aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorFelix Blyakher <felixb@sgi.com>2009-06-10 18:07:47 -0400
committerFelix Blyakher <felixb@sgi.com>2009-06-10 18:07:47 -0400
commit4e73e0eb633f8a1b5cbf20e7f42c6dbfec1d1ca7 (patch)
tree0cea46e43f0625244c3d06a71d6559e5ec5419ca /fs
parent4156e735d3abde8e9243b5d22f7999dd3fffab2e (diff)
parent07a2039b8eb0af4ff464efd3dfd95de5c02648c6 (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_super.c12
-rw-r--r--fs/Kconfig34
-rw-r--r--fs/Makefile6
-rw-r--r--fs/adfs/super.c16
-rw-r--r--fs/affs/super.c7
-rw-r--r--fs/afs/Kconfig8
-rw-r--r--fs/afs/Makefile3
-rw-r--r--fs/afs/cache.c503
-rw-r--r--fs/afs/cache.h15
-rw-r--r--fs/afs/cell.c16
-rw-r--r--fs/afs/file.c220
-rw-r--r--fs/afs/inode.c31
-rw-r--r--fs/afs/internal.h53
-rw-r--r--fs/afs/main.c27
-rw-r--r--fs/afs/mntpt.c4
-rw-r--r--fs/afs/netdevices.c3
-rw-r--r--fs/afs/super.c7
-rw-r--r--fs/afs/vlocation.c25
-rw-r--r--fs/afs/volume.c14
-rw-r--r--fs/afs/write.c21
-rw-r--r--fs/autofs/dirhash.c34
-rw-r--r--fs/autofs4/autofs_i.h2
-rw-r--r--fs/autofs4/dev-ioctl.c41
-rw-r--r--fs/autofs4/expire.c31
-rw-r--r--fs/autofs4/root.c41
-rw-r--r--fs/autofs4/waitq.c22
-rw-r--r--fs/befs/debug.c1
-rw-r--r--fs/befs/linuxvfs.c3
-rw-r--r--fs/befs/super.c1
-rw-r--r--fs/binfmt_elf.c22
-rw-r--r--fs/binfmt_elf_fdpic.c29
-rw-r--r--fs/binfmt_flat.c46
-rw-r--r--fs/binfmt_som.c7
-rw-r--r--fs/bio.c128
-rw-r--r--fs/block_dev.c1
-rw-r--r--fs/btrfs/Makefile21
-rw-r--r--fs/btrfs/acl.c20
-rw-r--r--fs/btrfs/async-thread.c67
-rw-r--r--fs/btrfs/async-thread.h2
-rw-r--r--fs/btrfs/btrfs_inode.h31
-rw-r--r--fs/btrfs/ctree.c948
-rw-r--r--fs/btrfs/ctree.h161
-rw-r--r--fs/btrfs/delayed-ref.c668
-rw-r--r--fs/btrfs/delayed-ref.h193
-rw-r--r--fs/btrfs/dir-item.c3
-rw-r--r--fs/btrfs/disk-io.c193
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/extent-tree.c2154
-rw-r--r--fs/btrfs/extent_io.c234
-rw-r--r--fs/btrfs/extent_io.h3
-rw-r--r--fs/btrfs/extent_map.c18
-rw-r--r--fs/btrfs/file-item.c7
-rw-r--r--fs/btrfs/file.c145
-rw-r--r--fs/btrfs/free-space-cache.c535
-rw-r--r--fs/btrfs/free-space-cache.h44
-rw-r--r--fs/btrfs/inode-item.c3
-rw-r--r--fs/btrfs/inode-map.c2
-rw-r--r--fs/btrfs/inode.c396
-rw-r--r--fs/btrfs/ioctl.c64
-rw-r--r--fs/btrfs/locking.c25
-rw-r--r--fs/btrfs/ordered-data.c120
-rw-r--r--fs/btrfs/ordered-data.h4
-rw-r--r--fs/btrfs/super.c98
-rw-r--r--fs/btrfs/transaction.c164
-rw-r--r--fs/btrfs/transaction.h8
-rw-r--r--fs/btrfs/tree-defrag.c2
-rw-r--r--fs/btrfs/tree-log.c458
-rw-r--r--fs/btrfs/tree-log.h17
-rw-r--r--fs/btrfs/volumes.c201
-rw-r--r--fs/btrfs/volumes.h18
-rw-r--r--fs/buffer.c163
-rw-r--r--fs/cachefiles/Kconfig39
-rw-r--r--fs/cachefiles/Makefile18
-rw-r--r--fs/cachefiles/bind.c286
-rw-r--r--fs/cachefiles/daemon.c755
-rw-r--r--fs/cachefiles/interface.c449
-rw-r--r--fs/cachefiles/internal.h360
-rw-r--r--fs/cachefiles/key.c159
-rw-r--r--fs/cachefiles/main.c106
-rw-r--r--fs/cachefiles/namei.c771
-rw-r--r--fs/cachefiles/proc.c134
-rw-r--r--fs/cachefiles/rdwr.c879
-rw-r--r--fs/cachefiles/security.c116
-rw-r--r--fs/cachefiles/xattr.c291
-rw-r--r--fs/cifs/CHANGES16
-rw-r--r--fs/cifs/README10
-rw-r--r--fs/cifs/cifs_dfs_ref.c32
-rw-r--r--fs/cifs/cifs_spnego.c2
-rw-r--r--fs/cifs/cifs_unicode.c198
-rw-r--r--fs/cifs/cifs_unicode.h23
-rw-r--r--fs/cifs/cifsfs.c54
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h19
-rw-r--r--fs/cifs/cifspdu.h8
-rw-r--r--fs/cifs/cifsproto.h5
-rw-r--r--fs/cifs/cifssmb.c221
-rw-r--r--fs/cifs/connect.c1357
-rw-r--r--fs/cifs/dir.c171
-rw-r--r--fs/cifs/dns_resolve.c2
-rw-r--r--fs/cifs/file.c161
-rw-r--r--fs/cifs/inode.c102
-rw-r--r--fs/cifs/link.c162
-rw-r--r--fs/cifs/misc.c71
-rw-r--r--fs/cifs/netmisc.c2
-rw-r--r--fs/cifs/nterr.h9
-rw-r--r--fs/cifs/ntlmssp.h68
-rw-r--r--fs/cifs/readdir.c78
-rw-r--r--fs/cifs/sess.c373
-rw-r--r--fs/cifs/smberr.h1
-rw-r--r--fs/compat.c141
-rw-r--r--fs/compat_ioctl.c9
-rw-r--r--fs/configfs/symlink.c2
-rw-r--r--fs/cramfs/inode.c39
-rw-r--r--fs/cramfs/uncompress.c2
-rw-r--r--fs/dcache.c5
-rw-r--r--fs/debugfs/inode.c16
-rw-r--r--fs/devpts/inode.c23
-rw-r--r--fs/direct-io.c4
-rw-r--r--fs/drop_caches.c2
-rw-r--r--fs/ecryptfs/crypto.c21
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h1
-rw-r--r--fs/ecryptfs/inode.c37
-rw-r--r--fs/ecryptfs/keystore.c3
-rw-r--r--fs/ecryptfs/main.c19
-rw-r--r--fs/ecryptfs/messaging.c85
-rw-r--r--fs/ecryptfs/miscdev.c43
-rw-r--r--fs/ecryptfs/mmap.c11
-rw-r--r--fs/ecryptfs/read_write.c32
-rw-r--r--fs/ecryptfs/super.c7
-rw-r--r--fs/efs/super.c20
-rw-r--r--fs/eventfd.c26
-rw-r--r--fs/eventpoll.c616
-rw-r--r--fs/exec.c126
-rw-r--r--fs/exofs/BUGS3
-rw-r--r--fs/exofs/Kbuild16
-rw-r--r--fs/exofs/Kconfig13
-rw-r--r--fs/exofs/common.h184
-rw-r--r--fs/exofs/dir.c672
-rw-r--r--fs/exofs/exofs.h180
-rw-r--r--fs/exofs/file.c87
-rw-r--r--fs/exofs/inode.c1303
-rw-r--r--fs/exofs/namei.c342
-rw-r--r--fs/exofs/osd.c153
-rw-r--r--fs/exofs/super.c584
-rw-r--r--fs/exofs/symlink.c57
-rw-r--r--fs/ext2/acl.c2
-rw-r--r--fs/ext2/inode.c44
-rw-r--r--fs/ext2/super.c4
-rw-r--r--fs/ext3/Kconfig19
-rw-r--r--fs/ext3/acl.c2
-rw-r--r--fs/ext3/dir.c2
-rw-r--r--fs/ext3/file.c6
-rw-r--r--fs/ext3/inode.c165
-rw-r--r--fs/ext3/ioctl.c59
-rw-r--r--fs/ext3/namei.c35
-rw-r--r--fs/ext3/super.c8
-rw-r--r--fs/ext4/Kconfig2
-rw-r--r--fs/ext4/acl.c2
-rw-r--r--fs/ext4/balloc.c14
-rw-r--r--fs/ext4/dir.c16
-rw-r--r--fs/ext4/ext4.h93
-rw-r--r--fs/ext4/ext4_extents.h1
-rw-r--r--fs/ext4/ext4_i.h6
-rw-r--r--fs/ext4/ext4_sb.h14
-rw-r--r--fs/ext4/extents.c154
-rw-r--r--fs/ext4/file.c7
-rw-r--r--fs/ext4/ialloc.c279
-rw-r--r--fs/ext4/inode.c471
-rw-r--r--fs/ext4/ioctl.c17
-rw-r--r--fs/ext4/mballoc.c158
-rw-r--r--fs/ext4/mballoc.h8
-rw-r--r--fs/ext4/namei.c164
-rw-r--r--fs/ext4/resize.c8
-rw-r--r--fs/ext4/super.c336
-rw-r--r--fs/fat/Kconfig3
-rw-r--r--fs/fat/inode.c8
-rw-r--r--fs/fcntl.c6
-rw-r--r--fs/file_table.c1
-rw-r--r--fs/filesystems.c2
-rw-r--r--fs/fs-writeback.c31
-rw-r--r--fs/fs_struct.c177
-rw-r--r--fs/fscache/Kconfig56
-rw-r--r--fs/fscache/Makefile19
-rw-r--r--fs/fscache/cache.c415
-rw-r--r--fs/fscache/cookie.c500
-rw-r--r--fs/fscache/fsdef.c144
-rw-r--r--fs/fscache/histogram.c109
-rw-r--r--fs/fscache/internal.h380
-rw-r--r--fs/fscache/main.c124
-rw-r--r--fs/fscache/netfs.c103
-rw-r--r--fs/fscache/object.c810
-rw-r--r--fs/fscache/operation.c459
-rw-r--r--fs/fscache/page.c816
-rw-r--r--fs/fscache/proc.c68
-rw-r--r--fs/fscache/stats.c212
-rw-r--r--fs/fuse/dir.c1
-rw-r--r--fs/fuse/file.c61
-rw-r--r--fs/fuse/inode.c4
-rw-r--r--fs/generic_acl.c2
-rw-r--r--fs/gfs2/acl.c2
-rw-r--r--fs/gfs2/glock.c11
-rw-r--r--fs/gfs2/glops.c6
-rw-r--r--fs/gfs2/inode.c8
-rw-r--r--fs/gfs2/inode.h14
-rw-r--r--fs/gfs2/ops_file.c15
-rw-r--r--fs/gfs2/ops_fstype.c13
-rw-r--r--fs/gfs2/ops_inode.c1
-rw-r--r--fs/gfs2/quota.c4
-rw-r--r--fs/gfs2/rgrp.c13
-rw-r--r--fs/hfs/inode.c4
-rw-r--r--fs/hfs/mdb.c1
-rw-r--r--fs/hfs/super.c3
-rw-r--r--fs/hfsplus/options.c2
-rw-r--r--fs/hfsplus/super.c3
-rw-r--r--fs/hpfs/super.c8
-rw-r--r--fs/hppfs/hppfs.c7
-rw-r--r--fs/hugetlbfs/inode.c35
-rw-r--r--fs/inode.c149
-rw-r--r--fs/internal.h8
-rw-r--r--fs/ioctl.c75
-rw-r--r--fs/isofs/inode.c3
-rw-r--r--fs/jbd/commit.c36
-rw-r--r--fs/jbd/journal.c34
-rw-r--r--fs/jbd/revoke.c44
-rw-r--r--fs/jbd/transaction.c2
-rw-r--r--fs/jbd2/commit.c19
-rw-r--r--fs/jbd2/revoke.c45
-rw-r--r--fs/jbd2/transaction.c2
-rw-r--r--fs/jffs2/acl.c6
-rw-r--r--fs/jffs2/erase.c7
-rw-r--r--fs/jffs2/malloc.c6
-rw-r--r--fs/jfs/acl.c2
-rw-r--r--fs/libfs.c19
-rw-r--r--fs/lockd/clntlock.c51
-rw-r--r--fs/lockd/mon.c8
-rw-r--r--fs/lockd/svc.c57
-rw-r--r--fs/lockd/svclock.c13
-rw-r--r--fs/minix/inode.c11
-rw-r--r--fs/mpage.c13
-rw-r--r--fs/namei.c29
-rw-r--r--fs/namespace.c91
-rw-r--r--fs/ncpfs/ioctl.c21
-rw-r--r--fs/nfs/Kconfig8
-rw-r--r--fs/nfs/Makefile1
-rw-r--r--fs/nfs/callback.c31
-rw-r--r--fs/nfs/callback.h1
-rw-r--r--fs/nfs/client.c130
-rw-r--r--fs/nfs/dir.c12
-rw-r--r--fs/nfs/file.c79
-rw-r--r--fs/nfs/fscache-index.c337
-rw-r--r--fs/nfs/fscache.c523
-rw-r--r--fs/nfs/fscache.h220
-rw-r--r--fs/nfs/getroot.c4
-rw-r--r--fs/nfs/inode.c323
-rw-r--r--fs/nfs/internal.h8
-rw-r--r--fs/nfs/iostat.h18
-rw-r--r--fs/nfs/nfs2xdr.c9
-rw-r--r--fs/nfs/nfs3proc.c7
-rw-r--r--fs/nfs/nfs3xdr.c40
-rw-r--r--fs/nfs/nfs4proc.c58
-rw-r--r--fs/nfs/nfs4state.c10
-rw-r--r--fs/nfs/nfs4xdr.c213
-rw-r--r--fs/nfs/nfsroot.c2
-rw-r--r--fs/nfs/pagelist.c11
-rw-r--r--fs/nfs/proc.c1
-rw-r--r--fs/nfs/read.c27
-rw-r--r--fs/nfs/super.c71
-rw-r--r--fs/nfs/write.c53
-rw-r--r--fs/nfsd/Kconfig1
-rw-r--r--fs/nfsd/nfs3proc.c10
-rw-r--r--fs/nfsd/nfs4callback.c47
-rw-r--r--fs/nfsd/nfs4proc.c246
-rw-r--r--fs/nfsd/nfs4recover.c120
-rw-r--r--fs/nfsd/nfs4state.c1195
-rw-r--r--fs/nfsd/nfs4xdr.c649
-rw-r--r--fs/nfsd/nfsctl.c44
-rw-r--r--fs/nfsd/nfsproc.c3
-rw-r--r--fs/nfsd/nfssvc.c100
-rw-r--r--fs/nfsd/vfs.c69
-rw-r--r--fs/nilfs2/Makefile5
-rw-r--r--fs/nilfs2/alloc.c504
-rw-r--r--fs/nilfs2/alloc.h72
-rw-r--r--fs/nilfs2/bmap.c788
-rw-r--r--fs/nilfs2/bmap.h244
-rw-r--r--fs/nilfs2/bmap_union.h42
-rw-r--r--fs/nilfs2/btnode.c316
-rw-r--r--fs/nilfs2/btnode.h58
-rw-r--r--fs/nilfs2/btree.c2269
-rw-r--r--fs/nilfs2/btree.h117
-rw-r--r--fs/nilfs2/cpfile.c927
-rw-r--r--fs/nilfs2/cpfile.h45
-rw-r--r--fs/nilfs2/dat.c430
-rw-r--r--fs/nilfs2/dat.h52
-rw-r--r--fs/nilfs2/dir.c711
-rw-r--r--fs/nilfs2/direct.c436
-rw-r--r--fs/nilfs2/direct.h78
-rw-r--r--fs/nilfs2/file.c160
-rw-r--r--fs/nilfs2/gcdat.c84
-rw-r--r--fs/nilfs2/gcinode.c288
-rw-r--r--fs/nilfs2/ifile.c150
-rw-r--r--fs/nilfs2/ifile.h53
-rw-r--r--fs/nilfs2/inode.c785
-rw-r--r--fs/nilfs2/ioctl.c665
-rw-r--r--fs/nilfs2/mdt.c564
-rw-r--r--fs/nilfs2/mdt.h125
-rw-r--r--fs/nilfs2/namei.c474
-rw-r--r--fs/nilfs2/nilfs.h314
-rw-r--r--fs/nilfs2/page.c541
-rw-r--r--fs/nilfs2/page.h76
-rw-r--r--fs/nilfs2/recovery.c919
-rw-r--r--fs/nilfs2/sb.h102
-rw-r--r--fs/nilfs2/segbuf.c439
-rw-r--r--fs/nilfs2/segbuf.h201
-rw-r--r--fs/nilfs2/seglist.h85
-rw-r--r--fs/nilfs2/segment.c2978
-rw-r--r--fs/nilfs2/segment.h244
-rw-r--r--fs/nilfs2/sufile.c558
-rw-r--r--fs/nilfs2/sufile.h125
-rw-r--r--fs/nilfs2/super.c1326
-rw-r--r--fs/nilfs2/the_nilfs.c641
-rw-r--r--fs/nilfs2/the_nilfs.h298
-rw-r--r--fs/notify/inotify/inotify_user.c2
-rw-r--r--fs/ntfs/dir.c4
-rw-r--r--fs/ntfs/inode.c3
-rw-r--r--fs/ntfs/layout.h329
-rw-r--r--fs/ntfs/logfile.h6
-rw-r--r--fs/ntfs/mft.c2
-rw-r--r--fs/ntfs/super.c50
-rw-r--r--fs/ntfs/usnjrnl.h48
-rw-r--r--fs/ocfs2/acl.c2
-rw-r--r--fs/ocfs2/alloc.c57
-rw-r--r--fs/ocfs2/alloc.h3
-rw-r--r--fs/ocfs2/aops.c23
-rw-r--r--fs/ocfs2/cluster/heartbeat.c96
-rw-r--r--fs/ocfs2/cluster/heartbeat.h3
-rw-r--r--fs/ocfs2/cluster/nodemanager.c9
-rw-r--r--fs/ocfs2/dcache.c15
-rw-r--r--fs/ocfs2/dir.c2808
-rw-r--r--fs/ocfs2/dir.h57
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h58
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c87
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c29
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c387
-rw-r--r--fs/ocfs2/dlm/dlmthread.c20
-rw-r--r--fs/ocfs2/dlmglue.c46
-rw-r--r--fs/ocfs2/dlmglue.h2
-rw-r--r--fs/ocfs2/export.c85
-rw-r--r--fs/ocfs2/file.c94
-rw-r--r--fs/ocfs2/inode.c48
-rw-r--r--fs/ocfs2/inode.h5
-rw-r--r--fs/ocfs2/journal.c173
-rw-r--r--fs/ocfs2/journal.h78
-rw-r--r--fs/ocfs2/localalloc.c86
-rw-r--r--fs/ocfs2/mmap.c6
-rw-r--r--fs/ocfs2/namei.c254
-rw-r--r--fs/ocfs2/ocfs2.h76
-rw-r--r--fs/ocfs2/ocfs2_fs.h136
-rw-r--r--fs/ocfs2/ocfs2_lockid.h4
-rw-r--r--fs/ocfs2/suballoc.c259
-rw-r--r--fs/ocfs2/suballoc.h4
-rw-r--r--fs/ocfs2/super.c188
-rw-r--r--fs/ocfs2/symlink.c77
-rw-r--r--fs/ocfs2/xattr.c8
-rw-r--r--fs/ocfs2/xattr.h2
-rw-r--r--fs/omfs/inode.c7
-rw-r--r--fs/open.c3
-rw-r--r--fs/partitions/check.c4
-rw-r--r--fs/pipe.c42
-rw-r--r--fs/proc/array.c13
-rw-r--r--fs/proc/base.c12
-rw-r--r--fs/proc/meminfo.c4
-rw-r--r--fs/proc/nommu.c2
-rw-r--r--fs/proc/proc_tty.c12
-rw-r--r--fs/proc/root.c3
-rw-r--r--fs/proc/stat.c5
-rw-r--r--fs/proc/task_mmu.c8
-rw-r--r--fs/proc/task_nommu.c9
-rw-r--r--fs/qnx4/inode.c3
-rw-r--r--fs/quota/Makefile9
-rw-r--r--fs/quota/dquot.c2
-rw-r--r--fs/ramfs/file-nommu.c15
-rw-r--r--fs/ramfs/inode.c113
-rw-r--r--fs/read_write.c56
-rw-r--r--fs/reiserfs/Kconfig1
-rw-r--r--fs/reiserfs/dir.c24
-rw-r--r--fs/reiserfs/namei.c17
-rw-r--r--fs/reiserfs/super.c16
-rw-r--r--fs/reiserfs/xattr.c260
-rw-r--r--fs/reiserfs/xattr_acl.c2
-rw-r--r--fs/reiserfs/xattr_security.c12
-rw-r--r--fs/romfs/Kconfig48
-rw-r--r--fs/romfs/Makefile9
-rw-r--r--fs/romfs/inode.c665
-rw-r--r--fs/romfs/internal.h47
-rw-r--r--fs/romfs/mmap-nommu.c75
-rw-r--r--fs/romfs/storage.c293
-rw-r--r--fs/romfs/super.c654
-rw-r--r--fs/splice.c358
-rw-r--r--fs/squashfs/Makefile1
-rw-r--r--fs/squashfs/cache.c1
-rw-r--r--fs/squashfs/export.c1
-rw-r--r--fs/squashfs/super.c13
-rw-r--r--fs/stat.c137
-rw-r--r--fs/super.c85
-rw-r--r--fs/sysfs/bin.c21
-rw-r--r--fs/sysfs/file.c16
-rw-r--r--fs/sysv/inode.c3
-rw-r--r--fs/ubifs/Kconfig4
-rw-r--r--fs/ubifs/budget.c37
-rw-r--r--fs/ubifs/debug.c6
-rw-r--r--fs/ubifs/file.c25
-rw-r--r--fs/ubifs/find.c12
-rw-r--r--fs/ubifs/gc.c428
-rw-r--r--fs/ubifs/journal.c7
-rw-r--r--fs/ubifs/key.h6
-rw-r--r--fs/ubifs/log.c5
-rw-r--r--fs/ubifs/lpt_commit.c34
-rw-r--r--fs/ubifs/recovery.c70
-rw-r--r--fs/ubifs/replay.c2
-rw-r--r--fs/ubifs/sb.c36
-rw-r--r--fs/ubifs/shrinker.c6
-rw-r--r--fs/ubifs/super.c40
-rw-r--r--fs/ubifs/tnc.c2
-rw-r--r--fs/ubifs/ubifs-media.h30
-rw-r--r--fs/ubifs/ubifs.h13
-rw-r--r--fs/udf/balloc.c150
-rw-r--r--fs/udf/dir.c14
-rw-r--r--fs/udf/directory.c38
-rw-r--r--fs/udf/ecma_167.h416
-rw-r--r--fs/udf/ialloc.c9
-rw-r--r--fs/udf/inode.c213
-rw-r--r--fs/udf/misc.c29
-rw-r--r--fs/udf/namei.c86
-rw-r--r--fs/udf/osta_udf.h22
-rw-r--r--fs/udf/partition.c2
-rw-r--r--fs/udf/super.c605
-rw-r--r--fs/udf/truncate.c44
-rw-r--r--fs/udf/udf_i.h6
-rw-r--r--fs/udf/udf_sb.h9
-rw-r--r--fs/udf/udfdecl.h57
-rw-r--r--fs/udf/udfend.h28
-rw-r--r--fs/udf/udftime.c6
-rw-r--r--fs/udf/unicode.c62
-rw-r--r--fs/ufs/dir.c2
-rw-r--r--fs/ufs/file.c2
-rw-r--r--fs/ufs/super.c3
-rw-r--r--fs/ufs/ufs.h2
-rw-r--r--fs/xattr.c10
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c23
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c12
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c4
452 files changed, 52825 insertions, 11103 deletions
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 5f8ab8adb5f5..ab5547ff29a1 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -37,6 +37,7 @@
37#include <linux/mount.h> 37#include <linux/mount.h>
38#include <linux/idr.h> 38#include <linux/idr.h>
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <linux/smp_lock.h>
40#include <net/9p/9p.h> 41#include <net/9p/9p.h>
41#include <net/9p/client.h> 42#include <net/9p/client.h>
42 43
@@ -155,6 +156,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
155 156
156 root = d_alloc_root(inode); 157 root = d_alloc_root(inode);
157 if (!root) { 158 if (!root) {
159 iput(inode);
158 retval = -ENOMEM; 160 retval = -ENOMEM;
159 goto release_sb; 161 goto release_sb;
160 } 162 }
@@ -173,10 +175,7 @@ P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
173 return 0; 175 return 0;
174 176
175release_sb: 177release_sb:
176 if (sb) { 178 deactivate_locked_super(sb);
177 up_write(&sb->s_umount);
178 deactivate_super(sb);
179 }
180 179
181free_stat: 180free_stat:
182 kfree(st); 181 kfree(st);
@@ -230,9 +229,12 @@ static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt)
230static void 229static void
231v9fs_umount_begin(struct super_block *sb) 230v9fs_umount_begin(struct super_block *sb)
232{ 231{
233 struct v9fs_session_info *v9ses = sb->s_fs_info; 232 struct v9fs_session_info *v9ses;
234 233
234 lock_kernel();
235 v9ses = sb->s_fs_info;
235 v9fs_session_cancel(v9ses); 236 v9fs_session_cancel(v9ses);
237 unlock_kernel();
236} 238}
237 239
238static const struct super_operations v9fs_super_ops = { 240static const struct super_operations v9fs_super_ops = {
diff --git a/fs/Kconfig b/fs/Kconfig
index cef8b18ceaa3..9f7270f36b2a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -66,6 +66,13 @@ config GENERIC_ACL
66 bool 66 bool
67 select FS_POSIX_ACL 67 select FS_POSIX_ACL
68 68
69menu "Caches"
70
71source "fs/fscache/Kconfig"
72source "fs/cachefiles/Kconfig"
73
74endmenu
75
69if BLOCK 76if BLOCK
70menu "CD-ROM/DVD Filesystems" 77menu "CD-ROM/DVD Filesystems"
71 78
@@ -168,6 +175,33 @@ source "fs/qnx4/Kconfig"
168source "fs/romfs/Kconfig" 175source "fs/romfs/Kconfig"
169source "fs/sysv/Kconfig" 176source "fs/sysv/Kconfig"
170source "fs/ufs/Kconfig" 177source "fs/ufs/Kconfig"
178source "fs/exofs/Kconfig"
179
180config NILFS2_FS
181 tristate "NILFS2 file system support (EXPERIMENTAL)"
182 depends on BLOCK && EXPERIMENTAL
183 select CRC32
184 help
185 NILFS2 is a log-structured file system (LFS) supporting continuous
186 snapshotting. In addition to versioning capability of the entire
187 file system, users can even restore files mistakenly overwritten or
188 destroyed just a few seconds ago. Since this file system can keep
189 consistency like conventional LFS, it achieves quick recovery after
190 system crashes.
191
192 NILFS2 creates a number of checkpoints every few seconds or per
193 synchronous write basis (unless there is no change). Users can
194 select significant versions among continuously created checkpoints,
195 and can change them into snapshots which will be preserved for long
196 periods until they are changed back to checkpoints. Each
197 snapshot is mountable as a read-only file system concurrently with
198 its writable mount, and this feature is convenient for online backup.
199
200 Some features including atime, extended attributes, and POSIX ACLs,
201 are not supported yet.
202
203 To compile this file system support as a module, choose M here: the
204 module will be called nilfs2. If unsure, say N.
171 205
172endif # MISC_FILESYSTEMS 206endif # MISC_FILESYSTEMS
173 207
diff --git a/fs/Makefile b/fs/Makefile
index 6e82a307bcd4..af6d04700d9c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
11 attr.o bad_inode.o file.o filesystems.o namespace.o \ 11 attr.o bad_inode.o file.o filesystems.o namespace.o \
12 seq_file.o xattr.o libfs.o fs-writeback.o \ 12 seq_file.o xattr.o libfs.o fs-writeback.o \
13 pnode.o drop_caches.o splice.o sync.o utimes.o \ 13 pnode.o drop_caches.o splice.o sync.o utimes.o \
14 stack.o 14 stack.o fs_struct.o
15 15
16ifeq ($(CONFIG_BLOCK),y) 16ifeq ($(CONFIG_BLOCK),y)
17obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o 17obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
@@ -63,6 +63,7 @@ obj-$(CONFIG_PROFILING) += dcookies.o
63obj-$(CONFIG_DLM) += dlm/ 63obj-$(CONFIG_DLM) += dlm/
64 64
65# Do not add any filesystems before this line 65# Do not add any filesystems before this line
66obj-$(CONFIG_FSCACHE) += fscache/
66obj-$(CONFIG_REISERFS_FS) += reiserfs/ 67obj-$(CONFIG_REISERFS_FS) += reiserfs/
67obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 68obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
68obj-$(CONFIG_EXT2_FS) += ext2/ 69obj-$(CONFIG_EXT2_FS) += ext2/
@@ -113,10 +114,13 @@ obj-$(CONFIG_JFS_FS) += jfs/
113obj-$(CONFIG_XFS_FS) += xfs/ 114obj-$(CONFIG_XFS_FS) += xfs/
114obj-$(CONFIG_9P_FS) += 9p/ 115obj-$(CONFIG_9P_FS) += 9p/
115obj-$(CONFIG_AFS_FS) += afs/ 116obj-$(CONFIG_AFS_FS) += afs/
117obj-$(CONFIG_NILFS2_FS) += nilfs2/
116obj-$(CONFIG_BEFS_FS) += befs/ 118obj-$(CONFIG_BEFS_FS) += befs/
117obj-$(CONFIG_HOSTFS) += hostfs/ 119obj-$(CONFIG_HOSTFS) += hostfs/
118obj-$(CONFIG_HPPFS) += hppfs/ 120obj-$(CONFIG_HPPFS) += hppfs/
121obj-$(CONFIG_CACHEFILES) += cachefiles/
119obj-$(CONFIG_DEBUG_FS) += debugfs/ 122obj-$(CONFIG_DEBUG_FS) += debugfs/
120obj-$(CONFIG_OCFS2_FS) += ocfs2/ 123obj-$(CONFIG_OCFS2_FS) += ocfs2/
121obj-$(CONFIG_BTRFS_FS) += btrfs/ 124obj-$(CONFIG_BTRFS_FS) += btrfs/
122obj-$(CONFIG_GFS2_FS) += gfs2/ 125obj-$(CONFIG_GFS2_FS) += gfs2/
126obj-$(CONFIG_EXOFS_FS) += exofs/
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 7f83a46f2b7e..dd9becca4241 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -219,16 +219,20 @@ static int adfs_remount(struct super_block *sb, int *flags, char *data)
219 219
220static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf) 220static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf)
221{ 221{
222 struct adfs_sb_info *asb = ADFS_SB(dentry->d_sb); 222 struct super_block *sb = dentry->d_sb;
223 struct adfs_sb_info *sbi = ADFS_SB(sb);
224 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
223 225
224 buf->f_type = ADFS_SUPER_MAGIC; 226 buf->f_type = ADFS_SUPER_MAGIC;
225 buf->f_namelen = asb->s_namelen; 227 buf->f_namelen = sbi->s_namelen;
226 buf->f_bsize = dentry->d_sb->s_blocksize; 228 buf->f_bsize = sb->s_blocksize;
227 buf->f_blocks = asb->s_size; 229 buf->f_blocks = sbi->s_size;
228 buf->f_files = asb->s_ids_per_zone * asb->s_map_size; 230 buf->f_files = sbi->s_ids_per_zone * sbi->s_map_size;
229 buf->f_bavail = 231 buf->f_bavail =
230 buf->f_bfree = adfs_map_free(dentry->d_sb); 232 buf->f_bfree = adfs_map_free(sb);
231 buf->f_ffree = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks; 233 buf->f_ffree = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks;
234 buf->f_fsid.val[0] = (u32)id;
235 buf->f_fsid.val[1] = (u32)(id >> 32);
232 236
233 return 0; 237 return 0;
234} 238}
diff --git a/fs/affs/super.c b/fs/affs/super.c
index a19d64b582aa..63f5183f263b 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -507,8 +507,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
507 kfree(new_opts); 507 kfree(new_opts);
508 return -EINVAL; 508 return -EINVAL;
509 } 509 }
510 kfree(sb->s_options); 510 replace_mount_options(sb, new_opts);
511 sb->s_options = new_opts;
512 511
513 sbi->s_flags = mount_flags; 512 sbi->s_flags = mount_flags;
514 sbi->s_mode = mode; 513 sbi->s_mode = mode;
@@ -533,6 +532,7 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
533{ 532{
534 struct super_block *sb = dentry->d_sb; 533 struct super_block *sb = dentry->d_sb;
535 int free; 534 int free;
535 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
536 536
537 pr_debug("AFFS: statfs() partsize=%d, reserved=%d\n",AFFS_SB(sb)->s_partition_size, 537 pr_debug("AFFS: statfs() partsize=%d, reserved=%d\n",AFFS_SB(sb)->s_partition_size,
538 AFFS_SB(sb)->s_reserved); 538 AFFS_SB(sb)->s_reserved);
@@ -543,6 +543,9 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
543 buf->f_blocks = AFFS_SB(sb)->s_partition_size - AFFS_SB(sb)->s_reserved; 543 buf->f_blocks = AFFS_SB(sb)->s_partition_size - AFFS_SB(sb)->s_reserved;
544 buf->f_bfree = free; 544 buf->f_bfree = free;
545 buf->f_bavail = free; 545 buf->f_bavail = free;
546 buf->f_fsid.val[0] = (u32)id;
547 buf->f_fsid.val[1] = (u32)(id >> 32);
548 buf->f_namelen = 30;
546 return 0; 549 return 0;
547} 550}
548 551
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
index e7b522fe15e1..5c4e61d3c772 100644
--- a/fs/afs/Kconfig
+++ b/fs/afs/Kconfig
@@ -19,3 +19,11 @@ config AFS_DEBUG
19 See <file:Documentation/filesystems/afs.txt> for more information. 19 See <file:Documentation/filesystems/afs.txt> for more information.
20 20
21 If unsure, say N. 21 If unsure, say N.
22
23config AFS_FSCACHE
24 bool "Provide AFS client caching support (EXPERIMENTAL)"
25 depends on EXPERIMENTAL
26 depends on AFS_FS=m && FSCACHE || AFS_FS=y && FSCACHE=y
27 help
28 Say Y here if you want AFS data to be cached locally on disk through
29 the generic filesystem cache manager
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index a66671082cfb..4f64b95d57bd 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -2,7 +2,10 @@
2# Makefile for Red Hat Linux AFS client. 2# Makefile for Red Hat Linux AFS client.
3# 3#
4 4
5afs-cache-$(CONFIG_AFS_FSCACHE) := cache.o
6
5kafs-objs := \ 7kafs-objs := \
8 $(afs-cache-y) \
6 callback.o \ 9 callback.o \
7 cell.o \ 10 cell.o \
8 cmservice.o \ 11 cmservice.o \
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index de0d7de69edc..e2b1d3f16519 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -1,6 +1,6 @@
1/* AFS caching stuff 1/* AFS caching stuff
2 * 2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
@@ -9,248 +9,395 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#ifdef AFS_CACHING_SUPPORT 12#include <linux/slab.h>
13static cachefs_match_val_t afs_cell_cache_match(void *target, 13#include <linux/sched.h>
14 const void *entry); 14#include "internal.h"
15static void afs_cell_cache_update(void *source, void *entry); 15
16 16static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
17struct cachefs_index_def afs_cache_cell_index_def = { 17 void *buffer, uint16_t buflen);
18 .name = "cell_ix", 18static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
19 .data_size = sizeof(struct afs_cache_cell), 19 void *buffer, uint16_t buflen);
20 .keys[0] = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 }, 20static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data,
21 .match = afs_cell_cache_match, 21 const void *buffer,
22 .update = afs_cell_cache_update, 22 uint16_t buflen);
23
24static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data,
25 void *buffer, uint16_t buflen);
26static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
27 void *buffer, uint16_t buflen);
28static enum fscache_checkaux afs_vlocation_cache_check_aux(
29 void *cookie_netfs_data, const void *buffer, uint16_t buflen);
30
31static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data,
32 void *buffer, uint16_t buflen);
33
34static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data,
35 void *buffer, uint16_t buflen);
36static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
37 uint64_t *size);
38static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
39 void *buffer, uint16_t buflen);
40static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
41 const void *buffer,
42 uint16_t buflen);
43static void afs_vnode_cache_now_uncached(void *cookie_netfs_data);
44
45struct fscache_netfs afs_cache_netfs = {
46 .name = "afs",
47 .version = 0,
48};
49
50struct fscache_cookie_def afs_cell_cache_index_def = {
51 .name = "AFS.cell",
52 .type = FSCACHE_COOKIE_TYPE_INDEX,
53 .get_key = afs_cell_cache_get_key,
54 .get_aux = afs_cell_cache_get_aux,
55 .check_aux = afs_cell_cache_check_aux,
56};
57
58struct fscache_cookie_def afs_vlocation_cache_index_def = {
59 .name = "AFS.vldb",
60 .type = FSCACHE_COOKIE_TYPE_INDEX,
61 .get_key = afs_vlocation_cache_get_key,
62 .get_aux = afs_vlocation_cache_get_aux,
63 .check_aux = afs_vlocation_cache_check_aux,
64};
65
66struct fscache_cookie_def afs_volume_cache_index_def = {
67 .name = "AFS.volume",
68 .type = FSCACHE_COOKIE_TYPE_INDEX,
69 .get_key = afs_volume_cache_get_key,
70};
71
72struct fscache_cookie_def afs_vnode_cache_index_def = {
73 .name = "AFS.vnode",
74 .type = FSCACHE_COOKIE_TYPE_DATAFILE,
75 .get_key = afs_vnode_cache_get_key,
76 .get_attr = afs_vnode_cache_get_attr,
77 .get_aux = afs_vnode_cache_get_aux,
78 .check_aux = afs_vnode_cache_check_aux,
79 .now_uncached = afs_vnode_cache_now_uncached,
23}; 80};
24#endif
25 81
26/* 82/*
27 * match a cell record obtained from the cache 83 * set the key for the index entry
28 */ 84 */
29#ifdef AFS_CACHING_SUPPORT 85static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
30static cachefs_match_val_t afs_cell_cache_match(void *target, 86 void *buffer, uint16_t bufmax)
31 const void *entry)
32{ 87{
33 const struct afs_cache_cell *ccell = entry; 88 const struct afs_cell *cell = cookie_netfs_data;
34 struct afs_cell *cell = target; 89 uint16_t klen;
35 90
36 _enter("{%s},{%s}", ccell->name, cell->name); 91 _enter("%p,%p,%u", cell, buffer, bufmax);
37 92
38 if (strncmp(ccell->name, cell->name, sizeof(ccell->name)) == 0) { 93 klen = strlen(cell->name);
39 _leave(" = SUCCESS"); 94 if (klen > bufmax)
40 return CACHEFS_MATCH_SUCCESS; 95 return 0;
41 }
42 96
43 _leave(" = FAILED"); 97 memcpy(buffer, cell->name, klen);
44 return CACHEFS_MATCH_FAILED; 98 return klen;
45} 99}
46#endif
47 100
48/* 101/*
49 * update a cell record in the cache 102 * provide new auxilliary cache data
50 */ 103 */
51#ifdef AFS_CACHING_SUPPORT 104static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
52static void afs_cell_cache_update(void *source, void *entry) 105 void *buffer, uint16_t bufmax)
53{ 106{
54 struct afs_cache_cell *ccell = entry; 107 const struct afs_cell *cell = cookie_netfs_data;
55 struct afs_cell *cell = source; 108 uint16_t dlen;
56 109
57 _enter("%p,%p", source, entry); 110 _enter("%p,%p,%u", cell, buffer, bufmax);
58 111
59 strncpy(ccell->name, cell->name, sizeof(ccell->name)); 112 dlen = cell->vl_naddrs * sizeof(cell->vl_addrs[0]);
113 dlen = min(dlen, bufmax);
114 dlen &= ~(sizeof(cell->vl_addrs[0]) - 1);
60 115
61 memcpy(ccell->vl_servers, 116 memcpy(buffer, cell->vl_addrs, dlen);
62 cell->vl_addrs, 117 return dlen;
63 min(sizeof(ccell->vl_servers), sizeof(cell->vl_addrs))); 118}
64 119
120/*
121 * check that the auxilliary data indicates that the entry is still valid
122 */
123static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data,
124 const void *buffer,
125 uint16_t buflen)
126{
127 _leave(" = OKAY");
128 return FSCACHE_CHECKAUX_OKAY;
65} 129}
66#endif
67
68#ifdef AFS_CACHING_SUPPORT
69static cachefs_match_val_t afs_vlocation_cache_match(void *target,
70 const void *entry);
71static void afs_vlocation_cache_update(void *source, void *entry);
72
73struct cachefs_index_def afs_vlocation_cache_index_def = {
74 .name = "vldb",
75 .data_size = sizeof(struct afs_cache_vlocation),
76 .keys[0] = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
77 .match = afs_vlocation_cache_match,
78 .update = afs_vlocation_cache_update,
79};
80#endif
81 130
131/*****************************************************************************/
82/* 132/*
83 * match a VLDB record stored in the cache 133 * set the key for the index entry
84 * - may also load target from entry
85 */ 134 */
86#ifdef AFS_CACHING_SUPPORT 135static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data,
87static cachefs_match_val_t afs_vlocation_cache_match(void *target, 136 void *buffer, uint16_t bufmax)
88 const void *entry)
89{ 137{
90 const struct afs_cache_vlocation *vldb = entry; 138 const struct afs_vlocation *vlocation = cookie_netfs_data;
91 struct afs_vlocation *vlocation = target; 139 uint16_t klen;
140
141 _enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax);
142
143 klen = strnlen(vlocation->vldb.name, sizeof(vlocation->vldb.name));
144 if (klen > bufmax)
145 return 0;
92 146
93 _enter("{%s},{%s}", vlocation->vldb.name, vldb->name); 147 memcpy(buffer, vlocation->vldb.name, klen);
94 148
95 if (strncmp(vlocation->vldb.name, vldb->name, sizeof(vldb->name)) == 0 149 _leave(" = %u", klen);
96 ) { 150 return klen;
97 if (!vlocation->valid || 151}
98 vlocation->vldb.rtime == vldb->rtime 152
153/*
154 * provide new auxilliary cache data
155 */
156static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
157 void *buffer, uint16_t bufmax)
158{
159 const struct afs_vlocation *vlocation = cookie_netfs_data;
160 uint16_t dlen;
161
162 _enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax);
163
164 dlen = sizeof(struct afs_cache_vlocation);
165 dlen -= offsetof(struct afs_cache_vlocation, nservers);
166 if (dlen > bufmax)
167 return 0;
168
169 memcpy(buffer, (uint8_t *)&vlocation->vldb.nservers, dlen);
170
171 _leave(" = %u", dlen);
172 return dlen;
173}
174
175/*
176 * check that the auxilliary data indicates that the entry is still valid
177 */
178static
179enum fscache_checkaux afs_vlocation_cache_check_aux(void *cookie_netfs_data,
180 const void *buffer,
181 uint16_t buflen)
182{
183 const struct afs_cache_vlocation *cvldb;
184 struct afs_vlocation *vlocation = cookie_netfs_data;
185 uint16_t dlen;
186
187 _enter("{%s},%p,%u", vlocation->vldb.name, buffer, buflen);
188
189 /* check the size of the data is what we're expecting */
190 dlen = sizeof(struct afs_cache_vlocation);
191 dlen -= offsetof(struct afs_cache_vlocation, nservers);
192 if (dlen != buflen)
193 return FSCACHE_CHECKAUX_OBSOLETE;
194
195 cvldb = container_of(buffer, struct afs_cache_vlocation, nservers);
196
197 /* if what's on disk is more valid than what's in memory, then use the
198 * VL record from the cache */
199 if (!vlocation->valid || vlocation->vldb.rtime == cvldb->rtime) {
200 memcpy((uint8_t *)&vlocation->vldb.nservers, buffer, dlen);
201 vlocation->valid = 1;
202 _leave(" = SUCCESS [c->m]");
203 return FSCACHE_CHECKAUX_OKAY;
204 }
205
206 /* need to update the cache if the cached info differs */
207 if (memcmp(&vlocation->vldb, buffer, dlen) != 0) {
208 /* delete if the volume IDs for this name differ */
209 if (memcmp(&vlocation->vldb.vid, &cvldb->vid,
210 sizeof(cvldb->vid)) != 0
99 ) { 211 ) {
100 vlocation->vldb = *vldb; 212 _leave(" = OBSOLETE");
101 vlocation->valid = 1; 213 return FSCACHE_CHECKAUX_OBSOLETE;
102 _leave(" = SUCCESS [c->m]");
103 return CACHEFS_MATCH_SUCCESS;
104 } else if (memcmp(&vlocation->vldb, vldb, sizeof(*vldb)) != 0) {
105 /* delete if VIDs for this name differ */
106 if (memcmp(&vlocation->vldb.vid,
107 &vldb->vid,
108 sizeof(vldb->vid)) != 0) {
109 _leave(" = DELETE");
110 return CACHEFS_MATCH_SUCCESS_DELETE;
111 }
112
113 _leave(" = UPDATE");
114 return CACHEFS_MATCH_SUCCESS_UPDATE;
115 } else {
116 _leave(" = SUCCESS");
117 return CACHEFS_MATCH_SUCCESS;
118 } 214 }
215
216 _leave(" = UPDATE");
217 return FSCACHE_CHECKAUX_NEEDS_UPDATE;
119 } 218 }
120 219
121 _leave(" = FAILED"); 220 _leave(" = OKAY");
122 return CACHEFS_MATCH_FAILED; 221 return FSCACHE_CHECKAUX_OKAY;
123} 222}
124#endif
125 223
224/*****************************************************************************/
126/* 225/*
127 * update a VLDB record stored in the cache 226 * set the key for the volume index entry
128 */ 227 */
129#ifdef AFS_CACHING_SUPPORT 228static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data,
130static void afs_vlocation_cache_update(void *source, void *entry) 229 void *buffer, uint16_t bufmax)
131{ 230{
132 struct afs_cache_vlocation *vldb = entry; 231 const struct afs_volume *volume = cookie_netfs_data;
133 struct afs_vlocation *vlocation = source; 232 uint16_t klen;
233
234 _enter("{%u},%p,%u", volume->type, buffer, bufmax);
235
236 klen = sizeof(volume->type);
237 if (klen > bufmax)
238 return 0;
134 239
135 _enter(""); 240 memcpy(buffer, &volume->type, sizeof(volume->type));
241
242 _leave(" = %u", klen);
243 return klen;
136 244
137 *vldb = vlocation->vldb;
138} 245}
139#endif
140
141#ifdef AFS_CACHING_SUPPORT
142static cachefs_match_val_t afs_volume_cache_match(void *target,
143 const void *entry);
144static void afs_volume_cache_update(void *source, void *entry);
145
146struct cachefs_index_def afs_volume_cache_index_def = {
147 .name = "volume",
148 .data_size = sizeof(struct afs_cache_vhash),
149 .keys[0] = { CACHEFS_INDEX_KEYS_BIN, 1 },
150 .keys[1] = { CACHEFS_INDEX_KEYS_BIN, 1 },
151 .match = afs_volume_cache_match,
152 .update = afs_volume_cache_update,
153};
154#endif
155 246
247/*****************************************************************************/
156/* 248/*
157 * match a volume hash record stored in the cache 249 * set the key for the index entry
158 */ 250 */
159#ifdef AFS_CACHING_SUPPORT 251static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data,
160static cachefs_match_val_t afs_volume_cache_match(void *target, 252 void *buffer, uint16_t bufmax)
161 const void *entry)
162{ 253{
163 const struct afs_cache_vhash *vhash = entry; 254 const struct afs_vnode *vnode = cookie_netfs_data;
164 struct afs_volume *volume = target; 255 uint16_t klen;
165 256
166 _enter("{%u},{%u}", volume->type, vhash->vtype); 257 _enter("{%x,%x,%llx},%p,%u",
258 vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
259 buffer, bufmax);
167 260
168 if (volume->type == vhash->vtype) { 261 klen = sizeof(vnode->fid.vnode);
169 _leave(" = SUCCESS"); 262 if (klen > bufmax)
170 return CACHEFS_MATCH_SUCCESS; 263 return 0;
171 } 264
265 memcpy(buffer, &vnode->fid.vnode, sizeof(vnode->fid.vnode));
172 266
173 _leave(" = FAILED"); 267 _leave(" = %u", klen);
174 return CACHEFS_MATCH_FAILED; 268 return klen;
175} 269}
176#endif
177 270
178/* 271/*
179 * update a volume hash record stored in the cache 272 * provide updated file attributes
180 */ 273 */
181#ifdef AFS_CACHING_SUPPORT 274static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
182static void afs_volume_cache_update(void *source, void *entry) 275 uint64_t *size)
183{ 276{
184 struct afs_cache_vhash *vhash = entry; 277 const struct afs_vnode *vnode = cookie_netfs_data;
185 struct afs_volume *volume = source;
186 278
187 _enter(""); 279 _enter("{%x,%x,%llx},",
280 vnode->fid.vnode, vnode->fid.unique,
281 vnode->status.data_version);
188 282
189 vhash->vtype = volume->type; 283 *size = vnode->status.size;
190} 284}
191#endif
192
193#ifdef AFS_CACHING_SUPPORT
194static cachefs_match_val_t afs_vnode_cache_match(void *target,
195 const void *entry);
196static void afs_vnode_cache_update(void *source, void *entry);
197
198struct cachefs_index_def afs_vnode_cache_index_def = {
199 .name = "vnode",
200 .data_size = sizeof(struct afs_cache_vnode),
201 .keys[0] = { CACHEFS_INDEX_KEYS_BIN, 4 },
202 .match = afs_vnode_cache_match,
203 .update = afs_vnode_cache_update,
204};
205#endif
206 285
207/* 286/*
208 * match a vnode record stored in the cache 287 * provide new auxilliary cache data
288 */
289static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
290 void *buffer, uint16_t bufmax)
291{
292 const struct afs_vnode *vnode = cookie_netfs_data;
293 uint16_t dlen;
294
295 _enter("{%x,%x,%Lx},%p,%u",
296 vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
297 buffer, bufmax);
298
299 dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version);
300 if (dlen > bufmax)
301 return 0;
302
303 memcpy(buffer, &vnode->fid.unique, sizeof(vnode->fid.unique));
304 buffer += sizeof(vnode->fid.unique);
305 memcpy(buffer, &vnode->status.data_version,
306 sizeof(vnode->status.data_version));
307
308 _leave(" = %u", dlen);
309 return dlen;
310}
311
312/*
313 * check that the auxilliary data indicates that the entry is still valid
209 */ 314 */
210#ifdef AFS_CACHING_SUPPORT 315static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
211static cachefs_match_val_t afs_vnode_cache_match(void *target, 316 const void *buffer,
212 const void *entry) 317 uint16_t buflen)
213{ 318{
214 const struct afs_cache_vnode *cvnode = entry; 319 struct afs_vnode *vnode = cookie_netfs_data;
215 struct afs_vnode *vnode = target; 320 uint16_t dlen;
216 321
217 _enter("{%x,%x,%Lx},{%x,%x,%Lx}", 322 _enter("{%x,%x,%llx},%p,%u",
218 vnode->fid.vnode, 323 vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
219 vnode->fid.unique, 324 buffer, buflen);
220 vnode->status.version, 325
221 cvnode->vnode_id, 326 /* check the size of the data is what we're expecting */
222 cvnode->vnode_unique, 327 dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version);
223 cvnode->data_version); 328 if (dlen != buflen) {
224 329 _leave(" = OBSOLETE [len %hx != %hx]", dlen, buflen);
225 if (vnode->fid.vnode != cvnode->vnode_id) { 330 return FSCACHE_CHECKAUX_OBSOLETE;
226 _leave(" = FAILED");
227 return CACHEFS_MATCH_FAILED;
228 } 331 }
229 332
230 if (vnode->fid.unique != cvnode->vnode_unique || 333 if (memcmp(buffer,
231 vnode->status.version != cvnode->data_version) { 334 &vnode->fid.unique,
232 _leave(" = DELETE"); 335 sizeof(vnode->fid.unique)
233 return CACHEFS_MATCH_SUCCESS_DELETE; 336 ) != 0) {
337 unsigned unique;
338
339 memcpy(&unique, buffer, sizeof(unique));
340
341 _leave(" = OBSOLETE [uniq %x != %x]",
342 unique, vnode->fid.unique);
343 return FSCACHE_CHECKAUX_OBSOLETE;
344 }
345
346 if (memcmp(buffer + sizeof(vnode->fid.unique),
347 &vnode->status.data_version,
348 sizeof(vnode->status.data_version)
349 ) != 0) {
350 afs_dataversion_t version;
351
352 memcpy(&version, buffer + sizeof(vnode->fid.unique),
353 sizeof(version));
354
355 _leave(" = OBSOLETE [vers %llx != %llx]",
356 version, vnode->status.data_version);
357 return FSCACHE_CHECKAUX_OBSOLETE;
234 } 358 }
235 359
236 _leave(" = SUCCESS"); 360 _leave(" = SUCCESS");
237 return CACHEFS_MATCH_SUCCESS; 361 return FSCACHE_CHECKAUX_OKAY;
238} 362}
239#endif
240 363
241/* 364/*
242 * update a vnode record stored in the cache 365 * indication the cookie is no longer uncached
366 * - this function is called when the backing store currently caching a cookie
367 * is removed
368 * - the netfs should use this to clean up any markers indicating cached pages
369 * - this is mandatory for any object that may have data
243 */ 370 */
244#ifdef AFS_CACHING_SUPPORT 371static void afs_vnode_cache_now_uncached(void *cookie_netfs_data)
245static void afs_vnode_cache_update(void *source, void *entry)
246{ 372{
247 struct afs_cache_vnode *cvnode = entry; 373 struct afs_vnode *vnode = cookie_netfs_data;
248 struct afs_vnode *vnode = source; 374 struct pagevec pvec;
375 pgoff_t first;
376 int loop, nr_pages;
377
378 _enter("{%x,%x,%Lx}",
379 vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version);
380
381 pagevec_init(&pvec, 0);
382 first = 0;
383
384 for (;;) {
385 /* grab a bunch of pages to clean */
386 nr_pages = pagevec_lookup(&pvec, vnode->vfs_inode.i_mapping,
387 first,
388 PAGEVEC_SIZE - pagevec_count(&pvec));
389 if (!nr_pages)
390 break;
249 391
250 _enter(""); 392 for (loop = 0; loop < nr_pages; loop++)
393 ClearPageFsCache(pvec.pages[loop]);
394
395 first = pvec.pages[nr_pages - 1]->index + 1;
396
397 pvec.nr = nr_pages;
398 pagevec_release(&pvec);
399 cond_resched();
400 }
251 401
252 cvnode->vnode_id = vnode->fid.vnode; 402 _leave("");
253 cvnode->vnode_unique = vnode->fid.unique;
254 cvnode->data_version = vnode->status.version;
255} 403}
256#endif
diff --git a/fs/afs/cache.h b/fs/afs/cache.h
index 36a3642cf90e..5c4f6b499e90 100644
--- a/fs/afs/cache.h
+++ b/fs/afs/cache.h
@@ -1,6 +1,6 @@
1/* AFS local cache management interface 1/* AFS local cache management interface
2 * 2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
@@ -9,15 +9,4 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#ifndef AFS_CACHE_H 12#include <linux/fscache.h>
13#define AFS_CACHE_H
14
15#undef AFS_CACHING_SUPPORT
16
17#include <linux/mm.h>
18#ifdef AFS_CACHING_SUPPORT
19#include <linux/cachefs.h>
20#endif
21#include "types.h"
22
23#endif /* AFS_CACHE_H */
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 5e1df14e16b1..e19c13f059ed 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -147,12 +147,11 @@ struct afs_cell *afs_cell_create(const char *name, char *vllist)
147 if (ret < 0) 147 if (ret < 0)
148 goto error; 148 goto error;
149 149
150#ifdef AFS_CACHING_SUPPORT 150#ifdef CONFIG_AFS_FSCACHE
151 /* put it up for caching */ 151 /* put it up for caching (this never returns an error) */
152 cachefs_acquire_cookie(afs_cache_netfs.primary_index, 152 cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index,
153 &afs_vlocation_cache_index_def, 153 &afs_cell_cache_index_def,
154 cell, 154 cell);
155 &cell->cache);
156#endif 155#endif
157 156
158 /* add to the cell lists */ 157 /* add to the cell lists */
@@ -362,10 +361,9 @@ static void afs_cell_destroy(struct afs_cell *cell)
362 list_del_init(&cell->proc_link); 361 list_del_init(&cell->proc_link);
363 up_write(&afs_proc_cells_sem); 362 up_write(&afs_proc_cells_sem);
364 363
365#ifdef AFS_CACHING_SUPPORT 364#ifdef CONFIG_AFS_FSCACHE
366 cachefs_relinquish_cookie(cell->cache, 0); 365 fscache_relinquish_cookie(cell->cache, 0);
367#endif 366#endif
368
369 key_put(cell->anonymous_key); 367 key_put(cell->anonymous_key);
370 kfree(cell); 368 kfree(cell);
371 369
diff --git a/fs/afs/file.c b/fs/afs/file.c
index a3901769a96c..0149dab365e7 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -23,6 +23,9 @@ static void afs_invalidatepage(struct page *page, unsigned long offset);
23static int afs_releasepage(struct page *page, gfp_t gfp_flags); 23static int afs_releasepage(struct page *page, gfp_t gfp_flags);
24static int afs_launder_page(struct page *page); 24static int afs_launder_page(struct page *page);
25 25
26static int afs_readpages(struct file *filp, struct address_space *mapping,
27 struct list_head *pages, unsigned nr_pages);
28
26const struct file_operations afs_file_operations = { 29const struct file_operations afs_file_operations = {
27 .open = afs_open, 30 .open = afs_open,
28 .release = afs_release, 31 .release = afs_release,
@@ -46,6 +49,7 @@ const struct inode_operations afs_file_inode_operations = {
46 49
47const struct address_space_operations afs_fs_aops = { 50const struct address_space_operations afs_fs_aops = {
48 .readpage = afs_readpage, 51 .readpage = afs_readpage,
52 .readpages = afs_readpages,
49 .set_page_dirty = afs_set_page_dirty, 53 .set_page_dirty = afs_set_page_dirty,
50 .launder_page = afs_launder_page, 54 .launder_page = afs_launder_page,
51 .releasepage = afs_releasepage, 55 .releasepage = afs_releasepage,
@@ -98,38 +102,21 @@ int afs_release(struct inode *inode, struct file *file)
98 return 0; 102 return 0;
99} 103}
100 104
105#ifdef CONFIG_AFS_FSCACHE
101/* 106/*
102 * deal with notification that a page was read from the cache 107 * deal with notification that a page was read from the cache
103 */ 108 */
104#ifdef AFS_CACHING_SUPPORT 109static void afs_file_readpage_read_complete(struct page *page,
105static void afs_readpage_read_complete(void *cookie_data, 110 void *data,
106 struct page *page, 111 int error)
107 void *data,
108 int error)
109{ 112{
110 _enter("%p,%p,%p,%d", cookie_data, page, data, error); 113 _enter("%p,%p,%d", page, data, error);
111 114
112 if (error) 115 /* if the read completes with an error, we just unlock the page and let
113 SetPageError(page); 116 * the VM reissue the readpage */
114 else 117 if (!error)
115 SetPageUptodate(page); 118 SetPageUptodate(page);
116 unlock_page(page); 119 unlock_page(page);
117
118}
119#endif
120
121/*
122 * deal with notification that a page was written to the cache
123 */
124#ifdef AFS_CACHING_SUPPORT
125static void afs_readpage_write_complete(void *cookie_data,
126 struct page *page,
127 void *data,
128 int error)
129{
130 _enter("%p,%p,%p,%d", cookie_data, page, data, error);
131
132 unlock_page(page);
133} 120}
134#endif 121#endif
135 122
@@ -161,9 +148,9 @@ static int afs_readpage(struct file *file, struct page *page)
161 if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) 148 if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
162 goto error; 149 goto error;
163 150
164#ifdef AFS_CACHING_SUPPORT
165 /* is it cached? */ 151 /* is it cached? */
166 ret = cachefs_read_or_alloc_page(vnode->cache, 152#ifdef CONFIG_AFS_FSCACHE
153 ret = fscache_read_or_alloc_page(vnode->cache,
167 page, 154 page,
168 afs_file_readpage_read_complete, 155 afs_file_readpage_read_complete,
169 NULL, 156 NULL,
@@ -171,20 +158,21 @@ static int afs_readpage(struct file *file, struct page *page)
171#else 158#else
172 ret = -ENOBUFS; 159 ret = -ENOBUFS;
173#endif 160#endif
174
175 switch (ret) { 161 switch (ret) {
176 /* read BIO submitted and wb-journal entry found */
177 case 1:
178 BUG(); // TODO - handle wb-journal match
179
180 /* read BIO submitted (page in cache) */ 162 /* read BIO submitted (page in cache) */
181 case 0: 163 case 0:
182 break; 164 break;
183 165
184 /* no page available in cache */ 166 /* page not yet cached */
185 case -ENOBUFS:
186 case -ENODATA: 167 case -ENODATA:
168 _debug("cache said ENODATA");
169 goto go_on;
170
171 /* page will not be cached */
172 case -ENOBUFS:
173 _debug("cache said ENOBUFS");
187 default: 174 default:
175 go_on:
188 offset = page->index << PAGE_CACHE_SHIFT; 176 offset = page->index << PAGE_CACHE_SHIFT;
189 len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE); 177 len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE);
190 178
@@ -198,27 +186,25 @@ static int afs_readpage(struct file *file, struct page *page)
198 set_bit(AFS_VNODE_DELETED, &vnode->flags); 186 set_bit(AFS_VNODE_DELETED, &vnode->flags);
199 ret = -ESTALE; 187 ret = -ESTALE;
200 } 188 }
201#ifdef AFS_CACHING_SUPPORT 189
202 cachefs_uncache_page(vnode->cache, page); 190#ifdef CONFIG_AFS_FSCACHE
191 fscache_uncache_page(vnode->cache, page);
203#endif 192#endif
193 BUG_ON(PageFsCache(page));
204 goto error; 194 goto error;
205 } 195 }
206 196
207 SetPageUptodate(page); 197 SetPageUptodate(page);
208 198
209#ifdef AFS_CACHING_SUPPORT 199 /* send the page to the cache */
210 if (cachefs_write_page(vnode->cache, 200#ifdef CONFIG_AFS_FSCACHE
211 page, 201 if (PageFsCache(page) &&
212 afs_file_readpage_write_complete, 202 fscache_write_page(vnode->cache, page, GFP_KERNEL) != 0) {
213 NULL, 203 fscache_uncache_page(vnode->cache, page);
214 GFP_KERNEL) != 0 204 BUG_ON(PageFsCache(page));
215 ) {
216 cachefs_uncache_page(vnode->cache, page);
217 unlock_page(page);
218 } 205 }
219#else
220 unlock_page(page);
221#endif 206#endif
207 unlock_page(page);
222 } 208 }
223 209
224 _leave(" = 0"); 210 _leave(" = 0");
@@ -232,34 +218,59 @@ error:
232} 218}
233 219
234/* 220/*
235 * invalidate part or all of a page 221 * read a set of pages
236 */ 222 */
237static void afs_invalidatepage(struct page *page, unsigned long offset) 223static int afs_readpages(struct file *file, struct address_space *mapping,
224 struct list_head *pages, unsigned nr_pages)
238{ 225{
239 int ret = 1; 226 struct afs_vnode *vnode;
227 int ret = 0;
240 228
241 _enter("{%lu},%lu", page->index, offset); 229 _enter(",{%lu},,%d", mapping->host->i_ino, nr_pages);
242 230
243 BUG_ON(!PageLocked(page)); 231 vnode = AFS_FS_I(mapping->host);
232 if (vnode->flags & AFS_VNODE_DELETED) {
233 _leave(" = -ESTALE");
234 return -ESTALE;
235 }
244 236
245 if (PagePrivate(page)) { 237 /* attempt to read as many of the pages as possible */
246 /* We release buffers only if the entire page is being 238#ifdef CONFIG_AFS_FSCACHE
247 * invalidated. 239 ret = fscache_read_or_alloc_pages(vnode->cache,
248 * The get_block cached value has been unconditionally 240 mapping,
249 * invalidated, so real IO is not possible anymore. 241 pages,
250 */ 242 &nr_pages,
251 if (offset == 0) { 243 afs_file_readpage_read_complete,
252 BUG_ON(!PageLocked(page)); 244 NULL,
253 245 mapping_gfp_mask(mapping));
254 ret = 0; 246#else
255 if (!PageWriteback(page)) 247 ret = -ENOBUFS;
256 ret = page->mapping->a_ops->releasepage(page, 248#endif
257 0); 249
258 /* possibly should BUG_ON(!ret); - neilb */ 250 switch (ret) {
259 } 251 /* all pages are being read from the cache */
252 case 0:
253 BUG_ON(!list_empty(pages));
254 BUG_ON(nr_pages != 0);
255 _leave(" = 0 [reading all]");
256 return 0;
257
258 /* there were pages that couldn't be read from the cache */
259 case -ENODATA:
260 case -ENOBUFS:
261 break;
262
263 /* other error */
264 default:
265 _leave(" = %d", ret);
266 return ret;
260 } 267 }
261 268
262 _leave(" = %d", ret); 269 /* load the missing pages from the network */
270 ret = read_cache_pages(mapping, pages, (void *) afs_readpage, file);
271
272 _leave(" = %d [netting]", ret);
273 return ret;
263} 274}
264 275
265/* 276/*
@@ -273,25 +284,82 @@ static int afs_launder_page(struct page *page)
273} 284}
274 285
275/* 286/*
276 * release a page and cleanup its private data 287 * invalidate part or all of a page
288 * - release a page and clean up its private data if offset is 0 (indicating
289 * the entire page)
290 */
291static void afs_invalidatepage(struct page *page, unsigned long offset)
292{
293 struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
294
295 _enter("{%lu},%lu", page->index, offset);
296
297 BUG_ON(!PageLocked(page));
298
299 /* we clean up only if the entire page is being invalidated */
300 if (offset == 0) {
301#ifdef CONFIG_AFS_FSCACHE
302 if (PageFsCache(page)) {
303 struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
304 fscache_wait_on_page_write(vnode->cache, page);
305 fscache_uncache_page(vnode->cache, page);
306 ClearPageFsCache(page);
307 }
308#endif
309
310 if (PagePrivate(page)) {
311 if (wb && !PageWriteback(page)) {
312 set_page_private(page, 0);
313 afs_put_writeback(wb);
314 }
315
316 if (!page_private(page))
317 ClearPagePrivate(page);
318 }
319 }
320
321 _leave("");
322}
323
324/*
325 * release a page and clean up its private state if it's not busy
326 * - return true if the page can now be released, false if not
277 */ 327 */
278static int afs_releasepage(struct page *page, gfp_t gfp_flags) 328static int afs_releasepage(struct page *page, gfp_t gfp_flags)
279{ 329{
330 struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
280 struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); 331 struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
281 struct afs_writeback *wb;
282 332
283 _enter("{{%x:%u}[%lu],%lx},%x", 333 _enter("{{%x:%u}[%lu],%lx},%x",
284 vnode->fid.vid, vnode->fid.vnode, page->index, page->flags, 334 vnode->fid.vid, vnode->fid.vnode, page->index, page->flags,
285 gfp_flags); 335 gfp_flags);
286 336
337 /* deny if page is being written to the cache and the caller hasn't
338 * elected to wait */
339#ifdef CONFIG_AFS_FSCACHE
340 if (PageFsCache(page)) {
341 if (fscache_check_page_write(vnode->cache, page)) {
342 if (!(gfp_flags & __GFP_WAIT)) {
343 _leave(" = F [cache busy]");
344 return 0;
345 }
346 fscache_wait_on_page_write(vnode->cache, page);
347 }
348
349 fscache_uncache_page(vnode->cache, page);
350 ClearPageFsCache(page);
351 }
352#endif
353
287 if (PagePrivate(page)) { 354 if (PagePrivate(page)) {
288 wb = (struct afs_writeback *) page_private(page); 355 if (wb) {
289 ASSERT(wb != NULL); 356 set_page_private(page, 0);
290 set_page_private(page, 0); 357 afs_put_writeback(wb);
358 }
291 ClearPagePrivate(page); 359 ClearPagePrivate(page);
292 afs_put_writeback(wb);
293 } 360 }
294 361
295 _leave(" = 0"); 362 /* indicate that the page can be released */
296 return 0; 363 _leave(" = T");
364 return 1;
297} 365}
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index bb47217f6a18..c048f0658751 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -61,6 +61,11 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
61 return -EBADMSG; 61 return -EBADMSG;
62 } 62 }
63 63
64#ifdef CONFIG_AFS_FSCACHE
65 if (vnode->status.size != inode->i_size)
66 fscache_attr_changed(vnode->cache);
67#endif
68
64 inode->i_nlink = vnode->status.nlink; 69 inode->i_nlink = vnode->status.nlink;
65 inode->i_uid = vnode->status.owner; 70 inode->i_uid = vnode->status.owner;
66 inode->i_gid = 0; 71 inode->i_gid = 0;
@@ -149,15 +154,6 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
149 return inode; 154 return inode;
150 } 155 }
151 156
152#ifdef AFS_CACHING_SUPPORT
153 /* set up caching before reading the status, as fetch-status reads the
154 * first page of symlinks to see if they're really mntpts */
155 cachefs_acquire_cookie(vnode->volume->cache,
156 NULL,
157 vnode,
158 &vnode->cache);
159#endif
160
161 if (!status) { 157 if (!status) {
162 /* it's a remotely extant inode */ 158 /* it's a remotely extant inode */
163 set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); 159 set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
@@ -183,6 +179,15 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
183 } 179 }
184 } 180 }
185 181
182 /* set up caching before mapping the status, as map-status reads the
183 * first page of symlinks to see if they're really mountpoints */
184 inode->i_size = vnode->status.size;
185#ifdef CONFIG_AFS_FSCACHE
186 vnode->cache = fscache_acquire_cookie(vnode->volume->cache,
187 &afs_vnode_cache_index_def,
188 vnode);
189#endif
190
186 ret = afs_inode_map_status(vnode, key); 191 ret = afs_inode_map_status(vnode, key);
187 if (ret < 0) 192 if (ret < 0)
188 goto bad_inode; 193 goto bad_inode;
@@ -196,6 +201,10 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
196 201
197 /* failure */ 202 /* failure */
198bad_inode: 203bad_inode:
204#ifdef CONFIG_AFS_FSCACHE
205 fscache_relinquish_cookie(vnode->cache, 0);
206 vnode->cache = NULL;
207#endif
199 iget_failed(inode); 208 iget_failed(inode);
200 _leave(" = %d [bad]", ret); 209 _leave(" = %d [bad]", ret);
201 return ERR_PTR(ret); 210 return ERR_PTR(ret);
@@ -340,8 +349,8 @@ void afs_clear_inode(struct inode *inode)
340 ASSERT(list_empty(&vnode->writebacks)); 349 ASSERT(list_empty(&vnode->writebacks));
341 ASSERT(!vnode->cb_promised); 350 ASSERT(!vnode->cb_promised);
342 351
343#ifdef AFS_CACHING_SUPPORT 352#ifdef CONFIG_AFS_FSCACHE
344 cachefs_relinquish_cookie(vnode->cache, 0); 353 fscache_relinquish_cookie(vnode->cache, 0);
345 vnode->cache = NULL; 354 vnode->cache = NULL;
346#endif 355#endif
347 356
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 67f259d99cd6..106be66dafd2 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -21,6 +21,7 @@
21 21
22#include "afs.h" 22#include "afs.h"
23#include "afs_vl.h" 23#include "afs_vl.h"
24#include "cache.h"
24 25
25#define AFS_CELL_MAX_ADDRS 15 26#define AFS_CELL_MAX_ADDRS 15
26 27
@@ -193,8 +194,8 @@ struct afs_cell {
193 struct key *anonymous_key; /* anonymous user key for this cell */ 194 struct key *anonymous_key; /* anonymous user key for this cell */
194 struct list_head proc_link; /* /proc cell list link */ 195 struct list_head proc_link; /* /proc cell list link */
195 struct proc_dir_entry *proc_dir; /* /proc dir for this cell */ 196 struct proc_dir_entry *proc_dir; /* /proc dir for this cell */
196#ifdef AFS_CACHING_SUPPORT 197#ifdef CONFIG_AFS_FSCACHE
197 struct cachefs_cookie *cache; /* caching cookie */ 198 struct fscache_cookie *cache; /* caching cookie */
198#endif 199#endif
199 200
200 /* server record management */ 201 /* server record management */
@@ -249,8 +250,8 @@ struct afs_vlocation {
249 struct list_head grave; /* link in master graveyard list */ 250 struct list_head grave; /* link in master graveyard list */
250 struct list_head update; /* link in master update list */ 251 struct list_head update; /* link in master update list */
251 struct afs_cell *cell; /* cell to which volume belongs */ 252 struct afs_cell *cell; /* cell to which volume belongs */
252#ifdef AFS_CACHING_SUPPORT 253#ifdef CONFIG_AFS_FSCACHE
253 struct cachefs_cookie *cache; /* caching cookie */ 254 struct fscache_cookie *cache; /* caching cookie */
254#endif 255#endif
255 struct afs_cache_vlocation vldb; /* volume information DB record */ 256 struct afs_cache_vlocation vldb; /* volume information DB record */
256 struct afs_volume *vols[3]; /* volume access record pointer (index by type) */ 257 struct afs_volume *vols[3]; /* volume access record pointer (index by type) */
@@ -302,8 +303,8 @@ struct afs_volume {
302 atomic_t usage; 303 atomic_t usage;
303 struct afs_cell *cell; /* cell to which belongs (unrefd ptr) */ 304 struct afs_cell *cell; /* cell to which belongs (unrefd ptr) */
304 struct afs_vlocation *vlocation; /* volume location */ 305 struct afs_vlocation *vlocation; /* volume location */
305#ifdef AFS_CACHING_SUPPORT 306#ifdef CONFIG_AFS_FSCACHE
306 struct cachefs_cookie *cache; /* caching cookie */ 307 struct fscache_cookie *cache; /* caching cookie */
307#endif 308#endif
308 afs_volid_t vid; /* volume ID */ 309 afs_volid_t vid; /* volume ID */
309 afs_voltype_t type; /* type of volume */ 310 afs_voltype_t type; /* type of volume */
@@ -333,8 +334,8 @@ struct afs_vnode {
333 struct afs_server *server; /* server currently supplying this file */ 334 struct afs_server *server; /* server currently supplying this file */
334 struct afs_fid fid; /* the file identifier for this inode */ 335 struct afs_fid fid; /* the file identifier for this inode */
335 struct afs_file_status status; /* AFS status info for this file */ 336 struct afs_file_status status; /* AFS status info for this file */
336#ifdef AFS_CACHING_SUPPORT 337#ifdef CONFIG_AFS_FSCACHE
337 struct cachefs_cookie *cache; /* caching cookie */ 338 struct fscache_cookie *cache; /* caching cookie */
338#endif 339#endif
339 struct afs_permits *permits; /* cache of permits so far obtained */ 340 struct afs_permits *permits; /* cache of permits so far obtained */
340 struct mutex permits_lock; /* lock for altering permits list */ 341 struct mutex permits_lock; /* lock for altering permits list */
@@ -428,6 +429,22 @@ struct afs_uuid {
428 429
429/*****************************************************************************/ 430/*****************************************************************************/
430/* 431/*
432 * cache.c
433 */
434#ifdef CONFIG_AFS_FSCACHE
435extern struct fscache_netfs afs_cache_netfs;
436extern struct fscache_cookie_def afs_cell_cache_index_def;
437extern struct fscache_cookie_def afs_vlocation_cache_index_def;
438extern struct fscache_cookie_def afs_volume_cache_index_def;
439extern struct fscache_cookie_def afs_vnode_cache_index_def;
440#else
441#define afs_cell_cache_index_def (*(struct fscache_cookie_def *) NULL)
442#define afs_vlocation_cache_index_def (*(struct fscache_cookie_def *) NULL)
443#define afs_volume_cache_index_def (*(struct fscache_cookie_def *) NULL)
444#define afs_vnode_cache_index_def (*(struct fscache_cookie_def *) NULL)
445#endif
446
447/*
431 * callback.c 448 * callback.c
432 */ 449 */
433extern void afs_init_callback_state(struct afs_server *); 450extern void afs_init_callback_state(struct afs_server *);
@@ -446,9 +463,6 @@ extern void afs_callback_update_kill(void);
446 */ 463 */
447extern struct rw_semaphore afs_proc_cells_sem; 464extern struct rw_semaphore afs_proc_cells_sem;
448extern struct list_head afs_proc_cells; 465extern struct list_head afs_proc_cells;
449#ifdef AFS_CACHING_SUPPORT
450extern struct cachefs_index_def afs_cache_cell_index_def;
451#endif
452 466
453#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0) 467#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
454extern int afs_cell_init(char *); 468extern int afs_cell_init(char *);
@@ -554,9 +568,6 @@ extern void afs_clear_inode(struct inode *);
554 * main.c 568 * main.c
555 */ 569 */
556extern struct afs_uuid afs_uuid; 570extern struct afs_uuid afs_uuid;
557#ifdef AFS_CACHING_SUPPORT
558extern struct cachefs_netfs afs_cache_netfs;
559#endif
560 571
561/* 572/*
562 * misc.c 573 * misc.c
@@ -637,10 +648,6 @@ extern int afs_get_MAC_address(u8 *, size_t);
637/* 648/*
638 * vlclient.c 649 * vlclient.c
639 */ 650 */
640#ifdef AFS_CACHING_SUPPORT
641extern struct cachefs_index_def afs_vlocation_cache_index_def;
642#endif
643
644extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *, 651extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *,
645 const char *, struct afs_cache_vlocation *, 652 const char *, struct afs_cache_vlocation *,
646 const struct afs_wait_mode *); 653 const struct afs_wait_mode *);
@@ -664,12 +671,6 @@ extern void afs_vlocation_purge(void);
664/* 671/*
665 * vnode.c 672 * vnode.c
666 */ 673 */
667#ifdef AFS_CACHING_SUPPORT
668extern struct cachefs_index_def afs_vnode_cache_index_def;
669#endif
670
671extern struct afs_timer_ops afs_vnode_cb_timed_out_ops;
672
673static inline struct afs_vnode *AFS_FS_I(struct inode *inode) 674static inline struct afs_vnode *AFS_FS_I(struct inode *inode)
674{ 675{
675 return container_of(inode, struct afs_vnode, vfs_inode); 676 return container_of(inode, struct afs_vnode, vfs_inode);
@@ -711,10 +712,6 @@ extern int afs_vnode_release_lock(struct afs_vnode *, struct key *);
711/* 712/*
712 * volume.c 713 * volume.c
713 */ 714 */
714#ifdef AFS_CACHING_SUPPORT
715extern struct cachefs_index_def afs_volume_cache_index_def;
716#endif
717
718#define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0) 715#define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0)
719 716
720extern void afs_put_volume(struct afs_volume *); 717extern void afs_put_volume(struct afs_volume *);
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 2d3e5d4fb9f7..66d54d348c55 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -1,6 +1,6 @@
1/* AFS client file system 1/* AFS client file system
2 * 2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2002,5 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
@@ -29,18 +29,6 @@ static char *rootcell;
29module_param(rootcell, charp, 0); 29module_param(rootcell, charp, 0);
30MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list"); 30MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
31 31
32#ifdef AFS_CACHING_SUPPORT
33static struct cachefs_netfs_operations afs_cache_ops = {
34 .get_page_cookie = afs_cache_get_page_cookie,
35};
36
37struct cachefs_netfs afs_cache_netfs = {
38 .name = "afs",
39 .version = 0,
40 .ops = &afs_cache_ops,
41};
42#endif
43
44struct afs_uuid afs_uuid; 32struct afs_uuid afs_uuid;
45 33
46/* 34/*
@@ -104,10 +92,9 @@ static int __init afs_init(void)
104 if (ret < 0) 92 if (ret < 0)
105 return ret; 93 return ret;
106 94
107#ifdef AFS_CACHING_SUPPORT 95#ifdef CONFIG_AFS_FSCACHE
108 /* we want to be able to cache */ 96 /* we want to be able to cache */
109 ret = cachefs_register_netfs(&afs_cache_netfs, 97 ret = fscache_register_netfs(&afs_cache_netfs);
110 &afs_cache_cell_index_def);
111 if (ret < 0) 98 if (ret < 0)
112 goto error_cache; 99 goto error_cache;
113#endif 100#endif
@@ -142,8 +129,8 @@ error_fs:
142error_open_socket: 129error_open_socket:
143error_vl_update_init: 130error_vl_update_init:
144error_cell_init: 131error_cell_init:
145#ifdef AFS_CACHING_SUPPORT 132#ifdef CONFIG_AFS_FSCACHE
146 cachefs_unregister_netfs(&afs_cache_netfs); 133 fscache_unregister_netfs(&afs_cache_netfs);
147error_cache: 134error_cache:
148#endif 135#endif
149 afs_callback_update_kill(); 136 afs_callback_update_kill();
@@ -175,8 +162,8 @@ static void __exit afs_exit(void)
175 afs_vlocation_purge(); 162 afs_vlocation_purge();
176 flush_scheduled_work(); 163 flush_scheduled_work();
177 afs_cell_purge(); 164 afs_cell_purge();
178#ifdef AFS_CACHING_SUPPORT 165#ifdef CONFIG_AFS_FSCACHE
179 cachefs_unregister_netfs(&afs_cache_netfs); 166 fscache_unregister_netfs(&afs_cache_netfs);
180#endif 167#endif
181 afs_proc_cleanup(); 168 afs_proc_cleanup();
182 rcu_barrier(); 169 rcu_barrier();
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 78db4953a800..2b9e2d03a390 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -173,9 +173,9 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
173 if (PageError(page)) 173 if (PageError(page))
174 goto error; 174 goto error;
175 175
176 buf = kmap(page); 176 buf = kmap_atomic(page, KM_USER0);
177 memcpy(devname, buf, size); 177 memcpy(devname, buf, size);
178 kunmap(page); 178 kunmap_atomic(buf, KM_USER0);
179 page_cache_release(page); 179 page_cache_release(page);
180 page = NULL; 180 page = NULL;
181 181
diff --git a/fs/afs/netdevices.c b/fs/afs/netdevices.c
index 49f189423063..7ad36506c256 100644
--- a/fs/afs/netdevices.c
+++ b/fs/afs/netdevices.c
@@ -20,8 +20,7 @@ int afs_get_MAC_address(u8 *mac, size_t maclen)
20 struct net_device *dev; 20 struct net_device *dev;
21 int ret = -ENODEV; 21 int ret = -ENODEV;
22 22
23 if (maclen != ETH_ALEN) 23 BUG_ON(maclen != ETH_ALEN);
24 BUG();
25 24
26 rtnl_lock(); 25 rtnl_lock();
27 dev = __dev_getfirstbyhwtype(&init_net, ARPHRD_ETHER); 26 dev = __dev_getfirstbyhwtype(&init_net, ARPHRD_ETHER);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index aee239a048cb..76828e5f8a39 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -405,21 +405,20 @@ static int afs_get_sb(struct file_system_type *fs_type,
405 sb->s_flags = flags; 405 sb->s_flags = flags;
406 ret = afs_fill_super(sb, &params); 406 ret = afs_fill_super(sb, &params);
407 if (ret < 0) { 407 if (ret < 0) {
408 up_write(&sb->s_umount); 408 deactivate_locked_super(sb);
409 deactivate_super(sb);
410 goto error; 409 goto error;
411 } 410 }
412 sb->s_options = new_opts; 411 save_mount_options(sb, new_opts);
413 sb->s_flags |= MS_ACTIVE; 412 sb->s_flags |= MS_ACTIVE;
414 } else { 413 } else {
415 _debug("reuse"); 414 _debug("reuse");
416 kfree(new_opts);
417 ASSERTCMP(sb->s_flags, &, MS_ACTIVE); 415 ASSERTCMP(sb->s_flags, &, MS_ACTIVE);
418 } 416 }
419 417
420 simple_set_mnt(mnt, sb); 418 simple_set_mnt(mnt, sb);
421 afs_put_volume(params.volume); 419 afs_put_volume(params.volume);
422 afs_put_cell(params.cell); 420 afs_put_cell(params.cell);
421 kfree(new_opts);
423 _leave(" = 0 [%p]", sb); 422 _leave(" = 0 [%p]", sb);
424 return 0; 423 return 0;
425 424
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 849fc3160cb5..ec2a7431e458 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -281,9 +281,8 @@ static void afs_vlocation_apply_update(struct afs_vlocation *vl,
281 281
282 vl->vldb = *vldb; 282 vl->vldb = *vldb;
283 283
284#ifdef AFS_CACHING_SUPPORT 284#ifdef CONFIG_AFS_FSCACHE
285 /* update volume entry in local cache */ 285 fscache_update_cookie(vl->cache);
286 cachefs_update_cookie(vl->cache);
287#endif 286#endif
288} 287}
289 288
@@ -304,11 +303,9 @@ static int afs_vlocation_fill_in_record(struct afs_vlocation *vl,
304 memset(&vldb, 0, sizeof(vldb)); 303 memset(&vldb, 0, sizeof(vldb));
305 304
306 /* see if we have an in-cache copy (will set vl->valid if there is) */ 305 /* see if we have an in-cache copy (will set vl->valid if there is) */
307#ifdef AFS_CACHING_SUPPORT 306#ifdef CONFIG_AFS_FSCACHE
308 cachefs_acquire_cookie(cell->cache, 307 vl->cache = fscache_acquire_cookie(vl->cell->cache,
309 &afs_volume_cache_index_def, 308 &afs_vlocation_cache_index_def, vl);
310 vlocation,
311 &vl->cache);
312#endif 309#endif
313 310
314 if (vl->valid) { 311 if (vl->valid) {
@@ -420,6 +417,11 @@ fill_in_record:
420 spin_unlock(&vl->lock); 417 spin_unlock(&vl->lock);
421 wake_up(&vl->waitq); 418 wake_up(&vl->waitq);
422 419
420 /* update volume entry in local cache */
421#ifdef CONFIG_AFS_FSCACHE
422 fscache_update_cookie(vl->cache);
423#endif
424
423 /* schedule for regular updates */ 425 /* schedule for regular updates */
424 afs_vlocation_queue_for_updates(vl); 426 afs_vlocation_queue_for_updates(vl);
425 goto success; 427 goto success;
@@ -465,7 +467,7 @@ found_in_memory:
465 spin_unlock(&vl->lock); 467 spin_unlock(&vl->lock);
466 468
467success: 469success:
468 _leave(" = %p",vl); 470 _leave(" = %p", vl);
469 return vl; 471 return vl;
470 472
471error_abandon: 473error_abandon:
@@ -523,10 +525,9 @@ static void afs_vlocation_destroy(struct afs_vlocation *vl)
523{ 525{
524 _enter("%p", vl); 526 _enter("%p", vl);
525 527
526#ifdef AFS_CACHING_SUPPORT 528#ifdef CONFIG_AFS_FSCACHE
527 cachefs_relinquish_cookie(vl->cache, 0); 529 fscache_relinquish_cookie(vl->cache, 0);
528#endif 530#endif
529
530 afs_put_cell(vl->cell); 531 afs_put_cell(vl->cell);
531 kfree(vl); 532 kfree(vl);
532} 533}
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 8bab0e3437f9..a353e69e2391 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -124,13 +124,11 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
124 } 124 }
125 125
126 /* attach the cache and volume location */ 126 /* attach the cache and volume location */
127#ifdef AFS_CACHING_SUPPORT 127#ifdef CONFIG_AFS_FSCACHE
128 cachefs_acquire_cookie(vlocation->cache, 128 volume->cache = fscache_acquire_cookie(vlocation->cache,
129 &afs_vnode_cache_index_def, 129 &afs_volume_cache_index_def,
130 volume, 130 volume);
131 &volume->cache);
132#endif 131#endif
133
134 afs_get_vlocation(vlocation); 132 afs_get_vlocation(vlocation);
135 volume->vlocation = vlocation; 133 volume->vlocation = vlocation;
136 134
@@ -194,8 +192,8 @@ void afs_put_volume(struct afs_volume *volume)
194 up_write(&vlocation->cell->vl_sem); 192 up_write(&vlocation->cell->vl_sem);
195 193
196 /* finish cleaning up the volume */ 194 /* finish cleaning up the volume */
197#ifdef AFS_CACHING_SUPPORT 195#ifdef CONFIG_AFS_FSCACHE
198 cachefs_relinquish_cookie(volume->cache, 0); 196 fscache_relinquish_cookie(volume->cache, 0);
199#endif 197#endif
200 afs_put_vlocation(vlocation); 198 afs_put_vlocation(vlocation);
201 199
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 3fb36d433621..c2e7a7ff0080 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -780,3 +780,24 @@ int afs_fsync(struct file *file, struct dentry *dentry, int datasync)
780 _leave(" = %d", ret); 780 _leave(" = %d", ret);
781 return ret; 781 return ret;
782} 782}
783
784/*
785 * notification that a previously read-only page is about to become writable
786 * - if it returns an error, the caller will deliver a bus error signal
787 */
788int afs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
789{
790 struct afs_vnode *vnode = AFS_FS_I(vma->vm_file->f_mapping->host);
791
792 _enter("{{%x:%u}},{%lx}",
793 vnode->fid.vid, vnode->fid.vnode, page->index);
794
795 /* wait for the page to be written to the cache before we allow it to
796 * be modified */
797#ifdef CONFIG_AFS_FSCACHE
798 fscache_wait_on_page_write(vnode->cache, page);
799#endif
800
801 _leave(" = 0");
802 return 0;
803}
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
index bf8c8af98004..4eb4d8dfb2f1 100644
--- a/fs/autofs/dirhash.c
+++ b/fs/autofs/dirhash.c
@@ -39,10 +39,12 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
39{ 39{
40 struct autofs_dirhash *dh = &sbi->dirhash; 40 struct autofs_dirhash *dh = &sbi->dirhash;
41 struct autofs_dir_ent *ent; 41 struct autofs_dir_ent *ent;
42 struct dentry *dentry;
43 unsigned long timeout = sbi->exp_timeout; 42 unsigned long timeout = sbi->exp_timeout;
44 43
45 while (1) { 44 while (1) {
45 struct path path;
46 int umount_ok;
47
46 if ( list_empty(&dh->expiry_head) || sbi->catatonic ) 48 if ( list_empty(&dh->expiry_head) || sbi->catatonic )
47 return NULL; /* No entries */ 49 return NULL; /* No entries */
48 /* We keep the list sorted by last_usage and want old stuff */ 50 /* We keep the list sorted by last_usage and want old stuff */
@@ -57,17 +59,17 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
57 return ent; /* Symlinks are always expirable */ 59 return ent; /* Symlinks are always expirable */
58 60
59 /* Get the dentry for the autofs subdirectory */ 61 /* Get the dentry for the autofs subdirectory */
60 dentry = ent->dentry; 62 path.dentry = ent->dentry;
61 63
62 if ( !dentry ) { 64 if (!path.dentry) {
63 /* Should only happen in catatonic mode */ 65 /* Should only happen in catatonic mode */
64 printk("autofs: dentry == NULL but inode range is directory, entry %s\n", ent->name); 66 printk("autofs: dentry == NULL but inode range is directory, entry %s\n", ent->name);
65 autofs_delete_usage(ent); 67 autofs_delete_usage(ent);
66 continue; 68 continue;
67 } 69 }
68 70
69 if ( !dentry->d_inode ) { 71 if (!path.dentry->d_inode) {
70 dput(dentry); 72 dput(path.dentry);
71 printk("autofs: negative dentry on expiry queue: %s\n", 73 printk("autofs: negative dentry on expiry queue: %s\n",
72 ent->name); 74 ent->name);
73 autofs_delete_usage(ent); 75 autofs_delete_usage(ent);
@@ -76,29 +78,29 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
76 78
77 /* Make sure entry is mounted and unused; note that dentry will 79 /* Make sure entry is mounted and unused; note that dentry will
78 point to the mounted-on-top root. */ 80 point to the mounted-on-top root. */
79 if (!S_ISDIR(dentry->d_inode->i_mode)||!d_mountpoint(dentry)) { 81 if (!S_ISDIR(path.dentry->d_inode->i_mode) ||
82 !d_mountpoint(path.dentry)) {
80 DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name)); 83 DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
81 continue; 84 continue;
82 } 85 }
83 mntget(mnt); 86 path.mnt = mnt;
84 dget(dentry); 87 path_get(&path);
85 if (!follow_down(&mnt, &dentry)) { 88 if (!follow_down(&path.mnt, &path.dentry)) {
86 dput(dentry); 89 path_put(&path);
87 mntput(mnt);
88 DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name)); 90 DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
89 continue; 91 continue;
90 } 92 }
91 while (d_mountpoint(dentry) && follow_down(&mnt, &dentry)) 93 while (d_mountpoint(path.dentry) &&
94 follow_down(&path.mnt, &path.dentry))
92 ; 95 ;
93 dput(dentry); 96 umount_ok = may_umount(path.mnt);
97 path_put(&path);
94 98
95 if ( may_umount(mnt) ) { 99 if (umount_ok) {
96 mntput(mnt);
97 DPRINTK(("autofs: signaling expire on %s\n", ent->name)); 100 DPRINTK(("autofs: signaling expire on %s\n", ent->name));
98 return ent; /* Expirable! */ 101 return ent; /* Expirable! */
99 } 102 }
100 DPRINTK(("autofs: didn't expire due to may_umount: %s\n", ent->name)); 103 DPRINTK(("autofs: didn't expire due to may_umount: %s\n", ent->name));
101 mntput(mnt);
102 } 104 }
103 return NULL; /* No expirable entries */ 105 return NULL; /* No expirable entries */
104} 106}
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index a76803108d06..b7ff33c63101 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -186,6 +186,8 @@ int autofs4_expire_wait(struct dentry *dentry);
186int autofs4_expire_run(struct super_block *, struct vfsmount *, 186int autofs4_expire_run(struct super_block *, struct vfsmount *,
187 struct autofs_sb_info *, 187 struct autofs_sb_info *,
188 struct autofs_packet_expire __user *); 188 struct autofs_packet_expire __user *);
189int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
190 struct autofs_sb_info *sbi, int when);
189int autofs4_expire_multi(struct super_block *, struct vfsmount *, 191int autofs4_expire_multi(struct super_block *, struct vfsmount *,
190 struct autofs_sb_info *, int __user *); 192 struct autofs_sb_info *, int __user *);
191struct dentry *autofs4_expire_direct(struct super_block *sb, 193struct dentry *autofs4_expire_direct(struct super_block *sb,
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 025e105bffea..84168c0dcc2d 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -54,11 +54,10 @@ static int check_name(const char *name)
54 * Check a string doesn't overrun the chunk of 54 * Check a string doesn't overrun the chunk of
55 * memory we copied from user land. 55 * memory we copied from user land.
56 */ 56 */
57static int invalid_str(char *str, void *end) 57static int invalid_str(char *str, size_t size)
58{ 58{
59 while ((void *) str <= end) 59 if (memchr(str, 0, size))
60 if (!*str++) 60 return 0;
61 return 0;
62 return -EINVAL; 61 return -EINVAL;
63} 62}
64 63
@@ -138,8 +137,7 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
138 } 137 }
139 138
140 if (param->size > sizeof(*param)) { 139 if (param->size > sizeof(*param)) {
141 err = invalid_str(param->path, 140 err = invalid_str(param->path, param->size - sizeof(*param));
142 (void *) ((size_t) param + param->size));
143 if (err) { 141 if (err) {
144 AUTOFS_WARN( 142 AUTOFS_WARN(
145 "path string terminator missing for cmd(0x%08x)", 143 "path string terminator missing for cmd(0x%08x)",
@@ -488,7 +486,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
488 } 486 }
489 487
490 path = param->path; 488 path = param->path;
491 devid = sbi->sb->s_dev; 489 devid = new_encode_dev(sbi->sb->s_dev);
492 490
493 param->requester.uid = param->requester.gid = -1; 491 param->requester.uid = param->requester.gid = -1;
494 492
@@ -525,40 +523,13 @@ static int autofs_dev_ioctl_expire(struct file *fp,
525 struct autofs_sb_info *sbi, 523 struct autofs_sb_info *sbi,
526 struct autofs_dev_ioctl *param) 524 struct autofs_dev_ioctl *param)
527{ 525{
528 struct dentry *dentry;
529 struct vfsmount *mnt; 526 struct vfsmount *mnt;
530 int err = -EAGAIN;
531 int how; 527 int how;
532 528
533 how = param->expire.how; 529 how = param->expire.how;
534 mnt = fp->f_path.mnt; 530 mnt = fp->f_path.mnt;
535 531
536 if (autofs_type_trigger(sbi->type)) 532 return autofs4_do_expire_multi(sbi->sb, mnt, sbi, how);
537 dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how);
538 else
539 dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how);
540
541 if (dentry) {
542 struct autofs_info *ino = autofs4_dentry_ino(dentry);
543
544 /*
545 * This is synchronous because it makes the daemon a
546 * little easier
547 */
548 err = autofs4_wait(sbi, dentry, NFY_EXPIRE);
549
550 spin_lock(&sbi->fs_lock);
551 if (ino->flags & AUTOFS_INF_MOUNTPOINT) {
552 ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
553 sbi->sb->s_root->d_mounted++;
554 }
555 ino->flags &= ~AUTOFS_INF_EXPIRING;
556 complete_all(&ino->expire_complete);
557 spin_unlock(&sbi->fs_lock);
558 dput(dentry);
559 }
560
561 return err;
562} 533}
563 534
564/* Check if autofs mount point is in use */ 535/* Check if autofs mount point is in use */
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index e3bd50776f9e..3077d8f16523 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -70,8 +70,10 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
70 * Otherwise it's an offset mount and we need to check 70 * Otherwise it's an offset mount and we need to check
71 * if we can umount its mount, if there is one. 71 * if we can umount its mount, if there is one.
72 */ 72 */
73 if (!d_mountpoint(dentry)) 73 if (!d_mountpoint(dentry)) {
74 status = 0;
74 goto done; 75 goto done;
76 }
75 } 77 }
76 78
77 /* Update the expiry counter if fs is busy */ 79 /* Update the expiry counter if fs is busy */
@@ -478,22 +480,16 @@ int autofs4_expire_run(struct super_block *sb,
478 return ret; 480 return ret;
479} 481}
480 482
481/* Call repeatedly until it returns -EAGAIN, meaning there's nothing 483int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
482 more to be done */ 484 struct autofs_sb_info *sbi, int when)
483int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
484 struct autofs_sb_info *sbi, int __user *arg)
485{ 485{
486 struct dentry *dentry; 486 struct dentry *dentry;
487 int ret = -EAGAIN; 487 int ret = -EAGAIN;
488 int do_now = 0;
489
490 if (arg && get_user(do_now, arg))
491 return -EFAULT;
492 488
493 if (autofs_type_trigger(sbi->type)) 489 if (autofs_type_trigger(sbi->type))
494 dentry = autofs4_expire_direct(sb, mnt, sbi, do_now); 490 dentry = autofs4_expire_direct(sb, mnt, sbi, when);
495 else 491 else
496 dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now); 492 dentry = autofs4_expire_indirect(sb, mnt, sbi, when);
497 493
498 if (dentry) { 494 if (dentry) {
499 struct autofs_info *ino = autofs4_dentry_ino(dentry); 495 struct autofs_info *ino = autofs4_dentry_ino(dentry);
@@ -516,3 +512,16 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
516 return ret; 512 return ret;
517} 513}
518 514
515/* Call repeatedly until it returns -EAGAIN, meaning there's nothing
516 more to be done */
517int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
518 struct autofs_sb_info *sbi, int __user *arg)
519{
520 int do_now = 0;
521
522 if (arg && get_user(do_now, arg))
523 return -EFAULT;
524
525 return autofs4_do_expire_multi(sb, mnt, sbi, do_now);
526}
527
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 74b1469a9504..e383bf0334f1 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -485,22 +485,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
485 DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d", 485 DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
486 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode); 486 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode);
487 487
488 expiring = autofs4_lookup_expiring(sbi, dentry->d_parent, &dentry->d_name);
489 if (expiring) {
490 /*
491 * If we are racing with expire the request might not
492 * be quite complete but the directory has been removed
493 * so it must have been successful, so just wait for it.
494 */
495 ino = autofs4_dentry_ino(expiring);
496 autofs4_expire_wait(expiring);
497 spin_lock(&sbi->lookup_lock);
498 if (!list_empty(&ino->expiring))
499 list_del_init(&ino->expiring);
500 spin_unlock(&sbi->lookup_lock);
501 dput(expiring);
502 }
503
504 unhashed = autofs4_lookup_active(sbi, dentry->d_parent, &dentry->d_name); 488 unhashed = autofs4_lookup_active(sbi, dentry->d_parent, &dentry->d_name);
505 if (unhashed) 489 if (unhashed)
506 dentry = unhashed; 490 dentry = unhashed;
@@ -538,14 +522,31 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
538 } 522 }
539 523
540 if (!oz_mode) { 524 if (!oz_mode) {
525 mutex_unlock(&dir->i_mutex);
526 expiring = autofs4_lookup_expiring(sbi,
527 dentry->d_parent,
528 &dentry->d_name);
529 if (expiring) {
530 /*
531 * If we are racing with expire the request might not
532 * be quite complete but the directory has been removed
533 * so it must have been successful, so just wait for it.
534 */
535 ino = autofs4_dentry_ino(expiring);
536 autofs4_expire_wait(expiring);
537 spin_lock(&sbi->lookup_lock);
538 if (!list_empty(&ino->expiring))
539 list_del_init(&ino->expiring);
540 spin_unlock(&sbi->lookup_lock);
541 dput(expiring);
542 }
543
541 spin_lock(&dentry->d_lock); 544 spin_lock(&dentry->d_lock);
542 dentry->d_flags |= DCACHE_AUTOFS_PENDING; 545 dentry->d_flags |= DCACHE_AUTOFS_PENDING;
543 spin_unlock(&dentry->d_lock); 546 spin_unlock(&dentry->d_lock);
544 if (dentry->d_op && dentry->d_op->d_revalidate) { 547 if (dentry->d_op && dentry->d_op->d_revalidate)
545 mutex_unlock(&dir->i_mutex);
546 (dentry->d_op->d_revalidate)(dentry, nd); 548 (dentry->d_op->d_revalidate)(dentry, nd);
547 mutex_lock(&dir->i_mutex); 549 mutex_lock(&dir->i_mutex);
548 }
549 } 550 }
550 551
551 /* 552 /*
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index eeb246845909..2341375386f8 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -297,20 +297,14 @@ static int validate_request(struct autofs_wait_queue **wait,
297 */ 297 */
298 if (notify == NFY_MOUNT) { 298 if (notify == NFY_MOUNT) {
299 /* 299 /*
300 * If the dentry isn't hashed just go ahead and try the 300 * If the dentry was successfully mounted while we slept
301 * mount again with a new wait (not much else we can do). 301 * on the wait queue mutex we can return success. If it
302 */ 302 * isn't mounted (doesn't have submounts for the case of
303 if (!d_unhashed(dentry)) { 303 * a multi-mount with no mount at it's base) we can
304 /* 304 * continue on and create a new request.
305 * But if the dentry is hashed, that means that we 305 */
306 * got here through the revalidate path. Thus, we 306 if (have_submounts(dentry))
307 * need to check if the dentry has been mounted 307 return 0;
308 * while we waited on the wq_mutex. If it has,
309 * simply return success.
310 */
311 if (d_mountpoint(dentry))
312 return 0;
313 }
314 } 308 }
315 309
316 return 1; 310 return 1;
diff --git a/fs/befs/debug.c b/fs/befs/debug.c
index b8e304a0661e..622e73775c83 100644
--- a/fs/befs/debug.c
+++ b/fs/befs/debug.c
@@ -17,6 +17,7 @@
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <linux/kernel.h> 18#include <linux/kernel.h>
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/slab.h>
20 21
21#endif /* __KERNEL__ */ 22#endif /* __KERNEL__ */
22 23
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index d06cb023ad02..76afd0d6b86c 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -900,6 +900,7 @@ static int
900befs_statfs(struct dentry *dentry, struct kstatfs *buf) 900befs_statfs(struct dentry *dentry, struct kstatfs *buf)
901{ 901{
902 struct super_block *sb = dentry->d_sb; 902 struct super_block *sb = dentry->d_sb;
903 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
903 904
904 befs_debug(sb, "---> befs_statfs()"); 905 befs_debug(sb, "---> befs_statfs()");
905 906
@@ -910,6 +911,8 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
910 buf->f_bavail = buf->f_bfree; 911 buf->f_bavail = buf->f_bfree;
911 buf->f_files = 0; /* UNKNOWN */ 912 buf->f_files = 0; /* UNKNOWN */
912 buf->f_ffree = 0; /* UNKNOWN */ 913 buf->f_ffree = 0; /* UNKNOWN */
914 buf->f_fsid.val[0] = (u32)id;
915 buf->f_fsid.val[1] = (u32)(id >> 32);
913 buf->f_namelen = BEFS_NAME_LEN; 916 buf->f_namelen = BEFS_NAME_LEN;
914 917
915 befs_debug(sb, "<--- befs_statfs()"); 918 befs_debug(sb, "<--- befs_statfs()");
diff --git a/fs/befs/super.c b/fs/befs/super.c
index 41f2b4d0093e..ca40f828f64d 100644
--- a/fs/befs/super.c
+++ b/fs/befs/super.c
@@ -8,6 +8,7 @@
8 */ 8 */
9 9
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <asm/page.h> /* for PAGE_SIZE */
11 12
12#include "befs.h" 13#include "befs.h"
13#include "super.h" 14#include "super.h"
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 33b7235f853b..40381df34869 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -12,8 +12,6 @@
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/stat.h>
16#include <linux/time.h>
17#include <linux/mm.h> 15#include <linux/mm.h>
18#include <linux/mman.h> 16#include <linux/mman.h>
19#include <linux/errno.h> 17#include <linux/errno.h>
@@ -21,20 +19,15 @@
21#include <linux/binfmts.h> 19#include <linux/binfmts.h>
22#include <linux/string.h> 20#include <linux/string.h>
23#include <linux/file.h> 21#include <linux/file.h>
24#include <linux/fcntl.h>
25#include <linux/ptrace.h>
26#include <linux/slab.h> 22#include <linux/slab.h>
27#include <linux/shm.h>
28#include <linux/personality.h> 23#include <linux/personality.h>
29#include <linux/elfcore.h> 24#include <linux/elfcore.h>
30#include <linux/init.h> 25#include <linux/init.h>
31#include <linux/highuid.h> 26#include <linux/highuid.h>
32#include <linux/smp.h>
33#include <linux/compiler.h> 27#include <linux/compiler.h>
34#include <linux/highmem.h> 28#include <linux/highmem.h>
35#include <linux/pagemap.h> 29#include <linux/pagemap.h>
36#include <linux/security.h> 30#include <linux/security.h>
37#include <linux/syscalls.h>
38#include <linux/random.h> 31#include <linux/random.h>
39#include <linux/elf.h> 32#include <linux/elf.h>
40#include <linux/utsname.h> 33#include <linux/utsname.h>
@@ -576,7 +569,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
576 unsigned long error; 569 unsigned long error;
577 struct elf_phdr *elf_ppnt, *elf_phdata; 570 struct elf_phdr *elf_ppnt, *elf_phdata;
578 unsigned long elf_bss, elf_brk; 571 unsigned long elf_bss, elf_brk;
579 int elf_exec_fileno;
580 int retval, i; 572 int retval, i;
581 unsigned int size; 573 unsigned int size;
582 unsigned long elf_entry; 574 unsigned long elf_entry;
@@ -631,12 +623,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
631 goto out_free_ph; 623 goto out_free_ph;
632 } 624 }
633 625
634 retval = get_unused_fd();
635 if (retval < 0)
636 goto out_free_ph;
637 get_file(bprm->file);
638 fd_install(elf_exec_fileno = retval, bprm->file);
639
640 elf_ppnt = elf_phdata; 626 elf_ppnt = elf_phdata;
641 elf_bss = 0; 627 elf_bss = 0;
642 elf_brk = 0; 628 elf_brk = 0;
@@ -655,13 +641,13 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
655 retval = -ENOEXEC; 641 retval = -ENOEXEC;
656 if (elf_ppnt->p_filesz > PATH_MAX || 642 if (elf_ppnt->p_filesz > PATH_MAX ||
657 elf_ppnt->p_filesz < 2) 643 elf_ppnt->p_filesz < 2)
658 goto out_free_file; 644 goto out_free_ph;
659 645
660 retval = -ENOMEM; 646 retval = -ENOMEM;
661 elf_interpreter = kmalloc(elf_ppnt->p_filesz, 647 elf_interpreter = kmalloc(elf_ppnt->p_filesz,
662 GFP_KERNEL); 648 GFP_KERNEL);
663 if (!elf_interpreter) 649 if (!elf_interpreter)
664 goto out_free_file; 650 goto out_free_ph;
665 651
666 retval = kernel_read(bprm->file, elf_ppnt->p_offset, 652 retval = kernel_read(bprm->file, elf_ppnt->p_offset,
667 elf_interpreter, 653 elf_interpreter,
@@ -956,8 +942,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
956 942
957 kfree(elf_phdata); 943 kfree(elf_phdata);
958 944
959 sys_close(elf_exec_fileno);
960
961 set_binfmt(&elf_format); 945 set_binfmt(&elf_format);
962 946
963#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES 947#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
@@ -1028,8 +1012,6 @@ out_free_dentry:
1028 fput(interpreter); 1012 fput(interpreter);
1029out_free_interp: 1013out_free_interp:
1030 kfree(elf_interpreter); 1014 kfree(elf_interpreter);
1031out_free_file:
1032 sys_close(elf_exec_fileno);
1033out_free_ph: 1015out_free_ph:
1034 kfree(elf_phdata); 1016 kfree(elf_phdata);
1035 goto out; 1017 goto out;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index f3e72c5c19f5..fdb66faa24f1 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -972,9 +972,12 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
972 params->elfhdr_addr = seg->addr; 972 params->elfhdr_addr = seg->addr;
973 973
974 /* clear any space allocated but not loaded */ 974 /* clear any space allocated but not loaded */
975 if (phdr->p_filesz < phdr->p_memsz) 975 if (phdr->p_filesz < phdr->p_memsz) {
976 clear_user((void *) (seg->addr + phdr->p_filesz), 976 ret = clear_user((void *) (seg->addr + phdr->p_filesz),
977 phdr->p_memsz - phdr->p_filesz); 977 phdr->p_memsz - phdr->p_filesz);
978 if (ret)
979 return ret;
980 }
978 981
979 if (mm) { 982 if (mm) {
980 if (phdr->p_flags & PF_X) { 983 if (phdr->p_flags & PF_X) {
@@ -1014,7 +1017,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
1014 struct elf32_fdpic_loadseg *seg; 1017 struct elf32_fdpic_loadseg *seg;
1015 struct elf32_phdr *phdr; 1018 struct elf32_phdr *phdr;
1016 unsigned long load_addr, delta_vaddr; 1019 unsigned long load_addr, delta_vaddr;
1017 int loop, dvset; 1020 int loop, dvset, ret;
1018 1021
1019 load_addr = params->load_addr; 1022 load_addr = params->load_addr;
1020 delta_vaddr = 0; 1023 delta_vaddr = 0;
@@ -1114,7 +1117,9 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
1114 * PT_LOAD */ 1117 * PT_LOAD */
1115 if (prot & PROT_WRITE && disp > 0) { 1118 if (prot & PROT_WRITE && disp > 0) {
1116 kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp); 1119 kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp);
1117 clear_user((void __user *) maddr, disp); 1120 ret = clear_user((void __user *) maddr, disp);
1121 if (ret)
1122 return ret;
1118 maddr += disp; 1123 maddr += disp;
1119 } 1124 }
1120 1125
@@ -1149,15 +1154,19 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
1149 if (prot & PROT_WRITE && excess1 > 0) { 1154 if (prot & PROT_WRITE && excess1 > 0) {
1150 kdebug("clear[%d] ad=%lx sz=%lx", 1155 kdebug("clear[%d] ad=%lx sz=%lx",
1151 loop, maddr + phdr->p_filesz, excess1); 1156 loop, maddr + phdr->p_filesz, excess1);
1152 clear_user((void __user *) maddr + phdr->p_filesz, 1157 ret = clear_user((void __user *) maddr + phdr->p_filesz,
1153 excess1); 1158 excess1);
1159 if (ret)
1160 return ret;
1154 } 1161 }
1155 1162
1156#else 1163#else
1157 if (excess > 0) { 1164 if (excess > 0) {
1158 kdebug("clear[%d] ad=%lx sz=%lx", 1165 kdebug("clear[%d] ad=%lx sz=%lx",
1159 loop, maddr + phdr->p_filesz, excess); 1166 loop, maddr + phdr->p_filesz, excess);
1160 clear_user((void *) maddr + phdr->p_filesz, excess); 1167 ret = clear_user((void *) maddr + phdr->p_filesz, excess);
1168 if (ret)
1169 return ret;
1161 } 1170 }
1162#endif 1171#endif
1163 1172
@@ -1379,7 +1388,7 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
1379 prstatus->pr_sigpend = p->pending.signal.sig[0]; 1388 prstatus->pr_sigpend = p->pending.signal.sig[0];
1380 prstatus->pr_sighold = p->blocked.sig[0]; 1389 prstatus->pr_sighold = p->blocked.sig[0];
1381 prstatus->pr_pid = task_pid_vnr(p); 1390 prstatus->pr_pid = task_pid_vnr(p);
1382 prstatus->pr_ppid = task_pid_vnr(p->parent); 1391 prstatus->pr_ppid = task_pid_vnr(p->real_parent);
1383 prstatus->pr_pgrp = task_pgrp_vnr(p); 1392 prstatus->pr_pgrp = task_pgrp_vnr(p);
1384 prstatus->pr_sid = task_session_vnr(p); 1393 prstatus->pr_sid = task_session_vnr(p);
1385 if (thread_group_leader(p)) { 1394 if (thread_group_leader(p)) {
@@ -1424,7 +1433,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
1424 psinfo->pr_psargs[len] = 0; 1433 psinfo->pr_psargs[len] = 0;
1425 1434
1426 psinfo->pr_pid = task_pid_vnr(p); 1435 psinfo->pr_pid = task_pid_vnr(p);
1427 psinfo->pr_ppid = task_pid_vnr(p->parent); 1436 psinfo->pr_ppid = task_pid_vnr(p->real_parent);
1428 psinfo->pr_pgrp = task_pgrp_vnr(p); 1437 psinfo->pr_pgrp = task_pgrp_vnr(p);
1429 psinfo->pr_sid = task_session_vnr(p); 1438 psinfo->pr_sid = task_session_vnr(p);
1430 1439
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 5cebf0b37798..697f6b5f1313 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -41,6 +41,7 @@
41#include <asm/uaccess.h> 41#include <asm/uaccess.h>
42#include <asm/unaligned.h> 42#include <asm/unaligned.h>
43#include <asm/cacheflush.h> 43#include <asm/cacheflush.h>
44#include <asm/page.h>
44 45
45/****************************************************************************/ 46/****************************************************************************/
46 47
@@ -54,6 +55,18 @@
54#define DBG_FLT(a...) 55#define DBG_FLT(a...)
55#endif 56#endif
56 57
58/*
59 * User data (stack, data section and bss) needs to be aligned
60 * for the same reasons as SLAB memory is, and to the same amount.
61 * Avoid duplicating architecture specific code by using the same
62 * macro as with SLAB allocation:
63 */
64#ifdef ARCH_SLAB_MINALIGN
65#define FLAT_DATA_ALIGN (ARCH_SLAB_MINALIGN)
66#else
67#define FLAT_DATA_ALIGN (sizeof(void *))
68#endif
69
57#define RELOC_FAILED 0xff00ff01 /* Relocation incorrect somewhere */ 70#define RELOC_FAILED 0xff00ff01 /* Relocation incorrect somewhere */
58#define UNLOADED_LIB 0x7ff000ff /* Placeholder for unused library */ 71#define UNLOADED_LIB 0x7ff000ff /* Placeholder for unused library */
59 72
@@ -114,20 +127,18 @@ static unsigned long create_flat_tables(
114 int envc = bprm->envc; 127 int envc = bprm->envc;
115 char uninitialized_var(dummy); 128 char uninitialized_var(dummy);
116 129
117 sp = (unsigned long *) ((-(unsigned long)sizeof(char *))&(unsigned long) p); 130 sp = (unsigned long *)p;
131 sp -= (envc + argc + 2) + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
132 sp = (unsigned long *) ((unsigned long)sp & -FLAT_DATA_ALIGN);
133 argv = sp + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
134 envp = argv + (argc + 1);
118 135
119 sp -= envc+1;
120 envp = sp;
121 sp -= argc+1;
122 argv = sp;
123
124 flat_stack_align(sp);
125 if (flat_argvp_envp_on_stack()) { 136 if (flat_argvp_envp_on_stack()) {
126 --sp; put_user((unsigned long) envp, sp); 137 put_user((unsigned long) envp, sp + 2);
127 --sp; put_user((unsigned long) argv, sp); 138 put_user((unsigned long) argv, sp + 1);
128 } 139 }
129 140
130 put_user(argc,--sp); 141 put_user(argc, sp);
131 current->mm->arg_start = (unsigned long) p; 142 current->mm->arg_start = (unsigned long) p;
132 while (argc-->0) { 143 while (argc-->0) {
133 put_user((unsigned long) p, argv++); 144 put_user((unsigned long) p, argv++);
@@ -558,7 +569,9 @@ static int load_flat_file(struct linux_binprm * bprm,
558 ret = realdatastart; 569 ret = realdatastart;
559 goto err; 570 goto err;
560 } 571 }
561 datapos = realdatastart + MAX_SHARED_LIBS * sizeof(unsigned long); 572 datapos = ALIGN(realdatastart +
573 MAX_SHARED_LIBS * sizeof(unsigned long),
574 FLAT_DATA_ALIGN);
562 575
563 DBG_FLT("BINFMT_FLAT: Allocated data+bss+stack (%d bytes): %x\n", 576 DBG_FLT("BINFMT_FLAT: Allocated data+bss+stack (%d bytes): %x\n",
564 (int)(data_len + bss_len + stack_len), (int)datapos); 577 (int)(data_len + bss_len + stack_len), (int)datapos);
@@ -604,9 +617,12 @@ static int load_flat_file(struct linux_binprm * bprm,
604 } 617 }
605 618
606 realdatastart = textpos + ntohl(hdr->data_start); 619 realdatastart = textpos + ntohl(hdr->data_start);
607 datapos = realdatastart + MAX_SHARED_LIBS * sizeof(unsigned long); 620 datapos = ALIGN(realdatastart +
608 reloc = (unsigned long *) (textpos + ntohl(hdr->reloc_start) + 621 MAX_SHARED_LIBS * sizeof(unsigned long),
609 MAX_SHARED_LIBS * sizeof(unsigned long)); 622 FLAT_DATA_ALIGN);
623
624 reloc = (unsigned long *)
625 (datapos + (ntohl(hdr->reloc_start) - text_len));
610 memp = textpos; 626 memp = textpos;
611 memp_size = len; 627 memp_size = len;
612#ifdef CONFIG_BINFMT_ZFLAT 628#ifdef CONFIG_BINFMT_ZFLAT
@@ -854,7 +870,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
854 stack_len = TOP_OF_ARGS - bprm->p; /* the strings */ 870 stack_len = TOP_OF_ARGS - bprm->p; /* the strings */
855 stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */ 871 stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */
856 stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */ 872 stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */
857 873 stack_len += FLAT_DATA_ALIGN - 1; /* reserve for upcoming alignment */
858 874
859 res = load_flat_file(bprm, &libinfo, 0, &stack_len); 875 res = load_flat_file(bprm, &libinfo, 0, &stack_len);
860 if (res > (unsigned long)-4096) 876 if (res > (unsigned long)-4096)
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 08644a61616e..eff74b9c9e77 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -188,7 +188,6 @@ out:
188static int 188static int
189load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) 189load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
190{ 190{
191 int som_exec_fileno;
192 int retval; 191 int retval;
193 unsigned int size; 192 unsigned int size;
194 unsigned long som_entry; 193 unsigned long som_entry;
@@ -220,12 +219,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
220 goto out_free; 219 goto out_free;
221 } 220 }
222 221
223 retval = get_unused_fd();
224 if (retval < 0)
225 goto out_free;
226 get_file(bprm->file);
227 fd_install(som_exec_fileno = retval, bprm->file);
228
229 /* Flush all traces of the currently running executable */ 222 /* Flush all traces of the currently running executable */
230 retval = flush_old_exec(bprm); 223 retval = flush_old_exec(bprm);
231 if (retval) 224 if (retval)
diff --git a/fs/bio.c b/fs/bio.c
index a040cde7f6fd..98711647ece4 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -175,14 +175,6 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
175 struct bio_vec *bvl; 175 struct bio_vec *bvl;
176 176
177 /* 177 /*
178 * If 'bs' is given, lookup the pool and do the mempool alloc.
179 * If not, this is a bio_kmalloc() allocation and just do a
180 * kzalloc() for the exact number of vecs right away.
181 */
182 if (!bs)
183 bvl = kmalloc(nr * sizeof(struct bio_vec), gfp_mask);
184
185 /*
186 * see comment near bvec_array define! 178 * see comment near bvec_array define!
187 */ 179 */
188 switch (nr) { 180 switch (nr) {
@@ -260,21 +252,6 @@ void bio_free(struct bio *bio, struct bio_set *bs)
260 mempool_free(p, bs->bio_pool); 252 mempool_free(p, bs->bio_pool);
261} 253}
262 254
263/*
264 * default destructor for a bio allocated with bio_alloc_bioset()
265 */
266static void bio_fs_destructor(struct bio *bio)
267{
268 bio_free(bio, fs_bio_set);
269}
270
271static void bio_kmalloc_destructor(struct bio *bio)
272{
273 if (bio_has_allocated_vec(bio))
274 kfree(bio->bi_io_vec);
275 kfree(bio);
276}
277
278void bio_init(struct bio *bio) 255void bio_init(struct bio *bio)
279{ 256{
280 memset(bio, 0, sizeof(*bio)); 257 memset(bio, 0, sizeof(*bio));
@@ -301,21 +278,15 @@ void bio_init(struct bio *bio)
301 **/ 278 **/
302struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) 279struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
303{ 280{
281 unsigned long idx = BIO_POOL_NONE;
304 struct bio_vec *bvl = NULL; 282 struct bio_vec *bvl = NULL;
305 struct bio *bio = NULL; 283 struct bio *bio;
306 unsigned long idx = 0; 284 void *p;
307 void *p = NULL; 285
308 286 p = mempool_alloc(bs->bio_pool, gfp_mask);
309 if (bs) { 287 if (unlikely(!p))
310 p = mempool_alloc(bs->bio_pool, gfp_mask); 288 return NULL;
311 if (!p) 289 bio = p + bs->front_pad;
312 goto err;
313 bio = p + bs->front_pad;
314 } else {
315 bio = kmalloc(sizeof(*bio), gfp_mask);
316 if (!bio)
317 goto err;
318 }
319 290
320 bio_init(bio); 291 bio_init(bio);
321 292
@@ -332,22 +303,33 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
332 303
333 nr_iovecs = bvec_nr_vecs(idx); 304 nr_iovecs = bvec_nr_vecs(idx);
334 } 305 }
306out_set:
335 bio->bi_flags |= idx << BIO_POOL_OFFSET; 307 bio->bi_flags |= idx << BIO_POOL_OFFSET;
336 bio->bi_max_vecs = nr_iovecs; 308 bio->bi_max_vecs = nr_iovecs;
337out_set:
338 bio->bi_io_vec = bvl; 309 bio->bi_io_vec = bvl;
339
340 return bio; 310 return bio;
341 311
342err_free: 312err_free:
343 if (bs) 313 mempool_free(p, bs->bio_pool);
344 mempool_free(p, bs->bio_pool);
345 else
346 kfree(bio);
347err:
348 return NULL; 314 return NULL;
349} 315}
350 316
317static void bio_fs_destructor(struct bio *bio)
318{
319 bio_free(bio, fs_bio_set);
320}
321
322/**
323 * bio_alloc - allocate a new bio, memory pool backed
324 * @gfp_mask: allocation mask to use
325 * @nr_iovecs: number of iovecs
326 *
327 * Allocate a new bio with @nr_iovecs bvecs. If @gfp_mask
328 * contains __GFP_WAIT, the allocation is guaranteed to succeed.
329 *
330 * RETURNS:
331 * Pointer to new bio on success, NULL on failure.
332 */
351struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) 333struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
352{ 334{
353 struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); 335 struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
@@ -358,19 +340,45 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
358 return bio; 340 return bio;
359} 341}
360 342
361/* 343static void bio_kmalloc_destructor(struct bio *bio)
362 * Like bio_alloc(), but doesn't use a mempool backing. This means that 344{
363 * it CAN fail, but while bio_alloc() can only be used for allocations 345 if (bio_integrity(bio))
364 * that have a short (finite) life span, bio_kmalloc() should be used 346 bio_integrity_free(bio);
365 * for more permanent bio allocations (like allocating some bio's for 347 kfree(bio);
366 * initalization or setup purposes). 348}
367 */ 349
350/**
351 * bio_alloc - allocate a bio for I/O
352 * @gfp_mask: the GFP_ mask given to the slab allocator
353 * @nr_iovecs: number of iovecs to pre-allocate
354 *
355 * Description:
356 * bio_alloc will allocate a bio and associated bio_vec array that can hold
357 * at least @nr_iovecs entries. Allocations will be done from the
358 * fs_bio_set. Also see @bio_alloc_bioset.
359 *
360 * If %__GFP_WAIT is set, then bio_alloc will always be able to allocate
361 * a bio. This is due to the mempool guarantees. To make this work, callers
362 * must never allocate more than 1 bio at the time from this pool. Callers
363 * that need to allocate more than 1 bio must always submit the previously
364 * allocate bio for IO before attempting to allocate a new one. Failure to
365 * do so can cause livelocks under memory pressure.
366 *
367 **/
368struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) 368struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
369{ 369{
370 struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, NULL); 370 struct bio *bio;
371 371
372 if (bio) 372 bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec),
373 bio->bi_destructor = bio_kmalloc_destructor; 373 gfp_mask);
374 if (unlikely(!bio))
375 return NULL;
376
377 bio_init(bio);
378 bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET;
379 bio->bi_max_vecs = nr_iovecs;
380 bio->bi_io_vec = bio->bi_inline_vecs;
381 bio->bi_destructor = bio_kmalloc_destructor;
374 382
375 return bio; 383 return bio;
376} 384}
@@ -809,12 +817,15 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
809 len += iov[i].iov_len; 817 len += iov[i].iov_len;
810 } 818 }
811 819
820 if (offset)
821 nr_pages++;
822
812 bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask); 823 bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask);
813 if (!bmd) 824 if (!bmd)
814 return ERR_PTR(-ENOMEM); 825 return ERR_PTR(-ENOMEM);
815 826
816 ret = -ENOMEM; 827 ret = -ENOMEM;
817 bio = bio_alloc(gfp_mask, nr_pages); 828 bio = bio_kmalloc(gfp_mask, nr_pages);
818 if (!bio) 829 if (!bio)
819 goto out_bmd; 830 goto out_bmd;
820 831
@@ -938,7 +949,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
938 if (!nr_pages) 949 if (!nr_pages)
939 return ERR_PTR(-EINVAL); 950 return ERR_PTR(-EINVAL);
940 951
941 bio = bio_alloc(gfp_mask, nr_pages); 952 bio = bio_kmalloc(gfp_mask, nr_pages);
942 if (!bio) 953 if (!bio)
943 return ERR_PTR(-ENOMEM); 954 return ERR_PTR(-ENOMEM);
944 955
@@ -1122,7 +1133,7 @@ static struct bio *__bio_map_kern(struct request_queue *q, void *data,
1122 int offset, i; 1133 int offset, i;
1123 struct bio *bio; 1134 struct bio *bio;
1124 1135
1125 bio = bio_alloc(gfp_mask, nr_pages); 1136 bio = bio_kmalloc(gfp_mask, nr_pages);
1126 if (!bio) 1137 if (!bio)
1127 return ERR_PTR(-ENOMEM); 1138 return ERR_PTR(-ENOMEM);
1128 1139
@@ -1420,8 +1431,7 @@ static void bio_pair_end_2(struct bio *bi, int err)
1420} 1431}
1421 1432
1422/* 1433/*
1423 * split a bio - only worry about a bio with a single page 1434 * split a bio - only worry about a bio with a single page in its iovec
1424 * in it's iovec
1425 */ 1435 */
1426struct bio_pair *bio_split(struct bio *bi, int first_sectors) 1436struct bio_pair *bio_split(struct bio *bi, int first_sectors)
1427{ 1437{
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 8c3c6899ccf3..f45dbc18dd17 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -204,6 +204,7 @@ int fsync_bdev(struct block_device *bdev)
204 } 204 }
205 return sync_blockdev(bdev); 205 return sync_blockdev(bdev);
206} 206}
207EXPORT_SYMBOL(fsync_bdev);
207 208
208/** 209/**
209 * freeze_bdev -- lock a filesystem and force it into a consistent state 210 * freeze_bdev -- lock a filesystem and force it into a consistent state
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index d2cf5a54a4b8..94212844a9bc 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -1,25 +1,10 @@
1ifneq ($(KERNELRELEASE),)
2# kbuild part of makefile
3 1
4obj-$(CONFIG_BTRFS_FS) := btrfs.o 2obj-$(CONFIG_BTRFS_FS) := btrfs.o
5btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ 3
4btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
6 file-item.o inode-item.o inode-map.o disk-io.o \ 5 file-item.o inode-item.o inode-map.o disk-io.o \
7 transaction.o inode.o file.o tree-defrag.o \ 6 transaction.o inode.o file.o tree-defrag.o \
8 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
9 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
10 ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \ 9 ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
11 compression.o 10 compression.o delayed-ref.o
12else
13
14# Normal Makefile
15
16KERNELDIR := /lib/modules/`uname -r`/build
17all:
18 $(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules
19
20modules_install:
21 $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install
22clean:
23 $(MAKE) -C $(KERNELDIR) M=`pwd` clean
24
25endif
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 1d53b62dbba5..cbba000dccbe 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -60,15 +60,20 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
60 return ERR_PTR(-EINVAL); 60 return ERR_PTR(-EINVAL);
61 } 61 }
62 62
63 /* Handle the cached NULL acl case without locking */
64 acl = ACCESS_ONCE(*p_acl);
65 if (!acl)
66 return acl;
67
63 spin_lock(&inode->i_lock); 68 spin_lock(&inode->i_lock);
64 if (*p_acl != BTRFS_ACL_NOT_CACHED) 69 acl = *p_acl;
65 acl = posix_acl_dup(*p_acl); 70 if (acl != BTRFS_ACL_NOT_CACHED)
71 acl = posix_acl_dup(acl);
66 spin_unlock(&inode->i_lock); 72 spin_unlock(&inode->i_lock);
67 73
68 if (acl) 74 if (acl != BTRFS_ACL_NOT_CACHED)
69 return acl; 75 return acl;
70 76
71
72 size = __btrfs_getxattr(inode, name, "", 0); 77 size = __btrfs_getxattr(inode, name, "", 0);
73 if (size > 0) { 78 if (size > 0) {
74 value = kzalloc(size, GFP_NOFS); 79 value = kzalloc(size, GFP_NOFS);
@@ -80,9 +85,12 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
80 btrfs_update_cached_acl(inode, p_acl, acl); 85 btrfs_update_cached_acl(inode, p_acl, acl);
81 } 86 }
82 kfree(value); 87 kfree(value);
83 } else if (size == -ENOENT) { 88 } else if (size == -ENOENT || size == -ENODATA || size == 0) {
89 /* FIXME, who returns -ENOENT? I think nobody */
84 acl = NULL; 90 acl = NULL;
85 btrfs_update_cached_acl(inode, p_acl, acl); 91 btrfs_update_cached_acl(inode, p_acl, acl);
92 } else {
93 acl = ERR_PTR(-EIO);
86 } 94 }
87 95
88 return acl; 96 return acl;
@@ -256,7 +264,7 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
256 } 264 }
257 265
258 if (!acl) 266 if (!acl)
259 inode->i_mode &= ~current->fs->umask; 267 inode->i_mode &= ~current_umask();
260 } 268 }
261 269
262 if (IS_POSIXACL(dir) && acl) { 270 if (IS_POSIXACL(dir) && acl) {
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c84ca1f5259a..502c3d61de62 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -20,12 +20,12 @@
20#include <linux/list.h> 20#include <linux/list.h>
21#include <linux/spinlock.h> 21#include <linux/spinlock.h>
22#include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/ftrace.h>
24#include "async-thread.h" 23#include "async-thread.h"
25 24
26#define WORK_QUEUED_BIT 0 25#define WORK_QUEUED_BIT 0
27#define WORK_DONE_BIT 1 26#define WORK_DONE_BIT 1
28#define WORK_ORDER_DONE_BIT 2 27#define WORK_ORDER_DONE_BIT 2
28#define WORK_HIGH_PRIO_BIT 3
29 29
30/* 30/*
31 * container for the kthread task pointer and the list of pending work 31 * container for the kthread task pointer and the list of pending work
@@ -37,6 +37,7 @@ struct btrfs_worker_thread {
37 37
38 /* list of struct btrfs_work that are waiting for service */ 38 /* list of struct btrfs_work that are waiting for service */
39 struct list_head pending; 39 struct list_head pending;
40 struct list_head prio_pending;
40 41
41 /* list of worker threads from struct btrfs_workers */ 42 /* list of worker threads from struct btrfs_workers */
42 struct list_head worker_list; 43 struct list_head worker_list;
@@ -104,10 +105,16 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
104 105
105 spin_lock_irqsave(&workers->lock, flags); 106 spin_lock_irqsave(&workers->lock, flags);
106 107
107 while (!list_empty(&workers->order_list)) { 108 while (1) {
108 work = list_entry(workers->order_list.next, 109 if (!list_empty(&workers->prio_order_list)) {
109 struct btrfs_work, order_list); 110 work = list_entry(workers->prio_order_list.next,
110 111 struct btrfs_work, order_list);
112 } else if (!list_empty(&workers->order_list)) {
113 work = list_entry(workers->order_list.next,
114 struct btrfs_work, order_list);
115 } else {
116 break;
117 }
111 if (!test_bit(WORK_DONE_BIT, &work->flags)) 118 if (!test_bit(WORK_DONE_BIT, &work->flags))
112 break; 119 break;
113 120
@@ -144,8 +151,14 @@ static int worker_loop(void *arg)
144 do { 151 do {
145 spin_lock_irq(&worker->lock); 152 spin_lock_irq(&worker->lock);
146again_locked: 153again_locked:
147 while (!list_empty(&worker->pending)) { 154 while (1) {
148 cur = worker->pending.next; 155 if (!list_empty(&worker->prio_pending))
156 cur = worker->prio_pending.next;
157 else if (!list_empty(&worker->pending))
158 cur = worker->pending.next;
159 else
160 break;
161
149 work = list_entry(cur, struct btrfs_work, list); 162 work = list_entry(cur, struct btrfs_work, list);
150 list_del(&work->list); 163 list_del(&work->list);
151 clear_bit(WORK_QUEUED_BIT, &work->flags); 164 clear_bit(WORK_QUEUED_BIT, &work->flags);
@@ -164,7 +177,6 @@ again_locked:
164 177
165 spin_lock_irq(&worker->lock); 178 spin_lock_irq(&worker->lock);
166 check_idle_worker(worker); 179 check_idle_worker(worker);
167
168 } 180 }
169 if (freezing(current)) { 181 if (freezing(current)) {
170 worker->working = 0; 182 worker->working = 0;
@@ -179,7 +191,8 @@ again_locked:
179 * jump_in? 191 * jump_in?
180 */ 192 */
181 smp_mb(); 193 smp_mb();
182 if (!list_empty(&worker->pending)) 194 if (!list_empty(&worker->pending) ||
195 !list_empty(&worker->prio_pending))
183 continue; 196 continue;
184 197
185 /* 198 /*
@@ -192,13 +205,18 @@ again_locked:
192 */ 205 */
193 schedule_timeout(1); 206 schedule_timeout(1);
194 smp_mb(); 207 smp_mb();
195 if (!list_empty(&worker->pending)) 208 if (!list_empty(&worker->pending) ||
209 !list_empty(&worker->prio_pending))
196 continue; 210 continue;
197 211
212 if (kthread_should_stop())
213 break;
214
198 /* still no more work?, sleep for real */ 215 /* still no more work?, sleep for real */
199 spin_lock_irq(&worker->lock); 216 spin_lock_irq(&worker->lock);
200 set_current_state(TASK_INTERRUPTIBLE); 217 set_current_state(TASK_INTERRUPTIBLE);
201 if (!list_empty(&worker->pending)) 218 if (!list_empty(&worker->pending) ||
219 !list_empty(&worker->prio_pending))
202 goto again_locked; 220 goto again_locked;
203 221
204 /* 222 /*
@@ -208,7 +226,8 @@ again_locked:
208 worker->working = 0; 226 worker->working = 0;
209 spin_unlock_irq(&worker->lock); 227 spin_unlock_irq(&worker->lock);
210 228
211 schedule(); 229 if (!kthread_should_stop())
230 schedule();
212 } 231 }
213 __set_current_state(TASK_RUNNING); 232 __set_current_state(TASK_RUNNING);
214 } 233 }
@@ -245,6 +264,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
245 INIT_LIST_HEAD(&workers->worker_list); 264 INIT_LIST_HEAD(&workers->worker_list);
246 INIT_LIST_HEAD(&workers->idle_list); 265 INIT_LIST_HEAD(&workers->idle_list);
247 INIT_LIST_HEAD(&workers->order_list); 266 INIT_LIST_HEAD(&workers->order_list);
267 INIT_LIST_HEAD(&workers->prio_order_list);
248 spin_lock_init(&workers->lock); 268 spin_lock_init(&workers->lock);
249 workers->max_workers = max; 269 workers->max_workers = max;
250 workers->idle_thresh = 32; 270 workers->idle_thresh = 32;
@@ -270,6 +290,7 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
270 } 290 }
271 291
272 INIT_LIST_HEAD(&worker->pending); 292 INIT_LIST_HEAD(&worker->pending);
293 INIT_LIST_HEAD(&worker->prio_pending);
273 INIT_LIST_HEAD(&worker->worker_list); 294 INIT_LIST_HEAD(&worker->worker_list);
274 spin_lock_init(&worker->lock); 295 spin_lock_init(&worker->lock);
275 atomic_set(&worker->num_pending, 0); 296 atomic_set(&worker->num_pending, 0);
@@ -393,7 +414,10 @@ int btrfs_requeue_work(struct btrfs_work *work)
393 goto out; 414 goto out;
394 415
395 spin_lock_irqsave(&worker->lock, flags); 416 spin_lock_irqsave(&worker->lock, flags);
396 list_add_tail(&work->list, &worker->pending); 417 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
418 list_add_tail(&work->list, &worker->prio_pending);
419 else
420 list_add_tail(&work->list, &worker->pending);
397 atomic_inc(&worker->num_pending); 421 atomic_inc(&worker->num_pending);
398 422
399 /* by definition we're busy, take ourselves off the idle 423 /* by definition we're busy, take ourselves off the idle
@@ -419,6 +443,11 @@ out:
419 return 0; 443 return 0;
420} 444}
421 445
446void btrfs_set_work_high_prio(struct btrfs_work *work)
447{
448 set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
449}
450
422/* 451/*
423 * places a struct btrfs_work into the pending queue of one of the kthreads 452 * places a struct btrfs_work into the pending queue of one of the kthreads
424 */ 453 */
@@ -435,7 +464,12 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
435 worker = find_worker(workers); 464 worker = find_worker(workers);
436 if (workers->ordered) { 465 if (workers->ordered) {
437 spin_lock_irqsave(&workers->lock, flags); 466 spin_lock_irqsave(&workers->lock, flags);
438 list_add_tail(&work->order_list, &workers->order_list); 467 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
468 list_add_tail(&work->order_list,
469 &workers->prio_order_list);
470 } else {
471 list_add_tail(&work->order_list, &workers->order_list);
472 }
439 spin_unlock_irqrestore(&workers->lock, flags); 473 spin_unlock_irqrestore(&workers->lock, flags);
440 } else { 474 } else {
441 INIT_LIST_HEAD(&work->order_list); 475 INIT_LIST_HEAD(&work->order_list);
@@ -443,7 +477,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
443 477
444 spin_lock_irqsave(&worker->lock, flags); 478 spin_lock_irqsave(&worker->lock, flags);
445 479
446 list_add_tail(&work->list, &worker->pending); 480 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
481 list_add_tail(&work->list, &worker->prio_pending);
482 else
483 list_add_tail(&work->list, &worker->pending);
447 atomic_inc(&worker->num_pending); 484 atomic_inc(&worker->num_pending);
448 check_busy_worker(worker); 485 check_busy_worker(worker);
449 486
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 31be4ed8b63e..1b511c109db6 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -85,6 +85,7 @@ struct btrfs_workers {
85 * of work items waiting for completion 85 * of work items waiting for completion
86 */ 86 */
87 struct list_head order_list; 87 struct list_head order_list;
88 struct list_head prio_order_list;
88 89
89 /* lock for finding the next worker thread to queue on */ 90 /* lock for finding the next worker thread to queue on */
90 spinlock_t lock; 91 spinlock_t lock;
@@ -98,4 +99,5 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
98int btrfs_stop_workers(struct btrfs_workers *workers); 99int btrfs_stop_workers(struct btrfs_workers *workers);
99void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); 100void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
100int btrfs_requeue_work(struct btrfs_work *work); 101int btrfs_requeue_work(struct btrfs_work *work);
102void btrfs_set_work_high_prio(struct btrfs_work *work);
101#endif 103#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 72677ce2b74f..b30986f00b9d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,6 +66,12 @@ struct btrfs_inode {
66 */ 66 */
67 struct list_head delalloc_inodes; 67 struct list_head delalloc_inodes;
68 68
69 /*
70 * list for tracking inodes that must be sent to disk before a
71 * rename or truncate commit
72 */
73 struct list_head ordered_operations;
74
69 /* the space_info for where this inode's data allocations are done */ 75 /* the space_info for where this inode's data allocations are done */
70 struct btrfs_space_info *space_info; 76 struct btrfs_space_info *space_info;
71 77
@@ -86,12 +92,6 @@ struct btrfs_inode {
86 */ 92 */
87 u64 logged_trans; 93 u64 logged_trans;
88 94
89 /*
90 * trans that last made a change that should be fully fsync'd. This
91 * gets reset to zero each time the inode is logged
92 */
93 u64 log_dirty_trans;
94
95 /* total number of bytes pending delalloc, used by stat to calc the 95 /* total number of bytes pending delalloc, used by stat to calc the
96 * real block usage of the file 96 * real block usage of the file
97 */ 97 */
@@ -121,6 +121,25 @@ struct btrfs_inode {
121 /* the start of block group preferred for allocations. */ 121 /* the start of block group preferred for allocations. */
122 u64 block_group; 122 u64 block_group;
123 123
124 /* the fsync log has some corner cases that mean we have to check
125 * directories to see if any unlinks have been done before
126 * the directory was logged. See tree-log.c for all the
127 * details
128 */
129 u64 last_unlink_trans;
130
131 /*
132 * ordered_data_close is set by truncate when a file that used
133 * to have good data has been truncated to zero. When it is set
134 * the btrfs file release call will add this inode to the
135 * ordered operations list so that we make sure to flush out any
136 * new data the application may have written before commit.
137 *
138 * yes, its silly to have a single bitflag, but we might grow more
139 * of these.
140 */
141 unsigned ordered_data_close:1;
142
124 struct inode vfs_inode; 143 struct inode vfs_inode;
125}; 144};
126 145
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 37f31b5529aa..fedf8b9f03a2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -254,18 +254,13 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
254 * empty_size -- a hint that you plan on doing more cow. This is the size in 254 * empty_size -- a hint that you plan on doing more cow. This is the size in
255 * bytes the allocator should try to find free next to the block it returns. 255 * bytes the allocator should try to find free next to the block it returns.
256 * This is just a hint and may be ignored by the allocator. 256 * This is just a hint and may be ignored by the allocator.
257 *
258 * prealloc_dest -- if you have already reserved a destination for the cow,
259 * this uses that block instead of allocating a new one.
260 * btrfs_alloc_reserved_extent is used to finish the allocation.
261 */ 257 */
262static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, 258static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
263 struct btrfs_root *root, 259 struct btrfs_root *root,
264 struct extent_buffer *buf, 260 struct extent_buffer *buf,
265 struct extent_buffer *parent, int parent_slot, 261 struct extent_buffer *parent, int parent_slot,
266 struct extent_buffer **cow_ret, 262 struct extent_buffer **cow_ret,
267 u64 search_start, u64 empty_size, 263 u64 search_start, u64 empty_size)
268 u64 prealloc_dest)
269{ 264{
270 u64 parent_start; 265 u64 parent_start;
271 struct extent_buffer *cow; 266 struct extent_buffer *cow;
@@ -291,26 +286,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
291 level = btrfs_header_level(buf); 286 level = btrfs_header_level(buf);
292 nritems = btrfs_header_nritems(buf); 287 nritems = btrfs_header_nritems(buf);
293 288
294 if (prealloc_dest) { 289 cow = btrfs_alloc_free_block(trans, root, buf->len,
295 struct btrfs_key ins; 290 parent_start, root->root_key.objectid,
296 291 trans->transid, level,
297 ins.objectid = prealloc_dest; 292 search_start, empty_size);
298 ins.offset = buf->len;
299 ins.type = BTRFS_EXTENT_ITEM_KEY;
300
301 ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
302 root->root_key.objectid,
303 trans->transid, level, &ins);
304 BUG_ON(ret);
305 cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
306 buf->len, level);
307 } else {
308 cow = btrfs_alloc_free_block(trans, root, buf->len,
309 parent_start,
310 root->root_key.objectid,
311 trans->transid, level,
312 search_start, empty_size);
313 }
314 if (IS_ERR(cow)) 293 if (IS_ERR(cow))
315 return PTR_ERR(cow); 294 return PTR_ERR(cow);
316 295
@@ -413,7 +392,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
413noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, 392noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
414 struct btrfs_root *root, struct extent_buffer *buf, 393 struct btrfs_root *root, struct extent_buffer *buf,
415 struct extent_buffer *parent, int parent_slot, 394 struct extent_buffer *parent, int parent_slot,
416 struct extent_buffer **cow_ret, u64 prealloc_dest) 395 struct extent_buffer **cow_ret)
417{ 396{
418 u64 search_start; 397 u64 search_start;
419 int ret; 398 int ret;
@@ -436,7 +415,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
436 btrfs_header_owner(buf) == root->root_key.objectid && 415 btrfs_header_owner(buf) == root->root_key.objectid &&
437 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 416 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
438 *cow_ret = buf; 417 *cow_ret = buf;
439 WARN_ON(prealloc_dest);
440 return 0; 418 return 0;
441 } 419 }
442 420
@@ -447,8 +425,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
447 btrfs_set_lock_blocking(buf); 425 btrfs_set_lock_blocking(buf);
448 426
449 ret = __btrfs_cow_block(trans, root, buf, parent, 427 ret = __btrfs_cow_block(trans, root, buf, parent,
450 parent_slot, cow_ret, search_start, 0, 428 parent_slot, cow_ret, search_start, 0);
451 prealloc_dest);
452 return ret; 429 return ret;
453} 430}
454 431
@@ -617,7 +594,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
617 err = __btrfs_cow_block(trans, root, cur, parent, i, 594 err = __btrfs_cow_block(trans, root, cur, parent, i,
618 &cur, search_start, 595 &cur, search_start,
619 min(16 * blocksize, 596 min(16 * blocksize,
620 (end_slot - i) * blocksize), 0); 597 (end_slot - i) * blocksize));
621 if (err) { 598 if (err) {
622 btrfs_tree_unlock(cur); 599 btrfs_tree_unlock(cur);
623 free_extent_buffer(cur); 600 free_extent_buffer(cur);
@@ -937,7 +914,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
937 BUG_ON(!child); 914 BUG_ON(!child);
938 btrfs_tree_lock(child); 915 btrfs_tree_lock(child);
939 btrfs_set_lock_blocking(child); 916 btrfs_set_lock_blocking(child);
940 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); 917 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
941 BUG_ON(ret); 918 BUG_ON(ret);
942 919
943 spin_lock(&root->node_lock); 920 spin_lock(&root->node_lock);
@@ -945,6 +922,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
945 spin_unlock(&root->node_lock); 922 spin_unlock(&root->node_lock);
946 923
947 ret = btrfs_update_extent_ref(trans, root, child->start, 924 ret = btrfs_update_extent_ref(trans, root, child->start,
925 child->len,
948 mid->start, child->start, 926 mid->start, child->start,
949 root->root_key.objectid, 927 root->root_key.objectid,
950 trans->transid, level - 1); 928 trans->transid, level - 1);
@@ -971,6 +949,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
971 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) 949 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
972 return 0; 950 return 0;
973 951
952 if (trans->transaction->delayed_refs.flushing &&
953 btrfs_header_nritems(mid) > 2)
954 return 0;
955
974 if (btrfs_header_nritems(mid) < 2) 956 if (btrfs_header_nritems(mid) < 2)
975 err_on_enospc = 1; 957 err_on_enospc = 1;
976 958
@@ -979,7 +961,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
979 btrfs_tree_lock(left); 961 btrfs_tree_lock(left);
980 btrfs_set_lock_blocking(left); 962 btrfs_set_lock_blocking(left);
981 wret = btrfs_cow_block(trans, root, left, 963 wret = btrfs_cow_block(trans, root, left,
982 parent, pslot - 1, &left, 0); 964 parent, pslot - 1, &left);
983 if (wret) { 965 if (wret) {
984 ret = wret; 966 ret = wret;
985 goto enospc; 967 goto enospc;
@@ -990,7 +972,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
990 btrfs_tree_lock(right); 972 btrfs_tree_lock(right);
991 btrfs_set_lock_blocking(right); 973 btrfs_set_lock_blocking(right);
992 wret = btrfs_cow_block(trans, root, right, 974 wret = btrfs_cow_block(trans, root, right,
993 parent, pslot + 1, &right, 0); 975 parent, pslot + 1, &right);
994 if (wret) { 976 if (wret) {
995 ret = wret; 977 ret = wret;
996 goto enospc; 978 goto enospc;
@@ -1171,7 +1153,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1171 wret = 1; 1153 wret = 1;
1172 } else { 1154 } else {
1173 ret = btrfs_cow_block(trans, root, left, parent, 1155 ret = btrfs_cow_block(trans, root, left, parent,
1174 pslot - 1, &left, 0); 1156 pslot - 1, &left);
1175 if (ret) 1157 if (ret)
1176 wret = 1; 1158 wret = 1;
1177 else { 1159 else {
@@ -1222,7 +1204,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1222 } else { 1204 } else {
1223 ret = btrfs_cow_block(trans, root, right, 1205 ret = btrfs_cow_block(trans, root, right,
1224 parent, pslot + 1, 1206 parent, pslot + 1,
1225 &right, 0); 1207 &right);
1226 if (ret) 1208 if (ret)
1227 wret = 1; 1209 wret = 1;
1228 else { 1210 else {
@@ -1262,9 +1244,9 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1262 * readahead one full node of leaves, finding things that are close 1244 * readahead one full node of leaves, finding things that are close
1263 * to the block in 'slot', and triggering ra on them. 1245 * to the block in 'slot', and triggering ra on them.
1264 */ 1246 */
1265static noinline void reada_for_search(struct btrfs_root *root, 1247static void reada_for_search(struct btrfs_root *root,
1266 struct btrfs_path *path, 1248 struct btrfs_path *path,
1267 int level, int slot, u64 objectid) 1249 int level, int slot, u64 objectid)
1268{ 1250{
1269 struct extent_buffer *node; 1251 struct extent_buffer *node;
1270 struct btrfs_disk_key disk_key; 1252 struct btrfs_disk_key disk_key;
@@ -1343,12 +1325,12 @@ static noinline int reada_for_balance(struct btrfs_root *root,
1343 int ret = 0; 1325 int ret = 0;
1344 int blocksize; 1326 int blocksize;
1345 1327
1346 parent = path->nodes[level - 1]; 1328 parent = path->nodes[level + 1];
1347 if (!parent) 1329 if (!parent)
1348 return 0; 1330 return 0;
1349 1331
1350 nritems = btrfs_header_nritems(parent); 1332 nritems = btrfs_header_nritems(parent);
1351 slot = path->slots[level]; 1333 slot = path->slots[level + 1];
1352 blocksize = btrfs_level_size(root, level); 1334 blocksize = btrfs_level_size(root, level);
1353 1335
1354 if (slot > 0) { 1336 if (slot > 0) {
@@ -1359,7 +1341,7 @@ static noinline int reada_for_balance(struct btrfs_root *root,
1359 block1 = 0; 1341 block1 = 0;
1360 free_extent_buffer(eb); 1342 free_extent_buffer(eb);
1361 } 1343 }
1362 if (slot < nritems) { 1344 if (slot + 1 < nritems) {
1363 block2 = btrfs_node_blockptr(parent, slot + 1); 1345 block2 = btrfs_node_blockptr(parent, slot + 1);
1364 gen = btrfs_node_ptr_generation(parent, slot + 1); 1346 gen = btrfs_node_ptr_generation(parent, slot + 1);
1365 eb = btrfs_find_tree_block(root, block2, blocksize); 1347 eb = btrfs_find_tree_block(root, block2, blocksize);
@@ -1369,7 +1351,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
1369 } 1351 }
1370 if (block1 || block2) { 1352 if (block1 || block2) {
1371 ret = -EAGAIN; 1353 ret = -EAGAIN;
1354
1355 /* release the whole path */
1372 btrfs_release_path(root, path); 1356 btrfs_release_path(root, path);
1357
1358 /* read the blocks */
1373 if (block1) 1359 if (block1)
1374 readahead_tree_block(root, block1, blocksize, 0); 1360 readahead_tree_block(root, block1, blocksize, 0);
1375 if (block2) 1361 if (block2)
@@ -1379,7 +1365,7 @@ static noinline int reada_for_balance(struct btrfs_root *root,
1379 eb = read_tree_block(root, block1, blocksize, 0); 1365 eb = read_tree_block(root, block1, blocksize, 0);
1380 free_extent_buffer(eb); 1366 free_extent_buffer(eb);
1381 } 1367 }
1382 if (block1) { 1368 if (block2) {
1383 eb = read_tree_block(root, block2, blocksize, 0); 1369 eb = read_tree_block(root, block2, blocksize, 0);
1384 free_extent_buffer(eb); 1370 free_extent_buffer(eb);
1385 } 1371 }
@@ -1465,6 +1451,138 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1465} 1451}
1466 1452
1467/* 1453/*
1454 * helper function for btrfs_search_slot. The goal is to find a block
1455 * in cache without setting the path to blocking. If we find the block
1456 * we return zero and the path is unchanged.
1457 *
1458 * If we can't find the block, we set the path blocking and do some
1459 * reada. -EAGAIN is returned and the search must be repeated.
1460 */
1461static int
1462read_block_for_search(struct btrfs_trans_handle *trans,
1463 struct btrfs_root *root, struct btrfs_path *p,
1464 struct extent_buffer **eb_ret, int level, int slot,
1465 struct btrfs_key *key)
1466{
1467 u64 blocknr;
1468 u64 gen;
1469 u32 blocksize;
1470 struct extent_buffer *b = *eb_ret;
1471 struct extent_buffer *tmp;
1472 int ret;
1473
1474 blocknr = btrfs_node_blockptr(b, slot);
1475 gen = btrfs_node_ptr_generation(b, slot);
1476 blocksize = btrfs_level_size(root, level - 1);
1477
1478 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1479 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1480 /*
1481 * we found an up to date block without sleeping, return
1482 * right away
1483 */
1484 *eb_ret = tmp;
1485 return 0;
1486 }
1487
1488 /*
1489 * reduce lock contention at high levels
1490 * of the btree by dropping locks before
1491 * we read. Don't release the lock on the current
1492 * level because we need to walk this node to figure
1493 * out which blocks to read.
1494 */
1495 btrfs_unlock_up_safe(p, level + 1);
1496 btrfs_set_path_blocking(p);
1497
1498 if (tmp)
1499 free_extent_buffer(tmp);
1500 if (p->reada)
1501 reada_for_search(root, p, level, slot, key->objectid);
1502
1503 btrfs_release_path(NULL, p);
1504
1505 ret = -EAGAIN;
1506 tmp = read_tree_block(root, blocknr, blocksize, gen);
1507 if (tmp) {
1508 /*
1509 * If the read above didn't mark this buffer up to date,
1510 * it will never end up being up to date. Set ret to EIO now
1511 * and give up so that our caller doesn't loop forever
1512 * on our EAGAINs.
1513 */
1514 if (!btrfs_buffer_uptodate(tmp, 0))
1515 ret = -EIO;
1516 free_extent_buffer(tmp);
1517 }
1518 return ret;
1519}
1520
1521/*
1522 * helper function for btrfs_search_slot. This does all of the checks
1523 * for node-level blocks and does any balancing required based on
1524 * the ins_len.
1525 *
1526 * If no extra work was required, zero is returned. If we had to
1527 * drop the path, -EAGAIN is returned and btrfs_search_slot must
1528 * start over
1529 */
1530static int
1531setup_nodes_for_search(struct btrfs_trans_handle *trans,
1532 struct btrfs_root *root, struct btrfs_path *p,
1533 struct extent_buffer *b, int level, int ins_len)
1534{
1535 int ret;
1536 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1537 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1538 int sret;
1539
1540 sret = reada_for_balance(root, p, level);
1541 if (sret)
1542 goto again;
1543
1544 btrfs_set_path_blocking(p);
1545 sret = split_node(trans, root, p, level);
1546 btrfs_clear_path_blocking(p, NULL);
1547
1548 BUG_ON(sret > 0);
1549 if (sret) {
1550 ret = sret;
1551 goto done;
1552 }
1553 b = p->nodes[level];
1554 } else if (ins_len < 0 && btrfs_header_nritems(b) <
1555 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
1556 int sret;
1557
1558 sret = reada_for_balance(root, p, level);
1559 if (sret)
1560 goto again;
1561
1562 btrfs_set_path_blocking(p);
1563 sret = balance_level(trans, root, p, level);
1564 btrfs_clear_path_blocking(p, NULL);
1565
1566 if (sret) {
1567 ret = sret;
1568 goto done;
1569 }
1570 b = p->nodes[level];
1571 if (!b) {
1572 btrfs_release_path(NULL, p);
1573 goto again;
1574 }
1575 BUG_ON(btrfs_header_nritems(b) == 1);
1576 }
1577 return 0;
1578
1579again:
1580 ret = -EAGAIN;
1581done:
1582 return ret;
1583}
1584
1585/*
1468 * look for key in the tree. path is filled in with nodes along the way 1586 * look for key in the tree. path is filled in with nodes along the way
1469 * if key is found, we return zero and you can find the item in the leaf 1587 * if key is found, we return zero and you can find the item in the leaf
1470 * level of the path (level 0) 1588 * level of the path (level 0)
@@ -1482,17 +1600,11 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1482 ins_len, int cow) 1600 ins_len, int cow)
1483{ 1601{
1484 struct extent_buffer *b; 1602 struct extent_buffer *b;
1485 struct extent_buffer *tmp;
1486 int slot; 1603 int slot;
1487 int ret; 1604 int ret;
1488 int level; 1605 int level;
1489 int should_reada = p->reada;
1490 int lowest_unlock = 1; 1606 int lowest_unlock = 1;
1491 int blocksize;
1492 u8 lowest_level = 0; 1607 u8 lowest_level = 0;
1493 u64 blocknr;
1494 u64 gen;
1495 struct btrfs_key prealloc_block;
1496 1608
1497 lowest_level = p->lowest_level; 1609 lowest_level = p->lowest_level;
1498 WARN_ON(lowest_level && ins_len > 0); 1610 WARN_ON(lowest_level && ins_len > 0);
@@ -1501,8 +1613,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1501 if (ins_len < 0) 1613 if (ins_len < 0)
1502 lowest_unlock = 2; 1614 lowest_unlock = 2;
1503 1615
1504 prealloc_block.objectid = 0;
1505
1506again: 1616again:
1507 if (p->skip_locking) 1617 if (p->skip_locking)
1508 b = btrfs_root_node(root); 1618 b = btrfs_root_node(root);
@@ -1523,50 +1633,21 @@ again:
1523 if (cow) { 1633 if (cow) {
1524 int wret; 1634 int wret;
1525 1635
1526 /* is a cow on this block not required */ 1636 /*
1637 * if we don't really need to cow this block
1638 * then we don't want to set the path blocking,
1639 * so we test it here
1640 */
1527 if (btrfs_header_generation(b) == trans->transid && 1641 if (btrfs_header_generation(b) == trans->transid &&
1528 btrfs_header_owner(b) == root->root_key.objectid && 1642 btrfs_header_owner(b) == root->root_key.objectid &&
1529 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { 1643 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
1530 goto cow_done; 1644 goto cow_done;
1531 } 1645 }
1532
1533 /* ok, we have to cow, is our old prealloc the right
1534 * size?
1535 */
1536 if (prealloc_block.objectid &&
1537 prealloc_block.offset != b->len) {
1538 btrfs_release_path(root, p);
1539 btrfs_free_reserved_extent(root,
1540 prealloc_block.objectid,
1541 prealloc_block.offset);
1542 prealloc_block.objectid = 0;
1543 goto again;
1544 }
1545
1546 /*
1547 * for higher level blocks, try not to allocate blocks
1548 * with the block and the parent locks held.
1549 */
1550 if (level > 0 && !prealloc_block.objectid) {
1551 u32 size = b->len;
1552 u64 hint = b->start;
1553
1554 btrfs_release_path(root, p);
1555 ret = btrfs_reserve_extent(trans, root,
1556 size, size, 0,
1557 hint, (u64)-1,
1558 &prealloc_block, 0);
1559 BUG_ON(ret);
1560 goto again;
1561 }
1562
1563 btrfs_set_path_blocking(p); 1646 btrfs_set_path_blocking(p);
1564 1647
1565 wret = btrfs_cow_block(trans, root, b, 1648 wret = btrfs_cow_block(trans, root, b,
1566 p->nodes[level + 1], 1649 p->nodes[level + 1],
1567 p->slots[level + 1], 1650 p->slots[level + 1], &b);
1568 &b, prealloc_block.objectid);
1569 prealloc_block.objectid = 0;
1570 if (wret) { 1651 if (wret) {
1571 free_extent_buffer(b); 1652 free_extent_buffer(b);
1572 ret = wret; 1653 ret = wret;
@@ -1611,51 +1692,15 @@ cow_done:
1611 if (ret && slot > 0) 1692 if (ret && slot > 0)
1612 slot -= 1; 1693 slot -= 1;
1613 p->slots[level] = slot; 1694 p->slots[level] = slot;
1614 if ((p->search_for_split || ins_len > 0) && 1695 ret = setup_nodes_for_search(trans, root, p, b, level,
1615 btrfs_header_nritems(b) >= 1696 ins_len);
1616 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { 1697 if (ret == -EAGAIN)
1617 int sret; 1698 goto again;
1618 1699 else if (ret)
1619 sret = reada_for_balance(root, p, level); 1700 goto done;
1620 if (sret) 1701 b = p->nodes[level];
1621 goto again; 1702 slot = p->slots[level];
1622
1623 btrfs_set_path_blocking(p);
1624 sret = split_node(trans, root, p, level);
1625 btrfs_clear_path_blocking(p, NULL);
1626
1627 BUG_ON(sret > 0);
1628 if (sret) {
1629 ret = sret;
1630 goto done;
1631 }
1632 b = p->nodes[level];
1633 slot = p->slots[level];
1634 } else if (ins_len < 0 &&
1635 btrfs_header_nritems(b) <
1636 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
1637 int sret;
1638
1639 sret = reada_for_balance(root, p, level);
1640 if (sret)
1641 goto again;
1642
1643 btrfs_set_path_blocking(p);
1644 sret = balance_level(trans, root, p, level);
1645 btrfs_clear_path_blocking(p, NULL);
1646 1703
1647 if (sret) {
1648 ret = sret;
1649 goto done;
1650 }
1651 b = p->nodes[level];
1652 if (!b) {
1653 btrfs_release_path(NULL, p);
1654 goto again;
1655 }
1656 slot = p->slots[level];
1657 BUG_ON(btrfs_header_nritems(b) == 1);
1658 }
1659 unlock_up(p, level, lowest_unlock); 1704 unlock_up(p, level, lowest_unlock);
1660 1705
1661 /* this is only true while dropping a snapshot */ 1706 /* this is only true while dropping a snapshot */
@@ -1664,44 +1709,14 @@ cow_done:
1664 goto done; 1709 goto done;
1665 } 1710 }
1666 1711
1667 blocknr = btrfs_node_blockptr(b, slot); 1712 ret = read_block_for_search(trans, root, p,
1668 gen = btrfs_node_ptr_generation(b, slot); 1713 &b, level, slot, key);
1669 blocksize = btrfs_level_size(root, level - 1); 1714 if (ret == -EAGAIN)
1715 goto again;
1716
1717 if (ret == -EIO)
1718 goto done;
1670 1719
1671 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1672 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1673 b = tmp;
1674 } else {
1675 /*
1676 * reduce lock contention at high levels
1677 * of the btree by dropping locks before
1678 * we read.
1679 */
1680 if (level > 0) {
1681 btrfs_release_path(NULL, p);
1682 if (tmp)
1683 free_extent_buffer(tmp);
1684 if (should_reada)
1685 reada_for_search(root, p,
1686 level, slot,
1687 key->objectid);
1688
1689 tmp = read_tree_block(root, blocknr,
1690 blocksize, gen);
1691 if (tmp)
1692 free_extent_buffer(tmp);
1693 goto again;
1694 } else {
1695 btrfs_set_path_blocking(p);
1696 if (tmp)
1697 free_extent_buffer(tmp);
1698 if (should_reada)
1699 reada_for_search(root, p,
1700 level, slot,
1701 key->objectid);
1702 b = read_node_slot(root, b, slot);
1703 }
1704 }
1705 if (!p->skip_locking) { 1720 if (!p->skip_locking) {
1706 int lret; 1721 int lret;
1707 1722
@@ -1742,12 +1757,10 @@ done:
1742 * we don't really know what they plan on doing with the path 1757 * we don't really know what they plan on doing with the path
1743 * from here on, so for now just mark it as blocking 1758 * from here on, so for now just mark it as blocking
1744 */ 1759 */
1745 btrfs_set_path_blocking(p); 1760 if (!p->leave_spinning)
1746 if (prealloc_block.objectid) { 1761 btrfs_set_path_blocking(p);
1747 btrfs_free_reserved_extent(root, 1762 if (ret < 0)
1748 prealloc_block.objectid, 1763 btrfs_release_path(root, p);
1749 prealloc_block.offset);
1750 }
1751 return ret; 1764 return ret;
1752} 1765}
1753 1766
@@ -1768,7 +1781,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1768 int ret; 1781 int ret;
1769 1782
1770 eb = btrfs_lock_root_node(root); 1783 eb = btrfs_lock_root_node(root);
1771 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0); 1784 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb);
1772 BUG_ON(ret); 1785 BUG_ON(ret);
1773 1786
1774 btrfs_set_lock_blocking(eb); 1787 btrfs_set_lock_blocking(eb);
@@ -1826,7 +1839,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1826 } 1839 }
1827 1840
1828 ret = btrfs_cow_block(trans, root, eb, parent, slot, 1841 ret = btrfs_cow_block(trans, root, eb, parent, slot,
1829 &eb, 0); 1842 &eb);
1830 BUG_ON(ret); 1843 BUG_ON(ret);
1831 1844
1832 if (root->root_key.objectid == 1845 if (root->root_key.objectid ==
@@ -2139,7 +2152,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2139 spin_unlock(&root->node_lock); 2152 spin_unlock(&root->node_lock);
2140 2153
2141 ret = btrfs_update_extent_ref(trans, root, lower->start, 2154 ret = btrfs_update_extent_ref(trans, root, lower->start,
2142 lower->start, c->start, 2155 lower->len, lower->start, c->start,
2143 root->root_key.objectid, 2156 root->root_key.objectid,
2144 trans->transid, level - 1); 2157 trans->transid, level - 1);
2145 BUG_ON(ret); 2158 BUG_ON(ret);
@@ -2174,8 +2187,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
2174 BUG_ON(!path->nodes[level]); 2187 BUG_ON(!path->nodes[level]);
2175 lower = path->nodes[level]; 2188 lower = path->nodes[level];
2176 nritems = btrfs_header_nritems(lower); 2189 nritems = btrfs_header_nritems(lower);
2177 if (slot > nritems) 2190 BUG_ON(slot > nritems);
2178 BUG();
2179 if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root)) 2191 if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
2180 BUG(); 2192 BUG();
2181 if (slot != nritems) { 2193 if (slot != nritems) {
@@ -2221,7 +2233,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2221 ret = insert_new_root(trans, root, path, level + 1); 2233 ret = insert_new_root(trans, root, path, level + 1);
2222 if (ret) 2234 if (ret)
2223 return ret; 2235 return ret;
2224 } else { 2236 } else if (!trans->transaction->delayed_refs.flushing) {
2225 ret = push_nodes_for_insert(trans, root, path, level); 2237 ret = push_nodes_for_insert(trans, root, path, level);
2226 c = path->nodes[level]; 2238 c = path->nodes[level];
2227 if (!ret && btrfs_header_nritems(c) < 2239 if (!ret && btrfs_header_nritems(c) <
@@ -2329,66 +2341,27 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
2329 return ret; 2341 return ret;
2330} 2342}
2331 2343
2332/* 2344static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2333 * push some data in the path leaf to the right, trying to free up at 2345 struct btrfs_root *root,
2334 * least data_size bytes. returns zero if the push worked, nonzero otherwise 2346 struct btrfs_path *path,
2335 * 2347 int data_size, int empty,
2336 * returns 1 if the push failed because the other node didn't have enough 2348 struct extent_buffer *right,
2337 * room, 0 if everything worked out and < 0 if there were major errors. 2349 int free_space, u32 left_nritems)
2338 */
2339static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2340 *root, struct btrfs_path *path, int data_size,
2341 int empty)
2342{ 2350{
2343 struct extent_buffer *left = path->nodes[0]; 2351 struct extent_buffer *left = path->nodes[0];
2344 struct extent_buffer *right; 2352 struct extent_buffer *upper = path->nodes[1];
2345 struct extent_buffer *upper;
2346 struct btrfs_disk_key disk_key; 2353 struct btrfs_disk_key disk_key;
2347 int slot; 2354 int slot;
2348 u32 i; 2355 u32 i;
2349 int free_space;
2350 int push_space = 0; 2356 int push_space = 0;
2351 int push_items = 0; 2357 int push_items = 0;
2352 struct btrfs_item *item; 2358 struct btrfs_item *item;
2353 u32 left_nritems;
2354 u32 nr; 2359 u32 nr;
2355 u32 right_nritems; 2360 u32 right_nritems;
2356 u32 data_end; 2361 u32 data_end;
2357 u32 this_item_size; 2362 u32 this_item_size;
2358 int ret; 2363 int ret;
2359 2364
2360 slot = path->slots[1];
2361 if (!path->nodes[1])
2362 return 1;
2363
2364 upper = path->nodes[1];
2365 if (slot >= btrfs_header_nritems(upper) - 1)
2366 return 1;
2367
2368 btrfs_assert_tree_locked(path->nodes[1]);
2369
2370 right = read_node_slot(root, upper, slot + 1);
2371 btrfs_tree_lock(right);
2372 btrfs_set_lock_blocking(right);
2373
2374 free_space = btrfs_leaf_free_space(root, right);
2375 if (free_space < data_size)
2376 goto out_unlock;
2377
2378 /* cow and double check */
2379 ret = btrfs_cow_block(trans, root, right, upper,
2380 slot + 1, &right, 0);
2381 if (ret)
2382 goto out_unlock;
2383
2384 free_space = btrfs_leaf_free_space(root, right);
2385 if (free_space < data_size)
2386 goto out_unlock;
2387
2388 left_nritems = btrfs_header_nritems(left);
2389 if (left_nritems == 0)
2390 goto out_unlock;
2391
2392 if (empty) 2365 if (empty)
2393 nr = 0; 2366 nr = 0;
2394 else 2367 else
@@ -2397,6 +2370,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2397 if (path->slots[0] >= left_nritems) 2370 if (path->slots[0] >= left_nritems)
2398 push_space += data_size; 2371 push_space += data_size;
2399 2372
2373 slot = path->slots[1];
2400 i = left_nritems - 1; 2374 i = left_nritems - 1;
2401 while (i >= nr) { 2375 while (i >= nr) {
2402 item = btrfs_item_nr(left, i); 2376 item = btrfs_item_nr(left, i);
@@ -2528,24 +2502,82 @@ out_unlock:
2528} 2502}
2529 2503
2530/* 2504/*
2505 * push some data in the path leaf to the right, trying to free up at
2506 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2507 *
2508 * returns 1 if the push failed because the other node didn't have enough
2509 * room, 0 if everything worked out and < 0 if there were major errors.
2510 */
2511static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2512 *root, struct btrfs_path *path, int data_size,
2513 int empty)
2514{
2515 struct extent_buffer *left = path->nodes[0];
2516 struct extent_buffer *right;
2517 struct extent_buffer *upper;
2518 int slot;
2519 int free_space;
2520 u32 left_nritems;
2521 int ret;
2522
2523 if (!path->nodes[1])
2524 return 1;
2525
2526 slot = path->slots[1];
2527 upper = path->nodes[1];
2528 if (slot >= btrfs_header_nritems(upper) - 1)
2529 return 1;
2530
2531 btrfs_assert_tree_locked(path->nodes[1]);
2532
2533 right = read_node_slot(root, upper, slot + 1);
2534 btrfs_tree_lock(right);
2535 btrfs_set_lock_blocking(right);
2536
2537 free_space = btrfs_leaf_free_space(root, right);
2538 if (free_space < data_size)
2539 goto out_unlock;
2540
2541 /* cow and double check */
2542 ret = btrfs_cow_block(trans, root, right, upper,
2543 slot + 1, &right);
2544 if (ret)
2545 goto out_unlock;
2546
2547 free_space = btrfs_leaf_free_space(root, right);
2548 if (free_space < data_size)
2549 goto out_unlock;
2550
2551 left_nritems = btrfs_header_nritems(left);
2552 if (left_nritems == 0)
2553 goto out_unlock;
2554
2555 return __push_leaf_right(trans, root, path, data_size, empty,
2556 right, free_space, left_nritems);
2557out_unlock:
2558 btrfs_tree_unlock(right);
2559 free_extent_buffer(right);
2560 return 1;
2561}
2562
2563/*
2531 * push some data in the path leaf to the left, trying to free up at 2564 * push some data in the path leaf to the left, trying to free up at
2532 * least data_size bytes. returns zero if the push worked, nonzero otherwise 2565 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2533 */ 2566 */
2534static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root 2567static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2535 *root, struct btrfs_path *path, int data_size, 2568 struct btrfs_root *root,
2536 int empty) 2569 struct btrfs_path *path, int data_size,
2570 int empty, struct extent_buffer *left,
2571 int free_space, int right_nritems)
2537{ 2572{
2538 struct btrfs_disk_key disk_key; 2573 struct btrfs_disk_key disk_key;
2539 struct extent_buffer *right = path->nodes[0]; 2574 struct extent_buffer *right = path->nodes[0];
2540 struct extent_buffer *left;
2541 int slot; 2575 int slot;
2542 int i; 2576 int i;
2543 int free_space;
2544 int push_space = 0; 2577 int push_space = 0;
2545 int push_items = 0; 2578 int push_items = 0;
2546 struct btrfs_item *item; 2579 struct btrfs_item *item;
2547 u32 old_left_nritems; 2580 u32 old_left_nritems;
2548 u32 right_nritems;
2549 u32 nr; 2581 u32 nr;
2550 int ret = 0; 2582 int ret = 0;
2551 int wret; 2583 int wret;
@@ -2553,41 +2585,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2553 u32 old_left_item_size; 2585 u32 old_left_item_size;
2554 2586
2555 slot = path->slots[1]; 2587 slot = path->slots[1];
2556 if (slot == 0)
2557 return 1;
2558 if (!path->nodes[1])
2559 return 1;
2560
2561 right_nritems = btrfs_header_nritems(right);
2562 if (right_nritems == 0)
2563 return 1;
2564
2565 btrfs_assert_tree_locked(path->nodes[1]);
2566
2567 left = read_node_slot(root, path->nodes[1], slot - 1);
2568 btrfs_tree_lock(left);
2569 btrfs_set_lock_blocking(left);
2570
2571 free_space = btrfs_leaf_free_space(root, left);
2572 if (free_space < data_size) {
2573 ret = 1;
2574 goto out;
2575 }
2576
2577 /* cow and double check */
2578 ret = btrfs_cow_block(trans, root, left,
2579 path->nodes[1], slot - 1, &left, 0);
2580 if (ret) {
2581 /* we hit -ENOSPC, but it isn't fatal here */
2582 ret = 1;
2583 goto out;
2584 }
2585
2586 free_space = btrfs_leaf_free_space(root, left);
2587 if (free_space < data_size) {
2588 ret = 1;
2589 goto out;
2590 }
2591 2588
2592 if (empty) 2589 if (empty)
2593 nr = right_nritems; 2590 nr = right_nritems;
@@ -2755,6 +2752,154 @@ out:
2755} 2752}
2756 2753
2757/* 2754/*
2755 * push some data in the path leaf to the left, trying to free up at
2756 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2757 */
2758static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2759 *root, struct btrfs_path *path, int data_size,
2760 int empty)
2761{
2762 struct extent_buffer *right = path->nodes[0];
2763 struct extent_buffer *left;
2764 int slot;
2765 int free_space;
2766 u32 right_nritems;
2767 int ret = 0;
2768
2769 slot = path->slots[1];
2770 if (slot == 0)
2771 return 1;
2772 if (!path->nodes[1])
2773 return 1;
2774
2775 right_nritems = btrfs_header_nritems(right);
2776 if (right_nritems == 0)
2777 return 1;
2778
2779 btrfs_assert_tree_locked(path->nodes[1]);
2780
2781 left = read_node_slot(root, path->nodes[1], slot - 1);
2782 btrfs_tree_lock(left);
2783 btrfs_set_lock_blocking(left);
2784
2785 free_space = btrfs_leaf_free_space(root, left);
2786 if (free_space < data_size) {
2787 ret = 1;
2788 goto out;
2789 }
2790
2791 /* cow and double check */
2792 ret = btrfs_cow_block(trans, root, left,
2793 path->nodes[1], slot - 1, &left);
2794 if (ret) {
2795 /* we hit -ENOSPC, but it isn't fatal here */
2796 ret = 1;
2797 goto out;
2798 }
2799
2800 free_space = btrfs_leaf_free_space(root, left);
2801 if (free_space < data_size) {
2802 ret = 1;
2803 goto out;
2804 }
2805
2806 return __push_leaf_left(trans, root, path, data_size,
2807 empty, left, free_space, right_nritems);
2808out:
2809 btrfs_tree_unlock(left);
2810 free_extent_buffer(left);
2811 return ret;
2812}
2813
2814/*
2815 * split the path's leaf in two, making sure there is at least data_size
2816 * available for the resulting leaf level of the path.
2817 *
2818 * returns 0 if all went well and < 0 on failure.
2819 */
2820static noinline int copy_for_split(struct btrfs_trans_handle *trans,
2821 struct btrfs_root *root,
2822 struct btrfs_path *path,
2823 struct extent_buffer *l,
2824 struct extent_buffer *right,
2825 int slot, int mid, int nritems)
2826{
2827 int data_copy_size;
2828 int rt_data_off;
2829 int i;
2830 int ret = 0;
2831 int wret;
2832 struct btrfs_disk_key disk_key;
2833
2834 nritems = nritems - mid;
2835 btrfs_set_header_nritems(right, nritems);
2836 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2837
2838 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2839 btrfs_item_nr_offset(mid),
2840 nritems * sizeof(struct btrfs_item));
2841
2842 copy_extent_buffer(right, l,
2843 btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2844 data_copy_size, btrfs_leaf_data(l) +
2845 leaf_data_end(root, l), data_copy_size);
2846
2847 rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2848 btrfs_item_end_nr(l, mid);
2849
2850 for (i = 0; i < nritems; i++) {
2851 struct btrfs_item *item = btrfs_item_nr(right, i);
2852 u32 ioff;
2853
2854 if (!right->map_token) {
2855 map_extent_buffer(right, (unsigned long)item,
2856 sizeof(struct btrfs_item),
2857 &right->map_token, &right->kaddr,
2858 &right->map_start, &right->map_len,
2859 KM_USER1);
2860 }
2861
2862 ioff = btrfs_item_offset(right, item);
2863 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2864 }
2865
2866 if (right->map_token) {
2867 unmap_extent_buffer(right, right->map_token, KM_USER1);
2868 right->map_token = NULL;
2869 }
2870
2871 btrfs_set_header_nritems(l, mid);
2872 ret = 0;
2873 btrfs_item_key(right, &disk_key, 0);
2874 wret = insert_ptr(trans, root, path, &disk_key, right->start,
2875 path->slots[1] + 1, 1);
2876 if (wret)
2877 ret = wret;
2878
2879 btrfs_mark_buffer_dirty(right);
2880 btrfs_mark_buffer_dirty(l);
2881 BUG_ON(path->slots[0] != slot);
2882
2883 ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
2884 BUG_ON(ret);
2885
2886 if (mid <= slot) {
2887 btrfs_tree_unlock(path->nodes[0]);
2888 free_extent_buffer(path->nodes[0]);
2889 path->nodes[0] = right;
2890 path->slots[0] -= mid;
2891 path->slots[1] += 1;
2892 } else {
2893 btrfs_tree_unlock(right);
2894 free_extent_buffer(right);
2895 }
2896
2897 BUG_ON(path->slots[0] < 0);
2898
2899 return ret;
2900}
2901
2902/*
2758 * split the path's leaf in two, making sure there is at least data_size 2903 * split the path's leaf in two, making sure there is at least data_size
2759 * available for the resulting leaf level of the path. 2904 * available for the resulting leaf level of the path.
2760 * 2905 *
@@ -2771,17 +2916,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
2771 int mid; 2916 int mid;
2772 int slot; 2917 int slot;
2773 struct extent_buffer *right; 2918 struct extent_buffer *right;
2774 int data_copy_size;
2775 int rt_data_off;
2776 int i;
2777 int ret = 0; 2919 int ret = 0;
2778 int wret; 2920 int wret;
2779 int double_split; 2921 int double_split;
2780 int num_doubles = 0; 2922 int num_doubles = 0;
2781 struct btrfs_disk_key disk_key;
2782 2923
2783 /* first try to make some room by pushing left and right */ 2924 /* first try to make some room by pushing left and right */
2784 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { 2925 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY &&
2926 !trans->transaction->delayed_refs.flushing) {
2785 wret = push_leaf_right(trans, root, path, data_size, 0); 2927 wret = push_leaf_right(trans, root, path, data_size, 0);
2786 if (wret < 0) 2928 if (wret < 0)
2787 return wret; 2929 return wret;
@@ -2830,11 +2972,14 @@ again:
2830 write_extent_buffer(right, root->fs_info->chunk_tree_uuid, 2972 write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
2831 (unsigned long)btrfs_header_chunk_tree_uuid(right), 2973 (unsigned long)btrfs_header_chunk_tree_uuid(right),
2832 BTRFS_UUID_SIZE); 2974 BTRFS_UUID_SIZE);
2975
2833 if (mid <= slot) { 2976 if (mid <= slot) {
2834 if (nritems == 1 || 2977 if (nritems == 1 ||
2835 leaf_space_used(l, mid, nritems - mid) + data_size > 2978 leaf_space_used(l, mid, nritems - mid) + data_size >
2836 BTRFS_LEAF_DATA_SIZE(root)) { 2979 BTRFS_LEAF_DATA_SIZE(root)) {
2837 if (slot >= nritems) { 2980 if (slot >= nritems) {
2981 struct btrfs_disk_key disk_key;
2982
2838 btrfs_cpu_key_to_disk(&disk_key, ins_key); 2983 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2839 btrfs_set_header_nritems(right, 0); 2984 btrfs_set_header_nritems(right, 0);
2840 wret = insert_ptr(trans, root, path, 2985 wret = insert_ptr(trans, root, path,
@@ -2862,6 +3007,8 @@ again:
2862 if (leaf_space_used(l, 0, mid) + data_size > 3007 if (leaf_space_used(l, 0, mid) + data_size >
2863 BTRFS_LEAF_DATA_SIZE(root)) { 3008 BTRFS_LEAF_DATA_SIZE(root)) {
2864 if (!extend && data_size && slot == 0) { 3009 if (!extend && data_size && slot == 0) {
3010 struct btrfs_disk_key disk_key;
3011
2865 btrfs_cpu_key_to_disk(&disk_key, ins_key); 3012 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2866 btrfs_set_header_nritems(right, 0); 3013 btrfs_set_header_nritems(right, 0);
2867 wret = insert_ptr(trans, root, path, 3014 wret = insert_ptr(trans, root, path,
@@ -2894,76 +3041,16 @@ again:
2894 } 3041 }
2895 } 3042 }
2896 } 3043 }
2897 nritems = nritems - mid;
2898 btrfs_set_header_nritems(right, nritems);
2899 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2900 3044
2901 copy_extent_buffer(right, l, btrfs_item_nr_offset(0), 3045 ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
2902 btrfs_item_nr_offset(mid),
2903 nritems * sizeof(struct btrfs_item));
2904
2905 copy_extent_buffer(right, l,
2906 btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2907 data_copy_size, btrfs_leaf_data(l) +
2908 leaf_data_end(root, l), data_copy_size);
2909
2910 rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2911 btrfs_item_end_nr(l, mid);
2912
2913 for (i = 0; i < nritems; i++) {
2914 struct btrfs_item *item = btrfs_item_nr(right, i);
2915 u32 ioff;
2916
2917 if (!right->map_token) {
2918 map_extent_buffer(right, (unsigned long)item,
2919 sizeof(struct btrfs_item),
2920 &right->map_token, &right->kaddr,
2921 &right->map_start, &right->map_len,
2922 KM_USER1);
2923 }
2924
2925 ioff = btrfs_item_offset(right, item);
2926 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2927 }
2928
2929 if (right->map_token) {
2930 unmap_extent_buffer(right, right->map_token, KM_USER1);
2931 right->map_token = NULL;
2932 }
2933
2934 btrfs_set_header_nritems(l, mid);
2935 ret = 0;
2936 btrfs_item_key(right, &disk_key, 0);
2937 wret = insert_ptr(trans, root, path, &disk_key, right->start,
2938 path->slots[1] + 1, 1);
2939 if (wret)
2940 ret = wret;
2941
2942 btrfs_mark_buffer_dirty(right);
2943 btrfs_mark_buffer_dirty(l);
2944 BUG_ON(path->slots[0] != slot);
2945
2946 ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
2947 BUG_ON(ret); 3046 BUG_ON(ret);
2948 3047
2949 if (mid <= slot) {
2950 btrfs_tree_unlock(path->nodes[0]);
2951 free_extent_buffer(path->nodes[0]);
2952 path->nodes[0] = right;
2953 path->slots[0] -= mid;
2954 path->slots[1] += 1;
2955 } else {
2956 btrfs_tree_unlock(right);
2957 free_extent_buffer(right);
2958 }
2959
2960 BUG_ON(path->slots[0] < 0);
2961
2962 if (double_split) { 3048 if (double_split) {
2963 BUG_ON(num_doubles != 0); 3049 BUG_ON(num_doubles != 0);
2964 num_doubles++; 3050 num_doubles++;
2965 goto again; 3051 goto again;
2966 } 3052 }
3053
2967 return ret; 3054 return ret;
2968} 3055}
2969 3056
@@ -3021,26 +3108,27 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
3021 return -EAGAIN; 3108 return -EAGAIN;
3022 } 3109 }
3023 3110
3111 btrfs_set_path_blocking(path);
3024 ret = split_leaf(trans, root, &orig_key, path, 3112 ret = split_leaf(trans, root, &orig_key, path,
3025 sizeof(struct btrfs_item), 1); 3113 sizeof(struct btrfs_item), 1);
3026 path->keep_locks = 0; 3114 path->keep_locks = 0;
3027 BUG_ON(ret); 3115 BUG_ON(ret);
3028 3116
3117 btrfs_unlock_up_safe(path, 1);
3118 leaf = path->nodes[0];
3119 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
3120
3121split:
3029 /* 3122 /*
3030 * make sure any changes to the path from split_leaf leave it 3123 * make sure any changes to the path from split_leaf leave it
3031 * in a blocking state 3124 * in a blocking state
3032 */ 3125 */
3033 btrfs_set_path_blocking(path); 3126 btrfs_set_path_blocking(path);
3034 3127
3035 leaf = path->nodes[0];
3036 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
3037
3038split:
3039 item = btrfs_item_nr(leaf, path->slots[0]); 3128 item = btrfs_item_nr(leaf, path->slots[0]);
3040 orig_offset = btrfs_item_offset(leaf, item); 3129 orig_offset = btrfs_item_offset(leaf, item);
3041 item_size = btrfs_item_size(leaf, item); 3130 item_size = btrfs_item_size(leaf, item);
3042 3131
3043
3044 buf = kmalloc(item_size, GFP_NOFS); 3132 buf = kmalloc(item_size, GFP_NOFS);
3045 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, 3133 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3046 path->slots[0]), item_size); 3134 path->slots[0]), item_size);
@@ -3445,39 +3533,27 @@ out:
3445} 3533}
3446 3534
3447/* 3535/*
3448 * Given a key and some data, insert items into the tree. 3536 * this is a helper for btrfs_insert_empty_items, the main goal here is
3449 * This does all the path init required, making room in the tree if needed. 3537 * to save stack depth by doing the bulk of the work in a function
3538 * that doesn't call btrfs_search_slot
3450 */ 3539 */
3451int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, 3540static noinline_for_stack int
3452 struct btrfs_root *root, 3541setup_items_for_insert(struct btrfs_trans_handle *trans,
3453 struct btrfs_path *path, 3542 struct btrfs_root *root, struct btrfs_path *path,
3454 struct btrfs_key *cpu_key, u32 *data_size, 3543 struct btrfs_key *cpu_key, u32 *data_size,
3455 int nr) 3544 u32 total_data, u32 total_size, int nr)
3456{ 3545{
3457 struct extent_buffer *leaf;
3458 struct btrfs_item *item; 3546 struct btrfs_item *item;
3459 int ret = 0;
3460 int slot;
3461 int slot_orig;
3462 int i; 3547 int i;
3463 u32 nritems; 3548 u32 nritems;
3464 u32 total_size = 0;
3465 u32 total_data = 0;
3466 unsigned int data_end; 3549 unsigned int data_end;
3467 struct btrfs_disk_key disk_key; 3550 struct btrfs_disk_key disk_key;
3551 int ret;
3552 struct extent_buffer *leaf;
3553 int slot;
3468 3554
3469 for (i = 0; i < nr; i++)
3470 total_data += data_size[i];
3471
3472 total_size = total_data + (nr * sizeof(struct btrfs_item));
3473 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3474 if (ret == 0)
3475 return -EEXIST;
3476 if (ret < 0)
3477 goto out;
3478
3479 slot_orig = path->slots[0];
3480 leaf = path->nodes[0]; 3555 leaf = path->nodes[0];
3556 slot = path->slots[0];
3481 3557
3482 nritems = btrfs_header_nritems(leaf); 3558 nritems = btrfs_header_nritems(leaf);
3483 data_end = leaf_data_end(root, leaf); 3559 data_end = leaf_data_end(root, leaf);
@@ -3489,9 +3565,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3489 BUG(); 3565 BUG();
3490 } 3566 }
3491 3567
3492 slot = path->slots[0];
3493 BUG_ON(slot < 0);
3494
3495 if (slot != nritems) { 3568 if (slot != nritems) {
3496 unsigned int old_data = btrfs_item_end_nr(leaf, slot); 3569 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3497 3570
@@ -3547,21 +3620,60 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3547 data_end -= data_size[i]; 3620 data_end -= data_size[i];
3548 btrfs_set_item_size(leaf, item, data_size[i]); 3621 btrfs_set_item_size(leaf, item, data_size[i]);
3549 } 3622 }
3623
3550 btrfs_set_header_nritems(leaf, nritems + nr); 3624 btrfs_set_header_nritems(leaf, nritems + nr);
3551 btrfs_mark_buffer_dirty(leaf);
3552 3625
3553 ret = 0; 3626 ret = 0;
3554 if (slot == 0) { 3627 if (slot == 0) {
3628 struct btrfs_disk_key disk_key;
3555 btrfs_cpu_key_to_disk(&disk_key, cpu_key); 3629 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3556 ret = fixup_low_keys(trans, root, path, &disk_key, 1); 3630 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3557 } 3631 }
3632 btrfs_unlock_up_safe(path, 1);
3633 btrfs_mark_buffer_dirty(leaf);
3558 3634
3559 if (btrfs_leaf_free_space(root, leaf) < 0) { 3635 if (btrfs_leaf_free_space(root, leaf) < 0) {
3560 btrfs_print_leaf(root, leaf); 3636 btrfs_print_leaf(root, leaf);
3561 BUG(); 3637 BUG();
3562 } 3638 }
3639 return ret;
3640}
3641
3642/*
3643 * Given a key and some data, insert items into the tree.
3644 * This does all the path init required, making room in the tree if needed.
3645 */
3646int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3647 struct btrfs_root *root,
3648 struct btrfs_path *path,
3649 struct btrfs_key *cpu_key, u32 *data_size,
3650 int nr)
3651{
3652 struct extent_buffer *leaf;
3653 int ret = 0;
3654 int slot;
3655 int i;
3656 u32 total_size = 0;
3657 u32 total_data = 0;
3658
3659 for (i = 0; i < nr; i++)
3660 total_data += data_size[i];
3661
3662 total_size = total_data + (nr * sizeof(struct btrfs_item));
3663 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3664 if (ret == 0)
3665 return -EEXIST;
3666 if (ret < 0)
3667 goto out;
3668
3669 leaf = path->nodes[0];
3670 slot = path->slots[0];
3671 BUG_ON(slot < 0);
3672
3673 ret = setup_items_for_insert(trans, root, path, cpu_key, data_size,
3674 total_data, total_size, nr);
3675
3563out: 3676out:
3564 btrfs_unlock_up_safe(path, 1);
3565 return ret; 3677 return ret;
3566} 3678}
3567 3679
@@ -3749,7 +3861,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3749 } 3861 }
3750 3862
3751 /* delete the leaf if it is mostly empty */ 3863 /* delete the leaf if it is mostly empty */
3752 if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) { 3864 if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 &&
3865 !trans->transaction->delayed_refs.flushing) {
3753 /* push_leaf_left fixes the path. 3866 /* push_leaf_left fixes the path.
3754 * make sure the path still points to our leaf 3867 * make sure the path still points to our leaf
3755 * for possible call to del_ptr below 3868 * for possible call to del_ptr below
@@ -3757,6 +3870,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3757 slot = path->slots[1]; 3870 slot = path->slots[1];
3758 extent_buffer_get(leaf); 3871 extent_buffer_get(leaf);
3759 3872
3873 btrfs_set_path_blocking(path);
3760 wret = push_leaf_left(trans, root, path, 1, 1); 3874 wret = push_leaf_left(trans, root, path, 1, 1);
3761 if (wret < 0 && wret != -ENOSPC) 3875 if (wret < 0 && wret != -ENOSPC)
3762 ret = wret; 3876 ret = wret;
@@ -4042,28 +4156,44 @@ next:
4042int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) 4156int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4043{ 4157{
4044 int slot; 4158 int slot;
4045 int level = 1; 4159 int level;
4046 struct extent_buffer *c; 4160 struct extent_buffer *c;
4047 struct extent_buffer *next = NULL; 4161 struct extent_buffer *next;
4048 struct btrfs_key key; 4162 struct btrfs_key key;
4049 u32 nritems; 4163 u32 nritems;
4050 int ret; 4164 int ret;
4165 int old_spinning = path->leave_spinning;
4166 int force_blocking = 0;
4051 4167
4052 nritems = btrfs_header_nritems(path->nodes[0]); 4168 nritems = btrfs_header_nritems(path->nodes[0]);
4053 if (nritems == 0) 4169 if (nritems == 0)
4054 return 1; 4170 return 1;
4055 4171
4056 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 4172 /*
4173 * we take the blocks in an order that upsets lockdep. Using
4174 * blocking mode is the only way around it.
4175 */
4176#ifdef CONFIG_DEBUG_LOCK_ALLOC
4177 force_blocking = 1;
4178#endif
4057 4179
4180 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4181again:
4182 level = 1;
4183 next = NULL;
4058 btrfs_release_path(root, path); 4184 btrfs_release_path(root, path);
4185
4059 path->keep_locks = 1; 4186 path->keep_locks = 1;
4187
4188 if (!force_blocking)
4189 path->leave_spinning = 1;
4190
4060 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4191 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4061 path->keep_locks = 0; 4192 path->keep_locks = 0;
4062 4193
4063 if (ret < 0) 4194 if (ret < 0)
4064 return ret; 4195 return ret;
4065 4196
4066 btrfs_set_path_blocking(path);
4067 nritems = btrfs_header_nritems(path->nodes[0]); 4197 nritems = btrfs_header_nritems(path->nodes[0]);
4068 /* 4198 /*
4069 * by releasing the path above we dropped all our locks. A balance 4199 * by releasing the path above we dropped all our locks. A balance
@@ -4073,19 +4203,24 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4073 */ 4203 */
4074 if (nritems > 0 && path->slots[0] < nritems - 1) { 4204 if (nritems > 0 && path->slots[0] < nritems - 1) {
4075 path->slots[0]++; 4205 path->slots[0]++;
4206 ret = 0;
4076 goto done; 4207 goto done;
4077 } 4208 }
4078 4209
4079 while (level < BTRFS_MAX_LEVEL) { 4210 while (level < BTRFS_MAX_LEVEL) {
4080 if (!path->nodes[level]) 4211 if (!path->nodes[level]) {
4081 return 1; 4212 ret = 1;
4213 goto done;
4214 }
4082 4215
4083 slot = path->slots[level] + 1; 4216 slot = path->slots[level] + 1;
4084 c = path->nodes[level]; 4217 c = path->nodes[level];
4085 if (slot >= btrfs_header_nritems(c)) { 4218 if (slot >= btrfs_header_nritems(c)) {
4086 level++; 4219 level++;
4087 if (level == BTRFS_MAX_LEVEL) 4220 if (level == BTRFS_MAX_LEVEL) {
4088 return 1; 4221 ret = 1;
4222 goto done;
4223 }
4089 continue; 4224 continue;
4090 } 4225 }
4091 4226
@@ -4094,16 +4229,27 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4094 free_extent_buffer(next); 4229 free_extent_buffer(next);
4095 } 4230 }
4096 4231
4097 /* the path was set to blocking above */ 4232 next = c;
4098 if (level == 1 && (path->locks[1] || path->skip_locking) && 4233 ret = read_block_for_search(NULL, root, path, &next, level,
4099 path->reada) 4234 slot, &key);
4100 reada_for_search(root, path, level, slot, 0); 4235 if (ret == -EAGAIN)
4236 goto again;
4237
4238 if (ret < 0) {
4239 btrfs_release_path(root, path);
4240 goto done;
4241 }
4101 4242
4102 next = read_node_slot(root, c, slot);
4103 if (!path->skip_locking) { 4243 if (!path->skip_locking) {
4104 btrfs_assert_tree_locked(c); 4244 ret = btrfs_try_spin_lock(next);
4105 btrfs_tree_lock(next); 4245 if (!ret) {
4106 btrfs_set_lock_blocking(next); 4246 btrfs_set_path_blocking(path);
4247 btrfs_tree_lock(next);
4248 if (!force_blocking)
4249 btrfs_clear_path_blocking(path, next);
4250 }
4251 if (force_blocking)
4252 btrfs_set_lock_blocking(next);
4107 } 4253 }
4108 break; 4254 break;
4109 } 4255 }
@@ -4113,27 +4259,47 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4113 c = path->nodes[level]; 4259 c = path->nodes[level];
4114 if (path->locks[level]) 4260 if (path->locks[level])
4115 btrfs_tree_unlock(c); 4261 btrfs_tree_unlock(c);
4262
4116 free_extent_buffer(c); 4263 free_extent_buffer(c);
4117 path->nodes[level] = next; 4264 path->nodes[level] = next;
4118 path->slots[level] = 0; 4265 path->slots[level] = 0;
4119 if (!path->skip_locking) 4266 if (!path->skip_locking)
4120 path->locks[level] = 1; 4267 path->locks[level] = 1;
4268
4121 if (!level) 4269 if (!level)
4122 break; 4270 break;
4123 4271
4124 btrfs_set_path_blocking(path); 4272 ret = read_block_for_search(NULL, root, path, &next, level,
4125 if (level == 1 && path->locks[1] && path->reada) 4273 0, &key);
4126 reada_for_search(root, path, level, slot, 0); 4274 if (ret == -EAGAIN)
4127 next = read_node_slot(root, next, 0); 4275 goto again;
4276
4277 if (ret < 0) {
4278 btrfs_release_path(root, path);
4279 goto done;
4280 }
4281
4128 if (!path->skip_locking) { 4282 if (!path->skip_locking) {
4129 btrfs_assert_tree_locked(path->nodes[level]); 4283 btrfs_assert_tree_locked(path->nodes[level]);
4130 btrfs_tree_lock(next); 4284 ret = btrfs_try_spin_lock(next);
4131 btrfs_set_lock_blocking(next); 4285 if (!ret) {
4286 btrfs_set_path_blocking(path);
4287 btrfs_tree_lock(next);
4288 if (!force_blocking)
4289 btrfs_clear_path_blocking(path, next);
4290 }
4291 if (force_blocking)
4292 btrfs_set_lock_blocking(next);
4132 } 4293 }
4133 } 4294 }
4295 ret = 0;
4134done: 4296done:
4135 unlock_up(path, 0, 1); 4297 unlock_up(path, 0, 1);
4136 return 0; 4298 path->leave_spinning = old_spinning;
4299 if (!old_spinning)
4300 btrfs_set_path_blocking(path);
4301
4302 return ret;
4137} 4303}
4138 4304
4139/* 4305/*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5e1d4e30e9d8..4414a5d9983a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -45,6 +45,13 @@ struct btrfs_ordered_sum;
45 45
46#define BTRFS_MAX_LEVEL 8 46#define BTRFS_MAX_LEVEL 8
47 47
48/*
49 * files bigger than this get some pre-flushing when they are added
50 * to the ordered operations list. That way we limit the total
51 * work done by the commit
52 */
53#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)
54
48/* holds pointers to all of the tree roots */ 55/* holds pointers to all of the tree roots */
49#define BTRFS_ROOT_TREE_OBJECTID 1ULL 56#define BTRFS_ROOT_TREE_OBJECTID 1ULL
50 57
@@ -136,12 +143,15 @@ static int btrfs_csum_sizes[] = { 4, 0 };
136#define BTRFS_FT_MAX 9 143#define BTRFS_FT_MAX 9
137 144
138/* 145/*
139 * the key defines the order in the tree, and so it also defines (optimal) 146 * The key defines the order in the tree, and so it also defines (optimal)
140 * block layout. objectid corresonds to the inode number. The flags 147 * block layout.
141 * tells us things about the object, and is a kind of stream selector. 148 *
142 * so for a given inode, keys with flags of 1 might refer to the inode 149 * objectid corresponds to the inode number.
143 * data, flags of 2 may point to file data in the btree and flags == 3 150 *
144 * may point to extents. 151 * type tells us things about the object, and is a kind of stream selector.
152 * so for a given inode, keys with type of 1 might refer to the inode data,
153 * type of 2 may point to file data in the btree and type == 3 may point to
154 * extents.
145 * 155 *
146 * offset is the starting byte offset for this key in the stream. 156 * offset is the starting byte offset for this key in the stream.
147 * 157 *
@@ -193,7 +203,7 @@ struct btrfs_dev_item {
193 203
194 /* 204 /*
195 * starting byte of this partition on the device, 205 * starting byte of this partition on the device,
196 * to allowr for stripe alignment in the future 206 * to allow for stripe alignment in the future
197 */ 207 */
198 __le64 start_offset; 208 __le64 start_offset;
199 209
@@ -401,15 +411,16 @@ struct btrfs_path {
401 int locks[BTRFS_MAX_LEVEL]; 411 int locks[BTRFS_MAX_LEVEL];
402 int reada; 412 int reada;
403 /* keep some upper locks as we walk down */ 413 /* keep some upper locks as we walk down */
404 int keep_locks;
405 int skip_locking;
406 int lowest_level; 414 int lowest_level;
407 415
408 /* 416 /*
409 * set by btrfs_split_item, tells search_slot to keep all locks 417 * set by btrfs_split_item, tells search_slot to keep all locks
410 * and to force calls to keep space in the nodes 418 * and to force calls to keep space in the nodes
411 */ 419 */
412 int search_for_split; 420 unsigned int search_for_split:1;
421 unsigned int keep_locks:1;
422 unsigned int skip_locking:1;
423 unsigned int leave_spinning:1;
413}; 424};
414 425
415/* 426/*
@@ -625,18 +636,35 @@ struct btrfs_space_info {
625 struct rw_semaphore groups_sem; 636 struct rw_semaphore groups_sem;
626}; 637};
627 638
628struct btrfs_free_space { 639/*
629 struct rb_node bytes_index; 640 * free clusters are used to claim free space in relatively large chunks,
630 struct rb_node offset_index; 641 * allowing us to do less seeky writes. They are used for all metadata
631 u64 offset; 642 * allocations and data allocations in ssd mode.
632 u64 bytes; 643 */
644struct btrfs_free_cluster {
645 spinlock_t lock;
646 spinlock_t refill_lock;
647 struct rb_root root;
648
649 /* largest extent in this cluster */
650 u64 max_size;
651
652 /* first extent starting offset */
653 u64 window_start;
654
655 struct btrfs_block_group_cache *block_group;
656 /*
657 * when a cluster is allocated from a block group, we put the
658 * cluster onto a list in the block group so that it can
659 * be freed before the block group is freed.
660 */
661 struct list_head block_group_list;
633}; 662};
634 663
635struct btrfs_block_group_cache { 664struct btrfs_block_group_cache {
636 struct btrfs_key key; 665 struct btrfs_key key;
637 struct btrfs_block_group_item item; 666 struct btrfs_block_group_item item;
638 spinlock_t lock; 667 spinlock_t lock;
639 struct mutex alloc_mutex;
640 struct mutex cache_mutex; 668 struct mutex cache_mutex;
641 u64 pinned; 669 u64 pinned;
642 u64 reserved; 670 u64 reserved;
@@ -648,6 +676,7 @@ struct btrfs_block_group_cache {
648 struct btrfs_space_info *space_info; 676 struct btrfs_space_info *space_info;
649 677
650 /* free space cache stuff */ 678 /* free space cache stuff */
679 spinlock_t tree_lock;
651 struct rb_root free_space_bytes; 680 struct rb_root free_space_bytes;
652 struct rb_root free_space_offset; 681 struct rb_root free_space_offset;
653 682
@@ -659,6 +688,11 @@ struct btrfs_block_group_cache {
659 688
660 /* usage count */ 689 /* usage count */
661 atomic_t count; 690 atomic_t count;
691
692 /* List of struct btrfs_free_clusters for this block group.
693 * Today it will only have one thing on it, but that may change
694 */
695 struct list_head cluster_list;
662}; 696};
663 697
664struct btrfs_leaf_ref_tree { 698struct btrfs_leaf_ref_tree {
@@ -688,15 +722,18 @@ struct btrfs_fs_info {
688 struct rb_root block_group_cache_tree; 722 struct rb_root block_group_cache_tree;
689 723
690 struct extent_io_tree pinned_extents; 724 struct extent_io_tree pinned_extents;
691 struct extent_io_tree pending_del;
692 struct extent_io_tree extent_ins;
693 725
694 /* logical->physical extent mapping */ 726 /* logical->physical extent mapping */
695 struct btrfs_mapping_tree mapping_tree; 727 struct btrfs_mapping_tree mapping_tree;
696 728
697 u64 generation; 729 u64 generation;
698 u64 last_trans_committed; 730 u64 last_trans_committed;
699 u64 last_trans_new_blockgroup; 731
732 /*
733 * this is updated to the current trans every time a full commit
734 * is required instead of the faster short fsync log commits
735 */
736 u64 last_trans_log_full_commit;
700 u64 open_ioctl_trans; 737 u64 open_ioctl_trans;
701 unsigned long mount_opt; 738 unsigned long mount_opt;
702 u64 max_extent; 739 u64 max_extent;
@@ -717,12 +754,20 @@ struct btrfs_fs_info {
717 struct mutex tree_log_mutex; 754 struct mutex tree_log_mutex;
718 struct mutex transaction_kthread_mutex; 755 struct mutex transaction_kthread_mutex;
719 struct mutex cleaner_mutex; 756 struct mutex cleaner_mutex;
720 struct mutex extent_ins_mutex;
721 struct mutex pinned_mutex;
722 struct mutex chunk_mutex; 757 struct mutex chunk_mutex;
723 struct mutex drop_mutex; 758 struct mutex drop_mutex;
724 struct mutex volume_mutex; 759 struct mutex volume_mutex;
725 struct mutex tree_reloc_mutex; 760 struct mutex tree_reloc_mutex;
761
762 /*
763 * this protects the ordered operations list only while we are
764 * processing all of the entries on it. This way we make
765 * sure the commit code doesn't find the list temporarily empty
766 * because another function happens to be doing non-waiting preflush
767 * before jumping into the main commit.
768 */
769 struct mutex ordered_operations_mutex;
770
726 struct list_head trans_list; 771 struct list_head trans_list;
727 struct list_head hashers; 772 struct list_head hashers;
728 struct list_head dead_roots; 773 struct list_head dead_roots;
@@ -737,10 +782,29 @@ struct btrfs_fs_info {
737 * ordered extents 782 * ordered extents
738 */ 783 */
739 spinlock_t ordered_extent_lock; 784 spinlock_t ordered_extent_lock;
785
786 /*
787 * all of the data=ordered extents pending writeback
788 * these can span multiple transactions and basically include
789 * every dirty data page that isn't from nodatacow
790 */
740 struct list_head ordered_extents; 791 struct list_head ordered_extents;
792
793 /*
794 * all of the inodes that have delalloc bytes. It is possible for
795 * this list to be empty even when there is still dirty data=ordered
796 * extents waiting to finish IO.
797 */
741 struct list_head delalloc_inodes; 798 struct list_head delalloc_inodes;
742 799
743 /* 800 /*
801 * special rename and truncate targets that must be on disk before
802 * we're allowed to commit. This is basically the ext3 style
803 * data=ordered list.
804 */
805 struct list_head ordered_operations;
806
807 /*
744 * there is a pool of worker threads for checksumming during writes 808 * there is a pool of worker threads for checksumming during writes
745 * and a pool for checksumming after reads. This is because readers 809 * and a pool for checksumming after reads. This is because readers
746 * can run with FS locks held, and the writers may be waiting for 810 * can run with FS locks held, and the writers may be waiting for
@@ -781,6 +845,11 @@ struct btrfs_fs_info {
781 atomic_t throttle_gen; 845 atomic_t throttle_gen;
782 846
783 u64 total_pinned; 847 u64 total_pinned;
848
849 /* protected by the delalloc lock, used to keep from writing
850 * metadata until there is a nice batch
851 */
852 u64 dirty_metadata_bytes;
784 struct list_head dirty_cowonly_roots; 853 struct list_head dirty_cowonly_roots;
785 854
786 struct btrfs_fs_devices *fs_devices; 855 struct btrfs_fs_devices *fs_devices;
@@ -795,8 +864,12 @@ struct btrfs_fs_info {
795 spinlock_t delalloc_lock; 864 spinlock_t delalloc_lock;
796 spinlock_t new_trans_lock; 865 spinlock_t new_trans_lock;
797 u64 delalloc_bytes; 866 u64 delalloc_bytes;
798 u64 last_alloc; 867
799 u64 last_data_alloc; 868 /* data_alloc_cluster is only used in ssd mode */
869 struct btrfs_free_cluster data_alloc_cluster;
870
871 /* all metadata allocations go through this cluster */
872 struct btrfs_free_cluster meta_alloc_cluster;
800 873
801 spinlock_t ref_cache_lock; 874 spinlock_t ref_cache_lock;
802 u64 total_ref_cache_size; 875 u64 total_ref_cache_size;
@@ -808,6 +881,9 @@ struct btrfs_fs_info {
808 u64 metadata_alloc_profile; 881 u64 metadata_alloc_profile;
809 u64 system_alloc_profile; 882 u64 system_alloc_profile;
810 883
884 unsigned data_chunk_allocations;
885 unsigned metadata_ratio;
886
811 void *bdev_holder; 887 void *bdev_holder;
812}; 888};
813 889
@@ -888,7 +964,6 @@ struct btrfs_root {
888}; 964};
889 965
890/* 966/*
891
892 * inode items have the data typically returned from stat and store other 967 * inode items have the data typically returned from stat and store other
893 * info about object characteristics. There is one for every file and dir in 968 * info about object characteristics. There is one for every file and dir in
894 * the FS 969 * the FS
@@ -919,7 +994,7 @@ struct btrfs_root {
919#define BTRFS_EXTENT_CSUM_KEY 128 994#define BTRFS_EXTENT_CSUM_KEY 128
920 995
921/* 996/*
922 * root items point to tree roots. There are typically in the root 997 * root items point to tree roots. They are typically in the root
923 * tree used by the super block to find all the other trees 998 * tree used by the super block to find all the other trees
924 */ 999 */
925#define BTRFS_ROOT_ITEM_KEY 132 1000#define BTRFS_ROOT_ITEM_KEY 132
@@ -966,6 +1041,8 @@ struct btrfs_root {
966#define BTRFS_MOUNT_SSD (1 << 3) 1041#define BTRFS_MOUNT_SSD (1 << 3)
967#define BTRFS_MOUNT_DEGRADED (1 << 4) 1042#define BTRFS_MOUNT_DEGRADED (1 << 4)
968#define BTRFS_MOUNT_COMPRESS (1 << 5) 1043#define BTRFS_MOUNT_COMPRESS (1 << 5)
1044#define BTRFS_MOUNT_NOTREELOG (1 << 6)
1045#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7)
969 1046
970#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1047#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
971#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1048#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1704,18 +1781,16 @@ static inline struct dentry *fdentry(struct file *file)
1704} 1781}
1705 1782
1706/* extent-tree.c */ 1783/* extent-tree.c */
1784void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
1785int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1786 struct btrfs_root *root, unsigned long count);
1707int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); 1787int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
1708int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
1709 struct btrfs_root *root, u64 bytenr,
1710 u64 num_bytes, u32 *refs);
1711int btrfs_update_pinned_extents(struct btrfs_root *root, 1788int btrfs_update_pinned_extents(struct btrfs_root *root,
1712 u64 bytenr, u64 num, int pin); 1789 u64 bytenr, u64 num, int pin);
1713int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, 1790int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
1714 struct btrfs_root *root, struct extent_buffer *leaf); 1791 struct btrfs_root *root, struct extent_buffer *leaf);
1715int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 1792int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
1716 struct btrfs_root *root, u64 objectid, u64 bytenr); 1793 struct btrfs_root *root, u64 objectid, u64 bytenr);
1717int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
1718 struct btrfs_root *root);
1719int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy); 1794int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
1720struct btrfs_block_group_cache *btrfs_lookup_block_group( 1795struct btrfs_block_group_cache *btrfs_lookup_block_group(
1721 struct btrfs_fs_info *info, 1796 struct btrfs_fs_info *info,
@@ -1777,7 +1852,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1777 u64 root_objectid, u64 ref_generation, 1852 u64 root_objectid, u64 ref_generation,
1778 u64 owner_objectid); 1853 u64 owner_objectid);
1779int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, 1854int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1780 struct btrfs_root *root, u64 bytenr, 1855 struct btrfs_root *root, u64 bytenr, u64 num_bytes,
1781 u64 orig_parent, u64 parent, 1856 u64 orig_parent, u64 parent,
1782 u64 root_objectid, u64 ref_generation, 1857 u64 root_objectid, u64 ref_generation,
1783 u64 owner_objectid); 1858 u64 owner_objectid);
@@ -1838,7 +1913,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
1838int btrfs_cow_block(struct btrfs_trans_handle *trans, 1913int btrfs_cow_block(struct btrfs_trans_handle *trans,
1839 struct btrfs_root *root, struct extent_buffer *buf, 1914 struct btrfs_root *root, struct extent_buffer *buf,
1840 struct extent_buffer *parent, int parent_slot, 1915 struct extent_buffer *parent, int parent_slot,
1841 struct extent_buffer **cow_ret, u64 prealloc_dest); 1916 struct extent_buffer **cow_ret);
1842int btrfs_copy_root(struct btrfs_trans_handle *trans, 1917int btrfs_copy_root(struct btrfs_trans_handle *trans,
1843 struct btrfs_root *root, 1918 struct btrfs_root *root,
1844 struct extent_buffer *buf, 1919 struct extent_buffer *buf,
@@ -2060,7 +2135,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
2060unsigned long btrfs_force_ra(struct address_space *mapping, 2135unsigned long btrfs_force_ra(struct address_space *mapping,
2061 struct file_ra_state *ra, struct file *file, 2136 struct file_ra_state *ra, struct file *file,
2062 pgoff_t offset, pgoff_t last_index); 2137 pgoff_t offset, pgoff_t last_index);
2063int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page); 2138int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2064int btrfs_readpage(struct file *file, struct page *page); 2139int btrfs_readpage(struct file *file, struct page *page);
2065void btrfs_delete_inode(struct inode *inode); 2140void btrfs_delete_inode(struct inode *inode);
2066void btrfs_put_inode(struct inode *inode); 2141void btrfs_put_inode(struct inode *inode);
@@ -2102,7 +2177,8 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
2102extern struct file_operations btrfs_file_operations; 2177extern struct file_operations btrfs_file_operations;
2103int btrfs_drop_extents(struct btrfs_trans_handle *trans, 2178int btrfs_drop_extents(struct btrfs_trans_handle *trans,
2104 struct btrfs_root *root, struct inode *inode, 2179 struct btrfs_root *root, struct inode *inode,
2105 u64 start, u64 end, u64 inline_limit, u64 *hint_block); 2180 u64 start, u64 end, u64 locked_end,
2181 u64 inline_limit, u64 *hint_block);
2106int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, 2182int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
2107 struct btrfs_root *root, 2183 struct btrfs_root *root,
2108 struct inode *inode, u64 start, u64 end); 2184 struct inode *inode, u64 start, u64 end);
@@ -2133,21 +2209,4 @@ int btrfs_check_acl(struct inode *inode, int mask);
2133int btrfs_init_acl(struct inode *inode, struct inode *dir); 2209int btrfs_init_acl(struct inode *inode, struct inode *dir);
2134int btrfs_acl_chmod(struct inode *inode); 2210int btrfs_acl_chmod(struct inode *inode);
2135 2211
2136/* free-space-cache.c */
2137int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
2138 u64 bytenr, u64 size);
2139int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
2140 u64 offset, u64 bytes);
2141int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
2142 u64 bytenr, u64 size);
2143int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
2144 u64 offset, u64 bytes);
2145void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
2146 *block_group);
2147struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
2148 *block_group, u64 offset,
2149 u64 bytes);
2150void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
2151 u64 bytes);
2152u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
2153#endif 2212#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
new file mode 100644
index 000000000000..d6c01c096a40
--- /dev/null
+++ b/fs/btrfs/delayed-ref.c
@@ -0,0 +1,668 @@
1/*
2 * Copyright (C) 2009 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/sort.h>
21#include "ctree.h"
22#include "delayed-ref.h"
23#include "transaction.h"
24
25/*
26 * delayed back reference update tracking. For subvolume trees
27 * we queue up extent allocations and backref maintenance for
28 * delayed processing. This avoids deep call chains where we
29 * add extents in the middle of btrfs_search_slot, and it allows
30 * us to buffer up frequently modified backrefs in an rb tree instead
31 * of hammering updates on the extent allocation tree.
32 *
33 * Right now this code is only used for reference counted trees, but
34 * the long term goal is to get rid of the similar code for delayed
35 * extent tree modifications.
36 */
37
38/*
39 * entries in the rb tree are ordered by the byte number of the extent
40 * and by the byte number of the parent block.
41 */
42static int comp_entry(struct btrfs_delayed_ref_node *ref,
43 u64 bytenr, u64 parent)
44{
45 if (bytenr < ref->bytenr)
46 return -1;
47 if (bytenr > ref->bytenr)
48 return 1;
49 if (parent < ref->parent)
50 return -1;
51 if (parent > ref->parent)
52 return 1;
53 return 0;
54}
55
56/*
57 * insert a new ref into the rbtree. This returns any existing refs
58 * for the same (bytenr,parent) tuple, or NULL if the new node was properly
59 * inserted.
60 */
61static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
62 u64 bytenr, u64 parent,
63 struct rb_node *node)
64{
65 struct rb_node **p = &root->rb_node;
66 struct rb_node *parent_node = NULL;
67 struct btrfs_delayed_ref_node *entry;
68 int cmp;
69
70 while (*p) {
71 parent_node = *p;
72 entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
73 rb_node);
74
75 cmp = comp_entry(entry, bytenr, parent);
76 if (cmp < 0)
77 p = &(*p)->rb_left;
78 else if (cmp > 0)
79 p = &(*p)->rb_right;
80 else
81 return entry;
82 }
83
84 entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
85 rb_link_node(node, parent_node, p);
86 rb_insert_color(node, root);
87 return NULL;
88}
89
90/*
91 * find an entry based on (bytenr,parent). This returns the delayed
92 * ref if it was able to find one, or NULL if nothing was in that spot
93 */
94static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
95 u64 bytenr, u64 parent,
96 struct btrfs_delayed_ref_node **last)
97{
98 struct rb_node *n = root->rb_node;
99 struct btrfs_delayed_ref_node *entry;
100 int cmp;
101
102 while (n) {
103 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
104 WARN_ON(!entry->in_tree);
105 if (last)
106 *last = entry;
107
108 cmp = comp_entry(entry, bytenr, parent);
109 if (cmp < 0)
110 n = n->rb_left;
111 else if (cmp > 0)
112 n = n->rb_right;
113 else
114 return entry;
115 }
116 return NULL;
117}
118
119int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
120 struct btrfs_delayed_ref_head *head)
121{
122 struct btrfs_delayed_ref_root *delayed_refs;
123
124 delayed_refs = &trans->transaction->delayed_refs;
125 assert_spin_locked(&delayed_refs->lock);
126 if (mutex_trylock(&head->mutex))
127 return 0;
128
129 atomic_inc(&head->node.refs);
130 spin_unlock(&delayed_refs->lock);
131
132 mutex_lock(&head->mutex);
133 spin_lock(&delayed_refs->lock);
134 if (!head->node.in_tree) {
135 mutex_unlock(&head->mutex);
136 btrfs_put_delayed_ref(&head->node);
137 return -EAGAIN;
138 }
139 btrfs_put_delayed_ref(&head->node);
140 return 0;
141}
142
143int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
144 struct list_head *cluster, u64 start)
145{
146 int count = 0;
147 struct btrfs_delayed_ref_root *delayed_refs;
148 struct rb_node *node;
149 struct btrfs_delayed_ref_node *ref;
150 struct btrfs_delayed_ref_head *head;
151
152 delayed_refs = &trans->transaction->delayed_refs;
153 if (start == 0) {
154 node = rb_first(&delayed_refs->root);
155 } else {
156 ref = NULL;
157 tree_search(&delayed_refs->root, start, (u64)-1, &ref);
158 if (ref) {
159 struct btrfs_delayed_ref_node *tmp;
160
161 node = rb_prev(&ref->rb_node);
162 while (node) {
163 tmp = rb_entry(node,
164 struct btrfs_delayed_ref_node,
165 rb_node);
166 if (tmp->bytenr < start)
167 break;
168 ref = tmp;
169 node = rb_prev(&ref->rb_node);
170 }
171 node = &ref->rb_node;
172 } else
173 node = rb_first(&delayed_refs->root);
174 }
175again:
176 while (node && count < 32) {
177 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
178 if (btrfs_delayed_ref_is_head(ref)) {
179 head = btrfs_delayed_node_to_head(ref);
180 if (list_empty(&head->cluster)) {
181 list_add_tail(&head->cluster, cluster);
182 delayed_refs->run_delayed_start =
183 head->node.bytenr;
184 count++;
185
186 WARN_ON(delayed_refs->num_heads_ready == 0);
187 delayed_refs->num_heads_ready--;
188 } else if (count) {
189 /* the goal of the clustering is to find extents
190 * that are likely to end up in the same extent
191 * leaf on disk. So, we don't want them spread
192 * all over the tree. Stop now if we've hit
193 * a head that was already in use
194 */
195 break;
196 }
197 }
198 node = rb_next(node);
199 }
200 if (count) {
201 return 0;
202 } else if (start) {
203 /*
204 * we've gone to the end of the rbtree without finding any
205 * clusters. start from the beginning and try again
206 */
207 start = 0;
208 node = rb_first(&delayed_refs->root);
209 goto again;
210 }
211 return 1;
212}
213
214/*
215 * This checks to see if there are any delayed refs in the
216 * btree for a given bytenr. It returns one if it finds any
217 * and zero otherwise.
218 *
219 * If it only finds a head node, it returns 0.
220 *
221 * The idea is to use this when deciding if you can safely delete an
222 * extent from the extent allocation tree. There may be a pending
223 * ref in the rbtree that adds or removes references, so as long as this
224 * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent
225 * allocation tree.
226 */
227int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
228{
229 struct btrfs_delayed_ref_node *ref;
230 struct btrfs_delayed_ref_root *delayed_refs;
231 struct rb_node *prev_node;
232 int ret = 0;
233
234 delayed_refs = &trans->transaction->delayed_refs;
235 spin_lock(&delayed_refs->lock);
236
237 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
238 if (ref) {
239 prev_node = rb_prev(&ref->rb_node);
240 if (!prev_node)
241 goto out;
242 ref = rb_entry(prev_node, struct btrfs_delayed_ref_node,
243 rb_node);
244 if (ref->bytenr == bytenr)
245 ret = 1;
246 }
247out:
248 spin_unlock(&delayed_refs->lock);
249 return ret;
250}
251
252/*
253 * helper function to lookup reference count
254 *
255 * the head node for delayed ref is used to store the sum of all the
256 * reference count modifications queued up in the rbtree. This way you
257 * can check to see what the reference count would be if all of the
258 * delayed refs are processed.
259 */
260int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
261 struct btrfs_root *root, u64 bytenr,
262 u64 num_bytes, u32 *refs)
263{
264 struct btrfs_delayed_ref_node *ref;
265 struct btrfs_delayed_ref_head *head;
266 struct btrfs_delayed_ref_root *delayed_refs;
267 struct btrfs_path *path;
268 struct extent_buffer *leaf;
269 struct btrfs_extent_item *ei;
270 struct btrfs_key key;
271 u32 num_refs;
272 int ret;
273
274 path = btrfs_alloc_path();
275 if (!path)
276 return -ENOMEM;
277
278 key.objectid = bytenr;
279 key.type = BTRFS_EXTENT_ITEM_KEY;
280 key.offset = num_bytes;
281 delayed_refs = &trans->transaction->delayed_refs;
282again:
283 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
284 &key, path, 0, 0);
285 if (ret < 0)
286 goto out;
287
288 if (ret == 0) {
289 leaf = path->nodes[0];
290 ei = btrfs_item_ptr(leaf, path->slots[0],
291 struct btrfs_extent_item);
292 num_refs = btrfs_extent_refs(leaf, ei);
293 } else {
294 num_refs = 0;
295 ret = 0;
296 }
297
298 spin_lock(&delayed_refs->lock);
299 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
300 if (ref) {
301 head = btrfs_delayed_node_to_head(ref);
302 if (mutex_trylock(&head->mutex)) {
303 num_refs += ref->ref_mod;
304 mutex_unlock(&head->mutex);
305 *refs = num_refs;
306 goto out;
307 }
308
309 atomic_inc(&ref->refs);
310 spin_unlock(&delayed_refs->lock);
311
312 btrfs_release_path(root->fs_info->extent_root, path);
313
314 mutex_lock(&head->mutex);
315 mutex_unlock(&head->mutex);
316 btrfs_put_delayed_ref(ref);
317 goto again;
318 } else {
319 *refs = num_refs;
320 }
321out:
322 spin_unlock(&delayed_refs->lock);
323 btrfs_free_path(path);
324 return ret;
325}
326
327/*
328 * helper function to update an extent delayed ref in the
329 * rbtree. existing and update must both have the same
330 * bytenr and parent
331 *
332 * This may free existing if the update cancels out whatever
333 * operation it was doing.
334 */
335static noinline void
336update_existing_ref(struct btrfs_trans_handle *trans,
337 struct btrfs_delayed_ref_root *delayed_refs,
338 struct btrfs_delayed_ref_node *existing,
339 struct btrfs_delayed_ref_node *update)
340{
341 struct btrfs_delayed_ref *existing_ref;
342 struct btrfs_delayed_ref *ref;
343
344 existing_ref = btrfs_delayed_node_to_ref(existing);
345 ref = btrfs_delayed_node_to_ref(update);
346
347 if (ref->pin)
348 existing_ref->pin = 1;
349
350 if (ref->action != existing_ref->action) {
351 /*
352 * this is effectively undoing either an add or a
353 * drop. We decrement the ref_mod, and if it goes
354 * down to zero we just delete the entry without
355 * every changing the extent allocation tree.
356 */
357 existing->ref_mod--;
358 if (existing->ref_mod == 0) {
359 rb_erase(&existing->rb_node,
360 &delayed_refs->root);
361 existing->in_tree = 0;
362 btrfs_put_delayed_ref(existing);
363 delayed_refs->num_entries--;
364 if (trans->delayed_ref_updates)
365 trans->delayed_ref_updates--;
366 }
367 } else {
368 if (existing_ref->action == BTRFS_ADD_DELAYED_REF) {
369 /* if we're adding refs, make sure all the
370 * details match up. The extent could
371 * have been totally freed and reallocated
372 * by a different owner before the delayed
373 * ref entries were removed.
374 */
375 existing_ref->owner_objectid = ref->owner_objectid;
376 existing_ref->generation = ref->generation;
377 existing_ref->root = ref->root;
378 existing->num_bytes = update->num_bytes;
379 }
380 /*
381 * the action on the existing ref matches
382 * the action on the ref we're trying to add.
383 * Bump the ref_mod by one so the backref that
384 * is eventually added/removed has the correct
385 * reference count
386 */
387 existing->ref_mod += update->ref_mod;
388 }
389}
390
391/*
392 * helper function to update the accounting in the head ref
393 * existing and update must have the same bytenr
394 */
395static noinline void
396update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
397 struct btrfs_delayed_ref_node *update)
398{
399 struct btrfs_delayed_ref_head *existing_ref;
400 struct btrfs_delayed_ref_head *ref;
401
402 existing_ref = btrfs_delayed_node_to_head(existing);
403 ref = btrfs_delayed_node_to_head(update);
404
405 if (ref->must_insert_reserved) {
406 /* if the extent was freed and then
407 * reallocated before the delayed ref
408 * entries were processed, we can end up
409 * with an existing head ref without
410 * the must_insert_reserved flag set.
411 * Set it again here
412 */
413 existing_ref->must_insert_reserved = ref->must_insert_reserved;
414
415 /*
416 * update the num_bytes so we make sure the accounting
417 * is done correctly
418 */
419 existing->num_bytes = update->num_bytes;
420
421 }
422
423 /*
424 * update the reference mod on the head to reflect this new operation
425 */
426 existing->ref_mod += update->ref_mod;
427}
428
429/*
430 * helper function to actually insert a delayed ref into the rbtree.
431 * this does all the dirty work in terms of maintaining the correct
432 * overall modification count in the head node and properly dealing
433 * with updating existing nodes as new modifications are queued.
434 */
435static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
436 struct btrfs_delayed_ref_node *ref,
437 u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
438 u64 ref_generation, u64 owner_objectid, int action,
439 int pin)
440{
441 struct btrfs_delayed_ref_node *existing;
442 struct btrfs_delayed_ref *full_ref;
443 struct btrfs_delayed_ref_head *head_ref = NULL;
444 struct btrfs_delayed_ref_root *delayed_refs;
445 int count_mod = 1;
446 int must_insert_reserved = 0;
447
448 /*
449 * the head node stores the sum of all the mods, so dropping a ref
450 * should drop the sum in the head node by one.
451 */
452 if (parent == (u64)-1) {
453 if (action == BTRFS_DROP_DELAYED_REF)
454 count_mod = -1;
455 else if (action == BTRFS_UPDATE_DELAYED_HEAD)
456 count_mod = 0;
457 }
458
459 /*
460 * BTRFS_ADD_DELAYED_EXTENT means that we need to update
461 * the reserved accounting when the extent is finally added, or
462 * if a later modification deletes the delayed ref without ever
463 * inserting the extent into the extent allocation tree.
464 * ref->must_insert_reserved is the flag used to record
465 * that accounting mods are required.
466 *
467 * Once we record must_insert_reserved, switch the action to
468 * BTRFS_ADD_DELAYED_REF because other special casing is not required.
469 */
470 if (action == BTRFS_ADD_DELAYED_EXTENT) {
471 must_insert_reserved = 1;
472 action = BTRFS_ADD_DELAYED_REF;
473 } else {
474 must_insert_reserved = 0;
475 }
476
477
478 delayed_refs = &trans->transaction->delayed_refs;
479
480 /* first set the basic ref node struct up */
481 atomic_set(&ref->refs, 1);
482 ref->bytenr = bytenr;
483 ref->parent = parent;
484 ref->ref_mod = count_mod;
485 ref->in_tree = 1;
486 ref->num_bytes = num_bytes;
487
488 if (btrfs_delayed_ref_is_head(ref)) {
489 head_ref = btrfs_delayed_node_to_head(ref);
490 head_ref->must_insert_reserved = must_insert_reserved;
491 INIT_LIST_HEAD(&head_ref->cluster);
492 mutex_init(&head_ref->mutex);
493 } else {
494 full_ref = btrfs_delayed_node_to_ref(ref);
495 full_ref->root = ref_root;
496 full_ref->generation = ref_generation;
497 full_ref->owner_objectid = owner_objectid;
498 full_ref->pin = pin;
499 full_ref->action = action;
500 }
501
502 existing = tree_insert(&delayed_refs->root, bytenr,
503 parent, &ref->rb_node);
504
505 if (existing) {
506 if (btrfs_delayed_ref_is_head(ref))
507 update_existing_head_ref(existing, ref);
508 else
509 update_existing_ref(trans, delayed_refs, existing, ref);
510
511 /*
512 * we've updated the existing ref, free the newly
513 * allocated ref
514 */
515 kfree(ref);
516 } else {
517 if (btrfs_delayed_ref_is_head(ref)) {
518 delayed_refs->num_heads++;
519 delayed_refs->num_heads_ready++;
520 }
521 delayed_refs->num_entries++;
522 trans->delayed_ref_updates++;
523 }
524 return 0;
525}
526
527/*
528 * add a delayed ref to the tree. This does all of the accounting required
529 * to make sure the delayed ref is eventually processed before this
530 * transaction commits.
531 */
532int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
533 u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
534 u64 ref_generation, u64 owner_objectid, int action,
535 int pin)
536{
537 struct btrfs_delayed_ref *ref;
538 struct btrfs_delayed_ref_head *head_ref;
539 struct btrfs_delayed_ref_root *delayed_refs;
540 int ret;
541
542 ref = kmalloc(sizeof(*ref), GFP_NOFS);
543 if (!ref)
544 return -ENOMEM;
545
546 /*
547 * the parent = 0 case comes from cases where we don't actually
548 * know the parent yet. It will get updated later via a add/drop
549 * pair.
550 */
551 if (parent == 0)
552 parent = bytenr;
553
554 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
555 if (!head_ref) {
556 kfree(ref);
557 return -ENOMEM;
558 }
559 delayed_refs = &trans->transaction->delayed_refs;
560 spin_lock(&delayed_refs->lock);
561
562 /*
563 * insert both the head node and the new ref without dropping
564 * the spin lock
565 */
566 ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
567 (u64)-1, 0, 0, 0, action, pin);
568 BUG_ON(ret);
569
570 ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
571 parent, ref_root, ref_generation,
572 owner_objectid, action, pin);
573 BUG_ON(ret);
574 spin_unlock(&delayed_refs->lock);
575 return 0;
576}
577
578/*
579 * this does a simple search for the head node for a given extent.
580 * It must be called with the delayed ref spinlock held, and it returns
581 * the head node if any where found, or NULL if not.
582 */
583struct btrfs_delayed_ref_head *
584btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
585{
586 struct btrfs_delayed_ref_node *ref;
587 struct btrfs_delayed_ref_root *delayed_refs;
588
589 delayed_refs = &trans->transaction->delayed_refs;
590 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
591 if (ref)
592 return btrfs_delayed_node_to_head(ref);
593 return NULL;
594}
595
596/*
597 * add a delayed ref to the tree. This does all of the accounting required
598 * to make sure the delayed ref is eventually processed before this
599 * transaction commits.
600 *
601 * The main point of this call is to add and remove a backreference in a single
602 * shot, taking the lock only once, and only searching for the head node once.
603 *
604 * It is the same as doing a ref add and delete in two separate calls.
605 */
606int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
607 u64 bytenr, u64 num_bytes, u64 orig_parent,
608 u64 parent, u64 orig_ref_root, u64 ref_root,
609 u64 orig_ref_generation, u64 ref_generation,
610 u64 owner_objectid, int pin)
611{
612 struct btrfs_delayed_ref *ref;
613 struct btrfs_delayed_ref *old_ref;
614 struct btrfs_delayed_ref_head *head_ref;
615 struct btrfs_delayed_ref_root *delayed_refs;
616 int ret;
617
618 ref = kmalloc(sizeof(*ref), GFP_NOFS);
619 if (!ref)
620 return -ENOMEM;
621
622 old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS);
623 if (!old_ref) {
624 kfree(ref);
625 return -ENOMEM;
626 }
627
628 /*
629 * the parent = 0 case comes from cases where we don't actually
630 * know the parent yet. It will get updated later via a add/drop
631 * pair.
632 */
633 if (parent == 0)
634 parent = bytenr;
635 if (orig_parent == 0)
636 orig_parent = bytenr;
637
638 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
639 if (!head_ref) {
640 kfree(ref);
641 kfree(old_ref);
642 return -ENOMEM;
643 }
644 delayed_refs = &trans->transaction->delayed_refs;
645 spin_lock(&delayed_refs->lock);
646
647 /*
648 * insert both the head node and the new ref without dropping
649 * the spin lock
650 */
651 ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
652 (u64)-1, 0, 0, 0,
653 BTRFS_UPDATE_DELAYED_HEAD, 0);
654 BUG_ON(ret);
655
656 ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
657 parent, ref_root, ref_generation,
658 owner_objectid, BTRFS_ADD_DELAYED_REF, 0);
659 BUG_ON(ret);
660
661 ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes,
662 orig_parent, orig_ref_root,
663 orig_ref_generation, owner_objectid,
664 BTRFS_DROP_DELAYED_REF, pin);
665 BUG_ON(ret);
666 spin_unlock(&delayed_refs->lock);
667 return 0;
668}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
new file mode 100644
index 000000000000..3bec2ff0b15c
--- /dev/null
+++ b/fs/btrfs/delayed-ref.h
@@ -0,0 +1,193 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#ifndef __DELAYED_REF__
19#define __DELAYED_REF__
20
21/* these are the possible values of struct btrfs_delayed_ref->action */
22#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */
23#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */
24#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
25#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
26
27struct btrfs_delayed_ref_node {
28 struct rb_node rb_node;
29
30 /* the starting bytenr of the extent */
31 u64 bytenr;
32
33 /* the parent our backref will point to */
34 u64 parent;
35
36 /* the size of the extent */
37 u64 num_bytes;
38
39 /* ref count on this data structure */
40 atomic_t refs;
41
42 /*
43 * how many refs is this entry adding or deleting. For
44 * head refs, this may be a negative number because it is keeping
45 * track of the total mods done to the reference count.
46 * For individual refs, this will always be a positive number
47 *
48 * It may be more than one, since it is possible for a single
49 * parent to have more than one ref on an extent
50 */
51 int ref_mod;
52
53 /* is this node still in the rbtree? */
54 unsigned int in_tree:1;
55};
56
57/*
58 * the head refs are used to hold a lock on a given extent, which allows us
59 * to make sure that only one process is running the delayed refs
60 * at a time for a single extent. They also store the sum of all the
61 * reference count modifications we've queued up.
62 */
63struct btrfs_delayed_ref_head {
64 struct btrfs_delayed_ref_node node;
65
66 /*
67 * the mutex is held while running the refs, and it is also
68 * held when checking the sum of reference modifications.
69 */
70 struct mutex mutex;
71
72 struct list_head cluster;
73
74 /*
75 * when a new extent is allocated, it is just reserved in memory
76 * The actual extent isn't inserted into the extent allocation tree
77 * until the delayed ref is processed. must_insert_reserved is
78 * used to flag a delayed ref so the accounting can be updated
79 * when a full insert is done.
80 *
81 * It is possible the extent will be freed before it is ever
82 * inserted into the extent allocation tree. In this case
83 * we need to update the in ram accounting to properly reflect
84 * the free has happened.
85 */
86 unsigned int must_insert_reserved:1;
87};
88
89struct btrfs_delayed_ref {
90 struct btrfs_delayed_ref_node node;
91
92 /* the root objectid our ref will point to */
93 u64 root;
94
95 /* the generation for the backref */
96 u64 generation;
97
98 /* owner_objectid of the backref */
99 u64 owner_objectid;
100
101 /* operation done by this entry in the rbtree */
102 u8 action;
103
104 /* if pin == 1, when the extent is freed it will be pinned until
105 * transaction commit
106 */
107 unsigned int pin:1;
108};
109
110struct btrfs_delayed_ref_root {
111 struct rb_root root;
112
113 /* this spin lock protects the rbtree and the entries inside */
114 spinlock_t lock;
115
116 /* how many delayed ref updates we've queued, used by the
117 * throttling code
118 */
119 unsigned long num_entries;
120
121 /* total number of head nodes in tree */
122 unsigned long num_heads;
123
124 /* total number of head nodes ready for processing */
125 unsigned long num_heads_ready;
126
127 /*
128 * set when the tree is flushing before a transaction commit,
129 * used by the throttling code to decide if new updates need
130 * to be run right away
131 */
132 int flushing;
133
134 u64 run_delayed_start;
135};
136
137static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
138{
139 WARN_ON(atomic_read(&ref->refs) == 0);
140 if (atomic_dec_and_test(&ref->refs)) {
141 WARN_ON(ref->in_tree);
142 kfree(ref);
143 }
144}
145
146int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
147 u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
148 u64 ref_generation, u64 owner_objectid, int action,
149 int pin);
150
151struct btrfs_delayed_ref_head *
152btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
153int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
154int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
155 struct btrfs_root *root, u64 bytenr,
156 u64 num_bytes, u32 *refs);
157int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
158 u64 bytenr, u64 num_bytes, u64 orig_parent,
159 u64 parent, u64 orig_ref_root, u64 ref_root,
160 u64 orig_ref_generation, u64 ref_generation,
161 u64 owner_objectid, int pin);
162int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
163 struct btrfs_delayed_ref_head *head);
164int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
165 struct list_head *cluster, u64 search_start);
166/*
167 * a node might live in a head or a regular ref, this lets you
168 * test for the proper type to use.
169 */
170static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node)
171{
172 return node->parent == (u64)-1;
173}
174
175/*
176 * helper functions to cast a node into its container
177 */
178static inline struct btrfs_delayed_ref *
179btrfs_delayed_node_to_ref(struct btrfs_delayed_ref_node *node)
180{
181 WARN_ON(btrfs_delayed_ref_is_head(node));
182 return container_of(node, struct btrfs_delayed_ref, node);
183
184}
185
186static inline struct btrfs_delayed_ref_head *
187btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node)
188{
189 WARN_ON(!btrfs_delayed_ref_is_head(node));
190 return container_of(node, struct btrfs_delayed_ref_head, node);
191
192}
193#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 926a0b287a7d..1d70236ba00c 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -145,7 +145,10 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
145 key.objectid = dir; 145 key.objectid = dir;
146 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); 146 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
147 key.offset = btrfs_name_hash(name, name_len); 147 key.offset = btrfs_name_hash(name, name_len);
148
148 path = btrfs_alloc_path(); 149 path = btrfs_alloc_path();
150 path->leave_spinning = 1;
151
149 data_size = sizeof(*dir_item) + name_len; 152 data_size = sizeof(*dir_item) + name_len;
150 dir_item = insert_with_overflow(trans, root, path, &key, data_size, 153 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
151 name, name_len); 154 name, name_len);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6ec80c0fc869..4b0ea0b80c23 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -38,6 +38,7 @@
38#include "locking.h" 38#include "locking.h"
39#include "ref-cache.h" 39#include "ref-cache.h"
40#include "tree-log.h" 40#include "tree-log.h"
41#include "free-space-cache.h"
41 42
42static struct extent_io_ops btree_extent_io_ops; 43static struct extent_io_ops btree_extent_io_ops;
43static void end_workqueue_fn(struct btrfs_work *work); 44static void end_workqueue_fn(struct btrfs_work *work);
@@ -231,10 +232,14 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
231 memcpy(&found, result, csum_size); 232 memcpy(&found, result, csum_size);
232 233
233 read_extent_buffer(buf, &val, 0, csum_size); 234 read_extent_buffer(buf, &val, 0, csum_size);
234 printk(KERN_INFO "btrfs: %s checksum verify failed " 235 if (printk_ratelimit()) {
235 "on %llu wanted %X found %X level %d\n", 236 printk(KERN_INFO "btrfs: %s checksum verify "
236 root->fs_info->sb->s_id, 237 "failed on %llu wanted %X found %X "
237 buf->start, val, found, btrfs_header_level(buf)); 238 "level %d\n",
239 root->fs_info->sb->s_id,
240 (unsigned long long)buf->start, val, found,
241 btrfs_header_level(buf));
242 }
238 if (result != (char *)&inline_result) 243 if (result != (char *)&inline_result)
239 kfree(result); 244 kfree(result);
240 return 1; 245 return 1;
@@ -267,10 +272,13 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
267 ret = 0; 272 ret = 0;
268 goto out; 273 goto out;
269 } 274 }
270 printk("parent transid verify failed on %llu wanted %llu found %llu\n", 275 if (printk_ratelimit()) {
271 (unsigned long long)eb->start, 276 printk("parent transid verify failed on %llu wanted %llu "
272 (unsigned long long)parent_transid, 277 "found %llu\n",
273 (unsigned long long)btrfs_header_generation(eb)); 278 (unsigned long long)eb->start,
279 (unsigned long long)parent_transid,
280 (unsigned long long)btrfs_header_generation(eb));
281 }
274 ret = 1; 282 ret = 1;
275 clear_extent_buffer_uptodate(io_tree, eb); 283 clear_extent_buffer_uptodate(io_tree, eb);
276out: 284out:
@@ -414,9 +422,12 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
414 422
415 found_start = btrfs_header_bytenr(eb); 423 found_start = btrfs_header_bytenr(eb);
416 if (found_start != start) { 424 if (found_start != start) {
417 printk(KERN_INFO "btrfs bad tree block start %llu %llu\n", 425 if (printk_ratelimit()) {
418 (unsigned long long)found_start, 426 printk(KERN_INFO "btrfs bad tree block start "
419 (unsigned long long)eb->start); 427 "%llu %llu\n",
428 (unsigned long long)found_start,
429 (unsigned long long)eb->start);
430 }
420 ret = -EIO; 431 ret = -EIO;
421 goto err; 432 goto err;
422 } 433 }
@@ -428,8 +439,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
428 goto err; 439 goto err;
429 } 440 }
430 if (check_tree_block_fsid(root, eb)) { 441 if (check_tree_block_fsid(root, eb)) {
431 printk(KERN_INFO "btrfs bad fsid on block %llu\n", 442 if (printk_ratelimit()) {
432 (unsigned long long)eb->start); 443 printk(KERN_INFO "btrfs bad fsid on block %llu\n",
444 (unsigned long long)eb->start);
445 }
433 ret = -EIO; 446 ret = -EIO;
434 goto err; 447 goto err;
435 } 448 }
@@ -578,19 +591,12 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
578 async->bio_flags = bio_flags; 591 async->bio_flags = bio_flags;
579 592
580 atomic_inc(&fs_info->nr_async_submits); 593 atomic_inc(&fs_info->nr_async_submits);
594
595 if (rw & (1 << BIO_RW_SYNCIO))
596 btrfs_set_work_high_prio(&async->work);
597
581 btrfs_queue_worker(&fs_info->workers, &async->work); 598 btrfs_queue_worker(&fs_info->workers, &async->work);
582#if 0
583 int limit = btrfs_async_submit_limit(fs_info);
584 if (atomic_read(&fs_info->nr_async_submits) > limit) {
585 wait_event_timeout(fs_info->async_submit_wait,
586 (atomic_read(&fs_info->nr_async_submits) < limit),
587 HZ/10);
588 599
589 wait_event_timeout(fs_info->async_submit_wait,
590 (atomic_read(&fs_info->nr_async_bios) < limit),
591 HZ/10);
592 }
593#endif
594 while (atomic_read(&fs_info->async_submit_draining) && 600 while (atomic_read(&fs_info->async_submit_draining) &&
595 atomic_read(&fs_info->nr_async_submits)) { 601 atomic_read(&fs_info->nr_async_submits)) {
596 wait_event(fs_info->async_submit_wait, 602 wait_event(fs_info->async_submit_wait,
@@ -655,6 +661,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
655 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 661 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
656 mirror_num, 0); 662 mirror_num, 0);
657 } 663 }
664
658 /* 665 /*
659 * kthread helpers are used to submit writes so that checksumming 666 * kthread helpers are used to submit writes so that checksumming
660 * can happen in parallel across all CPUs 667 * can happen in parallel across all CPUs
@@ -668,14 +675,31 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
668static int btree_writepage(struct page *page, struct writeback_control *wbc) 675static int btree_writepage(struct page *page, struct writeback_control *wbc)
669{ 676{
670 struct extent_io_tree *tree; 677 struct extent_io_tree *tree;
678 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
679 struct extent_buffer *eb;
680 int was_dirty;
681
671 tree = &BTRFS_I(page->mapping->host)->io_tree; 682 tree = &BTRFS_I(page->mapping->host)->io_tree;
683 if (!(current->flags & PF_MEMALLOC)) {
684 return extent_write_full_page(tree, page,
685 btree_get_extent, wbc);
686 }
672 687
673 if (current->flags & PF_MEMALLOC) { 688 redirty_page_for_writepage(wbc, page);
674 redirty_page_for_writepage(wbc, page); 689 eb = btrfs_find_tree_block(root, page_offset(page),
675 unlock_page(page); 690 PAGE_CACHE_SIZE);
676 return 0; 691 WARN_ON(!eb);
692
693 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
694 if (!was_dirty) {
695 spin_lock(&root->fs_info->delalloc_lock);
696 root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
697 spin_unlock(&root->fs_info->delalloc_lock);
677 } 698 }
678 return extent_write_full_page(tree, page, btree_get_extent, wbc); 699 free_extent_buffer(eb);
700
701 unlock_page(page);
702 return 0;
679} 703}
680 704
681static int btree_writepages(struct address_space *mapping, 705static int btree_writepages(struct address_space *mapping,
@@ -684,15 +708,15 @@ static int btree_writepages(struct address_space *mapping,
684 struct extent_io_tree *tree; 708 struct extent_io_tree *tree;
685 tree = &BTRFS_I(mapping->host)->io_tree; 709 tree = &BTRFS_I(mapping->host)->io_tree;
686 if (wbc->sync_mode == WB_SYNC_NONE) { 710 if (wbc->sync_mode == WB_SYNC_NONE) {
711 struct btrfs_root *root = BTRFS_I(mapping->host)->root;
687 u64 num_dirty; 712 u64 num_dirty;
688 u64 start = 0;
689 unsigned long thresh = 32 * 1024 * 1024; 713 unsigned long thresh = 32 * 1024 * 1024;
690 714
691 if (wbc->for_kupdate) 715 if (wbc->for_kupdate)
692 return 0; 716 return 0;
693 717
694 num_dirty = count_range_bits(tree, &start, (u64)-1, 718 /* this is a bit racy, but that's ok */
695 thresh, EXTENT_DIRTY); 719 num_dirty = root->fs_info->dirty_metadata_bytes;
696 if (num_dirty < thresh) 720 if (num_dirty < thresh)
697 return 0; 721 return 0;
698 } 722 }
@@ -747,27 +771,6 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
747 } 771 }
748} 772}
749 773
750#if 0
751static int btree_writepage(struct page *page, struct writeback_control *wbc)
752{
753 struct buffer_head *bh;
754 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
755 struct buffer_head *head;
756 if (!page_has_buffers(page)) {
757 create_empty_buffers(page, root->fs_info->sb->s_blocksize,
758 (1 << BH_Dirty)|(1 << BH_Uptodate));
759 }
760 head = page_buffers(page);
761 bh = head;
762 do {
763 if (buffer_dirty(bh))
764 csum_tree_block(root, bh, 0);
765 bh = bh->b_this_page;
766 } while (bh != head);
767 return block_write_full_page(page, btree_get_block, wbc);
768}
769#endif
770
771static struct address_space_operations btree_aops = { 774static struct address_space_operations btree_aops = {
772 .readpage = btree_readpage, 775 .readpage = btree_readpage,
773 .writepage = btree_writepage, 776 .writepage = btree_writepage,
@@ -845,8 +848,6 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
845 848
846 if (ret == 0) 849 if (ret == 0)
847 set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); 850 set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
848 else
849 WARN_ON(1);
850 return buf; 851 return buf;
851 852
852} 853}
@@ -859,9 +860,17 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
859 root->fs_info->running_transaction->transid) { 860 root->fs_info->running_transaction->transid) {
860 btrfs_assert_tree_locked(buf); 861 btrfs_assert_tree_locked(buf);
861 862
862 /* ugh, clear_extent_buffer_dirty can be expensive */ 863 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
863 btrfs_set_lock_blocking(buf); 864 spin_lock(&root->fs_info->delalloc_lock);
865 if (root->fs_info->dirty_metadata_bytes >= buf->len)
866 root->fs_info->dirty_metadata_bytes -= buf->len;
867 else
868 WARN_ON(1);
869 spin_unlock(&root->fs_info->delalloc_lock);
870 }
864 871
872 /* ugh, clear_extent_buffer_dirty needs to lock the page */
873 btrfs_set_lock_blocking(buf);
865 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, 874 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
866 buf); 875 buf);
867 } 876 }
@@ -1247,11 +1256,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1247 int ret = 0; 1256 int ret = 0;
1248 struct btrfs_device *device; 1257 struct btrfs_device *device;
1249 struct backing_dev_info *bdi; 1258 struct backing_dev_info *bdi;
1250#if 0 1259
1251 if ((bdi_bits & (1 << BDI_write_congested)) &&
1252 btrfs_congested_async(info, 0))
1253 return 1;
1254#endif
1255 list_for_each_entry(device, &info->fs_devices->devices, dev_list) { 1260 list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
1256 if (!device->bdev) 1261 if (!device->bdev)
1257 continue; 1262 continue;
@@ -1387,8 +1392,6 @@ static int bio_ready_for_csum(struct bio *bio)
1387 1392
1388 ret = extent_range_uptodate(io_tree, start + length, 1393 ret = extent_range_uptodate(io_tree, start + length,
1389 start + buf_len - 1); 1394 start + buf_len - 1);
1390 if (ret == 1)
1391 return ret;
1392 return ret; 1395 return ret;
1393} 1396}
1394 1397
@@ -1471,12 +1474,6 @@ static int transaction_kthread(void *arg)
1471 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1474 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1472 mutex_lock(&root->fs_info->transaction_kthread_mutex); 1475 mutex_lock(&root->fs_info->transaction_kthread_mutex);
1473 1476
1474 if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
1475 printk(KERN_INFO "btrfs: total reference cache "
1476 "size %llu\n",
1477 root->fs_info->total_ref_cache_size);
1478 }
1479
1480 mutex_lock(&root->fs_info->trans_mutex); 1477 mutex_lock(&root->fs_info->trans_mutex);
1481 cur = root->fs_info->running_transaction; 1478 cur = root->fs_info->running_transaction;
1482 if (!cur) { 1479 if (!cur) {
@@ -1493,6 +1490,7 @@ static int transaction_kthread(void *arg)
1493 mutex_unlock(&root->fs_info->trans_mutex); 1490 mutex_unlock(&root->fs_info->trans_mutex);
1494 trans = btrfs_start_transaction(root, 1); 1491 trans = btrfs_start_transaction(root, 1);
1495 ret = btrfs_commit_transaction(trans, root); 1492 ret = btrfs_commit_transaction(trans, root);
1493
1496sleep: 1494sleep:
1497 wake_up_process(root->fs_info->cleaner_kthread); 1495 wake_up_process(root->fs_info->cleaner_kthread);
1498 mutex_unlock(&root->fs_info->transaction_kthread_mutex); 1496 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1552,6 +1550,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1552 INIT_LIST_HEAD(&fs_info->dead_roots); 1550 INIT_LIST_HEAD(&fs_info->dead_roots);
1553 INIT_LIST_HEAD(&fs_info->hashers); 1551 INIT_LIST_HEAD(&fs_info->hashers);
1554 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 1552 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1553 INIT_LIST_HEAD(&fs_info->ordered_operations);
1555 spin_lock_init(&fs_info->delalloc_lock); 1554 spin_lock_init(&fs_info->delalloc_lock);
1556 spin_lock_init(&fs_info->new_trans_lock); 1555 spin_lock_init(&fs_info->new_trans_lock);
1557 spin_lock_init(&fs_info->ref_cache_lock); 1556 spin_lock_init(&fs_info->ref_cache_lock);
@@ -1579,6 +1578,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1579 fs_info->btree_inode = new_inode(sb); 1578 fs_info->btree_inode = new_inode(sb);
1580 fs_info->btree_inode->i_ino = 1; 1579 fs_info->btree_inode->i_ino = 1;
1581 fs_info->btree_inode->i_nlink = 1; 1580 fs_info->btree_inode->i_nlink = 1;
1581 fs_info->metadata_ratio = 8;
1582 1582
1583 fs_info->thread_pool_size = min_t(unsigned long, 1583 fs_info->thread_pool_size = min_t(unsigned long,
1584 num_online_cpus() + 2, 8); 1584 num_online_cpus() + 2, 8);
@@ -1611,10 +1611,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1611 1611
1612 extent_io_tree_init(&fs_info->pinned_extents, 1612 extent_io_tree_init(&fs_info->pinned_extents,
1613 fs_info->btree_inode->i_mapping, GFP_NOFS); 1613 fs_info->btree_inode->i_mapping, GFP_NOFS);
1614 extent_io_tree_init(&fs_info->pending_del,
1615 fs_info->btree_inode->i_mapping, GFP_NOFS);
1616 extent_io_tree_init(&fs_info->extent_ins,
1617 fs_info->btree_inode->i_mapping, GFP_NOFS);
1618 fs_info->do_barriers = 1; 1614 fs_info->do_barriers = 1;
1619 1615
1620 INIT_LIST_HEAD(&fs_info->dead_reloc_roots); 1616 INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
@@ -1627,15 +1623,18 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1627 insert_inode_hash(fs_info->btree_inode); 1623 insert_inode_hash(fs_info->btree_inode);
1628 1624
1629 mutex_init(&fs_info->trans_mutex); 1625 mutex_init(&fs_info->trans_mutex);
1626 mutex_init(&fs_info->ordered_operations_mutex);
1630 mutex_init(&fs_info->tree_log_mutex); 1627 mutex_init(&fs_info->tree_log_mutex);
1631 mutex_init(&fs_info->drop_mutex); 1628 mutex_init(&fs_info->drop_mutex);
1632 mutex_init(&fs_info->extent_ins_mutex);
1633 mutex_init(&fs_info->pinned_mutex);
1634 mutex_init(&fs_info->chunk_mutex); 1629 mutex_init(&fs_info->chunk_mutex);
1635 mutex_init(&fs_info->transaction_kthread_mutex); 1630 mutex_init(&fs_info->transaction_kthread_mutex);
1636 mutex_init(&fs_info->cleaner_mutex); 1631 mutex_init(&fs_info->cleaner_mutex);
1637 mutex_init(&fs_info->volume_mutex); 1632 mutex_init(&fs_info->volume_mutex);
1638 mutex_init(&fs_info->tree_reloc_mutex); 1633 mutex_init(&fs_info->tree_reloc_mutex);
1634
1635 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
1636 btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
1637
1639 init_waitqueue_head(&fs_info->transaction_throttle); 1638 init_waitqueue_head(&fs_info->transaction_throttle);
1640 init_waitqueue_head(&fs_info->transaction_wait); 1639 init_waitqueue_head(&fs_info->transaction_wait);
1641 init_waitqueue_head(&fs_info->async_submit_wait); 1640 init_waitqueue_head(&fs_info->async_submit_wait);
@@ -1670,7 +1669,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1670 if (features) { 1669 if (features) {
1671 printk(KERN_ERR "BTRFS: couldn't mount because of " 1670 printk(KERN_ERR "BTRFS: couldn't mount because of "
1672 "unsupported optional features (%Lx).\n", 1671 "unsupported optional features (%Lx).\n",
1673 features); 1672 (unsigned long long)features);
1674 err = -EINVAL; 1673 err = -EINVAL;
1675 goto fail_iput; 1674 goto fail_iput;
1676 } 1675 }
@@ -1680,7 +1679,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1680 if (!(sb->s_flags & MS_RDONLY) && features) { 1679 if (!(sb->s_flags & MS_RDONLY) && features) {
1681 printk(KERN_ERR "BTRFS: couldn't mount RDWR because of " 1680 printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
1682 "unsupported option features (%Lx).\n", 1681 "unsupported option features (%Lx).\n",
1683 features); 1682 (unsigned long long)features);
1684 err = -EINVAL; 1683 err = -EINVAL;
1685 goto fail_iput; 1684 goto fail_iput;
1686 } 1685 }
@@ -2076,10 +2075,10 @@ static int write_dev_supers(struct btrfs_device *device,
2076 device->barriers = 0; 2075 device->barriers = 0;
2077 get_bh(bh); 2076 get_bh(bh);
2078 lock_buffer(bh); 2077 lock_buffer(bh);
2079 ret = submit_bh(WRITE, bh); 2078 ret = submit_bh(WRITE_SYNC, bh);
2080 } 2079 }
2081 } else { 2080 } else {
2082 ret = submit_bh(WRITE, bh); 2081 ret = submit_bh(WRITE_SYNC, bh);
2083 } 2082 }
2084 2083
2085 if (!ret && wait) { 2084 if (!ret && wait) {
@@ -2272,7 +2271,7 @@ int close_ctree(struct btrfs_root *root)
2272 2271
2273 if (fs_info->delalloc_bytes) { 2272 if (fs_info->delalloc_bytes) {
2274 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", 2273 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
2275 fs_info->delalloc_bytes); 2274 (unsigned long long)fs_info->delalloc_bytes);
2276 } 2275 }
2277 if (fs_info->total_ref_cache_size) { 2276 if (fs_info->total_ref_cache_size) {
2278 printk(KERN_INFO "btrfs: at umount reference cache size %llu\n", 2277 printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
@@ -2309,16 +2308,6 @@ int close_ctree(struct btrfs_root *root)
2309 btrfs_stop_workers(&fs_info->endio_write_workers); 2308 btrfs_stop_workers(&fs_info->endio_write_workers);
2310 btrfs_stop_workers(&fs_info->submit_workers); 2309 btrfs_stop_workers(&fs_info->submit_workers);
2311 2310
2312#if 0
2313 while (!list_empty(&fs_info->hashers)) {
2314 struct btrfs_hasher *hasher;
2315 hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher,
2316 hashers);
2317 list_del(&hasher->hashers);
2318 crypto_free_hash(&fs_info->hash_tfm);
2319 kfree(hasher);
2320 }
2321#endif
2322 btrfs_close_devices(fs_info->fs_devices); 2311 btrfs_close_devices(fs_info->fs_devices);
2323 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2312 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2324 2313
@@ -2358,8 +2347,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
2358 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; 2347 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
2359 u64 transid = btrfs_header_generation(buf); 2348 u64 transid = btrfs_header_generation(buf);
2360 struct inode *btree_inode = root->fs_info->btree_inode; 2349 struct inode *btree_inode = root->fs_info->btree_inode;
2361 2350 int was_dirty;
2362 btrfs_set_lock_blocking(buf);
2363 2351
2364 btrfs_assert_tree_locked(buf); 2352 btrfs_assert_tree_locked(buf);
2365 if (transid != root->fs_info->generation) { 2353 if (transid != root->fs_info->generation) {
@@ -2370,7 +2358,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
2370 (unsigned long long)root->fs_info->generation); 2358 (unsigned long long)root->fs_info->generation);
2371 WARN_ON(1); 2359 WARN_ON(1);
2372 } 2360 }
2373 set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf); 2361 was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
2362 buf);
2363 if (!was_dirty) {
2364 spin_lock(&root->fs_info->delalloc_lock);
2365 root->fs_info->dirty_metadata_bytes += buf->len;
2366 spin_unlock(&root->fs_info->delalloc_lock);
2367 }
2374} 2368}
2375 2369
2376void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) 2370void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
@@ -2410,6 +2404,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2410int btree_lock_page_hook(struct page *page) 2404int btree_lock_page_hook(struct page *page)
2411{ 2405{
2412 struct inode *inode = page->mapping->host; 2406 struct inode *inode = page->mapping->host;
2407 struct btrfs_root *root = BTRFS_I(inode)->root;
2413 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2408 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2414 struct extent_buffer *eb; 2409 struct extent_buffer *eb;
2415 unsigned long len; 2410 unsigned long len;
@@ -2425,6 +2420,16 @@ int btree_lock_page_hook(struct page *page)
2425 2420
2426 btrfs_tree_lock(eb); 2421 btrfs_tree_lock(eb);
2427 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 2422 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
2423
2424 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
2425 spin_lock(&root->fs_info->delalloc_lock);
2426 if (root->fs_info->dirty_metadata_bytes >= eb->len)
2427 root->fs_info->dirty_metadata_bytes -= eb->len;
2428 else
2429 WARN_ON(1);
2430 spin_unlock(&root->fs_info->delalloc_lock);
2431 }
2432
2428 btrfs_tree_unlock(eb); 2433 btrfs_tree_unlock(eb);
2429 free_extent_buffer(eb); 2434 free_extent_buffer(eb);
2430out: 2435out:
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 95029db227be..c958ecbc1916 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -72,6 +72,7 @@ int btrfs_insert_dev_radix(struct btrfs_root *root,
72void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); 72void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
73int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); 73int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
74void btrfs_mark_buffer_dirty(struct extent_buffer *buf); 74void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
75void btrfs_mark_buffer_dirty_nonblocking(struct extent_buffer *buf);
75int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid); 76int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
76int btrfs_set_buffer_uptodate(struct extent_buffer *buf); 77int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
77int wait_on_tree_block_writeback(struct btrfs_root *root, 78int wait_on_tree_block_writeback(struct btrfs_root *root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fefe83ad2059..35af93355063 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -31,6 +31,7 @@
31#include "volumes.h" 31#include "volumes.h"
32#include "locking.h" 32#include "locking.h"
33#include "ref-cache.h" 33#include "ref-cache.h"
34#include "free-space-cache.h"
34 35
35#define PENDING_EXTENT_INSERT 0 36#define PENDING_EXTENT_INSERT 0
36#define PENDING_EXTENT_DELETE 1 37#define PENDING_EXTENT_DELETE 1
@@ -49,17 +50,23 @@ struct pending_extent_op {
49 int del; 50 int del;
50}; 51};
51 52
52static int finish_current_insert(struct btrfs_trans_handle *trans, 53static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
53 struct btrfs_root *extent_root, int all); 54 struct btrfs_root *root, u64 parent,
54static int del_pending_extents(struct btrfs_trans_handle *trans, 55 u64 root_objectid, u64 ref_generation,
55 struct btrfs_root *extent_root, int all); 56 u64 owner, struct btrfs_key *ins,
56static int pin_down_bytes(struct btrfs_trans_handle *trans, 57 int ref_mod);
57 struct btrfs_root *root, 58static int update_reserved_extents(struct btrfs_root *root,
58 u64 bytenr, u64 num_bytes, int is_data); 59 u64 bytenr, u64 num, int reserve);
59static int update_block_group(struct btrfs_trans_handle *trans, 60static int update_block_group(struct btrfs_trans_handle *trans,
60 struct btrfs_root *root, 61 struct btrfs_root *root,
61 u64 bytenr, u64 num_bytes, int alloc, 62 u64 bytenr, u64 num_bytes, int alloc,
62 int mark_free); 63 int mark_free);
64static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans,
65 struct btrfs_root *root,
66 u64 bytenr, u64 num_bytes, u64 parent,
67 u64 root_objectid, u64 ref_generation,
68 u64 owner_objectid, int pin,
69 int ref_to_drop);
63 70
64static int do_chunk_alloc(struct btrfs_trans_handle *trans, 71static int do_chunk_alloc(struct btrfs_trans_handle *trans,
65 struct btrfs_root *extent_root, u64 alloc_bytes, 72 struct btrfs_root *extent_root, u64 alloc_bytes,
@@ -160,7 +167,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
160 u64 extent_start, extent_end, size; 167 u64 extent_start, extent_end, size;
161 int ret; 168 int ret;
162 169
163 mutex_lock(&info->pinned_mutex);
164 while (start < end) { 170 while (start < end) {
165 ret = find_first_extent_bit(&info->pinned_extents, start, 171 ret = find_first_extent_bit(&info->pinned_extents, start,
166 &extent_start, &extent_end, 172 &extent_start, &extent_end,
@@ -186,7 +192,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
186 ret = btrfs_add_free_space(block_group, start, size); 192 ret = btrfs_add_free_space(block_group, start, size);
187 BUG_ON(ret); 193 BUG_ON(ret);
188 } 194 }
189 mutex_unlock(&info->pinned_mutex);
190 195
191 return 0; 196 return 0;
192} 197}
@@ -285,8 +290,8 @@ next:
285 block_group->key.objectid + 290 block_group->key.objectid +
286 block_group->key.offset); 291 block_group->key.offset);
287 292
288 remove_sb_from_cache(root, block_group);
289 block_group->cached = 1; 293 block_group->cached = 1;
294 remove_sb_from_cache(root, block_group);
290 ret = 0; 295 ret = 0;
291err: 296err:
292 btrfs_free_path(path); 297 btrfs_free_path(path);
@@ -307,7 +312,7 @@ btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
307} 312}
308 313
309/* 314/*
310 * return the block group that contains teh given bytenr 315 * return the block group that contains the given bytenr
311 */ 316 */
312struct btrfs_block_group_cache *btrfs_lookup_block_group( 317struct btrfs_block_group_cache *btrfs_lookup_block_group(
313 struct btrfs_fs_info *info, 318 struct btrfs_fs_info *info,
@@ -320,7 +325,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
320 return cache; 325 return cache;
321} 326}
322 327
323static inline void put_block_group(struct btrfs_block_group_cache *cache) 328void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
324{ 329{
325 if (atomic_dec_and_test(&cache->count)) 330 if (atomic_dec_and_test(&cache->count))
326 kfree(cache); 331 kfree(cache);
@@ -393,12 +398,12 @@ again:
393 div_factor(cache->key.offset, factor)) { 398 div_factor(cache->key.offset, factor)) {
394 group_start = cache->key.objectid; 399 group_start = cache->key.objectid;
395 spin_unlock(&cache->lock); 400 spin_unlock(&cache->lock);
396 put_block_group(cache); 401 btrfs_put_block_group(cache);
397 goto found; 402 goto found;
398 } 403 }
399 } 404 }
400 spin_unlock(&cache->lock); 405 spin_unlock(&cache->lock);
401 put_block_group(cache); 406 btrfs_put_block_group(cache);
402 cond_resched(); 407 cond_resched();
403 } 408 }
404 if (!wrapped) { 409 if (!wrapped) {
@@ -554,262 +559,13 @@ out:
554 return ret; 559 return ret;
555} 560}
556 561
557/*
558 * updates all the backrefs that are pending on update_list for the
559 * extent_root
560 */
561static noinline int update_backrefs(struct btrfs_trans_handle *trans,
562 struct btrfs_root *extent_root,
563 struct btrfs_path *path,
564 struct list_head *update_list)
565{
566 struct btrfs_key key;
567 struct btrfs_extent_ref *ref;
568 struct btrfs_fs_info *info = extent_root->fs_info;
569 struct pending_extent_op *op;
570 struct extent_buffer *leaf;
571 int ret = 0;
572 struct list_head *cur = update_list->next;
573 u64 ref_objectid;
574 u64 ref_root = extent_root->root_key.objectid;
575
576 op = list_entry(cur, struct pending_extent_op, list);
577
578search:
579 key.objectid = op->bytenr;
580 key.type = BTRFS_EXTENT_REF_KEY;
581 key.offset = op->orig_parent;
582
583 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
584 BUG_ON(ret);
585
586 leaf = path->nodes[0];
587
588loop:
589 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
590
591 ref_objectid = btrfs_ref_objectid(leaf, ref);
592
593 if (btrfs_ref_root(leaf, ref) != ref_root ||
594 btrfs_ref_generation(leaf, ref) != op->orig_generation ||
595 (ref_objectid != op->level &&
596 ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
597 printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, "
598 "root %llu, owner %u\n",
599 (unsigned long long)op->bytenr,
600 (unsigned long long)op->orig_parent,
601 (unsigned long long)ref_root, op->level);
602 btrfs_print_leaf(extent_root, leaf);
603 BUG();
604 }
605
606 key.objectid = op->bytenr;
607 key.offset = op->parent;
608 key.type = BTRFS_EXTENT_REF_KEY;
609 ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
610 BUG_ON(ret);
611 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
612 btrfs_set_ref_generation(leaf, ref, op->generation);
613
614 cur = cur->next;
615
616 list_del_init(&op->list);
617 unlock_extent(&info->extent_ins, op->bytenr,
618 op->bytenr + op->num_bytes - 1, GFP_NOFS);
619 kfree(op);
620
621 if (cur == update_list) {
622 btrfs_mark_buffer_dirty(path->nodes[0]);
623 btrfs_release_path(extent_root, path);
624 goto out;
625 }
626
627 op = list_entry(cur, struct pending_extent_op, list);
628
629 path->slots[0]++;
630 while (path->slots[0] < btrfs_header_nritems(leaf)) {
631 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
632 if (key.objectid == op->bytenr &&
633 key.type == BTRFS_EXTENT_REF_KEY)
634 goto loop;
635 path->slots[0]++;
636 }
637
638 btrfs_mark_buffer_dirty(path->nodes[0]);
639 btrfs_release_path(extent_root, path);
640 goto search;
641
642out:
643 return 0;
644}
645
646static noinline int insert_extents(struct btrfs_trans_handle *trans,
647 struct btrfs_root *extent_root,
648 struct btrfs_path *path,
649 struct list_head *insert_list, int nr)
650{
651 struct btrfs_key *keys;
652 u32 *data_size;
653 struct pending_extent_op *op;
654 struct extent_buffer *leaf;
655 struct list_head *cur = insert_list->next;
656 struct btrfs_fs_info *info = extent_root->fs_info;
657 u64 ref_root = extent_root->root_key.objectid;
658 int i = 0, last = 0, ret;
659 int total = nr * 2;
660
661 if (!nr)
662 return 0;
663
664 keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
665 if (!keys)
666 return -ENOMEM;
667
668 data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
669 if (!data_size) {
670 kfree(keys);
671 return -ENOMEM;
672 }
673
674 list_for_each_entry(op, insert_list, list) {
675 keys[i].objectid = op->bytenr;
676 keys[i].offset = op->num_bytes;
677 keys[i].type = BTRFS_EXTENT_ITEM_KEY;
678 data_size[i] = sizeof(struct btrfs_extent_item);
679 i++;
680
681 keys[i].objectid = op->bytenr;
682 keys[i].offset = op->parent;
683 keys[i].type = BTRFS_EXTENT_REF_KEY;
684 data_size[i] = sizeof(struct btrfs_extent_ref);
685 i++;
686 }
687
688 op = list_entry(cur, struct pending_extent_op, list);
689 i = 0;
690 while (i < total) {
691 int c;
692 ret = btrfs_insert_some_items(trans, extent_root, path,
693 keys+i, data_size+i, total-i);
694 BUG_ON(ret < 0);
695
696 if (last && ret > 1)
697 BUG();
698
699 leaf = path->nodes[0];
700 for (c = 0; c < ret; c++) {
701 int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
702
703 /*
704 * if the first item we inserted was a backref, then
705 * the EXTENT_ITEM will be the odd c's, else it will
706 * be the even c's
707 */
708 if ((ref_first && (c % 2)) ||
709 (!ref_first && !(c % 2))) {
710 struct btrfs_extent_item *itm;
711
712 itm = btrfs_item_ptr(leaf, path->slots[0] + c,
713 struct btrfs_extent_item);
714 btrfs_set_extent_refs(path->nodes[0], itm, 1);
715 op->del++;
716 } else {
717 struct btrfs_extent_ref *ref;
718
719 ref = btrfs_item_ptr(leaf, path->slots[0] + c,
720 struct btrfs_extent_ref);
721 btrfs_set_ref_root(leaf, ref, ref_root);
722 btrfs_set_ref_generation(leaf, ref,
723 op->generation);
724 btrfs_set_ref_objectid(leaf, ref, op->level);
725 btrfs_set_ref_num_refs(leaf, ref, 1);
726 op->del++;
727 }
728
729 /*
730 * using del to see when its ok to free up the
731 * pending_extent_op. In the case where we insert the
732 * last item on the list in order to help do batching
733 * we need to not free the extent op until we actually
734 * insert the extent_item
735 */
736 if (op->del == 2) {
737 unlock_extent(&info->extent_ins, op->bytenr,
738 op->bytenr + op->num_bytes - 1,
739 GFP_NOFS);
740 cur = cur->next;
741 list_del_init(&op->list);
742 kfree(op);
743 if (cur != insert_list)
744 op = list_entry(cur,
745 struct pending_extent_op,
746 list);
747 }
748 }
749 btrfs_mark_buffer_dirty(leaf);
750 btrfs_release_path(extent_root, path);
751
752 /*
753 * Ok backref's and items usually go right next to eachother,
754 * but if we could only insert 1 item that means that we
755 * inserted on the end of a leaf, and we have no idea what may
756 * be on the next leaf so we just play it safe. In order to
757 * try and help this case we insert the last thing on our
758 * insert list so hopefully it will end up being the last
759 * thing on the leaf and everything else will be before it,
760 * which will let us insert a whole bunch of items at the same
761 * time.
762 */
763 if (ret == 1 && !last && (i + ret < total)) {
764 /*
765 * last: where we will pick up the next time around
766 * i: our current key to insert, will be total - 1
767 * cur: the current op we are screwing with
768 * op: duh
769 */
770 last = i + ret;
771 i = total - 1;
772 cur = insert_list->prev;
773 op = list_entry(cur, struct pending_extent_op, list);
774 } else if (last) {
775 /*
776 * ok we successfully inserted the last item on the
777 * list, lets reset everything
778 *
779 * i: our current key to insert, so where we left off
780 * last time
781 * last: done with this
782 * cur: the op we are messing with
783 * op: duh
784 * total: since we inserted the last key, we need to
785 * decrement total so we dont overflow
786 */
787 i = last;
788 last = 0;
789 total--;
790 if (i < total) {
791 cur = insert_list->next;
792 op = list_entry(cur, struct pending_extent_op,
793 list);
794 }
795 } else {
796 i += ret;
797 }
798
799 cond_resched();
800 }
801 ret = 0;
802 kfree(keys);
803 kfree(data_size);
804 return ret;
805}
806
807static noinline int insert_extent_backref(struct btrfs_trans_handle *trans, 562static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
808 struct btrfs_root *root, 563 struct btrfs_root *root,
809 struct btrfs_path *path, 564 struct btrfs_path *path,
810 u64 bytenr, u64 parent, 565 u64 bytenr, u64 parent,
811 u64 ref_root, u64 ref_generation, 566 u64 ref_root, u64 ref_generation,
812 u64 owner_objectid) 567 u64 owner_objectid,
568 int refs_to_add)
813{ 569{
814 struct btrfs_key key; 570 struct btrfs_key key;
815 struct extent_buffer *leaf; 571 struct extent_buffer *leaf;
@@ -829,9 +585,10 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
829 btrfs_set_ref_root(leaf, ref, ref_root); 585 btrfs_set_ref_root(leaf, ref, ref_root);
830 btrfs_set_ref_generation(leaf, ref, ref_generation); 586 btrfs_set_ref_generation(leaf, ref, ref_generation);
831 btrfs_set_ref_objectid(leaf, ref, owner_objectid); 587 btrfs_set_ref_objectid(leaf, ref, owner_objectid);
832 btrfs_set_ref_num_refs(leaf, ref, 1); 588 btrfs_set_ref_num_refs(leaf, ref, refs_to_add);
833 } else if (ret == -EEXIST) { 589 } else if (ret == -EEXIST) {
834 u64 existing_owner; 590 u64 existing_owner;
591
835 BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID); 592 BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
836 leaf = path->nodes[0]; 593 leaf = path->nodes[0];
837 ref = btrfs_item_ptr(leaf, path->slots[0], 594 ref = btrfs_item_ptr(leaf, path->slots[0],
@@ -845,7 +602,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
845 602
846 num_refs = btrfs_ref_num_refs(leaf, ref); 603 num_refs = btrfs_ref_num_refs(leaf, ref);
847 BUG_ON(num_refs == 0); 604 BUG_ON(num_refs == 0);
848 btrfs_set_ref_num_refs(leaf, ref, num_refs + 1); 605 btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add);
849 606
850 existing_owner = btrfs_ref_objectid(leaf, ref); 607 existing_owner = btrfs_ref_objectid(leaf, ref);
851 if (existing_owner != owner_objectid && 608 if (existing_owner != owner_objectid &&
@@ -857,6 +614,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
857 } else { 614 } else {
858 goto out; 615 goto out;
859 } 616 }
617 btrfs_unlock_up_safe(path, 1);
860 btrfs_mark_buffer_dirty(path->nodes[0]); 618 btrfs_mark_buffer_dirty(path->nodes[0]);
861out: 619out:
862 btrfs_release_path(root, path); 620 btrfs_release_path(root, path);
@@ -865,7 +623,8 @@ out:
865 623
866static noinline int remove_extent_backref(struct btrfs_trans_handle *trans, 624static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
867 struct btrfs_root *root, 625 struct btrfs_root *root,
868 struct btrfs_path *path) 626 struct btrfs_path *path,
627 int refs_to_drop)
869{ 628{
870 struct extent_buffer *leaf; 629 struct extent_buffer *leaf;
871 struct btrfs_extent_ref *ref; 630 struct btrfs_extent_ref *ref;
@@ -875,8 +634,8 @@ static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
875 leaf = path->nodes[0]; 634 leaf = path->nodes[0];
876 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); 635 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
877 num_refs = btrfs_ref_num_refs(leaf, ref); 636 num_refs = btrfs_ref_num_refs(leaf, ref);
878 BUG_ON(num_refs == 0); 637 BUG_ON(num_refs < refs_to_drop);
879 num_refs -= 1; 638 num_refs -= refs_to_drop;
880 if (num_refs == 0) { 639 if (num_refs == 0) {
881 ret = btrfs_del_item(trans, root, path); 640 ret = btrfs_del_item(trans, root, path);
882 } else { 641 } else {
@@ -927,332 +686,28 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
927#endif 686#endif
928} 687}
929 688
930static noinline int free_extents(struct btrfs_trans_handle *trans,
931 struct btrfs_root *extent_root,
932 struct list_head *del_list)
933{
934 struct btrfs_fs_info *info = extent_root->fs_info;
935 struct btrfs_path *path;
936 struct btrfs_key key, found_key;
937 struct extent_buffer *leaf;
938 struct list_head *cur;
939 struct pending_extent_op *op;
940 struct btrfs_extent_item *ei;
941 int ret, num_to_del, extent_slot = 0, found_extent = 0;
942 u32 refs;
943 u64 bytes_freed = 0;
944
945 path = btrfs_alloc_path();
946 if (!path)
947 return -ENOMEM;
948 path->reada = 1;
949
950search:
951 /* search for the backref for the current ref we want to delete */
952 cur = del_list->next;
953 op = list_entry(cur, struct pending_extent_op, list);
954 ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
955 op->orig_parent,
956 extent_root->root_key.objectid,
957 op->orig_generation, op->level, 1);
958 if (ret) {
959 printk(KERN_ERR "btrfs unable to find backref byte nr %llu "
960 "root %llu gen %llu owner %u\n",
961 (unsigned long long)op->bytenr,
962 (unsigned long long)extent_root->root_key.objectid,
963 (unsigned long long)op->orig_generation, op->level);
964 btrfs_print_leaf(extent_root, path->nodes[0]);
965 WARN_ON(1);
966 goto out;
967 }
968
969 extent_slot = path->slots[0];
970 num_to_del = 1;
971 found_extent = 0;
972
973 /*
974 * if we aren't the first item on the leaf we can move back one and see
975 * if our ref is right next to our extent item
976 */
977 if (likely(extent_slot)) {
978 extent_slot--;
979 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
980 extent_slot);
981 if (found_key.objectid == op->bytenr &&
982 found_key.type == BTRFS_EXTENT_ITEM_KEY &&
983 found_key.offset == op->num_bytes) {
984 num_to_del++;
985 found_extent = 1;
986 }
987 }
988
989 /*
990 * if we didn't find the extent we need to delete the backref and then
991 * search for the extent item key so we can update its ref count
992 */
993 if (!found_extent) {
994 key.objectid = op->bytenr;
995 key.type = BTRFS_EXTENT_ITEM_KEY;
996 key.offset = op->num_bytes;
997
998 ret = remove_extent_backref(trans, extent_root, path);
999 BUG_ON(ret);
1000 btrfs_release_path(extent_root, path);
1001 ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
1002 BUG_ON(ret);
1003 extent_slot = path->slots[0];
1004 }
1005
1006 /* this is where we update the ref count for the extent */
1007 leaf = path->nodes[0];
1008 ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
1009 refs = btrfs_extent_refs(leaf, ei);
1010 BUG_ON(refs == 0);
1011 refs--;
1012 btrfs_set_extent_refs(leaf, ei, refs);
1013
1014 btrfs_mark_buffer_dirty(leaf);
1015
1016 /*
1017 * This extent needs deleting. The reason cur_slot is extent_slot +
1018 * num_to_del is because extent_slot points to the slot where the extent
1019 * is, and if the backref was not right next to the extent we will be
1020 * deleting at least 1 item, and will want to start searching at the
1021 * slot directly next to extent_slot. However if we did find the
1022 * backref next to the extent item them we will be deleting at least 2
1023 * items and will want to start searching directly after the ref slot
1024 */
1025 if (!refs) {
1026 struct list_head *pos, *n, *end;
1027 int cur_slot = extent_slot+num_to_del;
1028 u64 super_used;
1029 u64 root_used;
1030
1031 path->slots[0] = extent_slot;
1032 bytes_freed = op->num_bytes;
1033
1034 mutex_lock(&info->pinned_mutex);
1035 ret = pin_down_bytes(trans, extent_root, op->bytenr,
1036 op->num_bytes, op->level >=
1037 BTRFS_FIRST_FREE_OBJECTID);
1038 mutex_unlock(&info->pinned_mutex);
1039 BUG_ON(ret < 0);
1040 op->del = ret;
1041
1042 /*
1043 * we need to see if we can delete multiple things at once, so
1044 * start looping through the list of extents we are wanting to
1045 * delete and see if their extent/backref's are right next to
1046 * eachother and the extents only have 1 ref
1047 */
1048 for (pos = cur->next; pos != del_list; pos = pos->next) {
1049 struct pending_extent_op *tmp;
1050
1051 tmp = list_entry(pos, struct pending_extent_op, list);
1052
1053 /* we only want to delete extent+ref at this stage */
1054 if (cur_slot >= btrfs_header_nritems(leaf) - 1)
1055 break;
1056
1057 btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
1058 if (found_key.objectid != tmp->bytenr ||
1059 found_key.type != BTRFS_EXTENT_ITEM_KEY ||
1060 found_key.offset != tmp->num_bytes)
1061 break;
1062
1063 /* check to make sure this extent only has one ref */
1064 ei = btrfs_item_ptr(leaf, cur_slot,
1065 struct btrfs_extent_item);
1066 if (btrfs_extent_refs(leaf, ei) != 1)
1067 break;
1068
1069 btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
1070 if (found_key.objectid != tmp->bytenr ||
1071 found_key.type != BTRFS_EXTENT_REF_KEY ||
1072 found_key.offset != tmp->orig_parent)
1073 break;
1074
1075 /*
1076 * the ref is right next to the extent, we can set the
1077 * ref count to 0 since we will delete them both now
1078 */
1079 btrfs_set_extent_refs(leaf, ei, 0);
1080
1081 /* pin down the bytes for this extent */
1082 mutex_lock(&info->pinned_mutex);
1083 ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
1084 tmp->num_bytes, tmp->level >=
1085 BTRFS_FIRST_FREE_OBJECTID);
1086 mutex_unlock(&info->pinned_mutex);
1087 BUG_ON(ret < 0);
1088
1089 /*
1090 * use the del field to tell if we need to go ahead and
1091 * free up the extent when we delete the item or not.
1092 */
1093 tmp->del = ret;
1094 bytes_freed += tmp->num_bytes;
1095
1096 num_to_del += 2;
1097 cur_slot += 2;
1098 }
1099 end = pos;
1100
1101 /* update the free space counters */
1102 spin_lock(&info->delalloc_lock);
1103 super_used = btrfs_super_bytes_used(&info->super_copy);
1104 btrfs_set_super_bytes_used(&info->super_copy,
1105 super_used - bytes_freed);
1106
1107 root_used = btrfs_root_used(&extent_root->root_item);
1108 btrfs_set_root_used(&extent_root->root_item,
1109 root_used - bytes_freed);
1110 spin_unlock(&info->delalloc_lock);
1111
1112 /* delete the items */
1113 ret = btrfs_del_items(trans, extent_root, path,
1114 path->slots[0], num_to_del);
1115 BUG_ON(ret);
1116
1117 /*
1118 * loop through the extents we deleted and do the cleanup work
1119 * on them
1120 */
1121 for (pos = cur, n = pos->next; pos != end;
1122 pos = n, n = pos->next) {
1123 struct pending_extent_op *tmp;
1124 tmp = list_entry(pos, struct pending_extent_op, list);
1125
1126 /*
1127 * remember tmp->del tells us wether or not we pinned
1128 * down the extent
1129 */
1130 ret = update_block_group(trans, extent_root,
1131 tmp->bytenr, tmp->num_bytes, 0,
1132 tmp->del);
1133 BUG_ON(ret);
1134
1135 list_del_init(&tmp->list);
1136 unlock_extent(&info->extent_ins, tmp->bytenr,
1137 tmp->bytenr + tmp->num_bytes - 1,
1138 GFP_NOFS);
1139 kfree(tmp);
1140 }
1141 } else if (refs && found_extent) {
1142 /*
1143 * the ref and extent were right next to eachother, but the
1144 * extent still has a ref, so just free the backref and keep
1145 * going
1146 */
1147 ret = remove_extent_backref(trans, extent_root, path);
1148 BUG_ON(ret);
1149
1150 list_del_init(&op->list);
1151 unlock_extent(&info->extent_ins, op->bytenr,
1152 op->bytenr + op->num_bytes - 1, GFP_NOFS);
1153 kfree(op);
1154 } else {
1155 /*
1156 * the extent has multiple refs and the backref we were looking
1157 * for was not right next to it, so just unlock and go next,
1158 * we're good to go
1159 */
1160 list_del_init(&op->list);
1161 unlock_extent(&info->extent_ins, op->bytenr,
1162 op->bytenr + op->num_bytes - 1, GFP_NOFS);
1163 kfree(op);
1164 }
1165
1166 btrfs_release_path(extent_root, path);
1167 if (!list_empty(del_list))
1168 goto search;
1169
1170out:
1171 btrfs_free_path(path);
1172 return ret;
1173}
1174
1175static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans, 689static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1176 struct btrfs_root *root, u64 bytenr, 690 struct btrfs_root *root, u64 bytenr,
691 u64 num_bytes,
1177 u64 orig_parent, u64 parent, 692 u64 orig_parent, u64 parent,
1178 u64 orig_root, u64 ref_root, 693 u64 orig_root, u64 ref_root,
1179 u64 orig_generation, u64 ref_generation, 694 u64 orig_generation, u64 ref_generation,
1180 u64 owner_objectid) 695 u64 owner_objectid)
1181{ 696{
1182 int ret; 697 int ret;
1183 struct btrfs_root *extent_root = root->fs_info->extent_root; 698 int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID;
1184 struct btrfs_path *path;
1185
1186 if (root == root->fs_info->extent_root) {
1187 struct pending_extent_op *extent_op;
1188 u64 num_bytes;
1189
1190 BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
1191 num_bytes = btrfs_level_size(root, (int)owner_objectid);
1192 mutex_lock(&root->fs_info->extent_ins_mutex);
1193 if (test_range_bit(&root->fs_info->extent_ins, bytenr,
1194 bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
1195 u64 priv;
1196 ret = get_state_private(&root->fs_info->extent_ins,
1197 bytenr, &priv);
1198 BUG_ON(ret);
1199 extent_op = (struct pending_extent_op *)
1200 (unsigned long)priv;
1201 BUG_ON(extent_op->parent != orig_parent);
1202 BUG_ON(extent_op->generation != orig_generation);
1203 699
1204 extent_op->parent = parent; 700 ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes,
1205 extent_op->generation = ref_generation; 701 orig_parent, parent, orig_root,
1206 } else { 702 ref_root, orig_generation,
1207 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); 703 ref_generation, owner_objectid, pin);
1208 BUG_ON(!extent_op);
1209
1210 extent_op->type = PENDING_BACKREF_UPDATE;
1211 extent_op->bytenr = bytenr;
1212 extent_op->num_bytes = num_bytes;
1213 extent_op->parent = parent;
1214 extent_op->orig_parent = orig_parent;
1215 extent_op->generation = ref_generation;
1216 extent_op->orig_generation = orig_generation;
1217 extent_op->level = (int)owner_objectid;
1218 INIT_LIST_HEAD(&extent_op->list);
1219 extent_op->del = 0;
1220
1221 set_extent_bits(&root->fs_info->extent_ins,
1222 bytenr, bytenr + num_bytes - 1,
1223 EXTENT_WRITEBACK, GFP_NOFS);
1224 set_state_private(&root->fs_info->extent_ins,
1225 bytenr, (unsigned long)extent_op);
1226 }
1227 mutex_unlock(&root->fs_info->extent_ins_mutex);
1228 return 0;
1229 }
1230
1231 path = btrfs_alloc_path();
1232 if (!path)
1233 return -ENOMEM;
1234 ret = lookup_extent_backref(trans, extent_root, path,
1235 bytenr, orig_parent, orig_root,
1236 orig_generation, owner_objectid, 1);
1237 if (ret)
1238 goto out;
1239 ret = remove_extent_backref(trans, extent_root, path);
1240 if (ret)
1241 goto out;
1242 ret = insert_extent_backref(trans, extent_root, path, bytenr,
1243 parent, ref_root, ref_generation,
1244 owner_objectid);
1245 BUG_ON(ret); 704 BUG_ON(ret);
1246 finish_current_insert(trans, extent_root, 0);
1247 del_pending_extents(trans, extent_root, 0);
1248out:
1249 btrfs_free_path(path);
1250 return ret; 705 return ret;
1251} 706}
1252 707
1253int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, 708int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1254 struct btrfs_root *root, u64 bytenr, 709 struct btrfs_root *root, u64 bytenr,
1255 u64 orig_parent, u64 parent, 710 u64 num_bytes, u64 orig_parent, u64 parent,
1256 u64 ref_root, u64 ref_generation, 711 u64 ref_root, u64 ref_generation,
1257 u64 owner_objectid) 712 u64 owner_objectid)
1258{ 713{
@@ -1260,20 +715,36 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1260 if (ref_root == BTRFS_TREE_LOG_OBJECTID && 715 if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1261 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) 716 owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1262 return 0; 717 return 0;
1263 ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent, 718
1264 parent, ref_root, ref_root, 719 ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
1265 ref_generation, ref_generation, 720 orig_parent, parent, ref_root,
1266 owner_objectid); 721 ref_root, ref_generation,
722 ref_generation, owner_objectid);
1267 return ret; 723 return ret;
1268} 724}
1269
1270static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 725static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1271 struct btrfs_root *root, u64 bytenr, 726 struct btrfs_root *root, u64 bytenr,
727 u64 num_bytes,
1272 u64 orig_parent, u64 parent, 728 u64 orig_parent, u64 parent,
1273 u64 orig_root, u64 ref_root, 729 u64 orig_root, u64 ref_root,
1274 u64 orig_generation, u64 ref_generation, 730 u64 orig_generation, u64 ref_generation,
1275 u64 owner_objectid) 731 u64 owner_objectid)
1276{ 732{
733 int ret;
734
735 ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root,
736 ref_generation, owner_objectid,
737 BTRFS_ADD_DELAYED_REF, 0);
738 BUG_ON(ret);
739 return ret;
740}
741
742static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
743 struct btrfs_root *root, u64 bytenr,
744 u64 num_bytes, u64 parent, u64 ref_root,
745 u64 ref_generation, u64 owner_objectid,
746 int refs_to_add)
747{
1277 struct btrfs_path *path; 748 struct btrfs_path *path;
1278 int ret; 749 int ret;
1279 struct btrfs_key key; 750 struct btrfs_key key;
@@ -1286,17 +757,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1286 return -ENOMEM; 757 return -ENOMEM;
1287 758
1288 path->reada = 1; 759 path->reada = 1;
760 path->leave_spinning = 1;
1289 key.objectid = bytenr; 761 key.objectid = bytenr;
1290 key.type = BTRFS_EXTENT_ITEM_KEY; 762 key.type = BTRFS_EXTENT_ITEM_KEY;
1291 key.offset = (u64)-1; 763 key.offset = num_bytes;
1292 764
1293 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, 765 /* first find the extent item and update its reference count */
1294 0, 1); 766 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
1295 if (ret < 0) 767 path, 0, 1);
768 if (ret < 0) {
769 btrfs_set_path_blocking(path);
1296 return ret; 770 return ret;
1297 BUG_ON(ret == 0 || path->slots[0] == 0); 771 }
1298 772
1299 path->slots[0]--; 773 if (ret > 0) {
774 WARN_ON(1);
775 btrfs_free_path(path);
776 return -EIO;
777 }
1300 l = path->nodes[0]; 778 l = path->nodes[0];
1301 779
1302 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 780 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
@@ -1310,21 +788,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1310 BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY); 788 BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
1311 789
1312 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item); 790 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
791
1313 refs = btrfs_extent_refs(l, item); 792 refs = btrfs_extent_refs(l, item);
1314 btrfs_set_extent_refs(l, item, refs + 1); 793 btrfs_set_extent_refs(l, item, refs + refs_to_add);
794 btrfs_unlock_up_safe(path, 1);
795
1315 btrfs_mark_buffer_dirty(path->nodes[0]); 796 btrfs_mark_buffer_dirty(path->nodes[0]);
1316 797
1317 btrfs_release_path(root->fs_info->extent_root, path); 798 btrfs_release_path(root->fs_info->extent_root, path);
1318 799
1319 path->reada = 1; 800 path->reada = 1;
801 path->leave_spinning = 1;
802
803 /* now insert the actual backref */
1320 ret = insert_extent_backref(trans, root->fs_info->extent_root, 804 ret = insert_extent_backref(trans, root->fs_info->extent_root,
1321 path, bytenr, parent, 805 path, bytenr, parent,
1322 ref_root, ref_generation, 806 ref_root, ref_generation,
1323 owner_objectid); 807 owner_objectid, refs_to_add);
1324 BUG_ON(ret); 808 BUG_ON(ret);
1325 finish_current_insert(trans, root->fs_info->extent_root, 0);
1326 del_pending_extents(trans, root->fs_info->extent_root, 0);
1327
1328 btrfs_free_path(path); 809 btrfs_free_path(path);
1329 return 0; 810 return 0;
1330} 811}
@@ -1339,68 +820,278 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1339 if (ref_root == BTRFS_TREE_LOG_OBJECTID && 820 if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1340 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) 821 owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1341 return 0; 822 return 0;
1342 ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent, 823
824 ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent,
1343 0, ref_root, 0, ref_generation, 825 0, ref_root, 0, ref_generation,
1344 owner_objectid); 826 owner_objectid);
1345 return ret; 827 return ret;
1346} 828}
1347 829
1348int btrfs_extent_post_op(struct btrfs_trans_handle *trans, 830static int drop_delayed_ref(struct btrfs_trans_handle *trans,
1349 struct btrfs_root *root) 831 struct btrfs_root *root,
832 struct btrfs_delayed_ref_node *node)
833{
834 int ret = 0;
835 struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node);
836
837 BUG_ON(node->ref_mod == 0);
838 ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes,
839 node->parent, ref->root, ref->generation,
840 ref->owner_objectid, ref->pin, node->ref_mod);
841
842 return ret;
843}
844
845/* helper function to actually process a single delayed ref entry */
846static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
847 struct btrfs_root *root,
848 struct btrfs_delayed_ref_node *node,
849 int insert_reserved)
1350{ 850{
1351 u64 start;
1352 u64 end;
1353 int ret; 851 int ret;
852 struct btrfs_delayed_ref *ref;
1354 853
1355 while(1) { 854 if (node->parent == (u64)-1) {
1356 finish_current_insert(trans, root->fs_info->extent_root, 1); 855 struct btrfs_delayed_ref_head *head;
1357 del_pending_extents(trans, root->fs_info->extent_root, 1); 856 /*
857 * we've hit the end of the chain and we were supposed
858 * to insert this extent into the tree. But, it got
859 * deleted before we ever needed to insert it, so all
860 * we have to do is clean up the accounting
861 */
862 if (insert_reserved) {
863 update_reserved_extents(root, node->bytenr,
864 node->num_bytes, 0);
865 }
866 head = btrfs_delayed_node_to_head(node);
867 mutex_unlock(&head->mutex);
868 return 0;
869 }
1358 870
1359 /* is there more work to do? */ 871 ref = btrfs_delayed_node_to_ref(node);
1360 ret = find_first_extent_bit(&root->fs_info->pending_del, 872 if (ref->action == BTRFS_ADD_DELAYED_REF) {
1361 0, &start, &end, EXTENT_WRITEBACK); 873 if (insert_reserved) {
1362 if (!ret) 874 struct btrfs_key ins;
1363 continue; 875
1364 ret = find_first_extent_bit(&root->fs_info->extent_ins, 876 ins.objectid = node->bytenr;
1365 0, &start, &end, EXTENT_WRITEBACK); 877 ins.offset = node->num_bytes;
1366 if (!ret) 878 ins.type = BTRFS_EXTENT_ITEM_KEY;
1367 continue; 879
1368 break; 880 /* record the full extent allocation */
881 ret = __btrfs_alloc_reserved_extent(trans, root,
882 node->parent, ref->root,
883 ref->generation, ref->owner_objectid,
884 &ins, node->ref_mod);
885 update_reserved_extents(root, node->bytenr,
886 node->num_bytes, 0);
887 } else {
888 /* just add one backref */
889 ret = add_extent_ref(trans, root, node->bytenr,
890 node->num_bytes,
891 node->parent, ref->root, ref->generation,
892 ref->owner_objectid, node->ref_mod);
893 }
894 BUG_ON(ret);
895 } else if (ref->action == BTRFS_DROP_DELAYED_REF) {
896 WARN_ON(insert_reserved);
897 ret = drop_delayed_ref(trans, root, node);
1369 } 898 }
1370 return 0; 899 return 0;
1371} 900}
1372 901
1373int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, 902static noinline struct btrfs_delayed_ref_node *
1374 struct btrfs_root *root, u64 bytenr, 903select_delayed_ref(struct btrfs_delayed_ref_head *head)
1375 u64 num_bytes, u32 *refs)
1376{ 904{
1377 struct btrfs_path *path; 905 struct rb_node *node;
906 struct btrfs_delayed_ref_node *ref;
907 int action = BTRFS_ADD_DELAYED_REF;
908again:
909 /*
910 * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
911 * this prevents ref count from going down to zero when
912 * there still are pending delayed ref.
913 */
914 node = rb_prev(&head->node.rb_node);
915 while (1) {
916 if (!node)
917 break;
918 ref = rb_entry(node, struct btrfs_delayed_ref_node,
919 rb_node);
920 if (ref->bytenr != head->node.bytenr)
921 break;
922 if (btrfs_delayed_node_to_ref(ref)->action == action)
923 return ref;
924 node = rb_prev(node);
925 }
926 if (action == BTRFS_ADD_DELAYED_REF) {
927 action = BTRFS_DROP_DELAYED_REF;
928 goto again;
929 }
930 return NULL;
931}
932
933static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
934 struct btrfs_root *root,
935 struct list_head *cluster)
936{
937 struct btrfs_delayed_ref_root *delayed_refs;
938 struct btrfs_delayed_ref_node *ref;
939 struct btrfs_delayed_ref_head *locked_ref = NULL;
1378 int ret; 940 int ret;
1379 struct btrfs_key key; 941 int count = 0;
1380 struct extent_buffer *l; 942 int must_insert_reserved = 0;
1381 struct btrfs_extent_item *item;
1382 943
1383 WARN_ON(num_bytes < root->sectorsize); 944 delayed_refs = &trans->transaction->delayed_refs;
1384 path = btrfs_alloc_path(); 945 while (1) {
1385 path->reada = 1; 946 if (!locked_ref) {
1386 key.objectid = bytenr; 947 /* pick a new head ref from the cluster list */
1387 key.offset = num_bytes; 948 if (list_empty(cluster))
1388 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); 949 break;
1389 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, 950
1390 0, 0); 951 locked_ref = list_entry(cluster->next,
1391 if (ret < 0) 952 struct btrfs_delayed_ref_head, cluster);
1392 goto out; 953
1393 if (ret != 0) { 954 /* grab the lock that says we are going to process
1394 btrfs_print_leaf(root, path->nodes[0]); 955 * all the refs for this head */
1395 printk(KERN_INFO "btrfs failed to find block number %llu\n", 956 ret = btrfs_delayed_ref_lock(trans, locked_ref);
1396 (unsigned long long)bytenr); 957
1397 BUG(); 958 /*
959 * we may have dropped the spin lock to get the head
960 * mutex lock, and that might have given someone else
961 * time to free the head. If that's true, it has been
962 * removed from our list and we can move on.
963 */
964 if (ret == -EAGAIN) {
965 locked_ref = NULL;
966 count++;
967 continue;
968 }
969 }
970
971 /*
972 * record the must insert reserved flag before we
973 * drop the spin lock.
974 */
975 must_insert_reserved = locked_ref->must_insert_reserved;
976 locked_ref->must_insert_reserved = 0;
977
978 /*
979 * locked_ref is the head node, so we have to go one
980 * node back for any delayed ref updates
981 */
982 ref = select_delayed_ref(locked_ref);
983 if (!ref) {
984 /* All delayed refs have been processed, Go ahead
985 * and send the head node to run_one_delayed_ref,
986 * so that any accounting fixes can happen
987 */
988 ref = &locked_ref->node;
989 list_del_init(&locked_ref->cluster);
990 locked_ref = NULL;
991 }
992
993 ref->in_tree = 0;
994 rb_erase(&ref->rb_node, &delayed_refs->root);
995 delayed_refs->num_entries--;
996 spin_unlock(&delayed_refs->lock);
997
998 ret = run_one_delayed_ref(trans, root, ref,
999 must_insert_reserved);
1000 BUG_ON(ret);
1001 btrfs_put_delayed_ref(ref);
1002
1003 count++;
1004 cond_resched();
1005 spin_lock(&delayed_refs->lock);
1006 }
1007 return count;
1008}
1009
1010/*
1011 * this starts processing the delayed reference count updates and
1012 * extent insertions we have queued up so far. count can be
1013 * 0, which means to process everything in the tree at the start
1014 * of the run (but not newly added entries), or it can be some target
1015 * number you'd like to process.
1016 */
1017int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1018 struct btrfs_root *root, unsigned long count)
1019{
1020 struct rb_node *node;
1021 struct btrfs_delayed_ref_root *delayed_refs;
1022 struct btrfs_delayed_ref_node *ref;
1023 struct list_head cluster;
1024 int ret;
1025 int run_all = count == (unsigned long)-1;
1026 int run_most = 0;
1027
1028 if (root == root->fs_info->extent_root)
1029 root = root->fs_info->tree_root;
1030
1031 delayed_refs = &trans->transaction->delayed_refs;
1032 INIT_LIST_HEAD(&cluster);
1033again:
1034 spin_lock(&delayed_refs->lock);
1035 if (count == 0) {
1036 count = delayed_refs->num_entries * 2;
1037 run_most = 1;
1038 }
1039 while (1) {
1040 if (!(run_all || run_most) &&
1041 delayed_refs->num_heads_ready < 64)
1042 break;
1043
1044 /*
1045 * go find something we can process in the rbtree. We start at
1046 * the beginning of the tree, and then build a cluster
1047 * of refs to process starting at the first one we are able to
1048 * lock
1049 */
1050 ret = btrfs_find_ref_cluster(trans, &cluster,
1051 delayed_refs->run_delayed_start);
1052 if (ret)
1053 break;
1054
1055 ret = run_clustered_refs(trans, root, &cluster);
1056 BUG_ON(ret < 0);
1057
1058 count -= min_t(unsigned long, ret, count);
1059
1060 if (count == 0)
1061 break;
1062 }
1063
1064 if (run_all) {
1065 node = rb_first(&delayed_refs->root);
1066 if (!node)
1067 goto out;
1068 count = (unsigned long)-1;
1069
1070 while (node) {
1071 ref = rb_entry(node, struct btrfs_delayed_ref_node,
1072 rb_node);
1073 if (btrfs_delayed_ref_is_head(ref)) {
1074 struct btrfs_delayed_ref_head *head;
1075
1076 head = btrfs_delayed_node_to_head(ref);
1077 atomic_inc(&ref->refs);
1078
1079 spin_unlock(&delayed_refs->lock);
1080 mutex_lock(&head->mutex);
1081 mutex_unlock(&head->mutex);
1082
1083 btrfs_put_delayed_ref(ref);
1084 cond_resched();
1085 goto again;
1086 }
1087 node = rb_next(node);
1088 }
1089 spin_unlock(&delayed_refs->lock);
1090 schedule_timeout(1);
1091 goto again;
1398 } 1092 }
1399 l = path->nodes[0];
1400 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
1401 *refs = btrfs_extent_refs(l, item);
1402out: 1093out:
1403 btrfs_free_path(path); 1094 spin_unlock(&delayed_refs->lock);
1404 return 0; 1095 return 0;
1405} 1096}
1406 1097
@@ -1624,7 +1315,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1624 int refi = 0; 1315 int refi = 0;
1625 int slot; 1316 int slot;
1626 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 1317 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1627 u64, u64, u64, u64, u64, u64, u64, u64); 1318 u64, u64, u64, u64, u64, u64, u64, u64, u64);
1628 1319
1629 ref_root = btrfs_header_owner(buf); 1320 ref_root = btrfs_header_owner(buf);
1630 ref_generation = btrfs_header_generation(buf); 1321 ref_generation = btrfs_header_generation(buf);
@@ -1696,12 +1387,19 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1696 1387
1697 if (level == 0) { 1388 if (level == 0) {
1698 btrfs_item_key_to_cpu(buf, &key, slot); 1389 btrfs_item_key_to_cpu(buf, &key, slot);
1390 fi = btrfs_item_ptr(buf, slot,
1391 struct btrfs_file_extent_item);
1392
1393 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1394 if (bytenr == 0)
1395 continue;
1699 1396
1700 ret = process_func(trans, root, bytenr, 1397 ret = process_func(trans, root, bytenr,
1701 orig_buf->start, buf->start, 1398 btrfs_file_extent_disk_num_bytes(buf, fi),
1702 orig_root, ref_root, 1399 orig_buf->start, buf->start,
1703 orig_generation, ref_generation, 1400 orig_root, ref_root,
1704 key.objectid); 1401 orig_generation, ref_generation,
1402 key.objectid);
1705 1403
1706 if (ret) { 1404 if (ret) {
1707 faili = slot; 1405 faili = slot;
@@ -1709,7 +1407,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1709 goto fail; 1407 goto fail;
1710 } 1408 }
1711 } else { 1409 } else {
1712 ret = process_func(trans, root, bytenr, 1410 ret = process_func(trans, root, bytenr, buf->len,
1713 orig_buf->start, buf->start, 1411 orig_buf->start, buf->start,
1714 orig_root, ref_root, 1412 orig_root, ref_root,
1715 orig_generation, ref_generation, 1413 orig_generation, ref_generation,
@@ -1786,17 +1484,17 @@ int btrfs_update_ref(struct btrfs_trans_handle *trans,
1786 if (bytenr == 0) 1484 if (bytenr == 0)
1787 continue; 1485 continue;
1788 ret = __btrfs_update_extent_ref(trans, root, bytenr, 1486 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1789 orig_buf->start, buf->start, 1487 btrfs_file_extent_disk_num_bytes(buf, fi),
1790 orig_root, ref_root, 1488 orig_buf->start, buf->start,
1791 orig_generation, ref_generation, 1489 orig_root, ref_root, orig_generation,
1792 key.objectid); 1490 ref_generation, key.objectid);
1793 if (ret) 1491 if (ret)
1794 goto fail; 1492 goto fail;
1795 } else { 1493 } else {
1796 bytenr = btrfs_node_blockptr(buf, slot); 1494 bytenr = btrfs_node_blockptr(buf, slot);
1797 ret = __btrfs_update_extent_ref(trans, root, bytenr, 1495 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1798 orig_buf->start, buf->start, 1496 buf->len, orig_buf->start,
1799 orig_root, ref_root, 1497 buf->start, orig_root, ref_root,
1800 orig_generation, ref_generation, 1498 orig_generation, ref_generation,
1801 level - 1); 1499 level - 1);
1802 if (ret) 1500 if (ret)
@@ -1815,7 +1513,6 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
1815 struct btrfs_block_group_cache *cache) 1513 struct btrfs_block_group_cache *cache)
1816{ 1514{
1817 int ret; 1515 int ret;
1818 int pending_ret;
1819 struct btrfs_root *extent_root = root->fs_info->extent_root; 1516 struct btrfs_root *extent_root = root->fs_info->extent_root;
1820 unsigned long bi; 1517 unsigned long bi;
1821 struct extent_buffer *leaf; 1518 struct extent_buffer *leaf;
@@ -1831,12 +1528,8 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
1831 btrfs_mark_buffer_dirty(leaf); 1528 btrfs_mark_buffer_dirty(leaf);
1832 btrfs_release_path(extent_root, path); 1529 btrfs_release_path(extent_root, path);
1833fail: 1530fail:
1834 finish_current_insert(trans, extent_root, 0);
1835 pending_ret = del_pending_extents(trans, extent_root, 0);
1836 if (ret) 1531 if (ret)
1837 return ret; 1532 return ret;
1838 if (pending_ret)
1839 return pending_ret;
1840 return 0; 1533 return 0;
1841 1534
1842} 1535}
@@ -1900,7 +1593,7 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
1900 if (!block_group || block_group->ro) 1593 if (!block_group || block_group->ro)
1901 readonly = 1; 1594 readonly = 1;
1902 if (block_group) 1595 if (block_group)
1903 put_block_group(block_group); 1596 btrfs_put_block_group(block_group);
1904 return readonly; 1597 return readonly;
1905} 1598}
1906 1599
@@ -2151,10 +1844,14 @@ again:
2151 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" 1844 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
2152 ", %llu bytes_used, %llu bytes_reserved, " 1845 ", %llu bytes_used, %llu bytes_reserved, "
2153 "%llu bytes_pinned, %llu bytes_readonly, %llu may use" 1846 "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
2154 "%llu total\n", bytes, data_sinfo->bytes_delalloc, 1847 "%llu total\n", (unsigned long long)bytes,
2155 data_sinfo->bytes_used, data_sinfo->bytes_reserved, 1848 (unsigned long long)data_sinfo->bytes_delalloc,
2156 data_sinfo->bytes_pinned, data_sinfo->bytes_readonly, 1849 (unsigned long long)data_sinfo->bytes_used,
2157 data_sinfo->bytes_may_use, data_sinfo->total_bytes); 1850 (unsigned long long)data_sinfo->bytes_reserved,
1851 (unsigned long long)data_sinfo->bytes_pinned,
1852 (unsigned long long)data_sinfo->bytes_readonly,
1853 (unsigned long long)data_sinfo->bytes_may_use,
1854 (unsigned long long)data_sinfo->total_bytes);
2158 return -ENOSPC; 1855 return -ENOSPC;
2159 } 1856 }
2160 data_sinfo->bytes_may_use += bytes; 1857 data_sinfo->bytes_may_use += bytes;
@@ -2225,15 +1922,29 @@ void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
2225 spin_unlock(&info->lock); 1922 spin_unlock(&info->lock);
2226} 1923}
2227 1924
1925static void force_metadata_allocation(struct btrfs_fs_info *info)
1926{
1927 struct list_head *head = &info->space_info;
1928 struct btrfs_space_info *found;
1929
1930 rcu_read_lock();
1931 list_for_each_entry_rcu(found, head, list) {
1932 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
1933 found->force_alloc = 1;
1934 }
1935 rcu_read_unlock();
1936}
1937
2228static int do_chunk_alloc(struct btrfs_trans_handle *trans, 1938static int do_chunk_alloc(struct btrfs_trans_handle *trans,
2229 struct btrfs_root *extent_root, u64 alloc_bytes, 1939 struct btrfs_root *extent_root, u64 alloc_bytes,
2230 u64 flags, int force) 1940 u64 flags, int force)
2231{ 1941{
2232 struct btrfs_space_info *space_info; 1942 struct btrfs_space_info *space_info;
1943 struct btrfs_fs_info *fs_info = extent_root->fs_info;
2233 u64 thresh; 1944 u64 thresh;
2234 int ret = 0; 1945 int ret = 0;
2235 1946
2236 mutex_lock(&extent_root->fs_info->chunk_mutex); 1947 mutex_lock(&fs_info->chunk_mutex);
2237 1948
2238 flags = btrfs_reduce_alloc_profile(extent_root, flags); 1949 flags = btrfs_reduce_alloc_profile(extent_root, flags);
2239 1950
@@ -2265,6 +1976,18 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
2265 } 1976 }
2266 spin_unlock(&space_info->lock); 1977 spin_unlock(&space_info->lock);
2267 1978
1979 /*
1980 * if we're doing a data chunk, go ahead and make sure that
1981 * we keep a reasonable number of metadata chunks allocated in the
1982 * FS as well.
1983 */
1984 if (flags & BTRFS_BLOCK_GROUP_DATA) {
1985 fs_info->data_chunk_allocations++;
1986 if (!(fs_info->data_chunk_allocations %
1987 fs_info->metadata_ratio))
1988 force_metadata_allocation(fs_info);
1989 }
1990
2268 ret = btrfs_alloc_chunk(trans, extent_root, flags); 1991 ret = btrfs_alloc_chunk(trans, extent_root, flags);
2269 if (ret) 1992 if (ret)
2270 space_info->full = 1; 1993 space_info->full = 1;
@@ -2324,7 +2047,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
2324 WARN_ON(ret); 2047 WARN_ON(ret);
2325 } 2048 }
2326 } 2049 }
2327 put_block_group(cache); 2050 btrfs_put_block_group(cache);
2328 total -= num_bytes; 2051 total -= num_bytes;
2329 bytenr += num_bytes; 2052 bytenr += num_bytes;
2330 } 2053 }
@@ -2341,7 +2064,7 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
2341 return 0; 2064 return 0;
2342 2065
2343 bytenr = cache->key.objectid; 2066 bytenr = cache->key.objectid;
2344 put_block_group(cache); 2067 btrfs_put_block_group(cache);
2345 2068
2346 return bytenr; 2069 return bytenr;
2347} 2070}
@@ -2353,7 +2076,6 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2353 struct btrfs_block_group_cache *cache; 2076 struct btrfs_block_group_cache *cache;
2354 struct btrfs_fs_info *fs_info = root->fs_info; 2077 struct btrfs_fs_info *fs_info = root->fs_info;
2355 2078
2356 WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
2357 if (pin) { 2079 if (pin) {
2358 set_extent_dirty(&fs_info->pinned_extents, 2080 set_extent_dirty(&fs_info->pinned_extents,
2359 bytenr, bytenr + num - 1, GFP_NOFS); 2081 bytenr, bytenr + num - 1, GFP_NOFS);
@@ -2361,6 +2083,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2361 clear_extent_dirty(&fs_info->pinned_extents, 2083 clear_extent_dirty(&fs_info->pinned_extents,
2362 bytenr, bytenr + num - 1, GFP_NOFS); 2084 bytenr, bytenr + num - 1, GFP_NOFS);
2363 } 2085 }
2086
2364 while (num > 0) { 2087 while (num > 0) {
2365 cache = btrfs_lookup_block_group(fs_info, bytenr); 2088 cache = btrfs_lookup_block_group(fs_info, bytenr);
2366 BUG_ON(!cache); 2089 BUG_ON(!cache);
@@ -2385,7 +2108,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2385 if (cache->cached) 2108 if (cache->cached)
2386 btrfs_add_free_space(cache, bytenr, len); 2109 btrfs_add_free_space(cache, bytenr, len);
2387 } 2110 }
2388 put_block_group(cache); 2111 btrfs_put_block_group(cache);
2389 bytenr += len; 2112 bytenr += len;
2390 num -= len; 2113 num -= len;
2391 } 2114 }
@@ -2416,7 +2139,7 @@ static int update_reserved_extents(struct btrfs_root *root,
2416 } 2139 }
2417 spin_unlock(&cache->lock); 2140 spin_unlock(&cache->lock);
2418 spin_unlock(&cache->space_info->lock); 2141 spin_unlock(&cache->space_info->lock);
2419 put_block_group(cache); 2142 btrfs_put_block_group(cache);
2420 bytenr += len; 2143 bytenr += len;
2421 num -= len; 2144 num -= len;
2422 } 2145 }
@@ -2431,7 +2154,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
2431 struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents; 2154 struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
2432 int ret; 2155 int ret;
2433 2156
2434 mutex_lock(&root->fs_info->pinned_mutex);
2435 while (1) { 2157 while (1) {
2436 ret = find_first_extent_bit(pinned_extents, last, 2158 ret = find_first_extent_bit(pinned_extents, last,
2437 &start, &end, EXTENT_DIRTY); 2159 &start, &end, EXTENT_DIRTY);
@@ -2440,7 +2162,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
2440 set_extent_dirty(copy, start, end, GFP_NOFS); 2162 set_extent_dirty(copy, start, end, GFP_NOFS);
2441 last = end + 1; 2163 last = end + 1;
2442 } 2164 }
2443 mutex_unlock(&root->fs_info->pinned_mutex);
2444 return 0; 2165 return 0;
2445} 2166}
2446 2167
@@ -2452,7 +2173,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2452 u64 end; 2173 u64 end;
2453 int ret; 2174 int ret;
2454 2175
2455 mutex_lock(&root->fs_info->pinned_mutex);
2456 while (1) { 2176 while (1) {
2457 ret = find_first_extent_bit(unpin, 0, &start, &end, 2177 ret = find_first_extent_bit(unpin, 0, &start, &end,
2458 EXTENT_DIRTY); 2178 EXTENT_DIRTY);
@@ -2461,209 +2181,20 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2461 2181
2462 ret = btrfs_discard_extent(root, start, end + 1 - start); 2182 ret = btrfs_discard_extent(root, start, end + 1 - start);
2463 2183
2184 /* unlocks the pinned mutex */
2464 btrfs_update_pinned_extents(root, start, end + 1 - start, 0); 2185 btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
2465 clear_extent_dirty(unpin, start, end, GFP_NOFS); 2186 clear_extent_dirty(unpin, start, end, GFP_NOFS);
2466 2187
2467 if (need_resched()) { 2188 cond_resched();
2468 mutex_unlock(&root->fs_info->pinned_mutex);
2469 cond_resched();
2470 mutex_lock(&root->fs_info->pinned_mutex);
2471 }
2472 } 2189 }
2473 mutex_unlock(&root->fs_info->pinned_mutex);
2474 return ret; 2190 return ret;
2475} 2191}
2476 2192
2477static int finish_current_insert(struct btrfs_trans_handle *trans,
2478 struct btrfs_root *extent_root, int all)
2479{
2480 u64 start;
2481 u64 end;
2482 u64 priv;
2483 u64 search = 0;
2484 struct btrfs_fs_info *info = extent_root->fs_info;
2485 struct btrfs_path *path;
2486 struct pending_extent_op *extent_op, *tmp;
2487 struct list_head insert_list, update_list;
2488 int ret;
2489 int num_inserts = 0, max_inserts, restart = 0;
2490
2491 path = btrfs_alloc_path();
2492 INIT_LIST_HEAD(&insert_list);
2493 INIT_LIST_HEAD(&update_list);
2494
2495 max_inserts = extent_root->leafsize /
2496 (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
2497 sizeof(struct btrfs_extent_ref) +
2498 sizeof(struct btrfs_extent_item));
2499again:
2500 mutex_lock(&info->extent_ins_mutex);
2501 while (1) {
2502 ret = find_first_extent_bit(&info->extent_ins, search, &start,
2503 &end, EXTENT_WRITEBACK);
2504 if (ret) {
2505 if (restart && !num_inserts &&
2506 list_empty(&update_list)) {
2507 restart = 0;
2508 search = 0;
2509 continue;
2510 }
2511 break;
2512 }
2513
2514 ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
2515 if (!ret) {
2516 if (all)
2517 restart = 1;
2518 search = end + 1;
2519 if (need_resched()) {
2520 mutex_unlock(&info->extent_ins_mutex);
2521 cond_resched();
2522 mutex_lock(&info->extent_ins_mutex);
2523 }
2524 continue;
2525 }
2526
2527 ret = get_state_private(&info->extent_ins, start, &priv);
2528 BUG_ON(ret);
2529 extent_op = (struct pending_extent_op *)(unsigned long) priv;
2530
2531 if (extent_op->type == PENDING_EXTENT_INSERT) {
2532 num_inserts++;
2533 list_add_tail(&extent_op->list, &insert_list);
2534 search = end + 1;
2535 if (num_inserts == max_inserts) {
2536 restart = 1;
2537 break;
2538 }
2539 } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
2540 list_add_tail(&extent_op->list, &update_list);
2541 search = end + 1;
2542 } else {
2543 BUG();
2544 }
2545 }
2546
2547 /*
2548 * process the update list, clear the writeback bit for it, and if
2549 * somebody marked this thing for deletion then just unlock it and be
2550 * done, the free_extents will handle it
2551 */
2552 list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
2553 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2554 extent_op->bytenr + extent_op->num_bytes - 1,
2555 EXTENT_WRITEBACK, GFP_NOFS);
2556 if (extent_op->del) {
2557 list_del_init(&extent_op->list);
2558 unlock_extent(&info->extent_ins, extent_op->bytenr,
2559 extent_op->bytenr + extent_op->num_bytes
2560 - 1, GFP_NOFS);
2561 kfree(extent_op);
2562 }
2563 }
2564 mutex_unlock(&info->extent_ins_mutex);
2565
2566 /*
2567 * still have things left on the update list, go ahead an update
2568 * everything
2569 */
2570 if (!list_empty(&update_list)) {
2571 ret = update_backrefs(trans, extent_root, path, &update_list);
2572 BUG_ON(ret);
2573
2574 /* we may have COW'ed new blocks, so lets start over */
2575 if (all)
2576 restart = 1;
2577 }
2578
2579 /*
2580 * if no inserts need to be done, but we skipped some extents and we
2581 * need to make sure everything is cleaned then reset everything and
2582 * go back to the beginning
2583 */
2584 if (!num_inserts && restart) {
2585 search = 0;
2586 restart = 0;
2587 INIT_LIST_HEAD(&update_list);
2588 INIT_LIST_HEAD(&insert_list);
2589 goto again;
2590 } else if (!num_inserts) {
2591 goto out;
2592 }
2593
2594 /*
2595 * process the insert extents list. Again if we are deleting this
2596 * extent, then just unlock it, pin down the bytes if need be, and be
2597 * done with it. Saves us from having to actually insert the extent
2598 * into the tree and then subsequently come along and delete it
2599 */
2600 mutex_lock(&info->extent_ins_mutex);
2601 list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
2602 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2603 extent_op->bytenr + extent_op->num_bytes - 1,
2604 EXTENT_WRITEBACK, GFP_NOFS);
2605 if (extent_op->del) {
2606 u64 used;
2607 list_del_init(&extent_op->list);
2608 unlock_extent(&info->extent_ins, extent_op->bytenr,
2609 extent_op->bytenr + extent_op->num_bytes
2610 - 1, GFP_NOFS);
2611
2612 mutex_lock(&extent_root->fs_info->pinned_mutex);
2613 ret = pin_down_bytes(trans, extent_root,
2614 extent_op->bytenr,
2615 extent_op->num_bytes, 0);
2616 mutex_unlock(&extent_root->fs_info->pinned_mutex);
2617
2618 spin_lock(&info->delalloc_lock);
2619 used = btrfs_super_bytes_used(&info->super_copy);
2620 btrfs_set_super_bytes_used(&info->super_copy,
2621 used - extent_op->num_bytes);
2622 used = btrfs_root_used(&extent_root->root_item);
2623 btrfs_set_root_used(&extent_root->root_item,
2624 used - extent_op->num_bytes);
2625 spin_unlock(&info->delalloc_lock);
2626
2627 ret = update_block_group(trans, extent_root,
2628 extent_op->bytenr,
2629 extent_op->num_bytes,
2630 0, ret > 0);
2631 BUG_ON(ret);
2632 kfree(extent_op);
2633 num_inserts--;
2634 }
2635 }
2636 mutex_unlock(&info->extent_ins_mutex);
2637
2638 ret = insert_extents(trans, extent_root, path, &insert_list,
2639 num_inserts);
2640 BUG_ON(ret);
2641
2642 /*
2643 * if restart is set for whatever reason we need to go back and start
2644 * searching through the pending list again.
2645 *
2646 * We just inserted some extents, which could have resulted in new
2647 * blocks being allocated, which would result in new blocks needing
2648 * updates, so if all is set we _must_ restart to get the updated
2649 * blocks.
2650 */
2651 if (restart || all) {
2652 INIT_LIST_HEAD(&insert_list);
2653 INIT_LIST_HEAD(&update_list);
2654 search = 0;
2655 restart = 0;
2656 num_inserts = 0;
2657 goto again;
2658 }
2659out:
2660 btrfs_free_path(path);
2661 return 0;
2662}
2663
2664static int pin_down_bytes(struct btrfs_trans_handle *trans, 2193static int pin_down_bytes(struct btrfs_trans_handle *trans,
2665 struct btrfs_root *root, 2194 struct btrfs_root *root,
2666 u64 bytenr, u64 num_bytes, int is_data) 2195 struct btrfs_path *path,
2196 u64 bytenr, u64 num_bytes, int is_data,
2197 struct extent_buffer **must_clean)
2667{ 2198{
2668 int err = 0; 2199 int err = 0;
2669 struct extent_buffer *buf; 2200 struct extent_buffer *buf;
@@ -2686,17 +2217,18 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
2686 u64 header_transid = btrfs_header_generation(buf); 2217 u64 header_transid = btrfs_header_generation(buf);
2687 if (header_owner != BTRFS_TREE_LOG_OBJECTID && 2218 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
2688 header_owner != BTRFS_TREE_RELOC_OBJECTID && 2219 header_owner != BTRFS_TREE_RELOC_OBJECTID &&
2220 header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID &&
2689 header_transid == trans->transid && 2221 header_transid == trans->transid &&
2690 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 2222 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
2691 clean_tree_block(NULL, root, buf); 2223 *must_clean = buf;
2692 btrfs_tree_unlock(buf);
2693 free_extent_buffer(buf);
2694 return 1; 2224 return 1;
2695 } 2225 }
2696 btrfs_tree_unlock(buf); 2226 btrfs_tree_unlock(buf);
2697 } 2227 }
2698 free_extent_buffer(buf); 2228 free_extent_buffer(buf);
2699pinit: 2229pinit:
2230 btrfs_set_path_blocking(path);
2231 /* unlocks the pinned mutex */
2700 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); 2232 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2701 2233
2702 BUG_ON(err < 0); 2234 BUG_ON(err < 0);
@@ -2710,7 +2242,8 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2710 struct btrfs_root *root, 2242 struct btrfs_root *root,
2711 u64 bytenr, u64 num_bytes, u64 parent, 2243 u64 bytenr, u64 num_bytes, u64 parent,
2712 u64 root_objectid, u64 ref_generation, 2244 u64 root_objectid, u64 ref_generation,
2713 u64 owner_objectid, int pin, int mark_free) 2245 u64 owner_objectid, int pin, int mark_free,
2246 int refs_to_drop)
2714{ 2247{
2715 struct btrfs_path *path; 2248 struct btrfs_path *path;
2716 struct btrfs_key key; 2249 struct btrfs_key key;
@@ -2732,6 +2265,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2732 return -ENOMEM; 2265 return -ENOMEM;
2733 2266
2734 path->reada = 1; 2267 path->reada = 1;
2268 path->leave_spinning = 1;
2735 ret = lookup_extent_backref(trans, extent_root, path, 2269 ret = lookup_extent_backref(trans, extent_root, path,
2736 bytenr, parent, root_objectid, 2270 bytenr, parent, root_objectid,
2737 ref_generation, owner_objectid, 1); 2271 ref_generation, owner_objectid, 1);
@@ -2753,9 +2287,11 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2753 break; 2287 break;
2754 } 2288 }
2755 if (!found_extent) { 2289 if (!found_extent) {
2756 ret = remove_extent_backref(trans, extent_root, path); 2290 ret = remove_extent_backref(trans, extent_root, path,
2291 refs_to_drop);
2757 BUG_ON(ret); 2292 BUG_ON(ret);
2758 btrfs_release_path(extent_root, path); 2293 btrfs_release_path(extent_root, path);
2294 path->leave_spinning = 1;
2759 ret = btrfs_search_slot(trans, extent_root, 2295 ret = btrfs_search_slot(trans, extent_root,
2760 &key, path, -1, 1); 2296 &key, path, -1, 1);
2761 if (ret) { 2297 if (ret) {
@@ -2771,8 +2307,9 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2771 btrfs_print_leaf(extent_root, path->nodes[0]); 2307 btrfs_print_leaf(extent_root, path->nodes[0]);
2772 WARN_ON(1); 2308 WARN_ON(1);
2773 printk(KERN_ERR "btrfs unable to find ref byte nr %llu " 2309 printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
2774 "root %llu gen %llu owner %llu\n", 2310 "parent %llu root %llu gen %llu owner %llu\n",
2775 (unsigned long long)bytenr, 2311 (unsigned long long)bytenr,
2312 (unsigned long long)parent,
2776 (unsigned long long)root_objectid, 2313 (unsigned long long)root_objectid,
2777 (unsigned long long)ref_generation, 2314 (unsigned long long)ref_generation,
2778 (unsigned long long)owner_objectid); 2315 (unsigned long long)owner_objectid);
@@ -2782,17 +2319,23 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2782 ei = btrfs_item_ptr(leaf, extent_slot, 2319 ei = btrfs_item_ptr(leaf, extent_slot,
2783 struct btrfs_extent_item); 2320 struct btrfs_extent_item);
2784 refs = btrfs_extent_refs(leaf, ei); 2321 refs = btrfs_extent_refs(leaf, ei);
2785 BUG_ON(refs == 0);
2786 refs -= 1;
2787 btrfs_set_extent_refs(leaf, ei, refs);
2788 2322
2323 /*
2324 * we're not allowed to delete the extent item if there
2325 * are other delayed ref updates pending
2326 */
2327
2328 BUG_ON(refs < refs_to_drop);
2329 refs -= refs_to_drop;
2330 btrfs_set_extent_refs(leaf, ei, refs);
2789 btrfs_mark_buffer_dirty(leaf); 2331 btrfs_mark_buffer_dirty(leaf);
2790 2332
2791 if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) { 2333 if (refs == 0 && found_extent &&
2334 path->slots[0] == extent_slot + 1) {
2792 struct btrfs_extent_ref *ref; 2335 struct btrfs_extent_ref *ref;
2793 ref = btrfs_item_ptr(leaf, path->slots[0], 2336 ref = btrfs_item_ptr(leaf, path->slots[0],
2794 struct btrfs_extent_ref); 2337 struct btrfs_extent_ref);
2795 BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1); 2338 BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop);
2796 /* if the back ref and the extent are next to each other 2339 /* if the back ref and the extent are next to each other
2797 * they get deleted below in one shot 2340 * they get deleted below in one shot
2798 */ 2341 */
@@ -2800,11 +2343,13 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2800 num_to_del = 2; 2343 num_to_del = 2;
2801 } else if (found_extent) { 2344 } else if (found_extent) {
2802 /* otherwise delete the extent back ref */ 2345 /* otherwise delete the extent back ref */
2803 ret = remove_extent_backref(trans, extent_root, path); 2346 ret = remove_extent_backref(trans, extent_root, path,
2347 refs_to_drop);
2804 BUG_ON(ret); 2348 BUG_ON(ret);
2805 /* if refs are 0, we need to setup the path for deletion */ 2349 /* if refs are 0, we need to setup the path for deletion */
2806 if (refs == 0) { 2350 if (refs == 0) {
2807 btrfs_release_path(extent_root, path); 2351 btrfs_release_path(extent_root, path);
2352 path->leave_spinning = 1;
2808 ret = btrfs_search_slot(trans, extent_root, &key, path, 2353 ret = btrfs_search_slot(trans, extent_root, &key, path,
2809 -1, 1); 2354 -1, 1);
2810 BUG_ON(ret); 2355 BUG_ON(ret);
@@ -2814,16 +2359,18 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2814 if (refs == 0) { 2359 if (refs == 0) {
2815 u64 super_used; 2360 u64 super_used;
2816 u64 root_used; 2361 u64 root_used;
2362 struct extent_buffer *must_clean = NULL;
2817 2363
2818 if (pin) { 2364 if (pin) {
2819 mutex_lock(&root->fs_info->pinned_mutex); 2365 ret = pin_down_bytes(trans, root, path,
2820 ret = pin_down_bytes(trans, root, bytenr, num_bytes, 2366 bytenr, num_bytes,
2821 owner_objectid >= BTRFS_FIRST_FREE_OBJECTID); 2367 owner_objectid >= BTRFS_FIRST_FREE_OBJECTID,
2822 mutex_unlock(&root->fs_info->pinned_mutex); 2368 &must_clean);
2823 if (ret > 0) 2369 if (ret > 0)
2824 mark_free = 1; 2370 mark_free = 1;
2825 BUG_ON(ret < 0); 2371 BUG_ON(ret < 0);
2826 } 2372 }
2373
2827 /* block accounting for super block */ 2374 /* block accounting for super block */
2828 spin_lock(&info->delalloc_lock); 2375 spin_lock(&info->delalloc_lock);
2829 super_used = btrfs_super_bytes_used(&info->super_copy); 2376 super_used = btrfs_super_bytes_used(&info->super_copy);
@@ -2835,14 +2382,34 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2835 btrfs_set_root_used(&root->root_item, 2382 btrfs_set_root_used(&root->root_item,
2836 root_used - num_bytes); 2383 root_used - num_bytes);
2837 spin_unlock(&info->delalloc_lock); 2384 spin_unlock(&info->delalloc_lock);
2385
2386 /*
2387 * it is going to be very rare for someone to be waiting
2388 * on the block we're freeing. del_items might need to
2389 * schedule, so rather than get fancy, just force it
2390 * to blocking here
2391 */
2392 if (must_clean)
2393 btrfs_set_lock_blocking(must_clean);
2394
2838 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 2395 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
2839 num_to_del); 2396 num_to_del);
2840 BUG_ON(ret); 2397 BUG_ON(ret);
2841 btrfs_release_path(extent_root, path); 2398 btrfs_release_path(extent_root, path);
2842 2399
2400 if (must_clean) {
2401 clean_tree_block(NULL, root, must_clean);
2402 btrfs_tree_unlock(must_clean);
2403 free_extent_buffer(must_clean);
2404 }
2405
2843 if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 2406 if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
2844 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 2407 ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
2845 BUG_ON(ret); 2408 BUG_ON(ret);
2409 } else {
2410 invalidate_mapping_pages(info->btree_inode->i_mapping,
2411 bytenr >> PAGE_CACHE_SHIFT,
2412 (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
2846 } 2413 }
2847 2414
2848 ret = update_block_group(trans, root, bytenr, num_bytes, 0, 2415 ret = update_block_group(trans, root, bytenr, num_bytes, 0,
@@ -2850,218 +2417,103 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2850 BUG_ON(ret); 2417 BUG_ON(ret);
2851 } 2418 }
2852 btrfs_free_path(path); 2419 btrfs_free_path(path);
2853 finish_current_insert(trans, extent_root, 0);
2854 return ret; 2420 return ret;
2855} 2421}
2856 2422
2857/* 2423/*
2858 * find all the blocks marked as pending in the radix tree and remove 2424 * remove an extent from the root, returns 0 on success
2859 * them from the extent map
2860 */ 2425 */
2861static int del_pending_extents(struct btrfs_trans_handle *trans, 2426static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2862 struct btrfs_root *extent_root, int all) 2427 struct btrfs_root *root,
2428 u64 bytenr, u64 num_bytes, u64 parent,
2429 u64 root_objectid, u64 ref_generation,
2430 u64 owner_objectid, int pin,
2431 int refs_to_drop)
2863{ 2432{
2864 int ret; 2433 WARN_ON(num_bytes < root->sectorsize);
2865 int err = 0;
2866 u64 start;
2867 u64 end;
2868 u64 priv;
2869 u64 search = 0;
2870 int nr = 0, skipped = 0;
2871 struct extent_io_tree *pending_del;
2872 struct extent_io_tree *extent_ins;
2873 struct pending_extent_op *extent_op;
2874 struct btrfs_fs_info *info = extent_root->fs_info;
2875 struct list_head delete_list;
2876
2877 INIT_LIST_HEAD(&delete_list);
2878 extent_ins = &extent_root->fs_info->extent_ins;
2879 pending_del = &extent_root->fs_info->pending_del;
2880
2881again:
2882 mutex_lock(&info->extent_ins_mutex);
2883 while (1) {
2884 ret = find_first_extent_bit(pending_del, search, &start, &end,
2885 EXTENT_WRITEBACK);
2886 if (ret) {
2887 if (all && skipped && !nr) {
2888 search = 0;
2889 skipped = 0;
2890 continue;
2891 }
2892 mutex_unlock(&info->extent_ins_mutex);
2893 break;
2894 }
2895
2896 ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
2897 if (!ret) {
2898 search = end+1;
2899 skipped = 1;
2900
2901 if (need_resched()) {
2902 mutex_unlock(&info->extent_ins_mutex);
2903 cond_resched();
2904 mutex_lock(&info->extent_ins_mutex);
2905 }
2906
2907 continue;
2908 }
2909 BUG_ON(ret < 0);
2910
2911 ret = get_state_private(pending_del, start, &priv);
2912 BUG_ON(ret);
2913 extent_op = (struct pending_extent_op *)(unsigned long)priv;
2914
2915 clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
2916 GFP_NOFS);
2917 if (!test_range_bit(extent_ins, start, end,
2918 EXTENT_WRITEBACK, 0)) {
2919 list_add_tail(&extent_op->list, &delete_list);
2920 nr++;
2921 } else {
2922 kfree(extent_op);
2923
2924 ret = get_state_private(&info->extent_ins, start,
2925 &priv);
2926 BUG_ON(ret);
2927 extent_op = (struct pending_extent_op *)
2928 (unsigned long)priv;
2929
2930 clear_extent_bits(&info->extent_ins, start, end,
2931 EXTENT_WRITEBACK, GFP_NOFS);
2932
2933 if (extent_op->type == PENDING_BACKREF_UPDATE) {
2934 list_add_tail(&extent_op->list, &delete_list);
2935 search = end + 1;
2936 nr++;
2937 continue;
2938 }
2939
2940 mutex_lock(&extent_root->fs_info->pinned_mutex);
2941 ret = pin_down_bytes(trans, extent_root, start,
2942 end + 1 - start, 0);
2943 mutex_unlock(&extent_root->fs_info->pinned_mutex);
2944
2945 ret = update_block_group(trans, extent_root, start,
2946 end + 1 - start, 0, ret > 0);
2947
2948 unlock_extent(extent_ins, start, end, GFP_NOFS);
2949 BUG_ON(ret);
2950 kfree(extent_op);
2951 }
2952 if (ret)
2953 err = ret;
2954
2955 search = end + 1;
2956
2957 if (need_resched()) {
2958 mutex_unlock(&info->extent_ins_mutex);
2959 cond_resched();
2960 mutex_lock(&info->extent_ins_mutex);
2961 }
2962 }
2963 2434
2964 if (nr) { 2435 /*
2965 ret = free_extents(trans, extent_root, &delete_list); 2436 * if metadata always pin
2966 BUG_ON(ret); 2437 * if data pin when any transaction has committed this
2967 } 2438 */
2439 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID ||
2440 ref_generation != trans->transid)
2441 pin = 1;
2968 2442
2969 if (all && skipped) { 2443 if (ref_generation != trans->transid)
2970 INIT_LIST_HEAD(&delete_list); 2444 pin = 1;
2971 search = 0;
2972 nr = 0;
2973 goto again;
2974 }
2975 2445
2976 if (!err) 2446 return __free_extent(trans, root, bytenr, num_bytes, parent,
2977 finish_current_insert(trans, extent_root, 0); 2447 root_objectid, ref_generation,
2978 return err; 2448 owner_objectid, pin, pin == 0, refs_to_drop);
2979} 2449}
2980 2450
2981/* 2451/*
2982 * remove an extent from the root, returns 0 on success 2452 * when we free an extent, it is possible (and likely) that we free the last
2453 * delayed ref for that extent as well. This searches the delayed ref tree for
2454 * a given extent, and if there are no other delayed refs to be processed, it
2455 * removes it from the tree.
2983 */ 2456 */
2984static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 2457static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
2985 struct btrfs_root *root, 2458 struct btrfs_root *root, u64 bytenr)
2986 u64 bytenr, u64 num_bytes, u64 parent,
2987 u64 root_objectid, u64 ref_generation,
2988 u64 owner_objectid, int pin)
2989{ 2459{
2990 struct btrfs_root *extent_root = root->fs_info->extent_root; 2460 struct btrfs_delayed_ref_head *head;
2991 int pending_ret; 2461 struct btrfs_delayed_ref_root *delayed_refs;
2462 struct btrfs_delayed_ref_node *ref;
2463 struct rb_node *node;
2992 int ret; 2464 int ret;
2993 2465
2994 WARN_ON(num_bytes < root->sectorsize); 2466 delayed_refs = &trans->transaction->delayed_refs;
2995 if (root == extent_root) { 2467 spin_lock(&delayed_refs->lock);
2996 struct pending_extent_op *extent_op = NULL; 2468 head = btrfs_find_delayed_ref_head(trans, bytenr);
2997 2469 if (!head)
2998 mutex_lock(&root->fs_info->extent_ins_mutex); 2470 goto out;
2999 if (test_range_bit(&root->fs_info->extent_ins, bytenr,
3000 bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
3001 u64 priv;
3002 ret = get_state_private(&root->fs_info->extent_ins,
3003 bytenr, &priv);
3004 BUG_ON(ret);
3005 extent_op = (struct pending_extent_op *)
3006 (unsigned long)priv;
3007 2471
3008 extent_op->del = 1; 2472 node = rb_prev(&head->node.rb_node);
3009 if (extent_op->type == PENDING_EXTENT_INSERT) { 2473 if (!node)
3010 mutex_unlock(&root->fs_info->extent_ins_mutex); 2474 goto out;
3011 return 0;
3012 }
3013 }
3014 2475
3015 if (extent_op) { 2476 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
3016 ref_generation = extent_op->orig_generation;
3017 parent = extent_op->orig_parent;
3018 }
3019 2477
3020 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); 2478 /* there are still entries for this ref, we can't drop it */
3021 BUG_ON(!extent_op); 2479 if (ref->bytenr == bytenr)
3022 2480 goto out;
3023 extent_op->type = PENDING_EXTENT_DELETE;
3024 extent_op->bytenr = bytenr;
3025 extent_op->num_bytes = num_bytes;
3026 extent_op->parent = parent;
3027 extent_op->orig_parent = parent;
3028 extent_op->generation = ref_generation;
3029 extent_op->orig_generation = ref_generation;
3030 extent_op->level = (int)owner_objectid;
3031 INIT_LIST_HEAD(&extent_op->list);
3032 extent_op->del = 0;
3033
3034 set_extent_bits(&root->fs_info->pending_del,
3035 bytenr, bytenr + num_bytes - 1,
3036 EXTENT_WRITEBACK, GFP_NOFS);
3037 set_state_private(&root->fs_info->pending_del,
3038 bytenr, (unsigned long)extent_op);
3039 mutex_unlock(&root->fs_info->extent_ins_mutex);
3040 return 0;
3041 }
3042 /* if metadata always pin */
3043 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
3044 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
3045 mutex_lock(&root->fs_info->pinned_mutex);
3046 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
3047 mutex_unlock(&root->fs_info->pinned_mutex);
3048 update_reserved_extents(root, bytenr, num_bytes, 0);
3049 return 0;
3050 }
3051 pin = 1;
3052 }
3053 2481
3054 /* if data pin when any transaction has committed this */ 2482 /*
3055 if (ref_generation != trans->transid) 2483 * waiting for the lock here would deadlock. If someone else has it
3056 pin = 1; 2484 * locked they are already in the process of dropping it anyway
2485 */
2486 if (!mutex_trylock(&head->mutex))
2487 goto out;
3057 2488
3058 ret = __free_extent(trans, root, bytenr, num_bytes, parent, 2489 /*
3059 root_objectid, ref_generation, 2490 * at this point we have a head with no other entries. Go
3060 owner_objectid, pin, pin == 0); 2491 * ahead and process it.
2492 */
2493 head->node.in_tree = 0;
2494 rb_erase(&head->node.rb_node, &delayed_refs->root);
2495
2496 delayed_refs->num_entries--;
2497
2498 /*
2499 * we don't take a ref on the node because we're removing it from the
2500 * tree, so we just steal the ref the tree was holding.
2501 */
2502 delayed_refs->num_heads--;
2503 if (list_empty(&head->cluster))
2504 delayed_refs->num_heads_ready--;
2505
2506 list_del_init(&head->cluster);
2507 spin_unlock(&delayed_refs->lock);
3061 2508
3062 finish_current_insert(trans, root->fs_info->extent_root, 0); 2509 ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
3063 pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0); 2510 &head->node, head->must_insert_reserved);
3064 return ret ? ret : pending_ret; 2511 BUG_ON(ret);
2512 btrfs_put_delayed_ref(&head->node);
2513 return 0;
2514out:
2515 spin_unlock(&delayed_refs->lock);
2516 return 0;
3065} 2517}
3066 2518
3067int btrfs_free_extent(struct btrfs_trans_handle *trans, 2519int btrfs_free_extent(struct btrfs_trans_handle *trans,
@@ -3072,9 +2524,28 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
3072{ 2524{
3073 int ret; 2525 int ret;
3074 2526
3075 ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent, 2527 /*
3076 root_objectid, ref_generation, 2528 * tree log blocks never actually go into the extent allocation
3077 owner_objectid, pin); 2529 * tree, just update pinning info and exit early.
2530 *
2531 * data extents referenced by the tree log do need to have
2532 * their reference counts bumped.
2533 */
2534 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
2535 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2536 /* unlocks the pinned mutex */
2537 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2538 update_reserved_extents(root, bytenr, num_bytes, 0);
2539 ret = 0;
2540 } else {
2541 ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent,
2542 root_objectid, ref_generation,
2543 owner_objectid,
2544 BTRFS_DROP_DELAYED_REF, 1);
2545 BUG_ON(ret);
2546 ret = check_ref_cleanup(trans, root, bytenr);
2547 BUG_ON(ret);
2548 }
3078 return ret; 2549 return ret;
3079} 2550}
3080 2551
@@ -3103,228 +2574,262 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
3103{ 2574{
3104 int ret = 0; 2575 int ret = 0;
3105 struct btrfs_root *root = orig_root->fs_info->extent_root; 2576 struct btrfs_root *root = orig_root->fs_info->extent_root;
3106 u64 total_needed = num_bytes; 2577 struct btrfs_free_cluster *last_ptr = NULL;
3107 u64 *last_ptr = NULL;
3108 u64 last_wanted = 0;
3109 struct btrfs_block_group_cache *block_group = NULL; 2578 struct btrfs_block_group_cache *block_group = NULL;
3110 int chunk_alloc_done = 0;
3111 int empty_cluster = 2 * 1024 * 1024; 2579 int empty_cluster = 2 * 1024 * 1024;
3112 int allowed_chunk_alloc = 0; 2580 int allowed_chunk_alloc = 0;
3113 struct list_head *head = NULL, *cur = NULL;
3114 int loop = 0;
3115 int extra_loop = 0;
3116 struct btrfs_space_info *space_info; 2581 struct btrfs_space_info *space_info;
2582 int last_ptr_loop = 0;
2583 int loop = 0;
3117 2584
3118 WARN_ON(num_bytes < root->sectorsize); 2585 WARN_ON(num_bytes < root->sectorsize);
3119 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 2586 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
3120 ins->objectid = 0; 2587 ins->objectid = 0;
3121 ins->offset = 0; 2588 ins->offset = 0;
3122 2589
2590 space_info = __find_space_info(root->fs_info, data);
2591
3123 if (orig_root->ref_cows || empty_size) 2592 if (orig_root->ref_cows || empty_size)
3124 allowed_chunk_alloc = 1; 2593 allowed_chunk_alloc = 1;
3125 2594
3126 if (data & BTRFS_BLOCK_GROUP_METADATA) { 2595 if (data & BTRFS_BLOCK_GROUP_METADATA) {
3127 last_ptr = &root->fs_info->last_alloc; 2596 last_ptr = &root->fs_info->meta_alloc_cluster;
3128 if (!btrfs_test_opt(root, SSD)) 2597 if (!btrfs_test_opt(root, SSD))
3129 empty_cluster = 64 * 1024; 2598 empty_cluster = 64 * 1024;
3130 } 2599 }
3131 2600
3132 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) 2601 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
3133 last_ptr = &root->fs_info->last_data_alloc; 2602 last_ptr = &root->fs_info->data_alloc_cluster;
2603 }
3134 2604
3135 if (last_ptr) { 2605 if (last_ptr) {
3136 if (*last_ptr) { 2606 spin_lock(&last_ptr->lock);
3137 hint_byte = *last_ptr; 2607 if (last_ptr->block_group)
3138 last_wanted = *last_ptr; 2608 hint_byte = last_ptr->window_start;
3139 } else 2609 spin_unlock(&last_ptr->lock);
3140 empty_size += empty_cluster;
3141 } else {
3142 empty_cluster = 0;
3143 } 2610 }
2611
3144 search_start = max(search_start, first_logical_byte(root, 0)); 2612 search_start = max(search_start, first_logical_byte(root, 0));
3145 search_start = max(search_start, hint_byte); 2613 search_start = max(search_start, hint_byte);
3146 2614
3147 if (last_wanted && search_start != last_wanted) { 2615 if (!last_ptr) {
3148 last_wanted = 0; 2616 empty_cluster = 0;
3149 empty_size += empty_cluster; 2617 loop = 1;
3150 } 2618 }
3151 2619
3152 total_needed += empty_size; 2620 if (search_start == hint_byte) {
3153 block_group = btrfs_lookup_block_group(root->fs_info, search_start); 2621 block_group = btrfs_lookup_block_group(root->fs_info,
3154 if (!block_group) 2622 search_start);
3155 block_group = btrfs_lookup_first_block_group(root->fs_info, 2623 if (block_group && block_group_bits(block_group, data)) {
3156 search_start); 2624 down_read(&space_info->groups_sem);
3157 space_info = __find_space_info(root->fs_info, data); 2625 if (list_empty(&block_group->list) ||
2626 block_group->ro) {
2627 /*
2628 * someone is removing this block group,
2629 * we can't jump into the have_block_group
2630 * target because our list pointers are not
2631 * valid
2632 */
2633 btrfs_put_block_group(block_group);
2634 up_read(&space_info->groups_sem);
2635 } else
2636 goto have_block_group;
2637 } else if (block_group) {
2638 btrfs_put_block_group(block_group);
2639 }
2640 }
3158 2641
2642search:
3159 down_read(&space_info->groups_sem); 2643 down_read(&space_info->groups_sem);
3160 while (1) { 2644 list_for_each_entry(block_group, &space_info->block_groups, list) {
3161 struct btrfs_free_space *free_space; 2645 u64 offset;
3162 /*
3163 * the only way this happens if our hint points to a block
3164 * group thats not of the proper type, while looping this
3165 * should never happen
3166 */
3167 if (empty_size)
3168 extra_loop = 1;
3169 2646
3170 if (!block_group) 2647 atomic_inc(&block_group->count);
3171 goto new_group_no_lock; 2648 search_start = block_group->key.objectid;
3172 2649
2650have_block_group:
3173 if (unlikely(!block_group->cached)) { 2651 if (unlikely(!block_group->cached)) {
3174 mutex_lock(&block_group->cache_mutex); 2652 mutex_lock(&block_group->cache_mutex);
3175 ret = cache_block_group(root, block_group); 2653 ret = cache_block_group(root, block_group);
3176 mutex_unlock(&block_group->cache_mutex); 2654 mutex_unlock(&block_group->cache_mutex);
3177 if (ret) 2655 if (ret) {
2656 btrfs_put_block_group(block_group);
3178 break; 2657 break;
2658 }
3179 } 2659 }
3180 2660
3181 mutex_lock(&block_group->alloc_mutex);
3182 if (unlikely(!block_group_bits(block_group, data)))
3183 goto new_group;
3184
3185 if (unlikely(block_group->ro)) 2661 if (unlikely(block_group->ro))
3186 goto new_group; 2662 goto loop;
3187 2663
3188 free_space = btrfs_find_free_space(block_group, search_start, 2664 if (last_ptr) {
3189 total_needed); 2665 /*
3190 if (free_space) { 2666 * the refill lock keeps out other
3191 u64 start = block_group->key.objectid; 2667 * people trying to start a new cluster
3192 u64 end = block_group->key.objectid + 2668 */
3193 block_group->key.offset; 2669 spin_lock(&last_ptr->refill_lock);
2670 if (last_ptr->block_group &&
2671 (last_ptr->block_group->ro ||
2672 !block_group_bits(last_ptr->block_group, data))) {
2673 offset = 0;
2674 goto refill_cluster;
2675 }
3194 2676
3195 search_start = stripe_align(root, free_space->offset); 2677 offset = btrfs_alloc_from_cluster(block_group, last_ptr,
2678 num_bytes, search_start);
2679 if (offset) {
2680 /* we have a block, we're done */
2681 spin_unlock(&last_ptr->refill_lock);
2682 goto checks;
2683 }
3196 2684
3197 /* move on to the next group */ 2685 spin_lock(&last_ptr->lock);
3198 if (search_start + num_bytes >= search_end) 2686 /*
3199 goto new_group; 2687 * whoops, this cluster doesn't actually point to
2688 * this block group. Get a ref on the block
2689 * group is does point to and try again
2690 */
2691 if (!last_ptr_loop && last_ptr->block_group &&
2692 last_ptr->block_group != block_group) {
3200 2693
3201 /* move on to the next group */ 2694 btrfs_put_block_group(block_group);
3202 if (search_start + num_bytes > end) 2695 block_group = last_ptr->block_group;
3203 goto new_group; 2696 atomic_inc(&block_group->count);
2697 spin_unlock(&last_ptr->lock);
2698 spin_unlock(&last_ptr->refill_lock);
3204 2699
3205 if (last_wanted && search_start != last_wanted) { 2700 last_ptr_loop = 1;
3206 total_needed += empty_cluster; 2701 search_start = block_group->key.objectid;
3207 empty_size += empty_cluster;
3208 last_wanted = 0;
3209 /* 2702 /*
3210 * if search_start is still in this block group 2703 * we know this block group is properly
3211 * then we just re-search this block group 2704 * in the list because
2705 * btrfs_remove_block_group, drops the
2706 * cluster before it removes the block
2707 * group from the list
3212 */ 2708 */
3213 if (search_start >= start && 2709 goto have_block_group;
3214 search_start < end) {
3215 mutex_unlock(&block_group->alloc_mutex);
3216 continue;
3217 }
3218
3219 /* else we go to the next block group */
3220 goto new_group;
3221 } 2710 }
2711 spin_unlock(&last_ptr->lock);
2712refill_cluster:
2713 /*
2714 * this cluster didn't work out, free it and
2715 * start over
2716 */
2717 btrfs_return_cluster_to_free_space(NULL, last_ptr);
2718
2719 last_ptr_loop = 0;
3222 2720
3223 if (exclude_nr > 0 && 2721 /* allocate a cluster in this block group */
3224 (search_start + num_bytes > exclude_start && 2722 ret = btrfs_find_space_cluster(trans,
3225 search_start < exclude_start + exclude_nr)) { 2723 block_group, last_ptr,
3226 search_start = exclude_start + exclude_nr; 2724 offset, num_bytes,
2725 empty_cluster + empty_size);
2726 if (ret == 0) {
3227 /* 2727 /*
3228 * if search_start is still in this block group 2728 * now pull our allocation out of this
3229 * then we just re-search this block group 2729 * cluster
3230 */ 2730 */
3231 if (search_start >= start && 2731 offset = btrfs_alloc_from_cluster(block_group,
3232 search_start < end) { 2732 last_ptr, num_bytes,
3233 mutex_unlock(&block_group->alloc_mutex); 2733 search_start);
3234 last_wanted = 0; 2734 if (offset) {
3235 continue; 2735 /* we found one, proceed */
2736 spin_unlock(&last_ptr->refill_lock);
2737 goto checks;
3236 } 2738 }
3237
3238 /* else we go to the next block group */
3239 goto new_group;
3240 } 2739 }
2740 /*
2741 * at this point we either didn't find a cluster
2742 * or we weren't able to allocate a block from our
2743 * cluster. Free the cluster we've been trying
2744 * to use, and go to the next block group
2745 */
2746 if (loop < 2) {
2747 btrfs_return_cluster_to_free_space(NULL,
2748 last_ptr);
2749 spin_unlock(&last_ptr->refill_lock);
2750 goto loop;
2751 }
2752 spin_unlock(&last_ptr->refill_lock);
2753 }
3241 2754
3242 ins->objectid = search_start; 2755 offset = btrfs_find_space_for_alloc(block_group, search_start,
3243 ins->offset = num_bytes; 2756 num_bytes, empty_size);
2757 if (!offset)
2758 goto loop;
2759checks:
2760 search_start = stripe_align(root, offset);
3244 2761
3245 btrfs_remove_free_space_lock(block_group, search_start, 2762 /* move on to the next group */
3246 num_bytes); 2763 if (search_start + num_bytes >= search_end) {
3247 /* we are all good, lets return */ 2764 btrfs_add_free_space(block_group, offset, num_bytes);
3248 mutex_unlock(&block_group->alloc_mutex); 2765 goto loop;
3249 break;
3250 } 2766 }
3251new_group:
3252 mutex_unlock(&block_group->alloc_mutex);
3253 put_block_group(block_group);
3254 block_group = NULL;
3255new_group_no_lock:
3256 /* don't try to compare new allocations against the
3257 * last allocation any more
3258 */
3259 last_wanted = 0;
3260 2767
3261 /* 2768 /* move on to the next group */
3262 * Here's how this works. 2769 if (search_start + num_bytes >
3263 * loop == 0: we were searching a block group via a hint 2770 block_group->key.objectid + block_group->key.offset) {
3264 * and didn't find anything, so we start at 2771 btrfs_add_free_space(block_group, offset, num_bytes);
3265 * the head of the block groups and keep searching 2772 goto loop;
3266 * loop == 1: we're searching through all of the block groups 2773 }
3267 * if we hit the head again we have searched 2774
3268 * all of the block groups for this space and we 2775 if (exclude_nr > 0 &&
3269 * need to try and allocate, if we cant error out. 2776 (search_start + num_bytes > exclude_start &&
3270 * loop == 2: we allocated more space and are looping through 2777 search_start < exclude_start + exclude_nr)) {
3271 * all of the block groups again. 2778 search_start = exclude_start + exclude_nr;
3272 */ 2779
3273 if (loop == 0) { 2780 btrfs_add_free_space(block_group, offset, num_bytes);
3274 head = &space_info->block_groups; 2781 /*
3275 cur = head->next; 2782 * if search_start is still in this block group
3276 loop++; 2783 * then we just re-search this block group
3277 } else if (loop == 1 && cur == head) {
3278 int keep_going;
3279
3280 /* at this point we give up on the empty_size
3281 * allocations and just try to allocate the min
3282 * space.
3283 *
3284 * The extra_loop field was set if an empty_size
3285 * allocation was attempted above, and if this
3286 * is try we need to try the loop again without
3287 * the additional empty_size.
3288 */ 2784 */
3289 total_needed -= empty_size; 2785 if (search_start >= block_group->key.objectid &&
3290 empty_size = 0; 2786 search_start < (block_group->key.objectid +
3291 keep_going = extra_loop; 2787 block_group->key.offset))
3292 loop++; 2788 goto have_block_group;
2789 goto loop;
2790 }
3293 2791
3294 if (allowed_chunk_alloc && !chunk_alloc_done) { 2792 ins->objectid = search_start;
3295 up_read(&space_info->groups_sem); 2793 ins->offset = num_bytes;
3296 ret = do_chunk_alloc(trans, root, num_bytes + 2794
3297 2 * 1024 * 1024, data, 1); 2795 if (offset < search_start)
3298 down_read(&space_info->groups_sem); 2796 btrfs_add_free_space(block_group, offset,
3299 if (ret < 0) 2797 search_start - offset);
3300 goto loop_check; 2798 BUG_ON(offset > search_start);
3301 head = &space_info->block_groups; 2799
3302 /* 2800 /* we are all good, lets return */
3303 * we've allocated a new chunk, keep 2801 break;
3304 * trying 2802loop:
3305 */ 2803 btrfs_put_block_group(block_group);
3306 keep_going = 1; 2804 }
3307 chunk_alloc_done = 1; 2805 up_read(&space_info->groups_sem);
3308 } else if (!allowed_chunk_alloc) { 2806
3309 space_info->force_alloc = 1; 2807 /* loop == 0, try to find a clustered alloc in every block group
3310 } 2808 * loop == 1, try again after forcing a chunk allocation
3311loop_check: 2809 * loop == 2, set empty_size and empty_cluster to 0 and try again
3312 if (keep_going) { 2810 */
3313 cur = head->next; 2811 if (!ins->objectid && loop < 3 &&
3314 extra_loop = 0; 2812 (empty_size || empty_cluster || allowed_chunk_alloc)) {
3315 } else { 2813 if (loop >= 2) {
3316 break; 2814 empty_size = 0;
3317 } 2815 empty_cluster = 0;
3318 } else if (cur == head) {
3319 break;
3320 } 2816 }
3321 2817
3322 block_group = list_entry(cur, struct btrfs_block_group_cache, 2818 if (allowed_chunk_alloc) {
3323 list); 2819 ret = do_chunk_alloc(trans, root, num_bytes +
3324 atomic_inc(&block_group->count); 2820 2 * 1024 * 1024, data, 1);
2821 allowed_chunk_alloc = 0;
2822 } else {
2823 space_info->force_alloc = 1;
2824 }
3325 2825
3326 search_start = block_group->key.objectid; 2826 if (loop < 3) {
3327 cur = cur->next; 2827 loop++;
2828 goto search;
2829 }
2830 ret = -ENOSPC;
2831 } else if (!ins->objectid) {
2832 ret = -ENOSPC;
3328 } 2833 }
3329 2834
3330 /* we found what we needed */ 2835 /* we found what we needed */
@@ -3332,21 +2837,10 @@ loop_check:
3332 if (!(data & BTRFS_BLOCK_GROUP_DATA)) 2837 if (!(data & BTRFS_BLOCK_GROUP_DATA))
3333 trans->block_group = block_group->key.objectid; 2838 trans->block_group = block_group->key.objectid;
3334 2839
3335 if (last_ptr) 2840 btrfs_put_block_group(block_group);
3336 *last_ptr = ins->objectid + ins->offset;
3337 ret = 0; 2841 ret = 0;
3338 } else if (!ret) {
3339 printk(KERN_ERR "btrfs searching for %llu bytes, "
3340 "num_bytes %llu, loop %d, allowed_alloc %d\n",
3341 (unsigned long long)total_needed,
3342 (unsigned long long)num_bytes,
3343 loop, allowed_chunk_alloc);
3344 ret = -ENOSPC;
3345 } 2842 }
3346 if (block_group)
3347 put_block_group(block_group);
3348 2843
3349 up_read(&space_info->groups_sem);
3350 return ret; 2844 return ret;
3351} 2845}
3352 2846
@@ -3359,9 +2853,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3359 info->bytes_pinned - info->bytes_reserved), 2853 info->bytes_pinned - info->bytes_reserved),
3360 (info->full) ? "" : "not "); 2854 (info->full) ? "" : "not ");
3361 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," 2855 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
3362 " may_use=%llu, used=%llu\n", info->total_bytes, 2856 " may_use=%llu, used=%llu\n",
3363 info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use, 2857 (unsigned long long)info->total_bytes,
3364 info->bytes_used); 2858 (unsigned long long)info->bytes_pinned,
2859 (unsigned long long)info->bytes_delalloc,
2860 (unsigned long long)info->bytes_may_use,
2861 (unsigned long long)info->bytes_used);
3365 2862
3366 down_read(&info->groups_sem); 2863 down_read(&info->groups_sem);
3367 list_for_each_entry(cache, &info->block_groups, list) { 2864 list_for_each_entry(cache, &info->block_groups, list) {
@@ -3451,7 +2948,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
3451 ret = btrfs_discard_extent(root, start, len); 2948 ret = btrfs_discard_extent(root, start, len);
3452 2949
3453 btrfs_add_free_space(cache, start, len); 2950 btrfs_add_free_space(cache, start, len);
3454 put_block_group(cache); 2951 btrfs_put_block_group(cache);
3455 update_reserved_extents(root, start, len, 0); 2952 update_reserved_extents(root, start, len, 0);
3456 2953
3457 return ret; 2954 return ret;
@@ -3475,10 +2972,10 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3475static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, 2972static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3476 struct btrfs_root *root, u64 parent, 2973 struct btrfs_root *root, u64 parent,
3477 u64 root_objectid, u64 ref_generation, 2974 u64 root_objectid, u64 ref_generation,
3478 u64 owner, struct btrfs_key *ins) 2975 u64 owner, struct btrfs_key *ins,
2976 int ref_mod)
3479{ 2977{
3480 int ret; 2978 int ret;
3481 int pending_ret;
3482 u64 super_used; 2979 u64 super_used;
3483 u64 root_used; 2980 u64 root_used;
3484 u64 num_bytes = ins->offset; 2981 u64 num_bytes = ins->offset;
@@ -3503,33 +3000,6 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3503 btrfs_set_root_used(&root->root_item, root_used + num_bytes); 3000 btrfs_set_root_used(&root->root_item, root_used + num_bytes);
3504 spin_unlock(&info->delalloc_lock); 3001 spin_unlock(&info->delalloc_lock);
3505 3002
3506 if (root == extent_root) {
3507 struct pending_extent_op *extent_op;
3508
3509 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
3510 BUG_ON(!extent_op);
3511
3512 extent_op->type = PENDING_EXTENT_INSERT;
3513 extent_op->bytenr = ins->objectid;
3514 extent_op->num_bytes = ins->offset;
3515 extent_op->parent = parent;
3516 extent_op->orig_parent = 0;
3517 extent_op->generation = ref_generation;
3518 extent_op->orig_generation = 0;
3519 extent_op->level = (int)owner;
3520 INIT_LIST_HEAD(&extent_op->list);
3521 extent_op->del = 0;
3522
3523 mutex_lock(&root->fs_info->extent_ins_mutex);
3524 set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
3525 ins->objectid + ins->offset - 1,
3526 EXTENT_WRITEBACK, GFP_NOFS);
3527 set_state_private(&root->fs_info->extent_ins,
3528 ins->objectid, (unsigned long)extent_op);
3529 mutex_unlock(&root->fs_info->extent_ins_mutex);
3530 goto update_block;
3531 }
3532
3533 memcpy(&keys[0], ins, sizeof(*ins)); 3003 memcpy(&keys[0], ins, sizeof(*ins));
3534 keys[1].objectid = ins->objectid; 3004 keys[1].objectid = ins->objectid;
3535 keys[1].type = BTRFS_EXTENT_REF_KEY; 3005 keys[1].type = BTRFS_EXTENT_REF_KEY;
@@ -3540,37 +3010,31 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3540 path = btrfs_alloc_path(); 3010 path = btrfs_alloc_path();
3541 BUG_ON(!path); 3011 BUG_ON(!path);
3542 3012
3013 path->leave_spinning = 1;
3543 ret = btrfs_insert_empty_items(trans, extent_root, path, keys, 3014 ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
3544 sizes, 2); 3015 sizes, 2);
3545 BUG_ON(ret); 3016 BUG_ON(ret);
3546 3017
3547 extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3018 extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3548 struct btrfs_extent_item); 3019 struct btrfs_extent_item);
3549 btrfs_set_extent_refs(path->nodes[0], extent_item, 1); 3020 btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod);
3550 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 3021 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
3551 struct btrfs_extent_ref); 3022 struct btrfs_extent_ref);
3552 3023
3553 btrfs_set_ref_root(path->nodes[0], ref, root_objectid); 3024 btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
3554 btrfs_set_ref_generation(path->nodes[0], ref, ref_generation); 3025 btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
3555 btrfs_set_ref_objectid(path->nodes[0], ref, owner); 3026 btrfs_set_ref_objectid(path->nodes[0], ref, owner);
3556 btrfs_set_ref_num_refs(path->nodes[0], ref, 1); 3027 btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod);
3557 3028
3558 btrfs_mark_buffer_dirty(path->nodes[0]); 3029 btrfs_mark_buffer_dirty(path->nodes[0]);
3559 3030
3560 trans->alloc_exclude_start = 0; 3031 trans->alloc_exclude_start = 0;
3561 trans->alloc_exclude_nr = 0; 3032 trans->alloc_exclude_nr = 0;
3562 btrfs_free_path(path); 3033 btrfs_free_path(path);
3563 finish_current_insert(trans, extent_root, 0);
3564 pending_ret = del_pending_extents(trans, extent_root, 0);
3565 3034
3566 if (ret) 3035 if (ret)
3567 goto out; 3036 goto out;
3568 if (pending_ret) {
3569 ret = pending_ret;
3570 goto out;
3571 }
3572 3037
3573update_block:
3574 ret = update_block_group(trans, root, ins->objectid, 3038 ret = update_block_group(trans, root, ins->objectid,
3575 ins->offset, 1, 0); 3039 ins->offset, 1, 0);
3576 if (ret) { 3040 if (ret) {
@@ -3592,9 +3056,12 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3592 3056
3593 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) 3057 if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
3594 return 0; 3058 return 0;
3595 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, 3059
3596 ref_generation, owner, ins); 3060 ret = btrfs_add_delayed_ref(trans, ins->objectid,
3597 update_reserved_extents(root, ins->objectid, ins->offset, 0); 3061 ins->offset, parent, root_objectid,
3062 ref_generation, owner,
3063 BTRFS_ADD_DELAYED_EXTENT, 0);
3064 BUG_ON(ret);
3598 return ret; 3065 return ret;
3599} 3066}
3600 3067
@@ -3619,9 +3086,9 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
3619 ret = btrfs_remove_free_space(block_group, ins->objectid, 3086 ret = btrfs_remove_free_space(block_group, ins->objectid,
3620 ins->offset); 3087 ins->offset);
3621 BUG_ON(ret); 3088 BUG_ON(ret);
3622 put_block_group(block_group); 3089 btrfs_put_block_group(block_group);
3623 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, 3090 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3624 ref_generation, owner, ins); 3091 ref_generation, owner, ins, 1);
3625 return ret; 3092 return ret;
3626} 3093}
3627 3094
@@ -3640,20 +3107,18 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
3640 u64 search_end, struct btrfs_key *ins, u64 data) 3107 u64 search_end, struct btrfs_key *ins, u64 data)
3641{ 3108{
3642 int ret; 3109 int ret;
3643
3644 ret = __btrfs_reserve_extent(trans, root, num_bytes, 3110 ret = __btrfs_reserve_extent(trans, root, num_bytes,
3645 min_alloc_size, empty_size, hint_byte, 3111 min_alloc_size, empty_size, hint_byte,
3646 search_end, ins, data); 3112 search_end, ins, data);
3647 BUG_ON(ret); 3113 BUG_ON(ret);
3648 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 3114 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
3649 ret = __btrfs_alloc_reserved_extent(trans, root, parent, 3115 ret = btrfs_add_delayed_ref(trans, ins->objectid,
3650 root_objectid, ref_generation, 3116 ins->offset, parent, root_objectid,
3651 owner_objectid, ins); 3117 ref_generation, owner_objectid,
3118 BTRFS_ADD_DELAYED_EXTENT, 0);
3652 BUG_ON(ret); 3119 BUG_ON(ret);
3653
3654 } else {
3655 update_reserved_extents(root, ins->objectid, ins->offset, 1);
3656 } 3120 }
3121 update_reserved_extents(root, ins->objectid, ins->offset, 1);
3657 return ret; 3122 return ret;
3658} 3123}
3659 3124
@@ -3789,7 +3254,7 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3789 3254
3790 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 3255 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
3791 3256
3792 ret = __btrfs_free_extent(trans, root, disk_bytenr, 3257 ret = btrfs_free_extent(trans, root, disk_bytenr,
3793 btrfs_file_extent_disk_num_bytes(leaf, fi), 3258 btrfs_file_extent_disk_num_bytes(leaf, fi),
3794 leaf->start, leaf_owner, leaf_generation, 3259 leaf->start, leaf_owner, leaf_generation,
3795 key.objectid, 0); 3260 key.objectid, 0);
@@ -3829,7 +3294,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3829 */ 3294 */
3830 for (i = 0; i < ref->nritems; i++) { 3295 for (i = 0; i < ref->nritems; i++) {
3831 info = ref->extents + sorted[i].slot; 3296 info = ref->extents + sorted[i].slot;
3832 ret = __btrfs_free_extent(trans, root, info->bytenr, 3297 ret = btrfs_free_extent(trans, root, info->bytenr,
3833 info->num_bytes, ref->bytenr, 3298 info->num_bytes, ref->bytenr,
3834 ref->owner, ref->generation, 3299 ref->owner, ref->generation,
3835 info->objectid, 0); 3300 info->objectid, 0);
@@ -3846,12 +3311,13 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3846 return 0; 3311 return 0;
3847} 3312}
3848 3313
3849static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, 3314static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
3315 struct btrfs_root *root, u64 start,
3850 u64 len, u32 *refs) 3316 u64 len, u32 *refs)
3851{ 3317{
3852 int ret; 3318 int ret;
3853 3319
3854 ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs); 3320 ret = btrfs_lookup_extent_ref(trans, root, start, len, refs);
3855 BUG_ON(ret); 3321 BUG_ON(ret);
3856 3322
3857#if 0 /* some debugging code in case we see problems here */ 3323#if 0 /* some debugging code in case we see problems here */
@@ -3959,7 +3425,8 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
3959 * we just decrement it below and don't update any 3425 * we just decrement it below and don't update any
3960 * of the refs the leaf points to. 3426 * of the refs the leaf points to.
3961 */ 3427 */
3962 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); 3428 ret = drop_snap_lookup_refcount(trans, root, bytenr,
3429 blocksize, &refs);
3963 BUG_ON(ret); 3430 BUG_ON(ret);
3964 if (refs != 1) 3431 if (refs != 1)
3965 continue; 3432 continue;
@@ -4010,7 +3477,7 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
4010 */ 3477 */
4011 for (i = 0; i < refi; i++) { 3478 for (i = 0; i < refi; i++) {
4012 bytenr = sorted[i].bytenr; 3479 bytenr = sorted[i].bytenr;
4013 ret = __btrfs_free_extent(trans, root, bytenr, 3480 ret = btrfs_free_extent(trans, root, bytenr,
4014 blocksize, eb->start, 3481 blocksize, eb->start,
4015 root_owner, root_gen, 0, 1); 3482 root_owner, root_gen, 0, 1);
4016 BUG_ON(ret); 3483 BUG_ON(ret);
@@ -4053,7 +3520,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4053 3520
4054 WARN_ON(*level < 0); 3521 WARN_ON(*level < 0);
4055 WARN_ON(*level >= BTRFS_MAX_LEVEL); 3522 WARN_ON(*level >= BTRFS_MAX_LEVEL);
4056 ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start, 3523 ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
4057 path->nodes[*level]->len, &refs); 3524 path->nodes[*level]->len, &refs);
4058 BUG_ON(ret); 3525 BUG_ON(ret);
4059 if (refs > 1) 3526 if (refs > 1)
@@ -4104,7 +3571,8 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4104 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 3571 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
4105 blocksize = btrfs_level_size(root, *level - 1); 3572 blocksize = btrfs_level_size(root, *level - 1);
4106 3573
4107 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); 3574 ret = drop_snap_lookup_refcount(trans, root, bytenr,
3575 blocksize, &refs);
4108 BUG_ON(ret); 3576 BUG_ON(ret);
4109 3577
4110 /* 3578 /*
@@ -4119,7 +3587,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4119 root_gen = btrfs_header_generation(parent); 3587 root_gen = btrfs_header_generation(parent);
4120 path->slots[*level]++; 3588 path->slots[*level]++;
4121 3589
4122 ret = __btrfs_free_extent(trans, root, bytenr, 3590 ret = btrfs_free_extent(trans, root, bytenr,
4123 blocksize, parent->start, 3591 blocksize, parent->start,
4124 root_owner, root_gen, 3592 root_owner, root_gen,
4125 *level - 1, 1); 3593 *level - 1, 1);
@@ -4165,7 +3633,7 @@ out:
4165 * cleanup and free the reference on the last node 3633 * cleanup and free the reference on the last node
4166 * we processed 3634 * we processed
4167 */ 3635 */
4168 ret = __btrfs_free_extent(trans, root, bytenr, blocksize, 3636 ret = btrfs_free_extent(trans, root, bytenr, blocksize,
4169 parent->start, root_owner, root_gen, 3637 parent->start, root_owner, root_gen,
4170 *level, 1); 3638 *level, 1);
4171 free_extent_buffer(path->nodes[*level]); 3639 free_extent_buffer(path->nodes[*level]);
@@ -4354,6 +3822,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
4354 struct btrfs_path *path; 3822 struct btrfs_path *path;
4355 int i; 3823 int i;
4356 int orig_level; 3824 int orig_level;
3825 int update_count;
4357 struct btrfs_root_item *root_item = &root->root_item; 3826 struct btrfs_root_item *root_item = &root->root_item;
4358 3827
4359 WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex)); 3828 WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
@@ -4395,6 +3864,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
4395 } 3864 }
4396 } 3865 }
4397 while (1) { 3866 while (1) {
3867 unsigned long update;
4398 wret = walk_down_tree(trans, root, path, &level); 3868 wret = walk_down_tree(trans, root, path, &level);
4399 if (wret > 0) 3869 if (wret > 0)
4400 break; 3870 break;
@@ -4407,12 +3877,21 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
4407 break; 3877 break;
4408 if (wret < 0) 3878 if (wret < 0)
4409 ret = wret; 3879 ret = wret;
4410 if (trans->transaction->in_commit) { 3880 if (trans->transaction->in_commit ||
3881 trans->transaction->delayed_refs.flushing) {
4411 ret = -EAGAIN; 3882 ret = -EAGAIN;
4412 break; 3883 break;
4413 } 3884 }
4414 atomic_inc(&root->fs_info->throttle_gen); 3885 atomic_inc(&root->fs_info->throttle_gen);
4415 wake_up(&root->fs_info->transaction_throttle); 3886 wake_up(&root->fs_info->transaction_throttle);
3887 for (update_count = 0; update_count < 16; update_count++) {
3888 update = trans->delayed_ref_updates;
3889 trans->delayed_ref_updates = 0;
3890 if (update)
3891 btrfs_run_delayed_refs(trans, root, update);
3892 else
3893 break;
3894 }
4416 } 3895 }
4417 for (i = 0; i <= orig_level; i++) { 3896 for (i = 0; i <= orig_level; i++) {
4418 if (path->nodes[i]) { 3897 if (path->nodes[i]) {
@@ -5457,6 +4936,7 @@ static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
5457 root->root_key.objectid, 4936 root->root_key.objectid,
5458 trans->transid, key.objectid); 4937 trans->transid, key.objectid);
5459 BUG_ON(ret); 4938 BUG_ON(ret);
4939
5460 ret = btrfs_free_extent(trans, root, 4940 ret = btrfs_free_extent(trans, root,
5461 bytenr, num_bytes, leaf->start, 4941 bytenr, num_bytes, leaf->start,
5462 btrfs_header_owner(leaf), 4942 btrfs_header_owner(leaf),
@@ -5768,9 +5248,6 @@ static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
5768 ref_path, NULL, NULL); 5248 ref_path, NULL, NULL);
5769 BUG_ON(ret); 5249 BUG_ON(ret);
5770 5250
5771 if (root == root->fs_info->extent_root)
5772 btrfs_extent_post_op(trans, root);
5773
5774 return 0; 5251 return 0;
5775} 5252}
5776 5253
@@ -6038,6 +5515,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
6038 if (!path) 5515 if (!path)
6039 return -ENOMEM; 5516 return -ENOMEM;
6040 5517
5518 path->leave_spinning = 1;
6041 ret = btrfs_insert_empty_inode(trans, root, path, objectid); 5519 ret = btrfs_insert_empty_inode(trans, root, path, objectid);
6042 if (ret) 5520 if (ret)
6043 goto out; 5521 goto out;
@@ -6208,6 +5686,9 @@ again:
6208 btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1); 5686 btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
6209 mutex_unlock(&root->fs_info->cleaner_mutex); 5687 mutex_unlock(&root->fs_info->cleaner_mutex);
6210 5688
5689 trans = btrfs_start_transaction(info->tree_root, 1);
5690 btrfs_commit_transaction(trans, info->tree_root);
5691
6211 while (1) { 5692 while (1) {
6212 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5693 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6213 if (ret < 0) 5694 if (ret < 0)
@@ -6294,7 +5775,7 @@ next:
6294 WARN_ON(block_group->reserved > 0); 5775 WARN_ON(block_group->reserved > 0);
6295 WARN_ON(btrfs_block_group_used(&block_group->item) > 0); 5776 WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
6296 spin_unlock(&block_group->lock); 5777 spin_unlock(&block_group->lock);
6297 put_block_group(block_group); 5778 btrfs_put_block_group(block_group);
6298 ret = 0; 5779 ret = 0;
6299out: 5780out:
6300 btrfs_free_path(path); 5781 btrfs_free_path(path);
@@ -6421,9 +5902,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
6421 5902
6422 atomic_set(&cache->count, 1); 5903 atomic_set(&cache->count, 1);
6423 spin_lock_init(&cache->lock); 5904 spin_lock_init(&cache->lock);
6424 mutex_init(&cache->alloc_mutex); 5905 spin_lock_init(&cache->tree_lock);
6425 mutex_init(&cache->cache_mutex); 5906 mutex_init(&cache->cache_mutex);
6426 INIT_LIST_HEAD(&cache->list); 5907 INIT_LIST_HEAD(&cache->list);
5908 INIT_LIST_HEAD(&cache->cluster_list);
6427 read_extent_buffer(leaf, &cache->item, 5909 read_extent_buffer(leaf, &cache->item,
6428 btrfs_item_ptr_offset(leaf, path->slots[0]), 5910 btrfs_item_ptr_offset(leaf, path->slots[0]),
6429 sizeof(cache->item)); 5911 sizeof(cache->item));
@@ -6466,7 +5948,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
6466 5948
6467 extent_root = root->fs_info->extent_root; 5949 extent_root = root->fs_info->extent_root;
6468 5950
6469 root->fs_info->last_trans_new_blockgroup = trans->transid; 5951 root->fs_info->last_trans_log_full_commit = trans->transid;
6470 5952
6471 cache = kzalloc(sizeof(*cache), GFP_NOFS); 5953 cache = kzalloc(sizeof(*cache), GFP_NOFS);
6472 if (!cache) 5954 if (!cache)
@@ -6477,9 +5959,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
6477 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 5959 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
6478 atomic_set(&cache->count, 1); 5960 atomic_set(&cache->count, 1);
6479 spin_lock_init(&cache->lock); 5961 spin_lock_init(&cache->lock);
6480 mutex_init(&cache->alloc_mutex); 5962 spin_lock_init(&cache->tree_lock);
6481 mutex_init(&cache->cache_mutex); 5963 mutex_init(&cache->cache_mutex);
6482 INIT_LIST_HEAD(&cache->list); 5964 INIT_LIST_HEAD(&cache->list);
5965 INIT_LIST_HEAD(&cache->cluster_list);
6483 5966
6484 btrfs_set_block_group_used(&cache->item, bytes_used); 5967 btrfs_set_block_group_used(&cache->item, bytes_used);
6485 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); 5968 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
@@ -6500,9 +5983,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
6500 sizeof(cache->item)); 5983 sizeof(cache->item));
6501 BUG_ON(ret); 5984 BUG_ON(ret);
6502 5985
6503 finish_current_insert(trans, extent_root, 0);
6504 ret = del_pending_extents(trans, extent_root, 0);
6505 BUG_ON(ret);
6506 set_avail_alloc_bits(extent_root->fs_info, type); 5986 set_avail_alloc_bits(extent_root->fs_info, type);
6507 5987
6508 return 0; 5988 return 0;
@@ -6513,6 +5993,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
6513{ 5993{
6514 struct btrfs_path *path; 5994 struct btrfs_path *path;
6515 struct btrfs_block_group_cache *block_group; 5995 struct btrfs_block_group_cache *block_group;
5996 struct btrfs_free_cluster *cluster;
6516 struct btrfs_key key; 5997 struct btrfs_key key;
6517 int ret; 5998 int ret;
6518 5999
@@ -6524,6 +6005,21 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
6524 6005
6525 memcpy(&key, &block_group->key, sizeof(key)); 6006 memcpy(&key, &block_group->key, sizeof(key));
6526 6007
6008 /* make sure this block group isn't part of an allocation cluster */
6009 cluster = &root->fs_info->data_alloc_cluster;
6010 spin_lock(&cluster->refill_lock);
6011 btrfs_return_cluster_to_free_space(block_group, cluster);
6012 spin_unlock(&cluster->refill_lock);
6013
6014 /*
6015 * make sure this block group isn't part of a metadata
6016 * allocation cluster
6017 */
6018 cluster = &root->fs_info->meta_alloc_cluster;
6019 spin_lock(&cluster->refill_lock);
6020 btrfs_return_cluster_to_free_space(block_group, cluster);
6021 spin_unlock(&cluster->refill_lock);
6022
6527 path = btrfs_alloc_path(); 6023 path = btrfs_alloc_path();
6528 BUG_ON(!path); 6024 BUG_ON(!path);
6529 6025
@@ -6533,7 +6029,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
6533 spin_unlock(&root->fs_info->block_group_cache_lock); 6029 spin_unlock(&root->fs_info->block_group_cache_lock);
6534 btrfs_remove_free_space_cache(block_group); 6030 btrfs_remove_free_space_cache(block_group);
6535 down_write(&block_group->space_info->groups_sem); 6031 down_write(&block_group->space_info->groups_sem);
6536 list_del(&block_group->list); 6032 /*
6033 * we must use list_del_init so people can check to see if they
6034 * are still on the list after taking the semaphore
6035 */
6036 list_del_init(&block_group->list);
6537 up_write(&block_group->space_info->groups_sem); 6037 up_write(&block_group->space_info->groups_sem);
6538 6038
6539 spin_lock(&block_group->space_info->lock); 6039 spin_lock(&block_group->space_info->lock);
@@ -6542,8 +6042,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
6542 spin_unlock(&block_group->space_info->lock); 6042 spin_unlock(&block_group->space_info->lock);
6543 block_group->space_info->full = 0; 6043 block_group->space_info->full = 0;
6544 6044
6545 put_block_group(block_group); 6045 btrfs_put_block_group(block_group);
6546 put_block_group(block_group); 6046 btrfs_put_block_group(block_group);
6547 6047
6548 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 6048 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
6549 if (ret > 0) 6049 if (ret > 0)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ebe6b29e6069..fe9eb990e443 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -17,12 +17,6 @@
17#include "ctree.h" 17#include "ctree.h"
18#include "btrfs_inode.h" 18#include "btrfs_inode.h"
19 19
20/* temporary define until extent_map moves out of btrfs */
21struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
22 unsigned long extra_flags,
23 void (*ctor)(void *, struct kmem_cache *,
24 unsigned long));
25
26static struct kmem_cache *extent_state_cache; 20static struct kmem_cache *extent_state_cache;
27static struct kmem_cache *extent_buffer_cache; 21static struct kmem_cache *extent_buffer_cache;
28 22
@@ -50,20 +44,23 @@ struct extent_page_data {
50 /* tells writepage not to lock the state bits for this range 44 /* tells writepage not to lock the state bits for this range
51 * it still does the unlocking 45 * it still does the unlocking
52 */ 46 */
53 int extent_locked; 47 unsigned int extent_locked:1;
48
49 /* tells the submit_bio code to use a WRITE_SYNC */
50 unsigned int sync_io:1;
54}; 51};
55 52
56int __init extent_io_init(void) 53int __init extent_io_init(void)
57{ 54{
58 extent_state_cache = btrfs_cache_create("extent_state", 55 extent_state_cache = kmem_cache_create("extent_state",
59 sizeof(struct extent_state), 0, 56 sizeof(struct extent_state), 0,
60 NULL); 57 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
61 if (!extent_state_cache) 58 if (!extent_state_cache)
62 return -ENOMEM; 59 return -ENOMEM;
63 60
64 extent_buffer_cache = btrfs_cache_create("extent_buffers", 61 extent_buffer_cache = kmem_cache_create("extent_buffers",
65 sizeof(struct extent_buffer), 0, 62 sizeof(struct extent_buffer), 0,
66 NULL); 63 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
67 if (!extent_buffer_cache) 64 if (!extent_buffer_cache)
68 goto free_state_cache; 65 goto free_state_cache;
69 return 0; 66 return 0;
@@ -1404,69 +1401,6 @@ out:
1404 return total_bytes; 1401 return total_bytes;
1405} 1402}
1406 1403
1407#if 0
1408/*
1409 * helper function to lock both pages and extents in the tree.
1410 * pages must be locked first.
1411 */
1412static int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
1413{
1414 unsigned long index = start >> PAGE_CACHE_SHIFT;
1415 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1416 struct page *page;
1417 int err;
1418
1419 while (index <= end_index) {
1420 page = grab_cache_page(tree->mapping, index);
1421 if (!page) {
1422 err = -ENOMEM;
1423 goto failed;
1424 }
1425 if (IS_ERR(page)) {
1426 err = PTR_ERR(page);
1427 goto failed;
1428 }
1429 index++;
1430 }
1431 lock_extent(tree, start, end, GFP_NOFS);
1432 return 0;
1433
1434failed:
1435 /*
1436 * we failed above in getting the page at 'index', so we undo here
1437 * up to but not including the page at 'index'
1438 */
1439 end_index = index;
1440 index = start >> PAGE_CACHE_SHIFT;
1441 while (index < end_index) {
1442 page = find_get_page(tree->mapping, index);
1443 unlock_page(page);
1444 page_cache_release(page);
1445 index++;
1446 }
1447 return err;
1448}
1449
1450/*
1451 * helper function to unlock both pages and extents in the tree.
1452 */
1453static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
1454{
1455 unsigned long index = start >> PAGE_CACHE_SHIFT;
1456 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1457 struct page *page;
1458
1459 while (index <= end_index) {
1460 page = find_get_page(tree->mapping, index);
1461 unlock_page(page);
1462 page_cache_release(page);
1463 index++;
1464 }
1465 unlock_extent(tree, start, end, GFP_NOFS);
1466 return 0;
1467}
1468#endif
1469
1470/* 1404/*
1471 * set the private field for a given byte offset in the tree. If there isn't 1405 * set the private field for a given byte offset in the tree. If there isn't
1472 * an extent_state there already, this does nothing. 1406 * an extent_state there already, this does nothing.
@@ -2101,6 +2035,16 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2101 return ret; 2035 return ret;
2102} 2036}
2103 2037
2038static noinline void update_nr_written(struct page *page,
2039 struct writeback_control *wbc,
2040 unsigned long nr_written)
2041{
2042 wbc->nr_to_write -= nr_written;
2043 if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2044 wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2045 page->mapping->writeback_index = page->index + nr_written;
2046}
2047
2104/* 2048/*
2105 * the writepage semantics are similar to regular writepage. extent 2049 * the writepage semantics are similar to regular writepage. extent
2106 * records are inserted to lock ranges in the tree, and as dirty areas 2050 * records are inserted to lock ranges in the tree, and as dirty areas
@@ -2136,8 +2080,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2136 u64 delalloc_end; 2080 u64 delalloc_end;
2137 int page_started; 2081 int page_started;
2138 int compressed; 2082 int compressed;
2083 int write_flags;
2139 unsigned long nr_written = 0; 2084 unsigned long nr_written = 0;
2140 2085
2086 if (wbc->sync_mode == WB_SYNC_ALL)
2087 write_flags = WRITE_SYNC_PLUG;
2088 else
2089 write_flags = WRITE;
2090
2141 WARN_ON(!PageLocked(page)); 2091 WARN_ON(!PageLocked(page));
2142 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2092 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2143 if (page->index > end_index || 2093 if (page->index > end_index ||
@@ -2164,6 +2114,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2164 delalloc_end = 0; 2114 delalloc_end = 0;
2165 page_started = 0; 2115 page_started = 0;
2166 if (!epd->extent_locked) { 2116 if (!epd->extent_locked) {
2117 /*
2118 * make sure the wbc mapping index is at least updated
2119 * to this page.
2120 */
2121 update_nr_written(page, wbc, 0);
2122
2167 while (delalloc_end < page_end) { 2123 while (delalloc_end < page_end) {
2168 nr_delalloc = find_lock_delalloc_range(inode, tree, 2124 nr_delalloc = find_lock_delalloc_range(inode, tree,
2169 page, 2125 page,
@@ -2185,7 +2141,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2185 */ 2141 */
2186 if (page_started) { 2142 if (page_started) {
2187 ret = 0; 2143 ret = 0;
2188 goto update_nr_written; 2144 /*
2145 * we've unlocked the page, so we can't update
2146 * the mapping's writeback index, just update
2147 * nr_to_write.
2148 */
2149 wbc->nr_to_write -= nr_written;
2150 goto done_unlocked;
2189 } 2151 }
2190 } 2152 }
2191 lock_extent(tree, start, page_end, GFP_NOFS); 2153 lock_extent(tree, start, page_end, GFP_NOFS);
@@ -2198,13 +2160,18 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2198 if (ret == -EAGAIN) { 2160 if (ret == -EAGAIN) {
2199 unlock_extent(tree, start, page_end, GFP_NOFS); 2161 unlock_extent(tree, start, page_end, GFP_NOFS);
2200 redirty_page_for_writepage(wbc, page); 2162 redirty_page_for_writepage(wbc, page);
2163 update_nr_written(page, wbc, nr_written);
2201 unlock_page(page); 2164 unlock_page(page);
2202 ret = 0; 2165 ret = 0;
2203 goto update_nr_written; 2166 goto done_unlocked;
2204 } 2167 }
2205 } 2168 }
2206 2169
2207 nr_written++; 2170 /*
2171 * we don't want to touch the inode after unlocking the page,
2172 * so we update the mapping writeback index now
2173 */
2174 update_nr_written(page, wbc, nr_written + 1);
2208 2175
2209 end = page_end; 2176 end = page_end;
2210 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) 2177 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
@@ -2314,9 +2281,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2314 (unsigned long long)end); 2281 (unsigned long long)end);
2315 } 2282 }
2316 2283
2317 ret = submit_extent_page(WRITE, tree, page, sector, 2284 ret = submit_extent_page(write_flags, tree, page,
2318 iosize, pg_offset, bdev, 2285 sector, iosize, pg_offset,
2319 &epd->bio, max_nr, 2286 bdev, &epd->bio, max_nr,
2320 end_bio_extent_writepage, 2287 end_bio_extent_writepage,
2321 0, 0, 0); 2288 0, 0, 0);
2322 if (ret) 2289 if (ret)
@@ -2336,11 +2303,8 @@ done:
2336 unlock_extent(tree, unlock_start, page_end, GFP_NOFS); 2303 unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2337 unlock_page(page); 2304 unlock_page(page);
2338 2305
2339update_nr_written: 2306done_unlocked:
2340 wbc->nr_to_write -= nr_written; 2307
2341 if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2342 wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2343 page->mapping->writeback_index = page->index + nr_written;
2344 return 0; 2308 return 0;
2345} 2309}
2346 2310
@@ -2460,15 +2424,23 @@ retry:
2460 return ret; 2424 return ret;
2461} 2425}
2462 2426
2463static noinline void flush_write_bio(void *data) 2427static void flush_epd_write_bio(struct extent_page_data *epd)
2464{ 2428{
2465 struct extent_page_data *epd = data;
2466 if (epd->bio) { 2429 if (epd->bio) {
2467 submit_one_bio(WRITE, epd->bio, 0, 0); 2430 if (epd->sync_io)
2431 submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
2432 else
2433 submit_one_bio(WRITE, epd->bio, 0, 0);
2468 epd->bio = NULL; 2434 epd->bio = NULL;
2469 } 2435 }
2470} 2436}
2471 2437
2438static noinline void flush_write_bio(void *data)
2439{
2440 struct extent_page_data *epd = data;
2441 flush_epd_write_bio(epd);
2442}
2443
2472int extent_write_full_page(struct extent_io_tree *tree, struct page *page, 2444int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2473 get_extent_t *get_extent, 2445 get_extent_t *get_extent,
2474 struct writeback_control *wbc) 2446 struct writeback_control *wbc)
@@ -2480,23 +2452,22 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2480 .tree = tree, 2452 .tree = tree,
2481 .get_extent = get_extent, 2453 .get_extent = get_extent,
2482 .extent_locked = 0, 2454 .extent_locked = 0,
2455 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
2483 }; 2456 };
2484 struct writeback_control wbc_writepages = { 2457 struct writeback_control wbc_writepages = {
2485 .bdi = wbc->bdi, 2458 .bdi = wbc->bdi,
2486 .sync_mode = WB_SYNC_NONE, 2459 .sync_mode = wbc->sync_mode,
2487 .older_than_this = NULL, 2460 .older_than_this = NULL,
2488 .nr_to_write = 64, 2461 .nr_to_write = 64,
2489 .range_start = page_offset(page) + PAGE_CACHE_SIZE, 2462 .range_start = page_offset(page) + PAGE_CACHE_SIZE,
2490 .range_end = (loff_t)-1, 2463 .range_end = (loff_t)-1,
2491 }; 2464 };
2492 2465
2493
2494 ret = __extent_writepage(page, wbc, &epd); 2466 ret = __extent_writepage(page, wbc, &epd);
2495 2467
2496 extent_write_cache_pages(tree, mapping, &wbc_writepages, 2468 extent_write_cache_pages(tree, mapping, &wbc_writepages,
2497 __extent_writepage, &epd, flush_write_bio); 2469 __extent_writepage, &epd, flush_write_bio);
2498 if (epd.bio) 2470 flush_epd_write_bio(&epd);
2499 submit_one_bio(WRITE, epd.bio, 0, 0);
2500 return ret; 2471 return ret;
2501} 2472}
2502 2473
@@ -2515,6 +2486,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2515 .tree = tree, 2486 .tree = tree,
2516 .get_extent = get_extent, 2487 .get_extent = get_extent,
2517 .extent_locked = 1, 2488 .extent_locked = 1,
2489 .sync_io = mode == WB_SYNC_ALL,
2518 }; 2490 };
2519 struct writeback_control wbc_writepages = { 2491 struct writeback_control wbc_writepages = {
2520 .bdi = inode->i_mapping->backing_dev_info, 2492 .bdi = inode->i_mapping->backing_dev_info,
@@ -2540,8 +2512,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2540 start += PAGE_CACHE_SIZE; 2512 start += PAGE_CACHE_SIZE;
2541 } 2513 }
2542 2514
2543 if (epd.bio) 2515 flush_epd_write_bio(&epd);
2544 submit_one_bio(WRITE, epd.bio, 0, 0);
2545 return ret; 2516 return ret;
2546} 2517}
2547 2518
@@ -2556,13 +2527,13 @@ int extent_writepages(struct extent_io_tree *tree,
2556 .tree = tree, 2527 .tree = tree,
2557 .get_extent = get_extent, 2528 .get_extent = get_extent,
2558 .extent_locked = 0, 2529 .extent_locked = 0,
2530 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
2559 }; 2531 };
2560 2532
2561 ret = extent_write_cache_pages(tree, mapping, wbc, 2533 ret = extent_write_cache_pages(tree, mapping, wbc,
2562 __extent_writepage, &epd, 2534 __extent_writepage, &epd,
2563 flush_write_bio); 2535 flush_write_bio);
2564 if (epd.bio) 2536 flush_epd_write_bio(&epd);
2565 submit_one_bio(WRITE, epd.bio, 0, 0);
2566 return ret; 2537 return ret;
2567} 2538}
2568 2539
@@ -2884,25 +2855,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2884 disko = 0; 2855 disko = 0;
2885 flags = 0; 2856 flags = 0;
2886 2857
2887 switch (em->block_start) { 2858 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
2888 case EXTENT_MAP_LAST_BYTE:
2889 end = 1; 2859 end = 1;
2890 flags |= FIEMAP_EXTENT_LAST; 2860 flags |= FIEMAP_EXTENT_LAST;
2891 break; 2861 } else if (em->block_start == EXTENT_MAP_HOLE) {
2892 case EXTENT_MAP_HOLE:
2893 flags |= FIEMAP_EXTENT_UNWRITTEN; 2862 flags |= FIEMAP_EXTENT_UNWRITTEN;
2894 break; 2863 } else if (em->block_start == EXTENT_MAP_INLINE) {
2895 case EXTENT_MAP_INLINE:
2896 flags |= (FIEMAP_EXTENT_DATA_INLINE | 2864 flags |= (FIEMAP_EXTENT_DATA_INLINE |
2897 FIEMAP_EXTENT_NOT_ALIGNED); 2865 FIEMAP_EXTENT_NOT_ALIGNED);
2898 break; 2866 } else if (em->block_start == EXTENT_MAP_DELALLOC) {
2899 case EXTENT_MAP_DELALLOC:
2900 flags |= (FIEMAP_EXTENT_DELALLOC | 2867 flags |= (FIEMAP_EXTENT_DELALLOC |
2901 FIEMAP_EXTENT_UNKNOWN); 2868 FIEMAP_EXTENT_UNKNOWN);
2902 break; 2869 } else {
2903 default:
2904 disko = em->block_start; 2870 disko = em->block_start;
2905 break;
2906 } 2871 }
2907 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 2872 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2908 flags |= FIEMAP_EXTENT_ENCODED; 2873 flags |= FIEMAP_EXTENT_ENCODED;
@@ -3124,20 +3089,15 @@ void free_extent_buffer(struct extent_buffer *eb)
3124int clear_extent_buffer_dirty(struct extent_io_tree *tree, 3089int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3125 struct extent_buffer *eb) 3090 struct extent_buffer *eb)
3126{ 3091{
3127 int set;
3128 unsigned long i; 3092 unsigned long i;
3129 unsigned long num_pages; 3093 unsigned long num_pages;
3130 struct page *page; 3094 struct page *page;
3131 3095
3132 u64 start = eb->start;
3133 u64 end = start + eb->len - 1;
3134
3135 set = clear_extent_dirty(tree, start, end, GFP_NOFS);
3136 num_pages = num_extent_pages(eb->start, eb->len); 3096 num_pages = num_extent_pages(eb->start, eb->len);
3137 3097
3138 for (i = 0; i < num_pages; i++) { 3098 for (i = 0; i < num_pages; i++) {
3139 page = extent_buffer_page(eb, i); 3099 page = extent_buffer_page(eb, i);
3140 if (!set && !PageDirty(page)) 3100 if (!PageDirty(page))
3141 continue; 3101 continue;
3142 3102
3143 lock_page(page); 3103 lock_page(page);
@@ -3146,22 +3106,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3146 else 3106 else
3147 set_page_private(page, EXTENT_PAGE_PRIVATE); 3107 set_page_private(page, EXTENT_PAGE_PRIVATE);
3148 3108
3149 /*
3150 * if we're on the last page or the first page and the
3151 * block isn't aligned on a page boundary, do extra checks
3152 * to make sure we don't clean page that is partially dirty
3153 */
3154 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3155 ((i == num_pages - 1) &&
3156 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3157 start = (u64)page->index << PAGE_CACHE_SHIFT;
3158 end = start + PAGE_CACHE_SIZE - 1;
3159 if (test_range_bit(tree, start, end,
3160 EXTENT_DIRTY, 0)) {
3161 unlock_page(page);
3162 continue;
3163 }
3164 }
3165 clear_page_dirty_for_io(page); 3109 clear_page_dirty_for_io(page);
3166 spin_lock_irq(&page->mapping->tree_lock); 3110 spin_lock_irq(&page->mapping->tree_lock);
3167 if (!PageDirty(page)) { 3111 if (!PageDirty(page)) {
@@ -3187,29 +3131,13 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
3187{ 3131{
3188 unsigned long i; 3132 unsigned long i;
3189 unsigned long num_pages; 3133 unsigned long num_pages;
3134 int was_dirty = 0;
3190 3135
3136 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
3191 num_pages = num_extent_pages(eb->start, eb->len); 3137 num_pages = num_extent_pages(eb->start, eb->len);
3192 for (i = 0; i < num_pages; i++) { 3138 for (i = 0; i < num_pages; i++)
3193 struct page *page = extent_buffer_page(eb, i);
3194 /* writepage may need to do something special for the
3195 * first page, we have to make sure page->private is
3196 * properly set. releasepage may drop page->private
3197 * on us if the page isn't already dirty.
3198 */
3199 lock_page(page);
3200 if (i == 0) {
3201 set_page_extent_head(page, eb->len);
3202 } else if (PagePrivate(page) &&
3203 page->private != EXTENT_PAGE_PRIVATE) {
3204 set_page_extent_mapped(page);
3205 }
3206 __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); 3139 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
3207 set_extent_dirty(tree, page_offset(page), 3140 return was_dirty;
3208 page_offset(page) + PAGE_CACHE_SIZE - 1,
3209 GFP_NOFS);
3210 unlock_page(page);
3211 }
3212 return 0;
3213} 3141}
3214 3142
3215int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 3143int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
@@ -3789,6 +3717,10 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3789 ret = 0; 3717 ret = 0;
3790 goto out; 3718 goto out;
3791 } 3719 }
3720 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3721 ret = 0;
3722 goto out;
3723 }
3792 /* at this point we can safely release the extent buffer */ 3724 /* at this point we can safely release the extent buffer */
3793 num_pages = num_extent_pages(eb->start, eb->len); 3725 num_pages = num_extent_pages(eb->start, eb->len);
3794 for (i = 0; i < num_pages; i++) 3726 for (i = 0; i < num_pages; i++)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 1f9df88afbf6..5bc20abf3f3d 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -25,6 +25,7 @@
25/* these are bit numbers for test/set bit */ 25/* these are bit numbers for test/set bit */
26#define EXTENT_BUFFER_UPTODATE 0 26#define EXTENT_BUFFER_UPTODATE 0
27#define EXTENT_BUFFER_BLOCKING 1 27#define EXTENT_BUFFER_BLOCKING 1
28#define EXTENT_BUFFER_DIRTY 2
28 29
29/* 30/*
30 * page->private values. Every page that is controlled by the extent 31 * page->private values. Every page that is controlled by the extent
@@ -254,6 +255,8 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
254 struct extent_buffer *eb); 255 struct extent_buffer *eb);
255int set_extent_buffer_dirty(struct extent_io_tree *tree, 256int set_extent_buffer_dirty(struct extent_io_tree *tree,
256 struct extent_buffer *eb); 257 struct extent_buffer *eb);
258int test_extent_buffer_dirty(struct extent_io_tree *tree,
259 struct extent_buffer *eb);
257int set_extent_buffer_uptodate(struct extent_io_tree *tree, 260int set_extent_buffer_uptodate(struct extent_io_tree *tree,
258 struct extent_buffer *eb); 261 struct extent_buffer *eb);
259int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 262int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 50da69da20ce..30c9365861e6 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -6,19 +6,14 @@
6#include <linux/hardirq.h> 6#include <linux/hardirq.h>
7#include "extent_map.h" 7#include "extent_map.h"
8 8
9/* temporary define until extent_map moves out of btrfs */
10struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
11 unsigned long extra_flags,
12 void (*ctor)(void *, struct kmem_cache *,
13 unsigned long));
14 9
15static struct kmem_cache *extent_map_cache; 10static struct kmem_cache *extent_map_cache;
16 11
17int __init extent_map_init(void) 12int __init extent_map_init(void)
18{ 13{
19 extent_map_cache = btrfs_cache_create("extent_map", 14 extent_map_cache = kmem_cache_create("extent_map",
20 sizeof(struct extent_map), 0, 15 sizeof(struct extent_map), 0,
21 NULL); 16 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
22 if (!extent_map_cache) 17 if (!extent_map_cache)
23 return -ENOMEM; 18 return -ENOMEM;
24 return 0; 19 return 0;
@@ -43,7 +38,6 @@ void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
43 tree->map.rb_node = NULL; 38 tree->map.rb_node = NULL;
44 spin_lock_init(&tree->lock); 39 spin_lock_init(&tree->lock);
45} 40}
46EXPORT_SYMBOL(extent_map_tree_init);
47 41
48/** 42/**
49 * alloc_extent_map - allocate new extent map structure 43 * alloc_extent_map - allocate new extent map structure
@@ -64,7 +58,6 @@ struct extent_map *alloc_extent_map(gfp_t mask)
64 atomic_set(&em->refs, 1); 58 atomic_set(&em->refs, 1);
65 return em; 59 return em;
66} 60}
67EXPORT_SYMBOL(alloc_extent_map);
68 61
69/** 62/**
70 * free_extent_map - drop reference count of an extent_map 63 * free_extent_map - drop reference count of an extent_map
@@ -83,7 +76,6 @@ void free_extent_map(struct extent_map *em)
83 kmem_cache_free(extent_map_cache, em); 76 kmem_cache_free(extent_map_cache, em);
84 } 77 }
85} 78}
86EXPORT_SYMBOL(free_extent_map);
87 79
88static struct rb_node *tree_insert(struct rb_root *root, u64 offset, 80static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
89 struct rb_node *node) 81 struct rb_node *node)
@@ -234,7 +226,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
234 rb = tree_insert(&tree->map, em->start, &em->rb_node); 226 rb = tree_insert(&tree->map, em->start, &em->rb_node);
235 if (rb) { 227 if (rb) {
236 ret = -EEXIST; 228 ret = -EEXIST;
237 free_extent_map(merge);
238 goto out; 229 goto out;
239 } 230 }
240 atomic_inc(&em->refs); 231 atomic_inc(&em->refs);
@@ -265,7 +256,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
265out: 256out:
266 return ret; 257 return ret;
267} 258}
268EXPORT_SYMBOL(add_extent_mapping);
269 259
270/* simple helper to do math around the end of an extent, handling wrap */ 260/* simple helper to do math around the end of an extent, handling wrap */
271static u64 range_end(u64 start, u64 len) 261static u64 range_end(u64 start, u64 len)
@@ -327,7 +317,6 @@ found:
327out: 317out:
328 return em; 318 return em;
329} 319}
330EXPORT_SYMBOL(lookup_extent_mapping);
331 320
332/** 321/**
333 * remove_extent_mapping - removes an extent_map from the extent tree 322 * remove_extent_mapping - removes an extent_map from the extent tree
@@ -347,4 +336,3 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
347 em->in_tree = 0; 336 em->in_tree = 0;
348 return ret; 337 return ret;
349} 338}
350EXPORT_SYMBOL(remove_extent_mapping);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 964652435fd1..9b99886562d0 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -52,6 +52,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
52 file_key.offset = pos; 52 file_key.offset = pos;
53 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); 53 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
54 54
55 path->leave_spinning = 1;
55 ret = btrfs_insert_empty_item(trans, root, path, &file_key, 56 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
56 sizeof(*item)); 57 sizeof(*item));
57 if (ret < 0) 58 if (ret < 0)
@@ -523,6 +524,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
523 key.offset = end_byte - 1; 524 key.offset = end_byte - 1;
524 key.type = BTRFS_EXTENT_CSUM_KEY; 525 key.type = BTRFS_EXTENT_CSUM_KEY;
525 526
527 path->leave_spinning = 1;
526 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 528 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
527 if (ret > 0) { 529 if (ret > 0) {
528 if (path->slots[0] == 0) 530 if (path->slots[0] == 0)
@@ -757,8 +759,10 @@ insert:
757 } else { 759 } else {
758 ins_size = csum_size; 760 ins_size = csum_size;
759 } 761 }
762 path->leave_spinning = 1;
760 ret = btrfs_insert_empty_item(trans, root, path, &file_key, 763 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
761 ins_size); 764 ins_size);
765 path->leave_spinning = 0;
762 if (ret < 0) 766 if (ret < 0)
763 goto fail_unlock; 767 goto fail_unlock;
764 if (ret != 0) { 768 if (ret != 0) {
@@ -776,7 +780,6 @@ found:
776 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + 780 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
777 btrfs_item_size_nr(leaf, path->slots[0])); 781 btrfs_item_size_nr(leaf, path->slots[0]));
778 eb_token = NULL; 782 eb_token = NULL;
779 cond_resched();
780next_sector: 783next_sector:
781 784
782 if (!eb_token || 785 if (!eb_token ||
@@ -817,9 +820,9 @@ next_sector:
817 eb_token = NULL; 820 eb_token = NULL;
818 } 821 }
819 btrfs_mark_buffer_dirty(path->nodes[0]); 822 btrfs_mark_buffer_dirty(path->nodes[0]);
820 cond_resched();
821 if (total_bytes < sums->len) { 823 if (total_bytes < sums->len) {
822 btrfs_release_path(root, path); 824 btrfs_release_path(root, path);
825 cond_resched();
823 goto again; 826 goto again;
824 } 827 }
825out: 828out:
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index dc78954861b3..1d51dc38bb49 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -272,83 +272,6 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
272 return 0; 272 return 0;
273} 273}
274 274
275int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
276{
277 return 0;
278#if 0
279 struct btrfs_path *path;
280 struct btrfs_key found_key;
281 struct extent_buffer *leaf;
282 struct btrfs_file_extent_item *extent;
283 u64 last_offset = 0;
284 int nritems;
285 int slot;
286 int found_type;
287 int ret;
288 int err = 0;
289 u64 extent_end = 0;
290
291 path = btrfs_alloc_path();
292 ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino,
293 last_offset, 0);
294 while (1) {
295 nritems = btrfs_header_nritems(path->nodes[0]);
296 if (path->slots[0] >= nritems) {
297 ret = btrfs_next_leaf(root, path);
298 if (ret)
299 goto out;
300 nritems = btrfs_header_nritems(path->nodes[0]);
301 }
302 slot = path->slots[0];
303 leaf = path->nodes[0];
304 btrfs_item_key_to_cpu(leaf, &found_key, slot);
305 if (found_key.objectid != inode->i_ino)
306 break;
307 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
308 goto out;
309
310 if (found_key.offset < last_offset) {
311 WARN_ON(1);
312 btrfs_print_leaf(root, leaf);
313 printk(KERN_ERR "inode %lu found offset %llu "
314 "expected %llu\n", inode->i_ino,
315 (unsigned long long)found_key.offset,
316 (unsigned long long)last_offset);
317 err = 1;
318 goto out;
319 }
320 extent = btrfs_item_ptr(leaf, slot,
321 struct btrfs_file_extent_item);
322 found_type = btrfs_file_extent_type(leaf, extent);
323 if (found_type == BTRFS_FILE_EXTENT_REG) {
324 extent_end = found_key.offset +
325 btrfs_file_extent_num_bytes(leaf, extent);
326 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
327 struct btrfs_item *item;
328 item = btrfs_item_nr(leaf, slot);
329 extent_end = found_key.offset +
330 btrfs_file_extent_inline_len(leaf, extent);
331 extent_end = (extent_end + root->sectorsize - 1) &
332 ~((u64)root->sectorsize - 1);
333 }
334 last_offset = extent_end;
335 path->slots[0]++;
336 }
337 if (0 && last_offset < inode->i_size) {
338 WARN_ON(1);
339 btrfs_print_leaf(root, leaf);
340 printk(KERN_ERR "inode %lu found offset %llu size %llu\n",
341 inode->i_ino, (unsigned long long)last_offset,
342 (unsigned long long)inode->i_size);
343 err = 1;
344
345 }
346out:
347 btrfs_free_path(path);
348 return err;
349#endif
350}
351
352/* 275/*
353 * this is very complex, but the basic idea is to drop all extents 276 * this is very complex, but the basic idea is to drop all extents
354 * in the range start - end. hint_block is filled in with a block number 277 * in the range start - end. hint_block is filled in with a block number
@@ -363,15 +286,16 @@ out:
363 */ 286 */
364noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, 287noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
365 struct btrfs_root *root, struct inode *inode, 288 struct btrfs_root *root, struct inode *inode,
366 u64 start, u64 end, u64 inline_limit, u64 *hint_byte) 289 u64 start, u64 end, u64 locked_end,
290 u64 inline_limit, u64 *hint_byte)
367{ 291{
368 u64 extent_end = 0; 292 u64 extent_end = 0;
369 u64 locked_end = end;
370 u64 search_start = start; 293 u64 search_start = start;
371 u64 leaf_start; 294 u64 leaf_start;
372 u64 ram_bytes = 0; 295 u64 ram_bytes = 0;
373 u64 orig_parent = 0; 296 u64 orig_parent = 0;
374 u64 disk_bytenr = 0; 297 u64 disk_bytenr = 0;
298 u64 orig_locked_end = locked_end;
375 u8 compression; 299 u8 compression;
376 u8 encryption; 300 u8 encryption;
377 u16 other_encoding = 0; 301 u16 other_encoding = 0;
@@ -606,6 +530,7 @@ next_slot:
606 btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY); 530 btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
607 531
608 btrfs_release_path(root, path); 532 btrfs_release_path(root, path);
533 path->leave_spinning = 1;
609 ret = btrfs_insert_empty_item(trans, root, path, &ins, 534 ret = btrfs_insert_empty_item(trans, root, path, &ins,
610 sizeof(*extent)); 535 sizeof(*extent));
611 BUG_ON(ret); 536 BUG_ON(ret);
@@ -639,17 +564,22 @@ next_slot:
639 ram_bytes); 564 ram_bytes);
640 btrfs_set_file_extent_type(leaf, extent, found_type); 565 btrfs_set_file_extent_type(leaf, extent, found_type);
641 566
567 btrfs_unlock_up_safe(path, 1);
642 btrfs_mark_buffer_dirty(path->nodes[0]); 568 btrfs_mark_buffer_dirty(path->nodes[0]);
569 btrfs_set_lock_blocking(path->nodes[0]);
643 570
644 if (disk_bytenr != 0) { 571 if (disk_bytenr != 0) {
645 ret = btrfs_update_extent_ref(trans, root, 572 ret = btrfs_update_extent_ref(trans, root,
646 disk_bytenr, orig_parent, 573 disk_bytenr,
574 le64_to_cpu(old.disk_num_bytes),
575 orig_parent,
647 leaf->start, 576 leaf->start,
648 root->root_key.objectid, 577 root->root_key.objectid,
649 trans->transid, ins.objectid); 578 trans->transid, ins.objectid);
650 579
651 BUG_ON(ret); 580 BUG_ON(ret);
652 } 581 }
582 path->leave_spinning = 0;
653 btrfs_release_path(root, path); 583 btrfs_release_path(root, path);
654 if (disk_bytenr != 0) 584 if (disk_bytenr != 0)
655 inode_add_bytes(inode, extent_end - end); 585 inode_add_bytes(inode, extent_end - end);
@@ -678,11 +608,10 @@ next_slot:
678 } 608 }
679out: 609out:
680 btrfs_free_path(path); 610 btrfs_free_path(path);
681 if (locked_end > end) { 611 if (locked_end > orig_locked_end) {
682 unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1, 612 unlock_extent(&BTRFS_I(inode)->io_tree, orig_locked_end,
683 GFP_NOFS); 613 locked_end - 1, GFP_NOFS);
684 } 614 }
685 btrfs_check_file(root, inode);
686 return ret; 615 return ret;
687} 616}
688 617
@@ -824,7 +753,7 @@ again:
824 753
825 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 754 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
826 BUG_ON(ret); 755 BUG_ON(ret);
827 goto done; 756 goto release;
828 } else if (split == start) { 757 } else if (split == start) {
829 if (locked_end < extent_end) { 758 if (locked_end < extent_end) {
830 ret = try_lock_extent(&BTRFS_I(inode)->io_tree, 759 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
@@ -912,7 +841,7 @@ again:
912 btrfs_set_file_extent_other_encoding(leaf, fi, 0); 841 btrfs_set_file_extent_other_encoding(leaf, fi, 0);
913 842
914 if (orig_parent != leaf->start) { 843 if (orig_parent != leaf->start) {
915 ret = btrfs_update_extent_ref(trans, root, bytenr, 844 ret = btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
916 orig_parent, leaf->start, 845 orig_parent, leaf->start,
917 root->root_key.objectid, 846 root->root_key.objectid,
918 trans->transid, inode->i_ino); 847 trans->transid, inode->i_ino);
@@ -920,6 +849,8 @@ again:
920 } 849 }
921done: 850done:
922 btrfs_mark_buffer_dirty(leaf); 851 btrfs_mark_buffer_dirty(leaf);
852
853release:
923 btrfs_release_path(root, path); 854 btrfs_release_path(root, path);
924 if (split_end && split == start) { 855 if (split_end && split == start) {
925 split = end; 856 split = end;
@@ -1125,7 +1056,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
1125 if (will_write) { 1056 if (will_write) {
1126 btrfs_fdatawrite_range(inode->i_mapping, pos, 1057 btrfs_fdatawrite_range(inode->i_mapping, pos,
1127 pos + write_bytes - 1, 1058 pos + write_bytes - 1,
1128 WB_SYNC_NONE); 1059 WB_SYNC_ALL);
1129 } else { 1060 } else {
1130 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1061 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
1131 num_pages); 1062 num_pages);
@@ -1155,6 +1086,20 @@ out_nolock:
1155 page_cache_release(pinned[1]); 1086 page_cache_release(pinned[1]);
1156 *ppos = pos; 1087 *ppos = pos;
1157 1088
1089 /*
1090 * we want to make sure fsync finds this change
1091 * but we haven't joined a transaction running right now.
1092 *
1093 * Later on, someone is sure to update the inode and get the
1094 * real transid recorded.
1095 *
1096 * We set last_trans now to the fs_info generation + 1,
1097 * this will either be one more than the running transaction
1098 * or the generation used for the next transaction if there isn't
1099 * one running right now.
1100 */
1101 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1102
1158 if (num_written > 0 && will_write) { 1103 if (num_written > 0 && will_write) {
1159 struct btrfs_trans_handle *trans; 1104 struct btrfs_trans_handle *trans;
1160 1105
@@ -1167,8 +1112,11 @@ out_nolock:
1167 ret = btrfs_log_dentry_safe(trans, root, 1112 ret = btrfs_log_dentry_safe(trans, root,
1168 file->f_dentry); 1113 file->f_dentry);
1169 if (ret == 0) { 1114 if (ret == 0) {
1170 btrfs_sync_log(trans, root); 1115 ret = btrfs_sync_log(trans, root);
1171 btrfs_end_transaction(trans, root); 1116 if (ret == 0)
1117 btrfs_end_transaction(trans, root);
1118 else
1119 btrfs_commit_transaction(trans, root);
1172 } else { 1120 } else {
1173 btrfs_commit_transaction(trans, root); 1121 btrfs_commit_transaction(trans, root);
1174 } 1122 }
@@ -1185,6 +1133,18 @@ out_nolock:
1185 1133
1186int btrfs_release_file(struct inode *inode, struct file *filp) 1134int btrfs_release_file(struct inode *inode, struct file *filp)
1187{ 1135{
1136 /*
1137 * ordered_data_close is set by settattr when we are about to truncate
1138 * a file from a non-zero size to a zero size. This tries to
1139 * flush down new bytes that may have been written if the
1140 * application were using truncate to replace a file in place.
1141 */
1142 if (BTRFS_I(inode)->ordered_data_close) {
1143 BTRFS_I(inode)->ordered_data_close = 0;
1144 btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
1145 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
1146 filemap_flush(inode->i_mapping);
1147 }
1188 if (filp->private_data) 1148 if (filp->private_data)
1189 btrfs_ioctl_trans_end(filp); 1149 btrfs_ioctl_trans_end(filp);
1190 return 0; 1150 return 0;
@@ -1260,8 +1220,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1260 if (ret > 0) { 1220 if (ret > 0) {
1261 ret = btrfs_commit_transaction(trans, root); 1221 ret = btrfs_commit_transaction(trans, root);
1262 } else { 1222 } else {
1263 btrfs_sync_log(trans, root); 1223 ret = btrfs_sync_log(trans, root);
1264 ret = btrfs_end_transaction(trans, root); 1224 if (ret == 0)
1225 ret = btrfs_end_transaction(trans, root);
1226 else
1227 ret = btrfs_commit_transaction(trans, root);
1265 } 1228 }
1266 mutex_lock(&dentry->d_inode->i_mutex); 1229 mutex_lock(&dentry->d_inode->i_mutex);
1267out: 1230out:
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d1e5f0e84c58..0bc93657b460 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,15 @@
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include "ctree.h" 20#include "ctree.h"
21#include "free-space-cache.h"
22#include "transaction.h"
23
24struct btrfs_free_space {
25 struct rb_node bytes_index;
26 struct rb_node offset_index;
27 u64 offset;
28 u64 bytes;
29};
21 30
22static int tree_insert_offset(struct rb_root *root, u64 offset, 31static int tree_insert_offset(struct rb_root *root, u64 offset,
23 struct rb_node *node) 32 struct rb_node *node)
@@ -68,14 +77,24 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
68} 77}
69 78
70/* 79/*
71 * searches the tree for the given offset. If contains is set we will return 80 * searches the tree for the given offset.
72 * the free space that contains the given offset. If contains is not set we 81 *
73 * will return the free space that starts at or after the given offset and is 82 * fuzzy == 1: this is used for allocations where we are given a hint of where
74 * at least bytes long. 83 * to look for free space. Because the hint may not be completely on an offset
84 * mark, or the hint may no longer point to free space we need to fudge our
85 * results a bit. So we look for free space starting at or after offset with at
86 * least bytes size. We prefer to find as close to the given offset as we can.
87 * Also if the offset is within a free space range, then we will return the free
88 * space that contains the given offset, which means we can return a free space
89 * chunk with an offset before the provided offset.
90 *
91 * fuzzy == 0: this is just a normal tree search. Give us the free space that
92 * starts at the given offset which is at least bytes size, and if its not there
93 * return NULL.
75 */ 94 */
76static struct btrfs_free_space *tree_search_offset(struct rb_root *root, 95static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
77 u64 offset, u64 bytes, 96 u64 offset, u64 bytes,
78 int contains) 97 int fuzzy)
79{ 98{
80 struct rb_node *n = root->rb_node; 99 struct rb_node *n = root->rb_node;
81 struct btrfs_free_space *entry, *ret = NULL; 100 struct btrfs_free_space *entry, *ret = NULL;
@@ -84,13 +103,14 @@ static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
84 entry = rb_entry(n, struct btrfs_free_space, offset_index); 103 entry = rb_entry(n, struct btrfs_free_space, offset_index);
85 104
86 if (offset < entry->offset) { 105 if (offset < entry->offset) {
87 if (!contains && 106 if (fuzzy &&
88 (!ret || entry->offset < ret->offset) && 107 (!ret || entry->offset < ret->offset) &&
89 (bytes <= entry->bytes)) 108 (bytes <= entry->bytes))
90 ret = entry; 109 ret = entry;
91 n = n->rb_left; 110 n = n->rb_left;
92 } else if (offset > entry->offset) { 111 } else if (offset > entry->offset) {
93 if ((entry->offset + entry->bytes - 1) >= offset && 112 if (fuzzy &&
113 (entry->offset + entry->bytes - 1) >= offset &&
94 bytes <= entry->bytes) { 114 bytes <= entry->bytes) {
95 ret = entry; 115 ret = entry;
96 break; 116 break;
@@ -171,6 +191,7 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
171 int ret = 0; 191 int ret = 0;
172 192
173 193
194 BUG_ON(!info->bytes);
174 ret = tree_insert_offset(&block_group->free_space_offset, info->offset, 195 ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
175 &info->offset_index); 196 &info->offset_index);
176 if (ret) 197 if (ret)
@@ -184,108 +205,70 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
184 return ret; 205 return ret;
185} 206}
186 207
187static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 208int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
188 u64 offset, u64 bytes) 209 u64 offset, u64 bytes)
189{ 210{
190 struct btrfs_free_space *right_info; 211 struct btrfs_free_space *right_info;
191 struct btrfs_free_space *left_info; 212 struct btrfs_free_space *left_info;
192 struct btrfs_free_space *info = NULL; 213 struct btrfs_free_space *info = NULL;
193 struct btrfs_free_space *alloc_info;
194 int ret = 0; 214 int ret = 0;
195 215
196 alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); 216 info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
197 if (!alloc_info) 217 if (!info)
198 return -ENOMEM; 218 return -ENOMEM;
199 219
220 info->offset = offset;
221 info->bytes = bytes;
222
223 spin_lock(&block_group->tree_lock);
224
200 /* 225 /*
201 * first we want to see if there is free space adjacent to the range we 226 * first we want to see if there is free space adjacent to the range we
202 * are adding, if there is remove that struct and add a new one to 227 * are adding, if there is remove that struct and add a new one to
203 * cover the entire range 228 * cover the entire range
204 */ 229 */
205 right_info = tree_search_offset(&block_group->free_space_offset, 230 right_info = tree_search_offset(&block_group->free_space_offset,
206 offset+bytes, 0, 1); 231 offset+bytes, 0, 0);
207 left_info = tree_search_offset(&block_group->free_space_offset, 232 left_info = tree_search_offset(&block_group->free_space_offset,
208 offset-1, 0, 1); 233 offset-1, 0, 1);
209 234
210 if (right_info && right_info->offset == offset+bytes) { 235 if (right_info) {
211 unlink_free_space(block_group, right_info); 236 unlink_free_space(block_group, right_info);
212 info = right_info; 237 info->bytes += right_info->bytes;
213 info->offset = offset; 238 kfree(right_info);
214 info->bytes += bytes;
215 } else if (right_info && right_info->offset != offset+bytes) {
216 printk(KERN_ERR "btrfs adding space in the middle of an "
217 "existing free space area. existing: "
218 "offset=%llu, bytes=%llu. new: offset=%llu, "
219 "bytes=%llu\n", (unsigned long long)right_info->offset,
220 (unsigned long long)right_info->bytes,
221 (unsigned long long)offset,
222 (unsigned long long)bytes);
223 BUG();
224 } 239 }
225 240
226 if (left_info) { 241 if (left_info && left_info->offset + left_info->bytes == offset) {
227 unlink_free_space(block_group, left_info); 242 unlink_free_space(block_group, left_info);
228 243 info->offset = left_info->offset;
229 if (unlikely((left_info->offset + left_info->bytes) != 244 info->bytes += left_info->bytes;
230 offset)) { 245 kfree(left_info);
231 printk(KERN_ERR "btrfs free space to the left "
232 "of new free space isn't "
233 "quite right. existing: offset=%llu, "
234 "bytes=%llu. new: offset=%llu, bytes=%llu\n",
235 (unsigned long long)left_info->offset,
236 (unsigned long long)left_info->bytes,
237 (unsigned long long)offset,
238 (unsigned long long)bytes);
239 BUG();
240 }
241
242 if (info) {
243 info->offset = left_info->offset;
244 info->bytes += left_info->bytes;
245 kfree(left_info);
246 } else {
247 info = left_info;
248 info->bytes += bytes;
249 }
250 } 246 }
251 247
252 if (info) {
253 ret = link_free_space(block_group, info);
254 if (!ret)
255 info = NULL;
256 goto out;
257 }
258
259 info = alloc_info;
260 alloc_info = NULL;
261 info->offset = offset;
262 info->bytes = bytes;
263
264 ret = link_free_space(block_group, info); 248 ret = link_free_space(block_group, info);
265 if (ret) 249 if (ret)
266 kfree(info); 250 kfree(info);
267out: 251
252 spin_unlock(&block_group->tree_lock);
253
268 if (ret) { 254 if (ret) {
269 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); 255 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
270 if (ret == -EEXIST) 256 BUG_ON(ret == -EEXIST);
271 BUG();
272 } 257 }
273 258
274 kfree(alloc_info);
275
276 return ret; 259 return ret;
277} 260}
278 261
279static int 262int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
280__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, 263 u64 offset, u64 bytes)
281 u64 offset, u64 bytes)
282{ 264{
283 struct btrfs_free_space *info; 265 struct btrfs_free_space *info;
284 int ret = 0; 266 int ret = 0;
285 267
268 spin_lock(&block_group->tree_lock);
269
286 info = tree_search_offset(&block_group->free_space_offset, offset, 0, 270 info = tree_search_offset(&block_group->free_space_offset, offset, 0,
287 1); 271 1);
288
289 if (info && info->offset == offset) { 272 if (info && info->offset == offset) {
290 if (info->bytes < bytes) { 273 if (info->bytes < bytes) {
291 printk(KERN_ERR "Found free space at %llu, size %llu," 274 printk(KERN_ERR "Found free space at %llu, size %llu,"
@@ -295,12 +278,14 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
295 (unsigned long long)bytes); 278 (unsigned long long)bytes);
296 WARN_ON(1); 279 WARN_ON(1);
297 ret = -EINVAL; 280 ret = -EINVAL;
281 spin_unlock(&block_group->tree_lock);
298 goto out; 282 goto out;
299 } 283 }
300 unlink_free_space(block_group, info); 284 unlink_free_space(block_group, info);
301 285
302 if (info->bytes == bytes) { 286 if (info->bytes == bytes) {
303 kfree(info); 287 kfree(info);
288 spin_unlock(&block_group->tree_lock);
304 goto out; 289 goto out;
305 } 290 }
306 291
@@ -308,6 +293,7 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
308 info->bytes -= bytes; 293 info->bytes -= bytes;
309 294
310 ret = link_free_space(block_group, info); 295 ret = link_free_space(block_group, info);
296 spin_unlock(&block_group->tree_lock);
311 BUG_ON(ret); 297 BUG_ON(ret);
312 } else if (info && info->offset < offset && 298 } else if (info && info->offset < offset &&
313 info->offset + info->bytes >= offset + bytes) { 299 info->offset + info->bytes >= offset + bytes) {
@@ -333,70 +319,37 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
333 */ 319 */
334 kfree(info); 320 kfree(info);
335 } 321 }
336 322 spin_unlock(&block_group->tree_lock);
337 /* step two, insert a new info struct to cover anything 323 /* step two, insert a new info struct to cover anything
338 * before the hole 324 * before the hole
339 */ 325 */
340 ret = __btrfs_add_free_space(block_group, old_start, 326 ret = btrfs_add_free_space(block_group, old_start,
341 offset - old_start); 327 offset - old_start);
342 BUG_ON(ret); 328 BUG_ON(ret);
343 } else { 329 } else {
330 spin_unlock(&block_group->tree_lock);
331 if (!info) {
332 printk(KERN_ERR "couldn't find space %llu to free\n",
333 (unsigned long long)offset);
334 printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n",
335 block_group->cached,
336 (unsigned long long)block_group->key.objectid,
337 (unsigned long long)block_group->key.offset);
338 btrfs_dump_free_space(block_group, bytes);
339 } else if (info) {
340 printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, "
341 "but wanted offset=%llu bytes=%llu\n",
342 (unsigned long long)info->offset,
343 (unsigned long long)info->bytes,
344 (unsigned long long)offset,
345 (unsigned long long)bytes);
346 }
344 WARN_ON(1); 347 WARN_ON(1);
345 } 348 }
346out: 349out:
347 return ret; 350 return ret;
348} 351}
349 352
350int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
351 u64 offset, u64 bytes)
352{
353 int ret;
354 struct btrfs_free_space *sp;
355
356 mutex_lock(&block_group->alloc_mutex);
357 ret = __btrfs_add_free_space(block_group, offset, bytes);
358 sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
359 BUG_ON(!sp);
360 mutex_unlock(&block_group->alloc_mutex);
361
362 return ret;
363}
364
365int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
366 u64 offset, u64 bytes)
367{
368 int ret;
369 struct btrfs_free_space *sp;
370
371 ret = __btrfs_add_free_space(block_group, offset, bytes);
372 sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
373 BUG_ON(!sp);
374
375 return ret;
376}
377
378int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
379 u64 offset, u64 bytes)
380{
381 int ret = 0;
382
383 mutex_lock(&block_group->alloc_mutex);
384 ret = __btrfs_remove_free_space(block_group, offset, bytes);
385 mutex_unlock(&block_group->alloc_mutex);
386
387 return ret;
388}
389
390int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
391 u64 offset, u64 bytes)
392{
393 int ret;
394
395 ret = __btrfs_remove_free_space(block_group, offset, bytes);
396
397 return ret;
398}
399
400void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, 353void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
401 u64 bytes) 354 u64 bytes)
402{ 355{
@@ -408,6 +361,9 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
408 info = rb_entry(n, struct btrfs_free_space, offset_index); 361 info = rb_entry(n, struct btrfs_free_space, offset_index);
409 if (info->bytes >= bytes) 362 if (info->bytes >= bytes)
410 count++; 363 count++;
364 printk(KERN_ERR "entry offset %llu, bytes %llu\n",
365 (unsigned long long)info->offset,
366 (unsigned long long)info->bytes);
411 } 367 }
412 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" 368 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
413 "\n", count); 369 "\n", count);
@@ -428,68 +384,337 @@ u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
428 return ret; 384 return ret;
429} 385}
430 386
387/*
388 * for a given cluster, put all of its extents back into the free
389 * space cache. If the block group passed doesn't match the block group
390 * pointed to by the cluster, someone else raced in and freed the
391 * cluster already. In that case, we just return without changing anything
392 */
393static int
394__btrfs_return_cluster_to_free_space(
395 struct btrfs_block_group_cache *block_group,
396 struct btrfs_free_cluster *cluster)
397{
398 struct btrfs_free_space *entry;
399 struct rb_node *node;
400
401 spin_lock(&cluster->lock);
402 if (cluster->block_group != block_group)
403 goto out;
404
405 cluster->window_start = 0;
406 node = rb_first(&cluster->root);
407 while(node) {
408 entry = rb_entry(node, struct btrfs_free_space, offset_index);
409 node = rb_next(&entry->offset_index);
410 rb_erase(&entry->offset_index, &cluster->root);
411 link_free_space(block_group, entry);
412 }
413 list_del_init(&cluster->block_group_list);
414
415 btrfs_put_block_group(cluster->block_group);
416 cluster->block_group = NULL;
417 cluster->root.rb_node = NULL;
418out:
419 spin_unlock(&cluster->lock);
420 return 0;
421}
422
431void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) 423void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
432{ 424{
433 struct btrfs_free_space *info; 425 struct btrfs_free_space *info;
434 struct rb_node *node; 426 struct rb_node *node;
427 struct btrfs_free_cluster *cluster;
428 struct btrfs_free_cluster *safe;
429
430 spin_lock(&block_group->tree_lock);
431
432 list_for_each_entry_safe(cluster, safe, &block_group->cluster_list,
433 block_group_list) {
434
435 WARN_ON(cluster->block_group != block_group);
436 __btrfs_return_cluster_to_free_space(block_group, cluster);
437 }
435 438
436 mutex_lock(&block_group->alloc_mutex);
437 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { 439 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
438 info = rb_entry(node, struct btrfs_free_space, bytes_index); 440 info = rb_entry(node, struct btrfs_free_space, bytes_index);
439 unlink_free_space(block_group, info); 441 unlink_free_space(block_group, info);
440 kfree(info); 442 kfree(info);
441 if (need_resched()) { 443 if (need_resched()) {
442 mutex_unlock(&block_group->alloc_mutex); 444 spin_unlock(&block_group->tree_lock);
443 cond_resched(); 445 cond_resched();
444 mutex_lock(&block_group->alloc_mutex); 446 spin_lock(&block_group->tree_lock);
445 } 447 }
446 } 448 }
447 mutex_unlock(&block_group->alloc_mutex); 449 spin_unlock(&block_group->tree_lock);
448} 450}
449 451
450#if 0 452u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
451static struct btrfs_free_space *btrfs_find_free_space_offset(struct 453 u64 offset, u64 bytes, u64 empty_size)
452 btrfs_block_group_cache
453 *block_group, u64 offset,
454 u64 bytes)
455{ 454{
456 struct btrfs_free_space *ret; 455 struct btrfs_free_space *entry = NULL;
456 u64 ret = 0;
457 457
458 mutex_lock(&block_group->alloc_mutex); 458 spin_lock(&block_group->tree_lock);
459 ret = tree_search_offset(&block_group->free_space_offset, offset, 459 entry = tree_search_offset(&block_group->free_space_offset, offset,
460 bytes, 0); 460 bytes + empty_size, 1);
461 mutex_unlock(&block_group->alloc_mutex); 461 if (!entry)
462 entry = tree_search_bytes(&block_group->free_space_bytes,
463 offset, bytes + empty_size);
464 if (entry) {
465 unlink_free_space(block_group, entry);
466 ret = entry->offset;
467 entry->offset += bytes;
468 entry->bytes -= bytes;
469
470 if (!entry->bytes)
471 kfree(entry);
472 else
473 link_free_space(block_group, entry);
474 }
475 spin_unlock(&block_group->tree_lock);
462 476
463 return ret; 477 return ret;
464} 478}
465 479
466static struct btrfs_free_space *btrfs_find_free_space_bytes(struct 480/*
467 btrfs_block_group_cache 481 * given a cluster, put all of its extents back into the free space
468 *block_group, u64 offset, 482 * cache. If a block group is passed, this function will only free
469 u64 bytes) 483 * a cluster that belongs to the passed block group.
484 *
485 * Otherwise, it'll get a reference on the block group pointed to by the
486 * cluster and remove the cluster from it.
487 */
488int btrfs_return_cluster_to_free_space(
489 struct btrfs_block_group_cache *block_group,
490 struct btrfs_free_cluster *cluster)
470{ 491{
471 struct btrfs_free_space *ret; 492 int ret;
472 493
473 mutex_lock(&block_group->alloc_mutex); 494 /* first, get a safe pointer to the block group */
495 spin_lock(&cluster->lock);
496 if (!block_group) {
497 block_group = cluster->block_group;
498 if (!block_group) {
499 spin_unlock(&cluster->lock);
500 return 0;
501 }
502 } else if (cluster->block_group != block_group) {
503 /* someone else has already freed it don't redo their work */
504 spin_unlock(&cluster->lock);
505 return 0;
506 }
507 atomic_inc(&block_group->count);
508 spin_unlock(&cluster->lock);
474 509
475 ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes); 510 /* now return any extents the cluster had on it */
476 mutex_unlock(&block_group->alloc_mutex); 511 spin_lock(&block_group->tree_lock);
512 ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
513 spin_unlock(&block_group->tree_lock);
477 514
515 /* finally drop our ref */
516 btrfs_put_block_group(block_group);
478 return ret; 517 return ret;
479} 518}
480#endif
481 519
482struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache 520/*
483 *block_group, u64 offset, 521 * given a cluster, try to allocate 'bytes' from it, returns 0
484 u64 bytes) 522 * if it couldn't find anything suitably large, or a logical disk offset
523 * if things worked out
524 */
525u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
526 struct btrfs_free_cluster *cluster, u64 bytes,
527 u64 min_start)
485{ 528{
486 struct btrfs_free_space *ret = NULL; 529 struct btrfs_free_space *entry = NULL;
530 struct rb_node *node;
531 u64 ret = 0;
487 532
488 ret = tree_search_offset(&block_group->free_space_offset, offset, 533 spin_lock(&cluster->lock);
489 bytes, 0); 534 if (bytes > cluster->max_size)
490 if (!ret) 535 goto out;
491 ret = tree_search_bytes(&block_group->free_space_bytes,
492 offset, bytes);
493 536
537 if (cluster->block_group != block_group)
538 goto out;
539
540 node = rb_first(&cluster->root);
541 if (!node)
542 goto out;
543
544 entry = rb_entry(node, struct btrfs_free_space, offset_index);
545
546 while(1) {
547 if (entry->bytes < bytes || entry->offset < min_start) {
548 struct rb_node *node;
549
550 node = rb_next(&entry->offset_index);
551 if (!node)
552 break;
553 entry = rb_entry(node, struct btrfs_free_space,
554 offset_index);
555 continue;
556 }
557 ret = entry->offset;
558
559 entry->offset += bytes;
560 entry->bytes -= bytes;
561
562 if (entry->bytes == 0) {
563 rb_erase(&entry->offset_index, &cluster->root);
564 kfree(entry);
565 }
566 break;
567 }
568out:
569 spin_unlock(&cluster->lock);
494 return ret; 570 return ret;
495} 571}
572
573/*
574 * here we try to find a cluster of blocks in a block group. The goal
575 * is to find at least bytes free and up to empty_size + bytes free.
576 * We might not find them all in one contiguous area.
577 *
578 * returns zero and sets up cluster if things worked out, otherwise
579 * it returns -enospc
580 */
581int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
582 struct btrfs_block_group_cache *block_group,
583 struct btrfs_free_cluster *cluster,
584 u64 offset, u64 bytes, u64 empty_size)
585{
586 struct btrfs_free_space *entry = NULL;
587 struct rb_node *node;
588 struct btrfs_free_space *next;
589 struct btrfs_free_space *last;
590 u64 min_bytes;
591 u64 window_start;
592 u64 window_free;
593 u64 max_extent = 0;
594 int total_retries = 0;
595 int ret;
596
597 /* for metadata, allow allocates with more holes */
598 if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
599 /*
600 * we want to do larger allocations when we are
601 * flushing out the delayed refs, it helps prevent
602 * making more work as we go along.
603 */
604 if (trans->transaction->delayed_refs.flushing)
605 min_bytes = max(bytes, (bytes + empty_size) >> 1);
606 else
607 min_bytes = max(bytes, (bytes + empty_size) >> 4);
608 } else
609 min_bytes = max(bytes, (bytes + empty_size) >> 2);
610
611 spin_lock(&block_group->tree_lock);
612 spin_lock(&cluster->lock);
613
614 /* someone already found a cluster, hooray */
615 if (cluster->block_group) {
616 ret = 0;
617 goto out;
618 }
619again:
620 min_bytes = min(min_bytes, bytes + empty_size);
621 entry = tree_search_bytes(&block_group->free_space_bytes,
622 offset, min_bytes);
623 if (!entry) {
624 ret = -ENOSPC;
625 goto out;
626 }
627 window_start = entry->offset;
628 window_free = entry->bytes;
629 last = entry;
630 max_extent = entry->bytes;
631
632 while(1) {
633 /* out window is just right, lets fill it */
634 if (window_free >= bytes + empty_size)
635 break;
636
637 node = rb_next(&last->offset_index);
638 if (!node) {
639 ret = -ENOSPC;
640 goto out;
641 }
642 next = rb_entry(node, struct btrfs_free_space, offset_index);
643
644 /*
645 * we haven't filled the empty size and the window is
646 * very large. reset and try again
647 */
648 if (next->offset - window_start > (bytes + empty_size) * 2) {
649 entry = next;
650 window_start = entry->offset;
651 window_free = entry->bytes;
652 last = entry;
653 max_extent = 0;
654 total_retries++;
655 if (total_retries % 256 == 0) {
656 if (min_bytes >= (bytes + empty_size)) {
657 ret = -ENOSPC;
658 goto out;
659 }
660 /*
661 * grow our allocation a bit, we're not having
662 * much luck
663 */
664 min_bytes *= 2;
665 goto again;
666 }
667 } else {
668 last = next;
669 window_free += next->bytes;
670 if (entry->bytes > max_extent)
671 max_extent = entry->bytes;
672 }
673 }
674
675 cluster->window_start = entry->offset;
676
677 /*
678 * now we've found our entries, pull them out of the free space
679 * cache and put them into the cluster rbtree
680 *
681 * The cluster includes an rbtree, but only uses the offset index
682 * of each free space cache entry.
683 */
684 while(1) {
685 node = rb_next(&entry->offset_index);
686 unlink_free_space(block_group, entry);
687 ret = tree_insert_offset(&cluster->root, entry->offset,
688 &entry->offset_index);
689 BUG_ON(ret);
690
691 if (!node || entry == last)
692 break;
693
694 entry = rb_entry(node, struct btrfs_free_space, offset_index);
695 }
696 ret = 0;
697 cluster->max_size = max_extent;
698 atomic_inc(&block_group->count);
699 list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
700 cluster->block_group = block_group;
701out:
702 spin_unlock(&cluster->lock);
703 spin_unlock(&block_group->tree_lock);
704
705 return ret;
706}
707
708/*
709 * simple code to zero out a cluster
710 */
711void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
712{
713 spin_lock_init(&cluster->lock);
714 spin_lock_init(&cluster->refill_lock);
715 cluster->root.rb_node = NULL;
716 cluster->max_size = 0;
717 INIT_LIST_HEAD(&cluster->block_group_list);
718 cluster->block_group = NULL;
719}
720
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
new file mode 100644
index 000000000000..ab0bdc0a63ce
--- /dev/null
+++ b/fs/btrfs/free-space-cache.h
@@ -0,0 +1,44 @@
1/*
2 * Copyright (C) 2009 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_FREE_SPACE_CACHE
20#define __BTRFS_FREE_SPACE_CACHE
21
22int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
23 u64 bytenr, u64 size);
24int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
25 u64 bytenr, u64 size);
26void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
27 *block_group);
28u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
29 u64 offset, u64 bytes, u64 empty_size);
30void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
31 u64 bytes);
32u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
33int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
34 struct btrfs_block_group_cache *block_group,
35 struct btrfs_free_cluster *cluster,
36 u64 offset, u64 bytes, u64 empty_size);
37void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster);
38u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
39 struct btrfs_free_cluster *cluster, u64 bytes,
40 u64 min_start);
41int btrfs_return_cluster_to_free_space(
42 struct btrfs_block_group_cache *block_group,
43 struct btrfs_free_cluster *cluster);
44#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 3d46fa1f29a4..6b627c611808 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -73,6 +73,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
73 if (!path) 73 if (!path)
74 return -ENOMEM; 74 return -ENOMEM;
75 75
76 path->leave_spinning = 1;
77
76 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 78 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
77 if (ret > 0) { 79 if (ret > 0) {
78 ret = -ENOENT; 80 ret = -ENOENT;
@@ -127,6 +129,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
127 if (!path) 129 if (!path)
128 return -ENOMEM; 130 return -ENOMEM;
129 131
132 path->leave_spinning = 1;
130 ret = btrfs_insert_empty_item(trans, root, path, &key, 133 ret = btrfs_insert_empty_item(trans, root, path, &key,
131 ins_len); 134 ins_len);
132 if (ret == -EEXIST) { 135 if (ret == -EEXIST) {
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index cc7334d833c9..9abbced1123d 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -79,7 +79,7 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
79 } 79 }
80 path = btrfs_alloc_path(); 80 path = btrfs_alloc_path();
81 BUG_ON(!path); 81 BUG_ON(!path);
82 search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID); 82 search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID);
83 search_key.objectid = search_start; 83 search_key.objectid = search_start;
84 search_key.type = 0; 84 search_key.type = 0;
85 search_key.offset = 0; 85 search_key.offset = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7d4f948bc22a..1c8b0190d031 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -70,7 +70,6 @@ static struct extent_io_ops btrfs_extent_io_ops;
70static struct kmem_cache *btrfs_inode_cachep; 70static struct kmem_cache *btrfs_inode_cachep;
71struct kmem_cache *btrfs_trans_handle_cachep; 71struct kmem_cache *btrfs_trans_handle_cachep;
72struct kmem_cache *btrfs_transaction_cachep; 72struct kmem_cache *btrfs_transaction_cachep;
73struct kmem_cache *btrfs_bit_radix_cachep;
74struct kmem_cache *btrfs_path_cachep; 73struct kmem_cache *btrfs_path_cachep;
75 74
76#define S_SHIFT 12 75#define S_SHIFT 12
@@ -134,6 +133,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
134 if (!path) 133 if (!path)
135 return -ENOMEM; 134 return -ENOMEM;
136 135
136 path->leave_spinning = 1;
137 btrfs_set_trans_block_group(trans, inode); 137 btrfs_set_trans_block_group(trans, inode);
138 138
139 key.objectid = inode->i_ino; 139 key.objectid = inode->i_ino;
@@ -167,9 +167,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
167 cur_size = min_t(unsigned long, compressed_size, 167 cur_size = min_t(unsigned long, compressed_size,
168 PAGE_CACHE_SIZE); 168 PAGE_CACHE_SIZE);
169 169
170 kaddr = kmap(cpage); 170 kaddr = kmap_atomic(cpage, KM_USER0);
171 write_extent_buffer(leaf, kaddr, ptr, cur_size); 171 write_extent_buffer(leaf, kaddr, ptr, cur_size);
172 kunmap(cpage); 172 kunmap_atomic(kaddr, KM_USER0);
173 173
174 i++; 174 i++;
175 ptr += cur_size; 175 ptr += cur_size;
@@ -204,7 +204,7 @@ fail:
204 * does the checks required to make sure the data is small enough 204 * does the checks required to make sure the data is small enough
205 * to fit as an inline extent. 205 * to fit as an inline extent.
206 */ 206 */
207static int cow_file_range_inline(struct btrfs_trans_handle *trans, 207static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
208 struct btrfs_root *root, 208 struct btrfs_root *root,
209 struct inode *inode, u64 start, u64 end, 209 struct inode *inode, u64 start, u64 end,
210 size_t compressed_size, 210 size_t compressed_size,
@@ -233,7 +233,7 @@ static int cow_file_range_inline(struct btrfs_trans_handle *trans,
233 } 233 }
234 234
235 ret = btrfs_drop_extents(trans, root, inode, start, 235 ret = btrfs_drop_extents(trans, root, inode, start,
236 aligned_end, start, &hint_byte); 236 aligned_end, aligned_end, start, &hint_byte);
237 BUG_ON(ret); 237 BUG_ON(ret);
238 238
239 if (isize > actual_end) 239 if (isize > actual_end)
@@ -854,11 +854,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
854 u64 cur_end; 854 u64 cur_end;
855 int limit = 10 * 1024 * 1042; 855 int limit = 10 * 1024 * 1042;
856 856
857 if (!btrfs_test_opt(root, COMPRESS)) {
858 return cow_file_range(inode, locked_page, start, end,
859 page_started, nr_written, 1);
860 }
861
862 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | 857 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
863 EXTENT_DELALLOC, 1, 0, GFP_NOFS); 858 EXTENT_DELALLOC, 1, 0, GFP_NOFS);
864 while (start < end) { 859 while (start < end) {
@@ -935,7 +930,8 @@ static noinline int csum_exist_in_range(struct btrfs_root *root,
935 * If no cow copies or snapshots exist, we write directly to the existing 930 * If no cow copies or snapshots exist, we write directly to the existing
936 * blocks on disk 931 * blocks on disk
937 */ 932 */
938static int run_delalloc_nocow(struct inode *inode, struct page *locked_page, 933static noinline int run_delalloc_nocow(struct inode *inode,
934 struct page *locked_page,
939 u64 start, u64 end, int *page_started, int force, 935 u64 start, u64 end, int *page_started, int force,
940 unsigned long *nr_written) 936 unsigned long *nr_written)
941{ 937{
@@ -1133,6 +1129,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1133 unsigned long *nr_written) 1129 unsigned long *nr_written)
1134{ 1130{
1135 int ret; 1131 int ret;
1132 struct btrfs_root *root = BTRFS_I(inode)->root;
1136 1133
1137 if (btrfs_test_flag(inode, NODATACOW)) 1134 if (btrfs_test_flag(inode, NODATACOW))
1138 ret = run_delalloc_nocow(inode, locked_page, start, end, 1135 ret = run_delalloc_nocow(inode, locked_page, start, end,
@@ -1140,10 +1137,12 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1140 else if (btrfs_test_flag(inode, PREALLOC)) 1137 else if (btrfs_test_flag(inode, PREALLOC))
1141 ret = run_delalloc_nocow(inode, locked_page, start, end, 1138 ret = run_delalloc_nocow(inode, locked_page, start, end,
1142 page_started, 0, nr_written); 1139 page_started, 0, nr_written);
1140 else if (!btrfs_test_opt(root, COMPRESS))
1141 ret = cow_file_range(inode, locked_page, start, end,
1142 page_started, nr_written, 1);
1143 else 1143 else
1144 ret = cow_file_range_async(inode, locked_page, start, end, 1144 ret = cow_file_range_async(inode, locked_page, start, end,
1145 page_started, nr_written); 1145 page_started, nr_written);
1146
1147 return ret; 1146 return ret;
1148} 1147}
1149 1148
@@ -1439,6 +1438,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1439 struct inode *inode, u64 file_pos, 1438 struct inode *inode, u64 file_pos,
1440 u64 disk_bytenr, u64 disk_num_bytes, 1439 u64 disk_bytenr, u64 disk_num_bytes,
1441 u64 num_bytes, u64 ram_bytes, 1440 u64 num_bytes, u64 ram_bytes,
1441 u64 locked_end,
1442 u8 compression, u8 encryption, 1442 u8 compression, u8 encryption,
1443 u16 other_encoding, int extent_type) 1443 u16 other_encoding, int extent_type)
1444{ 1444{
@@ -1453,8 +1453,10 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1453 path = btrfs_alloc_path(); 1453 path = btrfs_alloc_path();
1454 BUG_ON(!path); 1454 BUG_ON(!path);
1455 1455
1456 path->leave_spinning = 1;
1456 ret = btrfs_drop_extents(trans, root, inode, file_pos, 1457 ret = btrfs_drop_extents(trans, root, inode, file_pos,
1457 file_pos + num_bytes, file_pos, &hint); 1458 file_pos + num_bytes, locked_end,
1459 file_pos, &hint);
1458 BUG_ON(ret); 1460 BUG_ON(ret);
1459 1461
1460 ins.objectid = inode->i_ino; 1462 ins.objectid = inode->i_ino;
@@ -1475,6 +1477,10 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1475 btrfs_set_file_extent_compression(leaf, fi, compression); 1477 btrfs_set_file_extent_compression(leaf, fi, compression);
1476 btrfs_set_file_extent_encryption(leaf, fi, encryption); 1478 btrfs_set_file_extent_encryption(leaf, fi, encryption);
1477 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); 1479 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
1480
1481 btrfs_unlock_up_safe(path, 1);
1482 btrfs_set_lock_blocking(leaf);
1483
1478 btrfs_mark_buffer_dirty(leaf); 1484 btrfs_mark_buffer_dirty(leaf);
1479 1485
1480 inode_add_bytes(inode, num_bytes); 1486 inode_add_bytes(inode, num_bytes);
@@ -1487,11 +1493,35 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1487 root->root_key.objectid, 1493 root->root_key.objectid,
1488 trans->transid, inode->i_ino, &ins); 1494 trans->transid, inode->i_ino, &ins);
1489 BUG_ON(ret); 1495 BUG_ON(ret);
1490
1491 btrfs_free_path(path); 1496 btrfs_free_path(path);
1497
1492 return 0; 1498 return 0;
1493} 1499}
1494 1500
1501/*
1502 * helper function for btrfs_finish_ordered_io, this
1503 * just reads in some of the csum leaves to prime them into ram
1504 * before we start the transaction. It limits the amount of btree
1505 * reads required while inside the transaction.
1506 */
1507static noinline void reada_csum(struct btrfs_root *root,
1508 struct btrfs_path *path,
1509 struct btrfs_ordered_extent *ordered_extent)
1510{
1511 struct btrfs_ordered_sum *sum;
1512 u64 bytenr;
1513
1514 sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum,
1515 list);
1516 bytenr = sum->sums[0].bytenr;
1517
1518 /*
1519 * we don't care about the results, the point of this search is
1520 * just to get the btree leaves into ram
1521 */
1522 btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0);
1523}
1524
1495/* as ordered data IO finishes, this gets called so we can finish 1525/* as ordered data IO finishes, this gets called so we can finish
1496 * an ordered extent if the range of bytes in the file it covers are 1526 * an ordered extent if the range of bytes in the file it covers are
1497 * fully written. 1527 * fully written.
@@ -1500,8 +1530,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1500{ 1530{
1501 struct btrfs_root *root = BTRFS_I(inode)->root; 1531 struct btrfs_root *root = BTRFS_I(inode)->root;
1502 struct btrfs_trans_handle *trans; 1532 struct btrfs_trans_handle *trans;
1503 struct btrfs_ordered_extent *ordered_extent; 1533 struct btrfs_ordered_extent *ordered_extent = NULL;
1504 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1534 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1535 struct btrfs_path *path;
1505 int compressed = 0; 1536 int compressed = 0;
1506 int ret; 1537 int ret;
1507 1538
@@ -1509,9 +1540,33 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1509 if (!ret) 1540 if (!ret)
1510 return 0; 1541 return 0;
1511 1542
1543 /*
1544 * before we join the transaction, try to do some of our IO.
1545 * This will limit the amount of IO that we have to do with
1546 * the transaction running. We're unlikely to need to do any
1547 * IO if the file extents are new, the disk_i_size checks
1548 * covers the most common case.
1549 */
1550 if (start < BTRFS_I(inode)->disk_i_size) {
1551 path = btrfs_alloc_path();
1552 if (path) {
1553 ret = btrfs_lookup_file_extent(NULL, root, path,
1554 inode->i_ino,
1555 start, 0);
1556 ordered_extent = btrfs_lookup_ordered_extent(inode,
1557 start);
1558 if (!list_empty(&ordered_extent->list)) {
1559 btrfs_release_path(root, path);
1560 reada_csum(root, path, ordered_extent);
1561 }
1562 btrfs_free_path(path);
1563 }
1564 }
1565
1512 trans = btrfs_join_transaction(root, 1); 1566 trans = btrfs_join_transaction(root, 1);
1513 1567
1514 ordered_extent = btrfs_lookup_ordered_extent(inode, start); 1568 if (!ordered_extent)
1569 ordered_extent = btrfs_lookup_ordered_extent(inode, start);
1515 BUG_ON(!ordered_extent); 1570 BUG_ON(!ordered_extent);
1516 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) 1571 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
1517 goto nocow; 1572 goto nocow;
@@ -1536,6 +1591,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1536 ordered_extent->disk_len, 1591 ordered_extent->disk_len,
1537 ordered_extent->len, 1592 ordered_extent->len,
1538 ordered_extent->len, 1593 ordered_extent->len,
1594 ordered_extent->file_offset +
1595 ordered_extent->len,
1539 compressed, 0, 0, 1596 compressed, 0, 0,
1540 BTRFS_FILE_EXTENT_REG); 1597 BTRFS_FILE_EXTENT_REG);
1541 BUG_ON(ret); 1598 BUG_ON(ret);
@@ -1765,10 +1822,12 @@ good:
1765 return 0; 1822 return 0;
1766 1823
1767zeroit: 1824zeroit:
1768 printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u " 1825 if (printk_ratelimit()) {
1769 "private %llu\n", page->mapping->host->i_ino, 1826 printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
1770 (unsigned long long)start, csum, 1827 "private %llu\n", page->mapping->host->i_ino,
1771 (unsigned long long)private); 1828 (unsigned long long)start, csum,
1829 (unsigned long long)private);
1830 }
1772 memset(kaddr + offset, 1, end - start + 1); 1831 memset(kaddr + offset, 1, end - start + 1);
1773 flush_dcache_page(page); 1832 flush_dcache_page(page);
1774 kunmap_atomic(kaddr, KM_USER0); 1833 kunmap_atomic(kaddr, KM_USER0);
@@ -1957,6 +2016,57 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
1957} 2016}
1958 2017
1959/* 2018/*
2019 * very simple check to peek ahead in the leaf looking for xattrs. If we
2020 * don't find any xattrs, we know there can't be any acls.
2021 *
2022 * slot is the slot the inode is in, objectid is the objectid of the inode
2023 */
2024static noinline int acls_after_inode_item(struct extent_buffer *leaf,
2025 int slot, u64 objectid)
2026{
2027 u32 nritems = btrfs_header_nritems(leaf);
2028 struct btrfs_key found_key;
2029 int scanned = 0;
2030
2031 slot++;
2032 while (slot < nritems) {
2033 btrfs_item_key_to_cpu(leaf, &found_key, slot);
2034
2035 /* we found a different objectid, there must not be acls */
2036 if (found_key.objectid != objectid)
2037 return 0;
2038
2039 /* we found an xattr, assume we've got an acl */
2040 if (found_key.type == BTRFS_XATTR_ITEM_KEY)
2041 return 1;
2042
2043 /*
2044 * we found a key greater than an xattr key, there can't
2045 * be any acls later on
2046 */
2047 if (found_key.type > BTRFS_XATTR_ITEM_KEY)
2048 return 0;
2049
2050 slot++;
2051 scanned++;
2052
2053 /*
2054 * it goes inode, inode backrefs, xattrs, extents,
2055 * so if there are a ton of hard links to an inode there can
2056 * be a lot of backrefs. Don't waste time searching too hard,
2057 * this is just an optimization
2058 */
2059 if (scanned >= 8)
2060 break;
2061 }
2062 /* we hit the end of the leaf before we found an xattr or
2063 * something larger than an xattr. We have to assume the inode
2064 * has acls
2065 */
2066 return 1;
2067}
2068
2069/*
1960 * read an inode from the btree into the in-memory inode 2070 * read an inode from the btree into the in-memory inode
1961 */ 2071 */
1962void btrfs_read_locked_inode(struct inode *inode) 2072void btrfs_read_locked_inode(struct inode *inode)
@@ -1967,6 +2077,7 @@ void btrfs_read_locked_inode(struct inode *inode)
1967 struct btrfs_timespec *tspec; 2077 struct btrfs_timespec *tspec;
1968 struct btrfs_root *root = BTRFS_I(inode)->root; 2078 struct btrfs_root *root = BTRFS_I(inode)->root;
1969 struct btrfs_key location; 2079 struct btrfs_key location;
2080 int maybe_acls;
1970 u64 alloc_group_block; 2081 u64 alloc_group_block;
1971 u32 rdev; 2082 u32 rdev;
1972 int ret; 2083 int ret;
@@ -2013,6 +2124,16 @@ void btrfs_read_locked_inode(struct inode *inode)
2013 2124
2014 alloc_group_block = btrfs_inode_block_group(leaf, inode_item); 2125 alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
2015 2126
2127 /*
2128 * try to precache a NULL acl entry for files that don't have
2129 * any xattrs or acls
2130 */
2131 maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino);
2132 if (!maybe_acls) {
2133 BTRFS_I(inode)->i_acl = NULL;
2134 BTRFS_I(inode)->i_default_acl = NULL;
2135 }
2136
2016 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, 2137 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
2017 alloc_group_block, 0); 2138 alloc_group_block, 0);
2018 btrfs_free_path(path); 2139 btrfs_free_path(path);
@@ -2101,6 +2222,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2101 2222
2102 path = btrfs_alloc_path(); 2223 path = btrfs_alloc_path();
2103 BUG_ON(!path); 2224 BUG_ON(!path);
2225 path->leave_spinning = 1;
2104 ret = btrfs_lookup_inode(trans, root, path, 2226 ret = btrfs_lookup_inode(trans, root, path,
2105 &BTRFS_I(inode)->location, 1); 2227 &BTRFS_I(inode)->location, 1);
2106 if (ret) { 2228 if (ret) {
@@ -2147,6 +2269,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2147 goto err; 2269 goto err;
2148 } 2270 }
2149 2271
2272 path->leave_spinning = 1;
2150 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, 2273 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
2151 name, name_len, -1); 2274 name, name_len, -1);
2152 if (IS_ERR(di)) { 2275 if (IS_ERR(di)) {
@@ -2190,8 +2313,6 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2190 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, 2313 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
2191 inode, dir->i_ino); 2314 inode, dir->i_ino);
2192 BUG_ON(ret != 0 && ret != -ENOENT); 2315 BUG_ON(ret != 0 && ret != -ENOENT);
2193 if (ret != -ENOENT)
2194 BTRFS_I(dir)->log_dirty_trans = trans->transid;
2195 2316
2196 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, 2317 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
2197 dir, index); 2318 dir, index);
@@ -2224,6 +2345,9 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2224 trans = btrfs_start_transaction(root, 1); 2345 trans = btrfs_start_transaction(root, 1);
2225 2346
2226 btrfs_set_trans_block_group(trans, dir); 2347 btrfs_set_trans_block_group(trans, dir);
2348
2349 btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
2350
2227 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 2351 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2228 dentry->d_name.name, dentry->d_name.len); 2352 dentry->d_name.name, dentry->d_name.len);
2229 2353
@@ -2498,6 +2622,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2498 key.type = (u8)-1; 2622 key.type = (u8)-1;
2499 2623
2500search_again: 2624search_again:
2625 path->leave_spinning = 1;
2501 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2626 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2502 if (ret < 0) 2627 if (ret < 0)
2503 goto error; 2628 goto error;
@@ -2644,6 +2769,7 @@ delete:
2644 break; 2769 break;
2645 } 2770 }
2646 if (found_extent) { 2771 if (found_extent) {
2772 btrfs_set_path_blocking(path);
2647 ret = btrfs_free_extent(trans, root, extent_start, 2773 ret = btrfs_free_extent(trans, root, extent_start,
2648 extent_num_bytes, 2774 extent_num_bytes,
2649 leaf->start, root_owner, 2775 leaf->start, root_owner,
@@ -2818,6 +2944,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
2818 err = btrfs_drop_extents(trans, root, inode, 2944 err = btrfs_drop_extents(trans, root, inode,
2819 cur_offset, 2945 cur_offset,
2820 cur_offset + hole_size, 2946 cur_offset + hole_size,
2947 block_end,
2821 cur_offset, &hint_byte); 2948 cur_offset, &hint_byte);
2822 if (err) 2949 if (err)
2823 break; 2950 break;
@@ -2848,11 +2975,21 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
2848 if (err) 2975 if (err)
2849 return err; 2976 return err;
2850 2977
2851 if (S_ISREG(inode->i_mode) && 2978 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
2852 attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) { 2979 if (attr->ia_size > inode->i_size) {
2853 err = btrfs_cont_expand(inode, attr->ia_size); 2980 err = btrfs_cont_expand(inode, attr->ia_size);
2854 if (err) 2981 if (err)
2855 return err; 2982 return err;
2983 } else if (inode->i_size > 0 &&
2984 attr->ia_size == 0) {
2985
2986 /* we're truncating a file that used to have good
2987 * data down to zero. Make sure it gets into
2988 * the ordered flush list so that any new writes
2989 * get down to disk quickly.
2990 */
2991 BTRFS_I(inode)->ordered_data_close = 1;
2992 }
2856 } 2993 }
2857 2994
2858 err = inode_setattr(inode, attr); 2995 err = inode_setattr(inode, attr);
@@ -2972,8 +3109,8 @@ static noinline void init_btrfs_i(struct inode *inode)
2972{ 3109{
2973 struct btrfs_inode *bi = BTRFS_I(inode); 3110 struct btrfs_inode *bi = BTRFS_I(inode);
2974 3111
2975 bi->i_acl = NULL; 3112 bi->i_acl = BTRFS_ACL_NOT_CACHED;
2976 bi->i_default_acl = NULL; 3113 bi->i_default_acl = BTRFS_ACL_NOT_CACHED;
2977 3114
2978 bi->generation = 0; 3115 bi->generation = 0;
2979 bi->sequence = 0; 3116 bi->sequence = 0;
@@ -2984,13 +3121,15 @@ static noinline void init_btrfs_i(struct inode *inode)
2984 bi->disk_i_size = 0; 3121 bi->disk_i_size = 0;
2985 bi->flags = 0; 3122 bi->flags = 0;
2986 bi->index_cnt = (u64)-1; 3123 bi->index_cnt = (u64)-1;
2987 bi->log_dirty_trans = 0; 3124 bi->last_unlink_trans = 0;
3125 bi->ordered_data_close = 0;
2988 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); 3126 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
2989 extent_io_tree_init(&BTRFS_I(inode)->io_tree, 3127 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
2990 inode->i_mapping, GFP_NOFS); 3128 inode->i_mapping, GFP_NOFS);
2991 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, 3129 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
2992 inode->i_mapping, GFP_NOFS); 3130 inode->i_mapping, GFP_NOFS);
2993 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); 3131 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
3132 INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
2994 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); 3133 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
2995 mutex_init(&BTRFS_I(inode)->extent_mutex); 3134 mutex_init(&BTRFS_I(inode)->extent_mutex);
2996 mutex_init(&BTRFS_I(inode)->log_mutex); 3135 mutex_init(&BTRFS_I(inode)->log_mutex);
@@ -3411,8 +3550,10 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3411 3550
3412 if (dir) { 3551 if (dir) {
3413 ret = btrfs_set_inode_index(dir, index); 3552 ret = btrfs_set_inode_index(dir, index);
3414 if (ret) 3553 if (ret) {
3554 iput(inode);
3415 return ERR_PTR(ret); 3555 return ERR_PTR(ret);
3556 }
3416 } 3557 }
3417 /* 3558 /*
3418 * index_cnt is ignored for everything but a dir, 3559 * index_cnt is ignored for everything but a dir,
@@ -3449,6 +3590,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3449 sizes[0] = sizeof(struct btrfs_inode_item); 3590 sizes[0] = sizeof(struct btrfs_inode_item);
3450 sizes[1] = name_len + sizeof(*ref); 3591 sizes[1] = name_len + sizeof(*ref);
3451 3592
3593 path->leave_spinning = 1;
3452 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); 3594 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
3453 if (ret != 0) 3595 if (ret != 0)
3454 goto fail; 3596 goto fail;
@@ -3494,6 +3636,7 @@ fail:
3494 if (dir) 3636 if (dir)
3495 BTRFS_I(dir)->index_cnt--; 3637 BTRFS_I(dir)->index_cnt--;
3496 btrfs_free_path(path); 3638 btrfs_free_path(path);
3639 iput(inode);
3497 return ERR_PTR(ret); 3640 return ERR_PTR(ret);
3498} 3641}
3499 3642
@@ -3727,6 +3870,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
3727 drop_inode = 1; 3870 drop_inode = 1;
3728 3871
3729 nr = trans->blocks_used; 3872 nr = trans->blocks_used;
3873
3874 btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
3730 btrfs_end_transaction_throttle(trans, root); 3875 btrfs_end_transaction_throttle(trans, root);
3731fail: 3876fail:
3732 if (drop_inode) { 3877 if (drop_inode) {
@@ -4151,7 +4296,6 @@ out:
4151 } 4296 }
4152 if (err) { 4297 if (err) {
4153 free_extent_map(em); 4298 free_extent_map(em);
4154 WARN_ON(1);
4155 return ERR_PTR(err); 4299 return ERR_PTR(err);
4156 } 4300 }
4157 return em; 4301 return em;
@@ -4292,8 +4436,9 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4292 * beyond EOF, then the page is guaranteed safe against truncation until we 4436 * beyond EOF, then the page is guaranteed safe against truncation until we
4293 * unlock the page. 4437 * unlock the page.
4294 */ 4438 */
4295int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page) 4439int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
4296{ 4440{
4441 struct page *page = vmf->page;
4297 struct inode *inode = fdentry(vma->vm_file)->d_inode; 4442 struct inode *inode = fdentry(vma->vm_file)->d_inode;
4298 struct btrfs_root *root = BTRFS_I(inode)->root; 4443 struct btrfs_root *root = BTRFS_I(inode)->root;
4299 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 4444 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -4306,10 +4451,15 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4306 u64 page_end; 4451 u64 page_end;
4307 4452
4308 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 4453 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
4309 if (ret) 4454 if (ret) {
4455 if (ret == -ENOMEM)
4456 ret = VM_FAULT_OOM;
4457 else /* -ENOSPC, -EIO, etc */
4458 ret = VM_FAULT_SIGBUS;
4310 goto out; 4459 goto out;
4460 }
4311 4461
4312 ret = -EINVAL; 4462 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
4313again: 4463again:
4314 lock_page(page); 4464 lock_page(page);
4315 size = i_size_read(inode); 4465 size = i_size_read(inode);
@@ -4357,6 +4507,8 @@ again:
4357 } 4507 }
4358 ClearPageChecked(page); 4508 ClearPageChecked(page);
4359 set_page_dirty(page); 4509 set_page_dirty(page);
4510
4511 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
4360 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 4512 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4361 4513
4362out_unlock: 4514out_unlock:
@@ -4382,6 +4534,27 @@ static void btrfs_truncate(struct inode *inode)
4382 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 4534 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
4383 4535
4384 trans = btrfs_start_transaction(root, 1); 4536 trans = btrfs_start_transaction(root, 1);
4537
4538 /*
4539 * setattr is responsible for setting the ordered_data_close flag,
4540 * but that is only tested during the last file release. That
4541 * could happen well after the next commit, leaving a great big
4542 * window where new writes may get lost if someone chooses to write
4543 * to this file after truncating to zero
4544 *
4545 * The inode doesn't have any dirty data here, and so if we commit
4546 * this is a noop. If someone immediately starts writing to the inode
4547 * it is very likely we'll catch some of their writes in this
4548 * transaction, and the commit will find this file on the ordered
4549 * data list with good things to send down.
4550 *
4551 * This is a best effort solution, there is still a window where
4552 * using truncate to replace the contents of the file will
4553 * end up with a zero length file after a crash.
4554 */
4555 if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
4556 btrfs_add_ordered_operation(trans, root, inode);
4557
4385 btrfs_set_trans_block_group(trans, inode); 4558 btrfs_set_trans_block_group(trans, inode);
4386 btrfs_i_size_write(inode, inode->i_size); 4559 btrfs_i_size_write(inode, inode->i_size);
4387 4560
@@ -4458,12 +4631,15 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
4458 ei->i_acl = BTRFS_ACL_NOT_CACHED; 4631 ei->i_acl = BTRFS_ACL_NOT_CACHED;
4459 ei->i_default_acl = BTRFS_ACL_NOT_CACHED; 4632 ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
4460 INIT_LIST_HEAD(&ei->i_orphan); 4633 INIT_LIST_HEAD(&ei->i_orphan);
4634 INIT_LIST_HEAD(&ei->ordered_operations);
4461 return &ei->vfs_inode; 4635 return &ei->vfs_inode;
4462} 4636}
4463 4637
4464void btrfs_destroy_inode(struct inode *inode) 4638void btrfs_destroy_inode(struct inode *inode)
4465{ 4639{
4466 struct btrfs_ordered_extent *ordered; 4640 struct btrfs_ordered_extent *ordered;
4641 struct btrfs_root *root = BTRFS_I(inode)->root;
4642
4467 WARN_ON(!list_empty(&inode->i_dentry)); 4643 WARN_ON(!list_empty(&inode->i_dentry));
4468 WARN_ON(inode->i_data.nrpages); 4644 WARN_ON(inode->i_data.nrpages);
4469 4645
@@ -4474,13 +4650,24 @@ void btrfs_destroy_inode(struct inode *inode)
4474 BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED) 4650 BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
4475 posix_acl_release(BTRFS_I(inode)->i_default_acl); 4651 posix_acl_release(BTRFS_I(inode)->i_default_acl);
4476 4652
4477 spin_lock(&BTRFS_I(inode)->root->list_lock); 4653 /*
4654 * Make sure we're properly removed from the ordered operation
4655 * lists.
4656 */
4657 smp_mb();
4658 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
4659 spin_lock(&root->fs_info->ordered_extent_lock);
4660 list_del_init(&BTRFS_I(inode)->ordered_operations);
4661 spin_unlock(&root->fs_info->ordered_extent_lock);
4662 }
4663
4664 spin_lock(&root->list_lock);
4478 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 4665 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
4479 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan" 4666 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
4480 " list\n", inode->i_ino); 4667 " list\n", inode->i_ino);
4481 dump_stack(); 4668 dump_stack();
4482 } 4669 }
4483 spin_unlock(&BTRFS_I(inode)->root->list_lock); 4670 spin_unlock(&root->list_lock);
4484 4671
4485 while (1) { 4672 while (1) {
4486 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 4673 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -4515,47 +4702,36 @@ void btrfs_destroy_cachep(void)
4515 kmem_cache_destroy(btrfs_trans_handle_cachep); 4702 kmem_cache_destroy(btrfs_trans_handle_cachep);
4516 if (btrfs_transaction_cachep) 4703 if (btrfs_transaction_cachep)
4517 kmem_cache_destroy(btrfs_transaction_cachep); 4704 kmem_cache_destroy(btrfs_transaction_cachep);
4518 if (btrfs_bit_radix_cachep)
4519 kmem_cache_destroy(btrfs_bit_radix_cachep);
4520 if (btrfs_path_cachep) 4705 if (btrfs_path_cachep)
4521 kmem_cache_destroy(btrfs_path_cachep); 4706 kmem_cache_destroy(btrfs_path_cachep);
4522} 4707}
4523 4708
4524struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
4525 unsigned long extra_flags,
4526 void (*ctor)(void *))
4527{
4528 return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT |
4529 SLAB_MEM_SPREAD | extra_flags), ctor);
4530}
4531
4532int btrfs_init_cachep(void) 4709int btrfs_init_cachep(void)
4533{ 4710{
4534 btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache", 4711 btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache",
4535 sizeof(struct btrfs_inode), 4712 sizeof(struct btrfs_inode), 0,
4536 0, init_once); 4713 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
4537 if (!btrfs_inode_cachep) 4714 if (!btrfs_inode_cachep)
4538 goto fail; 4715 goto fail;
4539 btrfs_trans_handle_cachep = 4716
4540 btrfs_cache_create("btrfs_trans_handle_cache", 4717 btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache",
4541 sizeof(struct btrfs_trans_handle), 4718 sizeof(struct btrfs_trans_handle), 0,
4542 0, NULL); 4719 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
4543 if (!btrfs_trans_handle_cachep) 4720 if (!btrfs_trans_handle_cachep)
4544 goto fail; 4721 goto fail;
4545 btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache", 4722
4546 sizeof(struct btrfs_transaction), 4723 btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache",
4547 0, NULL); 4724 sizeof(struct btrfs_transaction), 0,
4725 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
4548 if (!btrfs_transaction_cachep) 4726 if (!btrfs_transaction_cachep)
4549 goto fail; 4727 goto fail;
4550 btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache", 4728
4551 sizeof(struct btrfs_path), 4729 btrfs_path_cachep = kmem_cache_create("btrfs_path_cache",
4552 0, NULL); 4730 sizeof(struct btrfs_path), 0,
4731 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
4553 if (!btrfs_path_cachep) 4732 if (!btrfs_path_cachep)
4554 goto fail; 4733 goto fail;
4555 btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256, 4734
4556 SLAB_DESTROY_BY_RCU, NULL);
4557 if (!btrfs_bit_radix_cachep)
4558 goto fail;
4559 return 0; 4735 return 0;
4560fail: 4736fail:
4561 btrfs_destroy_cachep(); 4737 btrfs_destroy_cachep();
@@ -4605,8 +4781,36 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4605 if (ret) 4781 if (ret)
4606 goto out_unlock; 4782 goto out_unlock;
4607 4783
4784 /*
4785 * we're using rename to replace one file with another.
4786 * and the replacement file is large. Start IO on it now so
4787 * we don't add too much work to the end of the transaction
4788 */
4789 if (new_inode && old_inode && S_ISREG(old_inode->i_mode) &&
4790 new_inode->i_size &&
4791 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
4792 filemap_flush(old_inode->i_mapping);
4793
4608 trans = btrfs_start_transaction(root, 1); 4794 trans = btrfs_start_transaction(root, 1);
4609 4795
4796 /*
4797 * make sure the inode gets flushed if it is replacing
4798 * something.
4799 */
4800 if (new_inode && new_inode->i_size &&
4801 old_inode && S_ISREG(old_inode->i_mode)) {
4802 btrfs_add_ordered_operation(trans, root, old_inode);
4803 }
4804
4805 /*
4806 * this is an ugly little race, but the rename is required to make
4807 * sure that if we crash, the inode is either at the old name
4808 * or the new one. pinning the log transaction lets us make sure
4809 * we don't allow a log commit to come in after we unlink the
4810 * name but before we add the new name back in.
4811 */
4812 btrfs_pin_log_trans(root);
4813
4610 btrfs_set_trans_block_group(trans, new_dir); 4814 btrfs_set_trans_block_group(trans, new_dir);
4611 4815
4612 btrfs_inc_nlink(old_dentry->d_inode); 4816 btrfs_inc_nlink(old_dentry->d_inode);
@@ -4614,6 +4818,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4614 new_dir->i_ctime = new_dir->i_mtime = ctime; 4818 new_dir->i_ctime = new_dir->i_mtime = ctime;
4615 old_inode->i_ctime = ctime; 4819 old_inode->i_ctime = ctime;
4616 4820
4821 if (old_dentry->d_parent != new_dentry->d_parent)
4822 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
4823
4617 ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode, 4824 ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
4618 old_dentry->d_name.name, 4825 old_dentry->d_name.name,
4619 old_dentry->d_name.len); 4826 old_dentry->d_name.len);
@@ -4645,7 +4852,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4645 if (ret) 4852 if (ret)
4646 goto out_fail; 4853 goto out_fail;
4647 4854
4855 btrfs_log_new_name(trans, old_inode, old_dir,
4856 new_dentry->d_parent);
4648out_fail: 4857out_fail:
4858
4859 /* this btrfs_end_log_trans just allows the current
4860 * log-sub transaction to complete
4861 */
4862 btrfs_end_log_trans(root);
4649 btrfs_end_transaction_throttle(trans, root); 4863 btrfs_end_transaction_throttle(trans, root);
4650out_unlock: 4864out_unlock:
4651 return ret; 4865 return ret;
@@ -4813,10 +5027,10 @@ out_fail:
4813 return err; 5027 return err;
4814} 5028}
4815 5029
4816static int prealloc_file_range(struct inode *inode, u64 start, u64 end, 5030static int prealloc_file_range(struct btrfs_trans_handle *trans,
4817 u64 alloc_hint, int mode) 5031 struct inode *inode, u64 start, u64 end,
5032 u64 locked_end, u64 alloc_hint, int mode)
4818{ 5033{
4819 struct btrfs_trans_handle *trans;
4820 struct btrfs_root *root = BTRFS_I(inode)->root; 5034 struct btrfs_root *root = BTRFS_I(inode)->root;
4821 struct btrfs_key ins; 5035 struct btrfs_key ins;
4822 u64 alloc_size; 5036 u64 alloc_size;
@@ -4824,10 +5038,6 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
4824 u64 num_bytes = end - start; 5038 u64 num_bytes = end - start;
4825 int ret = 0; 5039 int ret = 0;
4826 5040
4827 trans = btrfs_join_transaction(root, 1);
4828 BUG_ON(!trans);
4829 btrfs_set_trans_block_group(trans, inode);
4830
4831 while (num_bytes > 0) { 5041 while (num_bytes > 0) {
4832 alloc_size = min(num_bytes, root->fs_info->max_extent); 5042 alloc_size = min(num_bytes, root->fs_info->max_extent);
4833 ret = btrfs_reserve_extent(trans, root, alloc_size, 5043 ret = btrfs_reserve_extent(trans, root, alloc_size,
@@ -4840,7 +5050,8 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
4840 ret = insert_reserved_file_extent(trans, inode, 5050 ret = insert_reserved_file_extent(trans, inode,
4841 cur_offset, ins.objectid, 5051 cur_offset, ins.objectid,
4842 ins.offset, ins.offset, 5052 ins.offset, ins.offset,
4843 ins.offset, 0, 0, 0, 5053 ins.offset, locked_end,
5054 0, 0, 0,
4844 BTRFS_FILE_EXTENT_PREALLOC); 5055 BTRFS_FILE_EXTENT_PREALLOC);
4845 BUG_ON(ret); 5056 BUG_ON(ret);
4846 num_bytes -= ins.offset; 5057 num_bytes -= ins.offset;
@@ -4858,7 +5069,6 @@ out:
4858 BUG_ON(ret); 5069 BUG_ON(ret);
4859 } 5070 }
4860 5071
4861 btrfs_end_transaction(trans, root);
4862 return ret; 5072 return ret;
4863} 5073}
4864 5074
@@ -4870,13 +5080,21 @@ static long btrfs_fallocate(struct inode *inode, int mode,
4870 u64 alloc_start; 5080 u64 alloc_start;
4871 u64 alloc_end; 5081 u64 alloc_end;
4872 u64 alloc_hint = 0; 5082 u64 alloc_hint = 0;
5083 u64 locked_end;
4873 u64 mask = BTRFS_I(inode)->root->sectorsize - 1; 5084 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
4874 struct extent_map *em; 5085 struct extent_map *em;
5086 struct btrfs_trans_handle *trans;
4875 int ret; 5087 int ret;
4876 5088
4877 alloc_start = offset & ~mask; 5089 alloc_start = offset & ~mask;
4878 alloc_end = (offset + len + mask) & ~mask; 5090 alloc_end = (offset + len + mask) & ~mask;
4879 5091
5092 /*
5093 * wait for ordered IO before we have any locks. We'll loop again
5094 * below with the locks held.
5095 */
5096 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
5097
4880 mutex_lock(&inode->i_mutex); 5098 mutex_lock(&inode->i_mutex);
4881 if (alloc_start > inode->i_size) { 5099 if (alloc_start > inode->i_size) {
4882 ret = btrfs_cont_expand(inode, alloc_start); 5100 ret = btrfs_cont_expand(inode, alloc_start);
@@ -4884,10 +5102,21 @@ static long btrfs_fallocate(struct inode *inode, int mode,
4884 goto out; 5102 goto out;
4885 } 5103 }
4886 5104
5105 locked_end = alloc_end - 1;
4887 while (1) { 5106 while (1) {
4888 struct btrfs_ordered_extent *ordered; 5107 struct btrfs_ordered_extent *ordered;
4889 lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, 5108
4890 alloc_end - 1, GFP_NOFS); 5109 trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
5110 if (!trans) {
5111 ret = -EIO;
5112 goto out;
5113 }
5114
5115 /* the extent lock is ordered inside the running
5116 * transaction
5117 */
5118 lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
5119 GFP_NOFS);
4891 ordered = btrfs_lookup_first_ordered_extent(inode, 5120 ordered = btrfs_lookup_first_ordered_extent(inode,
4892 alloc_end - 1); 5121 alloc_end - 1);
4893 if (ordered && 5122 if (ordered &&
@@ -4895,7 +5124,13 @@ static long btrfs_fallocate(struct inode *inode, int mode,
4895 ordered->file_offset < alloc_end) { 5124 ordered->file_offset < alloc_end) {
4896 btrfs_put_ordered_extent(ordered); 5125 btrfs_put_ordered_extent(ordered);
4897 unlock_extent(&BTRFS_I(inode)->io_tree, 5126 unlock_extent(&BTRFS_I(inode)->io_tree,
4898 alloc_start, alloc_end - 1, GFP_NOFS); 5127 alloc_start, locked_end, GFP_NOFS);
5128 btrfs_end_transaction(trans, BTRFS_I(inode)->root);
5129
5130 /*
5131 * we can't wait on the range with the transaction
5132 * running or with the extent lock held
5133 */
4899 btrfs_wait_ordered_range(inode, alloc_start, 5134 btrfs_wait_ordered_range(inode, alloc_start,
4900 alloc_end - alloc_start); 5135 alloc_end - alloc_start);
4901 } else { 5136 } else {
@@ -4913,8 +5148,9 @@ static long btrfs_fallocate(struct inode *inode, int mode,
4913 last_byte = min(extent_map_end(em), alloc_end); 5148 last_byte = min(extent_map_end(em), alloc_end);
4914 last_byte = (last_byte + mask) & ~mask; 5149 last_byte = (last_byte + mask) & ~mask;
4915 if (em->block_start == EXTENT_MAP_HOLE) { 5150 if (em->block_start == EXTENT_MAP_HOLE) {
4916 ret = prealloc_file_range(inode, cur_offset, 5151 ret = prealloc_file_range(trans, inode, cur_offset,
4917 last_byte, alloc_hint, mode); 5152 last_byte, locked_end + 1,
5153 alloc_hint, mode);
4918 if (ret < 0) { 5154 if (ret < 0) {
4919 free_extent_map(em); 5155 free_extent_map(em);
4920 break; 5156 break;
@@ -4930,8 +5166,10 @@ static long btrfs_fallocate(struct inode *inode, int mode,
4930 break; 5166 break;
4931 } 5167 }
4932 } 5168 }
4933 unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1, 5169 unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
4934 GFP_NOFS); 5170 GFP_NOFS);
5171
5172 btrfs_end_transaction(trans, BTRFS_I(inode)->root);
4935out: 5173out:
4936 mutex_unlock(&inode->i_mutex); 5174 mutex_unlock(&inode->i_mutex);
4937 return ret; 5175 return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index bca729fc80c8..2624b53ea783 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -267,7 +267,7 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
267 goto out_dput; 267 goto out_dput;
268 268
269 if (!IS_POSIXACL(parent->dentry->d_inode)) 269 if (!IS_POSIXACL(parent->dentry->d_inode))
270 mode &= ~current->fs->umask; 270 mode &= ~current_umask();
271 271
272 error = mnt_want_write(parent->mnt); 272 error = mnt_want_write(parent->mnt);
273 if (error) 273 if (error)
@@ -437,10 +437,6 @@ out_unlock:
437 return 0; 437 return 0;
438} 438}
439 439
440/*
441 * Called inside transaction, so use GFP_NOFS
442 */
443
444static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) 440static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
445{ 441{
446 u64 new_size; 442 u64 new_size;
@@ -461,15 +457,9 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
461 if (!capable(CAP_SYS_ADMIN)) 457 if (!capable(CAP_SYS_ADMIN))
462 return -EPERM; 458 return -EPERM;
463 459
464 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); 460 vol_args = memdup_user(arg, sizeof(*vol_args));
465 461 if (IS_ERR(vol_args))
466 if (!vol_args) 462 return PTR_ERR(vol_args);
467 return -ENOMEM;
468
469 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
470 ret = -EFAULT;
471 goto out;
472 }
473 463
474 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 464 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
475 namelen = strlen(vol_args->name); 465 namelen = strlen(vol_args->name);
@@ -483,11 +473,13 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
483 *devstr = '\0'; 473 *devstr = '\0';
484 devstr = vol_args->name; 474 devstr = vol_args->name;
485 devid = simple_strtoull(devstr, &end, 10); 475 devid = simple_strtoull(devstr, &end, 10);
486 printk(KERN_INFO "resizing devid %llu\n", devid); 476 printk(KERN_INFO "resizing devid %llu\n",
477 (unsigned long long)devid);
487 } 478 }
488 device = btrfs_find_device(root, devid, NULL, NULL); 479 device = btrfs_find_device(root, devid, NULL, NULL);
489 if (!device) { 480 if (!device) {
490 printk(KERN_INFO "resizer unable to find device %llu\n", devid); 481 printk(KERN_INFO "resizer unable to find device %llu\n",
482 (unsigned long long)devid);
491 ret = -EINVAL; 483 ret = -EINVAL;
492 goto out_unlock; 484 goto out_unlock;
493 } 485 }
@@ -545,7 +537,6 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
545 537
546out_unlock: 538out_unlock:
547 mutex_unlock(&root->fs_info->volume_mutex); 539 mutex_unlock(&root->fs_info->volume_mutex);
548out:
549 kfree(vol_args); 540 kfree(vol_args);
550 return ret; 541 return ret;
551} 542}
@@ -565,15 +556,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
565 if (root->fs_info->sb->s_flags & MS_RDONLY) 556 if (root->fs_info->sb->s_flags & MS_RDONLY)
566 return -EROFS; 557 return -EROFS;
567 558
568 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); 559 vol_args = memdup_user(arg, sizeof(*vol_args));
569 560 if (IS_ERR(vol_args))
570 if (!vol_args) 561 return PTR_ERR(vol_args);
571 return -ENOMEM;
572
573 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
574 ret = -EFAULT;
575 goto out;
576 }
577 562
578 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 563 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
579 namelen = strlen(vol_args->name); 564 namelen = strlen(vol_args->name);
@@ -675,19 +660,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
675 if (!capable(CAP_SYS_ADMIN)) 660 if (!capable(CAP_SYS_ADMIN))
676 return -EPERM; 661 return -EPERM;
677 662
678 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); 663 vol_args = memdup_user(arg, sizeof(*vol_args));
679 664 if (IS_ERR(vol_args))
680 if (!vol_args) 665 return PTR_ERR(vol_args);
681 return -ENOMEM;
682 666
683 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
684 ret = -EFAULT;
685 goto out;
686 }
687 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 667 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
688 ret = btrfs_init_new_device(root, vol_args->name); 668 ret = btrfs_init_new_device(root, vol_args->name);
689 669
690out:
691 kfree(vol_args); 670 kfree(vol_args);
692 return ret; 671 return ret;
693} 672}
@@ -703,19 +682,13 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
703 if (root->fs_info->sb->s_flags & MS_RDONLY) 682 if (root->fs_info->sb->s_flags & MS_RDONLY)
704 return -EROFS; 683 return -EROFS;
705 684
706 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); 685 vol_args = memdup_user(arg, sizeof(*vol_args));
707 686 if (IS_ERR(vol_args))
708 if (!vol_args) 687 return PTR_ERR(vol_args);
709 return -ENOMEM;
710 688
711 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
712 ret = -EFAULT;
713 goto out;
714 }
715 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 689 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
716 ret = btrfs_rm_device(root, vol_args->name); 690 ret = btrfs_rm_device(root, vol_args->name);
717 691
718out:
719 kfree(vol_args); 692 kfree(vol_args);
720 return ret; 693 return ret;
721} 694}
@@ -830,7 +803,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
830 BUG_ON(!trans); 803 BUG_ON(!trans);
831 804
832 /* punch hole in destination first */ 805 /* punch hole in destination first */
833 btrfs_drop_extents(trans, root, inode, off, off+len, 0, &hint_byte); 806 btrfs_drop_extents(trans, root, inode, off, off + len,
807 off + len, 0, &hint_byte);
834 808
835 /* clone data */ 809 /* clone data */
836 key.objectid = src->i_ino; 810 key.objectid = src->i_ino;
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 47b0a88c12a2..1c36e5cd8f55 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -60,8 +60,8 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb)
60 60
61/* 61/*
62 * unfortunately, many of the places that currently set a lock to blocking 62 * unfortunately, many of the places that currently set a lock to blocking
63 * don't end up blocking for every long, and often they don't block 63 * don't end up blocking for very long, and often they don't block
64 * at all. For a dbench 50 run, if we don't spin one the blocking bit 64 * at all. For a dbench 50 run, if we don't spin on the blocking bit
65 * at all, the context switch rate can jump up to 400,000/sec or more. 65 * at all, the context switch rate can jump up to 400,000/sec or more.
66 * 66 *
67 * So, we're still stuck with this crummy spin on the blocking bit, 67 * So, we're still stuck with this crummy spin on the blocking bit,
@@ -71,12 +71,13 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb)
71static int btrfs_spin_on_block(struct extent_buffer *eb) 71static int btrfs_spin_on_block(struct extent_buffer *eb)
72{ 72{
73 int i; 73 int i;
74
74 for (i = 0; i < 512; i++) { 75 for (i = 0; i < 512; i++) {
75 cpu_relax();
76 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 76 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
77 return 1; 77 return 1;
78 if (need_resched()) 78 if (need_resched())
79 break; 79 break;
80 cpu_relax();
80 } 81 }
81 return 0; 82 return 0;
82} 83}
@@ -95,13 +96,15 @@ int btrfs_try_spin_lock(struct extent_buffer *eb)
95{ 96{
96 int i; 97 int i;
97 98
98 spin_nested(eb); 99 if (btrfs_spin_on_block(eb)) {
99 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 100 spin_nested(eb);
100 return 1; 101 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
101 spin_unlock(&eb->lock); 102 return 1;
102 103 spin_unlock(&eb->lock);
104 }
103 /* spin for a bit on the BLOCKING flag */ 105 /* spin for a bit on the BLOCKING flag */
104 for (i = 0; i < 2; i++) { 106 for (i = 0; i < 2; i++) {
107 cpu_relax();
105 if (!btrfs_spin_on_block(eb)) 108 if (!btrfs_spin_on_block(eb))
106 break; 109 break;
107 110
@@ -148,6 +151,9 @@ int btrfs_tree_lock(struct extent_buffer *eb)
148 DEFINE_WAIT(wait); 151 DEFINE_WAIT(wait);
149 wait.func = btrfs_wake_function; 152 wait.func = btrfs_wake_function;
150 153
154 if (!btrfs_spin_on_block(eb))
155 goto sleep;
156
151 while(1) { 157 while(1) {
152 spin_nested(eb); 158 spin_nested(eb);
153 159
@@ -165,9 +171,10 @@ int btrfs_tree_lock(struct extent_buffer *eb)
165 * spin for a bit, and if the blocking flag goes away, 171 * spin for a bit, and if the blocking flag goes away,
166 * loop around 172 * loop around
167 */ 173 */
174 cpu_relax();
168 if (btrfs_spin_on_block(eb)) 175 if (btrfs_spin_on_block(eb))
169 continue; 176 continue;
170 177sleep:
171 prepare_to_wait_exclusive(&eb->lock_wq, &wait, 178 prepare_to_wait_exclusive(&eb->lock_wq, &wait,
172 TASK_UNINTERRUPTIBLE); 179 TASK_UNINTERRUPTIBLE);
173 180
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 77c2411a5f0f..d6f0806c682f 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -310,6 +310,16 @@ int btrfs_remove_ordered_extent(struct inode *inode,
310 310
311 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 311 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
312 list_del_init(&entry->root_extent_list); 312 list_del_init(&entry->root_extent_list);
313
314 /*
315 * we have no more ordered extents for this inode and
316 * no dirty pages. We can safely remove it from the
317 * list of ordered extents
318 */
319 if (RB_EMPTY_ROOT(&tree->tree) &&
320 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
321 list_del_init(&BTRFS_I(inode)->ordered_operations);
322 }
313 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 323 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
314 324
315 mutex_unlock(&tree->mutex); 325 mutex_unlock(&tree->mutex);
@@ -370,6 +380,68 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
370} 380}
371 381
372/* 382/*
383 * this is used during transaction commit to write all the inodes
384 * added to the ordered operation list. These files must be fully on
385 * disk before the transaction commits.
386 *
387 * we have two modes here, one is to just start the IO via filemap_flush
388 * and the other is to wait for all the io. When we wait, we have an
389 * extra check to make sure the ordered operation list really is empty
390 * before we return
391 */
392int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
393{
394 struct btrfs_inode *btrfs_inode;
395 struct inode *inode;
396 struct list_head splice;
397
398 INIT_LIST_HEAD(&splice);
399
400 mutex_lock(&root->fs_info->ordered_operations_mutex);
401 spin_lock(&root->fs_info->ordered_extent_lock);
402again:
403 list_splice_init(&root->fs_info->ordered_operations, &splice);
404
405 while (!list_empty(&splice)) {
406 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
407 ordered_operations);
408
409 inode = &btrfs_inode->vfs_inode;
410
411 list_del_init(&btrfs_inode->ordered_operations);
412
413 /*
414 * the inode may be getting freed (in sys_unlink path).
415 */
416 inode = igrab(inode);
417
418 if (!wait && inode) {
419 list_add_tail(&BTRFS_I(inode)->ordered_operations,
420 &root->fs_info->ordered_operations);
421 }
422 spin_unlock(&root->fs_info->ordered_extent_lock);
423
424 if (inode) {
425 if (wait)
426 btrfs_wait_ordered_range(inode, 0, (u64)-1);
427 else
428 filemap_flush(inode->i_mapping);
429 iput(inode);
430 }
431
432 cond_resched();
433 spin_lock(&root->fs_info->ordered_extent_lock);
434 }
435 if (wait && !list_empty(&root->fs_info->ordered_operations))
436 goto again;
437
438 spin_unlock(&root->fs_info->ordered_extent_lock);
439 mutex_unlock(&root->fs_info->ordered_operations_mutex);
440
441 return 0;
442}
443
444/*
373 * Used to start IO or wait for a given ordered extent to finish. 445 * Used to start IO or wait for a given ordered extent to finish.
374 * 446 *
375 * If wait is one, this effectively waits on page writeback for all the pages 447 * If wait is one, this effectively waits on page writeback for all the pages
@@ -417,7 +489,7 @@ again:
417 /* start IO across the range first to instantiate any delalloc 489 /* start IO across the range first to instantiate any delalloc
418 * extents 490 * extents
419 */ 491 */
420 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE); 492 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
421 493
422 /* The compression code will leave pages locked but return from 494 /* The compression code will leave pages locked but return from
423 * writepage without setting the page writeback. Starting again 495 * writepage without setting the page writeback. Starting again
@@ -726,3 +798,49 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
726 798
727 return ret; 799 return ret;
728} 800}
801
802/*
803 * add a given inode to the list of inodes that must be fully on
804 * disk before a transaction commit finishes.
805 *
806 * This basically gives us the ext3 style data=ordered mode, and it is mostly
807 * used to make sure renamed files are fully on disk.
808 *
809 * It is a noop if the inode is already fully on disk.
810 *
811 * If trans is not null, we'll do a friendly check for a transaction that
812 * is already flushing things and force the IO down ourselves.
813 */
814int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
815 struct btrfs_root *root,
816 struct inode *inode)
817{
818 u64 last_mod;
819
820 last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
821
822 /*
823 * if this file hasn't been changed since the last transaction
824 * commit, we can safely return without doing anything
825 */
826 if (last_mod < root->fs_info->last_trans_committed)
827 return 0;
828
829 /*
830 * the transaction is already committing. Just start the IO and
831 * don't bother with all of this list nonsense
832 */
833 if (trans && root->fs_info->running_transaction->blocked) {
834 btrfs_wait_ordered_range(inode, 0, (u64)-1);
835 return 0;
836 }
837
838 spin_lock(&root->fs_info->ordered_extent_lock);
839 if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
840 list_add_tail(&BTRFS_I(inode)->ordered_operations,
841 &root->fs_info->ordered_operations);
842 }
843 spin_unlock(&root->fs_info->ordered_extent_lock);
844
845 return 0;
846}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index ab66d5e8d6d6..3d31c8827b01 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -155,4 +155,8 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
155int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, 155int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
156 loff_t end, int sync_mode); 156 loff_t end, int sync_mode);
157int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); 157int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
158int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
159int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
160 struct btrfs_root *root,
161 struct inode *inode);
158#endif 162#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 19a4daf03ccb..2ff7cd2db25f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -24,6 +24,7 @@
24#include <linux/highmem.h> 24#include <linux/highmem.h>
25#include <linux/time.h> 25#include <linux/time.h>
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/seq_file.h>
27#include <linux/string.h> 28#include <linux/string.h>
28#include <linux/smp_lock.h> 29#include <linux/smp_lock.h>
29#include <linux/backing-dev.h> 30#include <linux/backing-dev.h>
@@ -66,7 +67,8 @@ static void btrfs_put_super(struct super_block *sb)
66enum { 67enum {
67 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, 68 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
68 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, 69 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
69 Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_err, 70 Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_notreelog,
71 Opt_ratio, Opt_flushoncommit, Opt_err,
70}; 72};
71 73
72static match_table_t tokens = { 74static match_table_t tokens = {
@@ -83,6 +85,9 @@ static match_table_t tokens = {
83 {Opt_compress, "compress"}, 85 {Opt_compress, "compress"},
84 {Opt_ssd, "ssd"}, 86 {Opt_ssd, "ssd"},
85 {Opt_noacl, "noacl"}, 87 {Opt_noacl, "noacl"},
88 {Opt_notreelog, "notreelog"},
89 {Opt_flushoncommit, "flushoncommit"},
90 {Opt_ratio, "metadata_ratio=%d"},
86 {Opt_err, NULL}, 91 {Opt_err, NULL},
87}; 92};
88 93
@@ -191,7 +196,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
191 info->max_extent = max_t(u64, 196 info->max_extent = max_t(u64,
192 info->max_extent, root->sectorsize); 197 info->max_extent, root->sectorsize);
193 printk(KERN_INFO "btrfs: max_extent at %llu\n", 198 printk(KERN_INFO "btrfs: max_extent at %llu\n",
194 info->max_extent); 199 (unsigned long long)info->max_extent);
195 } 200 }
196 break; 201 break;
197 case Opt_max_inline: 202 case Opt_max_inline:
@@ -206,7 +211,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
206 root->sectorsize); 211 root->sectorsize);
207 } 212 }
208 printk(KERN_INFO "btrfs: max_inline at %llu\n", 213 printk(KERN_INFO "btrfs: max_inline at %llu\n",
209 info->max_inline); 214 (unsigned long long)info->max_inline);
210 } 215 }
211 break; 216 break;
212 case Opt_alloc_start: 217 case Opt_alloc_start:
@@ -216,12 +221,29 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
216 kfree(num); 221 kfree(num);
217 printk(KERN_INFO 222 printk(KERN_INFO
218 "btrfs: allocations start at %llu\n", 223 "btrfs: allocations start at %llu\n",
219 info->alloc_start); 224 (unsigned long long)info->alloc_start);
220 } 225 }
221 break; 226 break;
222 case Opt_noacl: 227 case Opt_noacl:
223 root->fs_info->sb->s_flags &= ~MS_POSIXACL; 228 root->fs_info->sb->s_flags &= ~MS_POSIXACL;
224 break; 229 break;
230 case Opt_notreelog:
231 printk(KERN_INFO "btrfs: disabling tree log\n");
232 btrfs_set_opt(info->mount_opt, NOTREELOG);
233 break;
234 case Opt_flushoncommit:
235 printk(KERN_INFO "btrfs: turning on flush-on-commit\n");
236 btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT);
237 break;
238 case Opt_ratio:
239 intarg = 0;
240 match_int(&args[0], &intarg);
241 if (intarg) {
242 info->metadata_ratio = intarg;
243 printk(KERN_INFO "btrfs: metadata ratio %d\n",
244 info->metadata_ratio);
245 }
246 break;
225 default: 247 default:
226 break; 248 break;
227 } 249 }
@@ -363,9 +385,8 @@ fail_close:
363int btrfs_sync_fs(struct super_block *sb, int wait) 385int btrfs_sync_fs(struct super_block *sb, int wait)
364{ 386{
365 struct btrfs_trans_handle *trans; 387 struct btrfs_trans_handle *trans;
366 struct btrfs_root *root; 388 struct btrfs_root *root = btrfs_sb(sb);
367 int ret; 389 int ret;
368 root = btrfs_sb(sb);
369 390
370 if (sb->s_flags & MS_RDONLY) 391 if (sb->s_flags & MS_RDONLY)
371 return 0; 392 return 0;
@@ -385,6 +406,44 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
385 return ret; 406 return ret;
386} 407}
387 408
409static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
410{
411 struct btrfs_root *root = btrfs_sb(vfs->mnt_sb);
412 struct btrfs_fs_info *info = root->fs_info;
413
414 if (btrfs_test_opt(root, DEGRADED))
415 seq_puts(seq, ",degraded");
416 if (btrfs_test_opt(root, NODATASUM))
417 seq_puts(seq, ",nodatasum");
418 if (btrfs_test_opt(root, NODATACOW))
419 seq_puts(seq, ",nodatacow");
420 if (btrfs_test_opt(root, NOBARRIER))
421 seq_puts(seq, ",nobarrier");
422 if (info->max_extent != (u64)-1)
423 seq_printf(seq, ",max_extent=%llu",
424 (unsigned long long)info->max_extent);
425 if (info->max_inline != 8192 * 1024)
426 seq_printf(seq, ",max_inline=%llu",
427 (unsigned long long)info->max_inline);
428 if (info->alloc_start != 0)
429 seq_printf(seq, ",alloc_start=%llu",
430 (unsigned long long)info->alloc_start);
431 if (info->thread_pool_size != min_t(unsigned long,
432 num_online_cpus() + 2, 8))
433 seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
434 if (btrfs_test_opt(root, COMPRESS))
435 seq_puts(seq, ",compress");
436 if (btrfs_test_opt(root, SSD))
437 seq_puts(seq, ",ssd");
438 if (btrfs_test_opt(root, NOTREELOG))
439 seq_puts(seq, ",notreelog");
440 if (btrfs_test_opt(root, FLUSHONCOMMIT))
441 seq_puts(seq, ",flushoncommit");
442 if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
443 seq_puts(seq, ",noacl");
444 return 0;
445}
446
388static void btrfs_write_super(struct super_block *sb) 447static void btrfs_write_super(struct super_block *sb)
389{ 448{
390 sb->s_dirt = 0; 449 sb->s_dirt = 0;
@@ -443,8 +502,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
443 502
444 if (s->s_root) { 503 if (s->s_root) {
445 if ((flags ^ s->s_flags) & MS_RDONLY) { 504 if ((flags ^ s->s_flags) & MS_RDONLY) {
446 up_write(&s->s_umount); 505 deactivate_locked_super(s);
447 deactivate_super(s);
448 error = -EBUSY; 506 error = -EBUSY;
449 goto error_close_devices; 507 goto error_close_devices;
450 } 508 }
@@ -458,8 +516,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
458 error = btrfs_fill_super(s, fs_devices, data, 516 error = btrfs_fill_super(s, fs_devices, data,
459 flags & MS_SILENT ? 1 : 0); 517 flags & MS_SILENT ? 1 : 0);
460 if (error) { 518 if (error) {
461 up_write(&s->s_umount); 519 deactivate_locked_super(s);
462 deactivate_super(s);
463 goto error_free_subvol_name; 520 goto error_free_subvol_name;
464 } 521 }
465 522
@@ -476,15 +533,13 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
476 mutex_unlock(&s->s_root->d_inode->i_mutex); 533 mutex_unlock(&s->s_root->d_inode->i_mutex);
477 534
478 if (IS_ERR(root)) { 535 if (IS_ERR(root)) {
479 up_write(&s->s_umount); 536 deactivate_locked_super(s);
480 deactivate_super(s);
481 error = PTR_ERR(root); 537 error = PTR_ERR(root);
482 goto error_free_subvol_name; 538 goto error_free_subvol_name;
483 } 539 }
484 if (!root->d_inode) { 540 if (!root->d_inode) {
485 dput(root); 541 dput(root);
486 up_write(&s->s_umount); 542 deactivate_locked_super(s);
487 deactivate_super(s);
488 error = -ENXIO; 543 error = -ENXIO;
489 goto error_free_subvol_name; 544 goto error_free_subvol_name;
490 } 545 }
@@ -589,14 +644,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
589 if (!capable(CAP_SYS_ADMIN)) 644 if (!capable(CAP_SYS_ADMIN))
590 return -EPERM; 645 return -EPERM;
591 646
592 vol = kmalloc(sizeof(*vol), GFP_KERNEL); 647 vol = memdup_user((void __user *)arg, sizeof(*vol));
593 if (!vol) 648 if (IS_ERR(vol))
594 return -ENOMEM; 649 return PTR_ERR(vol);
595
596 if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
597 ret = -EFAULT;
598 goto out;
599 }
600 650
601 switch (cmd) { 651 switch (cmd) {
602 case BTRFS_IOC_SCAN_DEV: 652 case BTRFS_IOC_SCAN_DEV:
@@ -604,7 +654,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
604 &btrfs_fs_type, &fs_devices); 654 &btrfs_fs_type, &fs_devices);
605 break; 655 break;
606 } 656 }
607out: 657
608 kfree(vol); 658 kfree(vol);
609 return ret; 659 return ret;
610} 660}
@@ -630,7 +680,7 @@ static struct super_operations btrfs_super_ops = {
630 .put_super = btrfs_put_super, 680 .put_super = btrfs_put_super,
631 .write_super = btrfs_write_super, 681 .write_super = btrfs_write_super,
632 .sync_fs = btrfs_sync_fs, 682 .sync_fs = btrfs_sync_fs,
633 .show_options = generic_show_options, 683 .show_options = btrfs_show_options,
634 .write_inode = btrfs_write_inode, 684 .write_inode = btrfs_write_inode,
635 .dirty_inode = btrfs_dirty_inode, 685 .dirty_inode = btrfs_dirty_inode,
636 .alloc_inode = btrfs_alloc_inode, 686 .alloc_inode = btrfs_alloc_inode,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4112d53d4f4d..01b143605ec1 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -53,8 +53,6 @@ static noinline int join_transaction(struct btrfs_root *root)
53 GFP_NOFS); 53 GFP_NOFS);
54 BUG_ON(!cur_trans); 54 BUG_ON(!cur_trans);
55 root->fs_info->generation++; 55 root->fs_info->generation++;
56 root->fs_info->last_alloc = 0;
57 root->fs_info->last_data_alloc = 0;
58 cur_trans->num_writers = 1; 56 cur_trans->num_writers = 1;
59 cur_trans->num_joined = 0; 57 cur_trans->num_joined = 0;
60 cur_trans->transid = root->fs_info->generation; 58 cur_trans->transid = root->fs_info->generation;
@@ -65,6 +63,15 @@ static noinline int join_transaction(struct btrfs_root *root)
65 cur_trans->use_count = 1; 63 cur_trans->use_count = 1;
66 cur_trans->commit_done = 0; 64 cur_trans->commit_done = 0;
67 cur_trans->start_time = get_seconds(); 65 cur_trans->start_time = get_seconds();
66
67 cur_trans->delayed_refs.root.rb_node = NULL;
68 cur_trans->delayed_refs.num_entries = 0;
69 cur_trans->delayed_refs.num_heads_ready = 0;
70 cur_trans->delayed_refs.num_heads = 0;
71 cur_trans->delayed_refs.flushing = 0;
72 cur_trans->delayed_refs.run_delayed_start = 0;
73 spin_lock_init(&cur_trans->delayed_refs.lock);
74
68 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 75 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
69 list_add_tail(&cur_trans->list, &root->fs_info->trans_list); 76 list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
70 extent_io_tree_init(&cur_trans->dirty_pages, 77 extent_io_tree_init(&cur_trans->dirty_pages,
@@ -182,6 +189,8 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
182 h->block_group = 0; 189 h->block_group = 0;
183 h->alloc_exclude_nr = 0; 190 h->alloc_exclude_nr = 0;
184 h->alloc_exclude_start = 0; 191 h->alloc_exclude_start = 0;
192 h->delayed_ref_updates = 0;
193
185 root->fs_info->running_transaction->use_count++; 194 root->fs_info->running_transaction->use_count++;
186 mutex_unlock(&root->fs_info->trans_mutex); 195 mutex_unlock(&root->fs_info->trans_mutex);
187 return h; 196 return h;
@@ -271,7 +280,6 @@ void btrfs_throttle(struct btrfs_root *root)
271 if (!root->fs_info->open_ioctl_trans) 280 if (!root->fs_info->open_ioctl_trans)
272 wait_current_trans(root); 281 wait_current_trans(root);
273 mutex_unlock(&root->fs_info->trans_mutex); 282 mutex_unlock(&root->fs_info->trans_mutex);
274
275 throttle_on_drops(root); 283 throttle_on_drops(root);
276} 284}
277 285
@@ -280,6 +288,27 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
280{ 288{
281 struct btrfs_transaction *cur_trans; 289 struct btrfs_transaction *cur_trans;
282 struct btrfs_fs_info *info = root->fs_info; 290 struct btrfs_fs_info *info = root->fs_info;
291 int count = 0;
292
293 while (count < 4) {
294 unsigned long cur = trans->delayed_ref_updates;
295 trans->delayed_ref_updates = 0;
296 if (cur &&
297 trans->transaction->delayed_refs.num_heads_ready > 64) {
298 trans->delayed_ref_updates = 0;
299
300 /*
301 * do a full flush if the transaction is trying
302 * to close
303 */
304 if (trans->transaction->delayed_refs.flushing)
305 cur = 0;
306 btrfs_run_delayed_refs(trans, root, cur);
307 } else {
308 break;
309 }
310 count++;
311 }
283 312
284 mutex_lock(&info->trans_mutex); 313 mutex_lock(&info->trans_mutex);
285 cur_trans = info->running_transaction; 314 cur_trans = info->running_transaction;
@@ -424,9 +453,10 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
424 u64 old_root_bytenr; 453 u64 old_root_bytenr;
425 struct btrfs_root *tree_root = root->fs_info->tree_root; 454 struct btrfs_root *tree_root = root->fs_info->tree_root;
426 455
427 btrfs_extent_post_op(trans, root);
428 btrfs_write_dirty_block_groups(trans, root); 456 btrfs_write_dirty_block_groups(trans, root);
429 btrfs_extent_post_op(trans, root); 457
458 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
459 BUG_ON(ret);
430 460
431 while (1) { 461 while (1) {
432 old_root_bytenr = btrfs_root_bytenr(&root->root_item); 462 old_root_bytenr = btrfs_root_bytenr(&root->root_item);
@@ -438,14 +468,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
438 btrfs_header_level(root->node)); 468 btrfs_header_level(root->node));
439 btrfs_set_root_generation(&root->root_item, trans->transid); 469 btrfs_set_root_generation(&root->root_item, trans->transid);
440 470
441 btrfs_extent_post_op(trans, root);
442
443 ret = btrfs_update_root(trans, tree_root, 471 ret = btrfs_update_root(trans, tree_root,
444 &root->root_key, 472 &root->root_key,
445 &root->root_item); 473 &root->root_item);
446 BUG_ON(ret); 474 BUG_ON(ret);
447 btrfs_write_dirty_block_groups(trans, root); 475 btrfs_write_dirty_block_groups(trans, root);
448 btrfs_extent_post_op(trans, root); 476
477 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
478 BUG_ON(ret);
449 } 479 }
450 return 0; 480 return 0;
451} 481}
@@ -459,15 +489,18 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
459 struct btrfs_fs_info *fs_info = root->fs_info; 489 struct btrfs_fs_info *fs_info = root->fs_info;
460 struct list_head *next; 490 struct list_head *next;
461 struct extent_buffer *eb; 491 struct extent_buffer *eb;
492 int ret;
462 493
463 btrfs_extent_post_op(trans, fs_info->tree_root); 494 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
495 BUG_ON(ret);
464 496
465 eb = btrfs_lock_root_node(fs_info->tree_root); 497 eb = btrfs_lock_root_node(fs_info->tree_root);
466 btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0); 498 btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
467 btrfs_tree_unlock(eb); 499 btrfs_tree_unlock(eb);
468 free_extent_buffer(eb); 500 free_extent_buffer(eb);
469 501
470 btrfs_extent_post_op(trans, fs_info->tree_root); 502 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
503 BUG_ON(ret);
471 504
472 while (!list_empty(&fs_info->dirty_cowonly_roots)) { 505 while (!list_empty(&fs_info->dirty_cowonly_roots)) {
473 next = fs_info->dirty_cowonly_roots.next; 506 next = fs_info->dirty_cowonly_roots.next;
@@ -475,6 +508,9 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
475 root = list_entry(next, struct btrfs_root, dirty_list); 508 root = list_entry(next, struct btrfs_root, dirty_list);
476 509
477 update_cowonly_root(trans, root); 510 update_cowonly_root(trans, root);
511
512 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
513 BUG_ON(ret);
478 } 514 }
479 return 0; 515 return 0;
480} 516}
@@ -635,6 +671,37 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
635} 671}
636 672
637/* 673/*
674 * when dropping snapshots, we generate a ton of delayed refs, and it makes
675 * sense not to join the transaction while it is trying to flush the current
676 * queue of delayed refs out.
677 *
678 * This is used by the drop snapshot code only
679 */
680static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
681{
682 DEFINE_WAIT(wait);
683
684 mutex_lock(&info->trans_mutex);
685 while (info->running_transaction &&
686 info->running_transaction->delayed_refs.flushing) {
687 prepare_to_wait(&info->transaction_wait, &wait,
688 TASK_UNINTERRUPTIBLE);
689 mutex_unlock(&info->trans_mutex);
690
691 atomic_dec(&info->throttles);
692 wake_up(&info->transaction_throttle);
693
694 schedule();
695
696 atomic_inc(&info->throttles);
697 mutex_lock(&info->trans_mutex);
698 finish_wait(&info->transaction_wait, &wait);
699 }
700 mutex_unlock(&info->trans_mutex);
701 return 0;
702}
703
704/*
638 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on 705 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
639 * all of them 706 * all of them
640 */ 707 */
@@ -661,7 +728,22 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
661 atomic_inc(&root->fs_info->throttles); 728 atomic_inc(&root->fs_info->throttles);
662 729
663 while (1) { 730 while (1) {
731 /*
732 * we don't want to jump in and create a bunch of
733 * delayed refs if the transaction is starting to close
734 */
735 wait_transaction_pre_flush(tree_root->fs_info);
664 trans = btrfs_start_transaction(tree_root, 1); 736 trans = btrfs_start_transaction(tree_root, 1);
737
738 /*
739 * we've joined a transaction, make sure it isn't
740 * closing right now
741 */
742 if (trans->transaction->delayed_refs.flushing) {
743 btrfs_end_transaction(trans, tree_root);
744 continue;
745 }
746
665 mutex_lock(&root->fs_info->drop_mutex); 747 mutex_lock(&root->fs_info->drop_mutex);
666 ret = btrfs_drop_snapshot(trans, dirty->root); 748 ret = btrfs_drop_snapshot(trans, dirty->root);
667 if (ret != -EAGAIN) 749 if (ret != -EAGAIN)
@@ -766,7 +848,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
766 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 848 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
767 849
768 old = btrfs_lock_root_node(root); 850 old = btrfs_lock_root_node(root);
769 btrfs_cow_block(trans, root, old, NULL, 0, &old, 0); 851 btrfs_cow_block(trans, root, old, NULL, 0, &old);
770 852
771 btrfs_copy_root(trans, root, old, &tmp, objectid); 853 btrfs_copy_root(trans, root, old, &tmp, objectid);
772 btrfs_tree_unlock(old); 854 btrfs_tree_unlock(old);
@@ -894,12 +976,32 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
894 struct extent_io_tree *pinned_copy; 976 struct extent_io_tree *pinned_copy;
895 DEFINE_WAIT(wait); 977 DEFINE_WAIT(wait);
896 int ret; 978 int ret;
979 int should_grow = 0;
980 unsigned long now = get_seconds();
981 int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
982
983 btrfs_run_ordered_operations(root, 0);
984
985 /* make a pass through all the delayed refs we have so far
986 * any runnings procs may add more while we are here
987 */
988 ret = btrfs_run_delayed_refs(trans, root, 0);
989 BUG_ON(ret);
990
991 cur_trans = trans->transaction;
992 /*
993 * set the flushing flag so procs in this transaction have to
994 * start sending their work down.
995 */
996 cur_trans->delayed_refs.flushing = 1;
997
998 ret = btrfs_run_delayed_refs(trans, root, 0);
999 BUG_ON(ret);
897 1000
898 INIT_LIST_HEAD(&dirty_fs_roots);
899 mutex_lock(&root->fs_info->trans_mutex); 1001 mutex_lock(&root->fs_info->trans_mutex);
900 if (trans->transaction->in_commit) { 1002 INIT_LIST_HEAD(&dirty_fs_roots);
901 cur_trans = trans->transaction; 1003 if (cur_trans->in_commit) {
902 trans->transaction->use_count++; 1004 cur_trans->use_count++;
903 mutex_unlock(&root->fs_info->trans_mutex); 1005 mutex_unlock(&root->fs_info->trans_mutex);
904 btrfs_end_transaction(trans, root); 1006 btrfs_end_transaction(trans, root);
905 1007
@@ -922,7 +1024,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
922 1024
923 trans->transaction->in_commit = 1; 1025 trans->transaction->in_commit = 1;
924 trans->transaction->blocked = 1; 1026 trans->transaction->blocked = 1;
925 cur_trans = trans->transaction;
926 if (cur_trans->list.prev != &root->fs_info->trans_list) { 1027 if (cur_trans->list.prev != &root->fs_info->trans_list) {
927 prev_trans = list_entry(cur_trans->list.prev, 1028 prev_trans = list_entry(cur_trans->list.prev,
928 struct btrfs_transaction, list); 1029 struct btrfs_transaction, list);
@@ -937,6 +1038,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
937 } 1038 }
938 } 1039 }
939 1040
1041 if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
1042 should_grow = 1;
1043
940 do { 1044 do {
941 int snap_pending = 0; 1045 int snap_pending = 0;
942 joined = cur_trans->num_joined; 1046 joined = cur_trans->num_joined;
@@ -949,26 +1053,42 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
949 1053
950 if (cur_trans->num_writers > 1) 1054 if (cur_trans->num_writers > 1)
951 timeout = MAX_SCHEDULE_TIMEOUT; 1055 timeout = MAX_SCHEDULE_TIMEOUT;
952 else 1056 else if (should_grow)
953 timeout = 1; 1057 timeout = 1;
954 1058
955 mutex_unlock(&root->fs_info->trans_mutex); 1059 mutex_unlock(&root->fs_info->trans_mutex);
956 1060
957 if (snap_pending) { 1061 if (flush_on_commit || snap_pending) {
1062 if (flush_on_commit)
1063 btrfs_start_delalloc_inodes(root);
958 ret = btrfs_wait_ordered_extents(root, 1); 1064 ret = btrfs_wait_ordered_extents(root, 1);
959 BUG_ON(ret); 1065 BUG_ON(ret);
960 } 1066 }
961 1067
962 schedule_timeout(timeout); 1068 /*
1069 * rename don't use btrfs_join_transaction, so, once we
1070 * set the transaction to blocked above, we aren't going
1071 * to get any new ordered operations. We can safely run
1072 * it here and no for sure that nothing new will be added
1073 * to the list
1074 */
1075 btrfs_run_ordered_operations(root, 1);
1076
1077 smp_mb();
1078 if (cur_trans->num_writers > 1 || should_grow)
1079 schedule_timeout(timeout);
963 1080
964 mutex_lock(&root->fs_info->trans_mutex); 1081 mutex_lock(&root->fs_info->trans_mutex);
965 finish_wait(&cur_trans->writer_wait, &wait); 1082 finish_wait(&cur_trans->writer_wait, &wait);
966 } while (cur_trans->num_writers > 1 || 1083 } while (cur_trans->num_writers > 1 ||
967 (cur_trans->num_joined != joined)); 1084 (should_grow && cur_trans->num_joined != joined));
968 1085
969 ret = create_pending_snapshots(trans, root->fs_info); 1086 ret = create_pending_snapshots(trans, root->fs_info);
970 BUG_ON(ret); 1087 BUG_ON(ret);
971 1088
1089 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1090 BUG_ON(ret);
1091
972 WARN_ON(cur_trans != trans->transaction); 1092 WARN_ON(cur_trans != trans->transaction);
973 1093
974 /* btrfs_commit_tree_roots is responsible for getting the 1094 /* btrfs_commit_tree_roots is responsible for getting the
@@ -1032,6 +1152,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1032 btrfs_copy_pinned(root, pinned_copy); 1152 btrfs_copy_pinned(root, pinned_copy);
1033 1153
1034 trans->transaction->blocked = 0; 1154 trans->transaction->blocked = 0;
1155
1035 wake_up(&root->fs_info->transaction_throttle); 1156 wake_up(&root->fs_info->transaction_throttle);
1036 wake_up(&root->fs_info->transaction_wait); 1157 wake_up(&root->fs_info->transaction_wait);
1037 1158
@@ -1058,6 +1179,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1058 mutex_lock(&root->fs_info->trans_mutex); 1179 mutex_lock(&root->fs_info->trans_mutex);
1059 1180
1060 cur_trans->commit_done = 1; 1181 cur_trans->commit_done = 1;
1182
1061 root->fs_info->last_trans_committed = cur_trans->transid; 1183 root->fs_info->last_trans_committed = cur_trans->transid;
1062 wake_up(&cur_trans->commit_wait); 1184 wake_up(&cur_trans->commit_wait);
1063 1185
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index ea292117f882..94f5bde2b58d 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -19,10 +19,16 @@
19#ifndef __BTRFS_TRANSACTION__ 19#ifndef __BTRFS_TRANSACTION__
20#define __BTRFS_TRANSACTION__ 20#define __BTRFS_TRANSACTION__
21#include "btrfs_inode.h" 21#include "btrfs_inode.h"
22#include "delayed-ref.h"
22 23
23struct btrfs_transaction { 24struct btrfs_transaction {
24 u64 transid; 25 u64 transid;
26 /*
27 * total writers in this transaction, it must be zero before the
28 * transaction can end
29 */
25 unsigned long num_writers; 30 unsigned long num_writers;
31
26 unsigned long num_joined; 32 unsigned long num_joined;
27 int in_commit; 33 int in_commit;
28 int use_count; 34 int use_count;
@@ -34,6 +40,7 @@ struct btrfs_transaction {
34 wait_queue_head_t writer_wait; 40 wait_queue_head_t writer_wait;
35 wait_queue_head_t commit_wait; 41 wait_queue_head_t commit_wait;
36 struct list_head pending_snapshots; 42 struct list_head pending_snapshots;
43 struct btrfs_delayed_ref_root delayed_refs;
37}; 44};
38 45
39struct btrfs_trans_handle { 46struct btrfs_trans_handle {
@@ -44,6 +51,7 @@ struct btrfs_trans_handle {
44 u64 block_group; 51 u64 block_group;
45 u64 alloc_exclude_start; 52 u64 alloc_exclude_start;
46 u64 alloc_exclude_nr; 53 u64 alloc_exclude_nr;
54 unsigned long delayed_ref_updates;
47}; 55};
48 56
49struct btrfs_pending_snapshot { 57struct btrfs_pending_snapshot {
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 98d25fa4570e..b10eacdb1620 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -124,8 +124,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
124 } 124 }
125 125
126 btrfs_release_path(root, path); 126 btrfs_release_path(root, path);
127 if (is_extent)
128 btrfs_extent_post_op(trans, root);
129out: 127out:
130 if (path) 128 if (path)
131 btrfs_free_path(path); 129 btrfs_free_path(path);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9c462fbd60fa..db5e212e8445 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -35,6 +35,49 @@
35#define LOG_INODE_EXISTS 1 35#define LOG_INODE_EXISTS 1
36 36
37/* 37/*
38 * directory trouble cases
39 *
40 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
41 * log, we must force a full commit before doing an fsync of the directory
42 * where the unlink was done.
43 * ---> record transid of last unlink/rename per directory
44 *
45 * mkdir foo/some_dir
46 * normal commit
47 * rename foo/some_dir foo2/some_dir
48 * mkdir foo/some_dir
49 * fsync foo/some_dir/some_file
50 *
51 * The fsync above will unlink the original some_dir without recording
52 * it in its new location (foo2). After a crash, some_dir will be gone
53 * unless the fsync of some_file forces a full commit
54 *
55 * 2) we must log any new names for any file or dir that is in the fsync
56 * log. ---> check inode while renaming/linking.
57 *
58 * 2a) we must log any new names for any file or dir during rename
59 * when the directory they are being removed from was logged.
60 * ---> check inode and old parent dir during rename
61 *
62 * 2a is actually the more important variant. With the extra logging
63 * a crash might unlink the old name without recreating the new one
64 *
65 * 3) after a crash, we must go through any directories with a link count
66 * of zero and redo the rm -rf
67 *
68 * mkdir f1/foo
69 * normal commit
70 * rm -rf f1/foo
71 * fsync(f1)
72 *
73 * The directory f1 was fully removed from the FS, but fsync was never
74 * called on f1, only its parent dir. After a crash the rm -rf must
75 * be replayed. This must be able to recurse down the entire
76 * directory tree. The inode link count fixup code takes care of the
77 * ugly details.
78 */
79
80/*
38 * stages for the tree walking. The first 81 * stages for the tree walking. The first
39 * stage (0) is to only pin down the blocks we find 82 * stage (0) is to only pin down the blocks we find
40 * the second stage (1) is to make sure that all the inodes 83 * the second stage (1) is to make sure that all the inodes
@@ -47,12 +90,17 @@
47#define LOG_WALK_REPLAY_INODES 1 90#define LOG_WALK_REPLAY_INODES 1
48#define LOG_WALK_REPLAY_ALL 2 91#define LOG_WALK_REPLAY_ALL 2
49 92
50static int __btrfs_log_inode(struct btrfs_trans_handle *trans, 93static int btrfs_log_inode(struct btrfs_trans_handle *trans,
51 struct btrfs_root *root, struct inode *inode, 94 struct btrfs_root *root, struct inode *inode,
52 int inode_only); 95 int inode_only);
53static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 96static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root, 97 struct btrfs_root *root,
55 struct btrfs_path *path, u64 objectid); 98 struct btrfs_path *path, u64 objectid);
99static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
100 struct btrfs_root *root,
101 struct btrfs_root *log,
102 struct btrfs_path *path,
103 u64 dirid, int del_all);
56 104
57/* 105/*
58 * tree logging is a special write ahead log used to make sure that 106 * tree logging is a special write ahead log used to make sure that
@@ -133,10 +181,25 @@ static int join_running_log_trans(struct btrfs_root *root)
133} 181}
134 182
135/* 183/*
184 * This either makes the current running log transaction wait
185 * until you call btrfs_end_log_trans() or it makes any future
186 * log transactions wait until you call btrfs_end_log_trans()
187 */
188int btrfs_pin_log_trans(struct btrfs_root *root)
189{
190 int ret = -ENOENT;
191
192 mutex_lock(&root->log_mutex);
193 atomic_inc(&root->log_writers);
194 mutex_unlock(&root->log_mutex);
195 return ret;
196}
197
198/*
136 * indicate we're done making changes to the log tree 199 * indicate we're done making changes to the log tree
137 * and wake up anyone waiting to do a sync 200 * and wake up anyone waiting to do a sync
138 */ 201 */
139static int end_log_trans(struct btrfs_root *root) 202int btrfs_end_log_trans(struct btrfs_root *root)
140{ 203{
141 if (atomic_dec_and_test(&root->log_writers)) { 204 if (atomic_dec_and_test(&root->log_writers)) {
142 smp_mb(); 205 smp_mb();
@@ -199,12 +262,9 @@ static int process_one_buffer(struct btrfs_root *log,
199 struct extent_buffer *eb, 262 struct extent_buffer *eb,
200 struct walk_control *wc, u64 gen) 263 struct walk_control *wc, u64 gen)
201{ 264{
202 if (wc->pin) { 265 if (wc->pin)
203 mutex_lock(&log->fs_info->pinned_mutex);
204 btrfs_update_pinned_extents(log->fs_info->extent_root, 266 btrfs_update_pinned_extents(log->fs_info->extent_root,
205 eb->start, eb->len, 1); 267 eb->start, eb->len, 1);
206 mutex_unlock(&log->fs_info->pinned_mutex);
207 }
208 268
209 if (btrfs_buffer_uptodate(eb, gen)) { 269 if (btrfs_buffer_uptodate(eb, gen)) {
210 if (wc->write) 270 if (wc->write)
@@ -476,7 +536,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
476 saved_nbytes = inode_get_bytes(inode); 536 saved_nbytes = inode_get_bytes(inode);
477 /* drop any overlapping extents */ 537 /* drop any overlapping extents */
478 ret = btrfs_drop_extents(trans, root, inode, 538 ret = btrfs_drop_extents(trans, root, inode,
479 start, extent_end, start, &alloc_hint); 539 start, extent_end, extent_end, start, &alloc_hint);
480 BUG_ON(ret); 540 BUG_ON(ret);
481 541
482 if (found_type == BTRFS_FILE_EXTENT_REG || 542 if (found_type == BTRFS_FILE_EXTENT_REG ||
@@ -603,6 +663,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
603 663
604 ret = link_to_fixup_dir(trans, root, path, location.objectid); 664 ret = link_to_fixup_dir(trans, root, path, location.objectid);
605 BUG_ON(ret); 665 BUG_ON(ret);
666
606 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 667 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
607 BUG_ON(ret); 668 BUG_ON(ret);
608 kfree(name); 669 kfree(name);
@@ -804,6 +865,7 @@ conflict_again:
804 victim_name_len)) { 865 victim_name_len)) {
805 btrfs_inc_nlink(inode); 866 btrfs_inc_nlink(inode);
806 btrfs_release_path(root, path); 867 btrfs_release_path(root, path);
868
807 ret = btrfs_unlink_inode(trans, root, dir, 869 ret = btrfs_unlink_inode(trans, root, dir,
808 inode, victim_name, 870 inode, victim_name,
809 victim_name_len); 871 victim_name_len);
@@ -922,13 +984,20 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
922 key.offset--; 984 key.offset--;
923 btrfs_release_path(root, path); 985 btrfs_release_path(root, path);
924 } 986 }
925 btrfs_free_path(path); 987 btrfs_release_path(root, path);
926 if (nlink != inode->i_nlink) { 988 if (nlink != inode->i_nlink) {
927 inode->i_nlink = nlink; 989 inode->i_nlink = nlink;
928 btrfs_update_inode(trans, root, inode); 990 btrfs_update_inode(trans, root, inode);
929 } 991 }
930 BTRFS_I(inode)->index_cnt = (u64)-1; 992 BTRFS_I(inode)->index_cnt = (u64)-1;
931 993
994 if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) {
995 ret = replay_dir_deletes(trans, root, NULL, path,
996 inode->i_ino, 1);
997 BUG_ON(ret);
998 }
999 btrfs_free_path(path);
1000
932 return 0; 1001 return 0;
933} 1002}
934 1003
@@ -971,9 +1040,12 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
971 1040
972 iput(inode); 1041 iput(inode);
973 1042
974 if (key.offset == 0) 1043 /*
975 break; 1044 * fixup on a directory may create new entries,
976 key.offset--; 1045 * make sure we always look for the highset possible
1046 * offset
1047 */
1048 key.offset = (u64)-1;
977 } 1049 }
978 btrfs_release_path(root, path); 1050 btrfs_release_path(root, path);
979 return 0; 1051 return 0;
@@ -1150,8 +1222,7 @@ insert:
1150 ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1222 ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1151 name, name_len, log_type, &log_key); 1223 name, name_len, log_type, &log_key);
1152 1224
1153 if (ret && ret != -ENOENT) 1225 BUG_ON(ret && ret != -ENOENT);
1154 BUG();
1155 goto out; 1226 goto out;
1156} 1227}
1157 1228
@@ -1313,11 +1384,11 @@ again:
1313 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1384 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1314 name_len); 1385 name_len);
1315 log_di = NULL; 1386 log_di = NULL;
1316 if (dir_key->type == BTRFS_DIR_ITEM_KEY) { 1387 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
1317 log_di = btrfs_lookup_dir_item(trans, log, log_path, 1388 log_di = btrfs_lookup_dir_item(trans, log, log_path,
1318 dir_key->objectid, 1389 dir_key->objectid,
1319 name, name_len, 0); 1390 name, name_len, 0);
1320 } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) { 1391 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
1321 log_di = btrfs_lookup_dir_index_item(trans, log, 1392 log_di = btrfs_lookup_dir_index_item(trans, log,
1322 log_path, 1393 log_path,
1323 dir_key->objectid, 1394 dir_key->objectid,
@@ -1378,7 +1449,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1378 struct btrfs_root *root, 1449 struct btrfs_root *root,
1379 struct btrfs_root *log, 1450 struct btrfs_root *log,
1380 struct btrfs_path *path, 1451 struct btrfs_path *path,
1381 u64 dirid) 1452 u64 dirid, int del_all)
1382{ 1453{
1383 u64 range_start; 1454 u64 range_start;
1384 u64 range_end; 1455 u64 range_end;
@@ -1408,10 +1479,14 @@ again:
1408 range_start = 0; 1479 range_start = 0;
1409 range_end = 0; 1480 range_end = 0;
1410 while (1) { 1481 while (1) {
1411 ret = find_dir_range(log, path, dirid, key_type, 1482 if (del_all)
1412 &range_start, &range_end); 1483 range_end = (u64)-1;
1413 if (ret != 0) 1484 else {
1414 break; 1485 ret = find_dir_range(log, path, dirid, key_type,
1486 &range_start, &range_end);
1487 if (ret != 0)
1488 break;
1489 }
1415 1490
1416 dir_key.offset = range_start; 1491 dir_key.offset = range_start;
1417 while (1) { 1492 while (1) {
@@ -1437,7 +1512,8 @@ again:
1437 break; 1512 break;
1438 1513
1439 ret = check_item_in_log(trans, root, log, path, 1514 ret = check_item_in_log(trans, root, log, path,
1440 log_path, dir, &found_key); 1515 log_path, dir,
1516 &found_key);
1441 BUG_ON(ret); 1517 BUG_ON(ret);
1442 if (found_key.offset == (u64)-1) 1518 if (found_key.offset == (u64)-1)
1443 break; 1519 break;
@@ -1514,7 +1590,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1514 mode = btrfs_inode_mode(eb, inode_item); 1590 mode = btrfs_inode_mode(eb, inode_item);
1515 if (S_ISDIR(mode)) { 1591 if (S_ISDIR(mode)) {
1516 ret = replay_dir_deletes(wc->trans, 1592 ret = replay_dir_deletes(wc->trans,
1517 root, log, path, key.objectid); 1593 root, log, path, key.objectid, 0);
1518 BUG_ON(ret); 1594 BUG_ON(ret);
1519 } 1595 }
1520 ret = overwrite_item(wc->trans, root, path, 1596 ret = overwrite_item(wc->trans, root, path,
@@ -1533,6 +1609,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1533 root, inode, inode->i_size, 1609 root, inode, inode->i_size,
1534 BTRFS_EXTENT_DATA_KEY); 1610 BTRFS_EXTENT_DATA_KEY);
1535 BUG_ON(ret); 1611 BUG_ON(ret);
1612
1613 /* if the nlink count is zero here, the iput
1614 * will free the inode. We bump it to make
1615 * sure it doesn't get freed until the link
1616 * count fixup is done
1617 */
1618 if (inode->i_nlink == 0) {
1619 btrfs_inc_nlink(inode);
1620 btrfs_update_inode(wc->trans,
1621 root, inode);
1622 }
1536 iput(inode); 1623 iput(inode);
1537 } 1624 }
1538 ret = link_to_fixup_dir(wc->trans, root, 1625 ret = link_to_fixup_dir(wc->trans, root,
@@ -1840,7 +1927,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
1840 return ret; 1927 return ret;
1841} 1928}
1842 1929
1843static int wait_log_commit(struct btrfs_root *root, unsigned long transid) 1930static int wait_log_commit(struct btrfs_trans_handle *trans,
1931 struct btrfs_root *root, unsigned long transid)
1844{ 1932{
1845 DEFINE_WAIT(wait); 1933 DEFINE_WAIT(wait);
1846 int index = transid % 2; 1934 int index = transid % 2;
@@ -1854,9 +1942,12 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
1854 prepare_to_wait(&root->log_commit_wait[index], 1942 prepare_to_wait(&root->log_commit_wait[index],
1855 &wait, TASK_UNINTERRUPTIBLE); 1943 &wait, TASK_UNINTERRUPTIBLE);
1856 mutex_unlock(&root->log_mutex); 1944 mutex_unlock(&root->log_mutex);
1857 if (root->log_transid < transid + 2 && 1945
1946 if (root->fs_info->last_trans_log_full_commit !=
1947 trans->transid && root->log_transid < transid + 2 &&
1858 atomic_read(&root->log_commit[index])) 1948 atomic_read(&root->log_commit[index]))
1859 schedule(); 1949 schedule();
1950
1860 finish_wait(&root->log_commit_wait[index], &wait); 1951 finish_wait(&root->log_commit_wait[index], &wait);
1861 mutex_lock(&root->log_mutex); 1952 mutex_lock(&root->log_mutex);
1862 } while (root->log_transid < transid + 2 && 1953 } while (root->log_transid < transid + 2 &&
@@ -1864,14 +1955,16 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
1864 return 0; 1955 return 0;
1865} 1956}
1866 1957
1867static int wait_for_writer(struct btrfs_root *root) 1958static int wait_for_writer(struct btrfs_trans_handle *trans,
1959 struct btrfs_root *root)
1868{ 1960{
1869 DEFINE_WAIT(wait); 1961 DEFINE_WAIT(wait);
1870 while (atomic_read(&root->log_writers)) { 1962 while (atomic_read(&root->log_writers)) {
1871 prepare_to_wait(&root->log_writer_wait, 1963 prepare_to_wait(&root->log_writer_wait,
1872 &wait, TASK_UNINTERRUPTIBLE); 1964 &wait, TASK_UNINTERRUPTIBLE);
1873 mutex_unlock(&root->log_mutex); 1965 mutex_unlock(&root->log_mutex);
1874 if (atomic_read(&root->log_writers)) 1966 if (root->fs_info->last_trans_log_full_commit !=
1967 trans->transid && atomic_read(&root->log_writers))
1875 schedule(); 1968 schedule();
1876 mutex_lock(&root->log_mutex); 1969 mutex_lock(&root->log_mutex);
1877 finish_wait(&root->log_writer_wait, &wait); 1970 finish_wait(&root->log_writer_wait, &wait);
@@ -1882,7 +1975,14 @@ static int wait_for_writer(struct btrfs_root *root)
1882/* 1975/*
1883 * btrfs_sync_log does sends a given tree log down to the disk and 1976 * btrfs_sync_log does sends a given tree log down to the disk and
1884 * updates the super blocks to record it. When this call is done, 1977 * updates the super blocks to record it. When this call is done,
1885 * you know that any inodes previously logged are safely on disk 1978 * you know that any inodes previously logged are safely on disk only
1979 * if it returns 0.
1980 *
1981 * Any other return value means you need to call btrfs_commit_transaction.
1982 * Some of the edge cases for fsyncing directories that have had unlinks
1983 * or renames done in the past mean that sometimes the only safe
1984 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
1985 * that has happened.
1886 */ 1986 */
1887int btrfs_sync_log(struct btrfs_trans_handle *trans, 1987int btrfs_sync_log(struct btrfs_trans_handle *trans,
1888 struct btrfs_root *root) 1988 struct btrfs_root *root)
@@ -1896,7 +1996,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1896 mutex_lock(&root->log_mutex); 1996 mutex_lock(&root->log_mutex);
1897 index1 = root->log_transid % 2; 1997 index1 = root->log_transid % 2;
1898 if (atomic_read(&root->log_commit[index1])) { 1998 if (atomic_read(&root->log_commit[index1])) {
1899 wait_log_commit(root, root->log_transid); 1999 wait_log_commit(trans, root, root->log_transid);
1900 mutex_unlock(&root->log_mutex); 2000 mutex_unlock(&root->log_mutex);
1901 return 0; 2001 return 0;
1902 } 2002 }
@@ -1904,18 +2004,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1904 2004
1905 /* wait for previous tree log sync to complete */ 2005 /* wait for previous tree log sync to complete */
1906 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2006 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
1907 wait_log_commit(root, root->log_transid - 1); 2007 wait_log_commit(trans, root, root->log_transid - 1);
1908 2008
1909 while (1) { 2009 while (1) {
1910 unsigned long batch = root->log_batch; 2010 unsigned long batch = root->log_batch;
1911 mutex_unlock(&root->log_mutex); 2011 mutex_unlock(&root->log_mutex);
1912 schedule_timeout_uninterruptible(1); 2012 schedule_timeout_uninterruptible(1);
1913 mutex_lock(&root->log_mutex); 2013 mutex_lock(&root->log_mutex);
1914 wait_for_writer(root); 2014
2015 wait_for_writer(trans, root);
1915 if (batch == root->log_batch) 2016 if (batch == root->log_batch)
1916 break; 2017 break;
1917 } 2018 }
1918 2019
2020 /* bail out if we need to do a full commit */
2021 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2022 ret = -EAGAIN;
2023 mutex_unlock(&root->log_mutex);
2024 goto out;
2025 }
2026
1919 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); 2027 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
1920 BUG_ON(ret); 2028 BUG_ON(ret);
1921 2029
@@ -1951,16 +2059,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1951 2059
1952 index2 = log_root_tree->log_transid % 2; 2060 index2 = log_root_tree->log_transid % 2;
1953 if (atomic_read(&log_root_tree->log_commit[index2])) { 2061 if (atomic_read(&log_root_tree->log_commit[index2])) {
1954 wait_log_commit(log_root_tree, log_root_tree->log_transid); 2062 wait_log_commit(trans, log_root_tree,
2063 log_root_tree->log_transid);
1955 mutex_unlock(&log_root_tree->log_mutex); 2064 mutex_unlock(&log_root_tree->log_mutex);
1956 goto out; 2065 goto out;
1957 } 2066 }
1958 atomic_set(&log_root_tree->log_commit[index2], 1); 2067 atomic_set(&log_root_tree->log_commit[index2], 1);
1959 2068
1960 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) 2069 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
1961 wait_log_commit(log_root_tree, log_root_tree->log_transid - 1); 2070 wait_log_commit(trans, log_root_tree,
2071 log_root_tree->log_transid - 1);
2072 }
2073
2074 wait_for_writer(trans, log_root_tree);
1962 2075
1963 wait_for_writer(log_root_tree); 2076 /*
2077 * now that we've moved on to the tree of log tree roots,
2078 * check the full commit flag again
2079 */
2080 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2081 mutex_unlock(&log_root_tree->log_mutex);
2082 ret = -EAGAIN;
2083 goto out_wake_log_root;
2084 }
1964 2085
1965 ret = btrfs_write_and_wait_marked_extents(log_root_tree, 2086 ret = btrfs_write_and_wait_marked_extents(log_root_tree,
1966 &log_root_tree->dirty_log_pages); 2087 &log_root_tree->dirty_log_pages);
@@ -1985,7 +2106,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1985 * in and cause problems either. 2106 * in and cause problems either.
1986 */ 2107 */
1987 write_ctree_super(trans, root->fs_info->tree_root, 2); 2108 write_ctree_super(trans, root->fs_info->tree_root, 2);
2109 ret = 0;
1988 2110
2111out_wake_log_root:
1989 atomic_set(&log_root_tree->log_commit[index2], 0); 2112 atomic_set(&log_root_tree->log_commit[index2], 0);
1990 smp_mb(); 2113 smp_mb();
1991 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2114 if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
@@ -1998,7 +2121,8 @@ out:
1998 return 0; 2121 return 0;
1999} 2122}
2000 2123
2001/* * free all the extents used by the tree log. This should be called 2124/*
2125 * free all the extents used by the tree log. This should be called
2002 * at commit time of the full transaction 2126 * at commit time of the full transaction
2003 */ 2127 */
2004int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 2128int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
@@ -2132,7 +2256,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2132 2256
2133 btrfs_free_path(path); 2257 btrfs_free_path(path);
2134 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2258 mutex_unlock(&BTRFS_I(dir)->log_mutex);
2135 end_log_trans(root); 2259 btrfs_end_log_trans(root);
2136 2260
2137 return 0; 2261 return 0;
2138} 2262}
@@ -2159,7 +2283,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2159 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, 2283 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2160 dirid, &index); 2284 dirid, &index);
2161 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2285 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2162 end_log_trans(root); 2286 btrfs_end_log_trans(root);
2163 2287
2164 return ret; 2288 return ret;
2165} 2289}
@@ -2559,7 +2683,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2559 * 2683 *
2560 * This handles both files and directories. 2684 * This handles both files and directories.
2561 */ 2685 */
2562static int __btrfs_log_inode(struct btrfs_trans_handle *trans, 2686static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2563 struct btrfs_root *root, struct inode *inode, 2687 struct btrfs_root *root, struct inode *inode,
2564 int inode_only) 2688 int inode_only)
2565{ 2689{
@@ -2585,28 +2709,17 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
2585 min_key.offset = 0; 2709 min_key.offset = 0;
2586 2710
2587 max_key.objectid = inode->i_ino; 2711 max_key.objectid = inode->i_ino;
2712
2713 /* today the code can only do partial logging of directories */
2714 if (!S_ISDIR(inode->i_mode))
2715 inode_only = LOG_INODE_ALL;
2716
2588 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 2717 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
2589 max_key.type = BTRFS_XATTR_ITEM_KEY; 2718 max_key.type = BTRFS_XATTR_ITEM_KEY;
2590 else 2719 else
2591 max_key.type = (u8)-1; 2720 max_key.type = (u8)-1;
2592 max_key.offset = (u64)-1; 2721 max_key.offset = (u64)-1;
2593 2722
2594 /*
2595 * if this inode has already been logged and we're in inode_only
2596 * mode, we don't want to delete the things that have already
2597 * been written to the log.
2598 *
2599 * But, if the inode has been through an inode_only log,
2600 * the logged_trans field is not set. This allows us to catch
2601 * any new names for this inode in the backrefs by logging it
2602 * again
2603 */
2604 if (inode_only == LOG_INODE_EXISTS &&
2605 BTRFS_I(inode)->logged_trans == trans->transid) {
2606 btrfs_free_path(path);
2607 btrfs_free_path(dst_path);
2608 goto out;
2609 }
2610 mutex_lock(&BTRFS_I(inode)->log_mutex); 2723 mutex_lock(&BTRFS_I(inode)->log_mutex);
2611 2724
2612 /* 2725 /*
@@ -2693,7 +2806,6 @@ next_slot:
2693 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 2806 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
2694 btrfs_release_path(root, path); 2807 btrfs_release_path(root, path);
2695 btrfs_release_path(log, dst_path); 2808 btrfs_release_path(log, dst_path);
2696 BTRFS_I(inode)->log_dirty_trans = 0;
2697 ret = log_directory_changes(trans, root, inode, path, dst_path); 2809 ret = log_directory_changes(trans, root, inode, path, dst_path);
2698 BUG_ON(ret); 2810 BUG_ON(ret);
2699 } 2811 }
@@ -2702,19 +2814,69 @@ next_slot:
2702 2814
2703 btrfs_free_path(path); 2815 btrfs_free_path(path);
2704 btrfs_free_path(dst_path); 2816 btrfs_free_path(dst_path);
2705out:
2706 return 0; 2817 return 0;
2707} 2818}
2708 2819
2709int btrfs_log_inode(struct btrfs_trans_handle *trans, 2820/*
2710 struct btrfs_root *root, struct inode *inode, 2821 * follow the dentry parent pointers up the chain and see if any
2711 int inode_only) 2822 * of the directories in it require a full commit before they can
2823 * be logged. Returns zero if nothing special needs to be done or 1 if
2824 * a full commit is required.
2825 */
2826static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
2827 struct inode *inode,
2828 struct dentry *parent,
2829 struct super_block *sb,
2830 u64 last_committed)
2712{ 2831{
2713 int ret; 2832 int ret = 0;
2833 struct btrfs_root *root;
2714 2834
2715 start_log_trans(trans, root); 2835 /*
2716 ret = __btrfs_log_inode(trans, root, inode, inode_only); 2836 * for regular files, if its inode is already on disk, we don't
2717 end_log_trans(root); 2837 * have to worry about the parents at all. This is because
2838 * we can use the last_unlink_trans field to record renames
2839 * and other fun in this file.
2840 */
2841 if (S_ISREG(inode->i_mode) &&
2842 BTRFS_I(inode)->generation <= last_committed &&
2843 BTRFS_I(inode)->last_unlink_trans <= last_committed)
2844 goto out;
2845
2846 if (!S_ISDIR(inode->i_mode)) {
2847 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2848 goto out;
2849 inode = parent->d_inode;
2850 }
2851
2852 while (1) {
2853 BTRFS_I(inode)->logged_trans = trans->transid;
2854 smp_mb();
2855
2856 if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
2857 root = BTRFS_I(inode)->root;
2858
2859 /*
2860 * make sure any commits to the log are forced
2861 * to be full commits
2862 */
2863 root->fs_info->last_trans_log_full_commit =
2864 trans->transid;
2865 ret = 1;
2866 break;
2867 }
2868
2869 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2870 break;
2871
2872 if (parent == sb->s_root)
2873 break;
2874
2875 parent = parent->d_parent;
2876 inode = parent->d_inode;
2877
2878 }
2879out:
2718 return ret; 2880 return ret;
2719} 2881}
2720 2882
@@ -2724,31 +2886,70 @@ int btrfs_log_inode(struct btrfs_trans_handle *trans,
2724 * only logging is done of any parent directories that are older than 2886 * only logging is done of any parent directories that are older than
2725 * the last committed transaction 2887 * the last committed transaction
2726 */ 2888 */
2727int btrfs_log_dentry(struct btrfs_trans_handle *trans, 2889int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2728 struct btrfs_root *root, struct dentry *dentry) 2890 struct btrfs_root *root, struct inode *inode,
2891 struct dentry *parent, int exists_only)
2729{ 2892{
2730 int inode_only = LOG_INODE_ALL; 2893 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
2731 struct super_block *sb; 2894 struct super_block *sb;
2732 int ret; 2895 int ret = 0;
2896 u64 last_committed = root->fs_info->last_trans_committed;
2897
2898 sb = inode->i_sb;
2899
2900 if (btrfs_test_opt(root, NOTREELOG)) {
2901 ret = 1;
2902 goto end_no_trans;
2903 }
2904
2905 if (root->fs_info->last_trans_log_full_commit >
2906 root->fs_info->last_trans_committed) {
2907 ret = 1;
2908 goto end_no_trans;
2909 }
2910
2911 ret = check_parent_dirs_for_sync(trans, inode, parent,
2912 sb, last_committed);
2913 if (ret)
2914 goto end_no_trans;
2733 2915
2734 start_log_trans(trans, root); 2916 start_log_trans(trans, root);
2735 sb = dentry->d_inode->i_sb;
2736 while (1) {
2737 ret = __btrfs_log_inode(trans, root, dentry->d_inode,
2738 inode_only);
2739 BUG_ON(ret);
2740 inode_only = LOG_INODE_EXISTS;
2741 2917
2742 dentry = dentry->d_parent; 2918 ret = btrfs_log_inode(trans, root, inode, inode_only);
2743 if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb) 2919 BUG_ON(ret);
2920
2921 /*
2922 * for regular files, if its inode is already on disk, we don't
2923 * have to worry about the parents at all. This is because
2924 * we can use the last_unlink_trans field to record renames
2925 * and other fun in this file.
2926 */
2927 if (S_ISREG(inode->i_mode) &&
2928 BTRFS_I(inode)->generation <= last_committed &&
2929 BTRFS_I(inode)->last_unlink_trans <= last_committed)
2930 goto no_parent;
2931
2932 inode_only = LOG_INODE_EXISTS;
2933 while (1) {
2934 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2744 break; 2935 break;
2745 2936
2746 if (BTRFS_I(dentry->d_inode)->generation <= 2937 inode = parent->d_inode;
2747 root->fs_info->last_trans_committed) 2938 if (BTRFS_I(inode)->generation >
2939 root->fs_info->last_trans_committed) {
2940 ret = btrfs_log_inode(trans, root, inode, inode_only);
2941 BUG_ON(ret);
2942 }
2943 if (parent == sb->s_root)
2748 break; 2944 break;
2945
2946 parent = parent->d_parent;
2749 } 2947 }
2750 end_log_trans(root); 2948no_parent:
2751 return 0; 2949 ret = 0;
2950 btrfs_end_log_trans(root);
2951end_no_trans:
2952 return ret;
2752} 2953}
2753 2954
2754/* 2955/*
@@ -2760,12 +2961,8 @@ int btrfs_log_dentry(struct btrfs_trans_handle *trans,
2760int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 2961int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
2761 struct btrfs_root *root, struct dentry *dentry) 2962 struct btrfs_root *root, struct dentry *dentry)
2762{ 2963{
2763 u64 gen; 2964 return btrfs_log_inode_parent(trans, root, dentry->d_inode,
2764 gen = root->fs_info->last_trans_new_blockgroup; 2965 dentry->d_parent, 0);
2765 if (gen > root->fs_info->last_trans_committed)
2766 return 1;
2767 else
2768 return btrfs_log_dentry(trans, root, dentry);
2769} 2966}
2770 2967
2771/* 2968/*
@@ -2884,3 +3081,94 @@ again:
2884 kfree(log_root_tree); 3081 kfree(log_root_tree);
2885 return 0; 3082 return 0;
2886} 3083}
3084
3085/*
3086 * there are some corner cases where we want to force a full
3087 * commit instead of allowing a directory to be logged.
3088 *
3089 * They revolve around files there were unlinked from the directory, and
3090 * this function updates the parent directory so that a full commit is
3091 * properly done if it is fsync'd later after the unlinks are done.
3092 */
3093void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
3094 struct inode *dir, struct inode *inode,
3095 int for_rename)
3096{
3097 /*
3098 * when we're logging a file, if it hasn't been renamed
3099 * or unlinked, and its inode is fully committed on disk,
3100 * we don't have to worry about walking up the directory chain
3101 * to log its parents.
3102 *
3103 * So, we use the last_unlink_trans field to put this transid
3104 * into the file. When the file is logged we check it and
3105 * don't log the parents if the file is fully on disk.
3106 */
3107 if (S_ISREG(inode->i_mode))
3108 BTRFS_I(inode)->last_unlink_trans = trans->transid;
3109
3110 /*
3111 * if this directory was already logged any new
3112 * names for this file/dir will get recorded
3113 */
3114 smp_mb();
3115 if (BTRFS_I(dir)->logged_trans == trans->transid)
3116 return;
3117
3118 /*
3119 * if the inode we're about to unlink was logged,
3120 * the log will be properly updated for any new names
3121 */
3122 if (BTRFS_I(inode)->logged_trans == trans->transid)
3123 return;
3124
3125 /*
3126 * when renaming files across directories, if the directory
3127 * there we're unlinking from gets fsync'd later on, there's
3128 * no way to find the destination directory later and fsync it
3129 * properly. So, we have to be conservative and force commits
3130 * so the new name gets discovered.
3131 */
3132 if (for_rename)
3133 goto record;
3134
3135 /* we can safely do the unlink without any special recording */
3136 return;
3137
3138record:
3139 BTRFS_I(dir)->last_unlink_trans = trans->transid;
3140}
3141
3142/*
3143 * Call this after adding a new name for a file and it will properly
3144 * update the log to reflect the new name.
3145 *
3146 * It will return zero if all goes well, and it will return 1 if a
3147 * full transaction commit is required.
3148 */
3149int btrfs_log_new_name(struct btrfs_trans_handle *trans,
3150 struct inode *inode, struct inode *old_dir,
3151 struct dentry *parent)
3152{
3153 struct btrfs_root * root = BTRFS_I(inode)->root;
3154
3155 /*
3156 * this will force the logging code to walk the dentry chain
3157 * up for the file
3158 */
3159 if (S_ISREG(inode->i_mode))
3160 BTRFS_I(inode)->last_unlink_trans = trans->transid;
3161
3162 /*
3163 * if this inode hasn't been logged and directory we're renaming it
3164 * from hasn't been logged, we don't need to log it
3165 */
3166 if (BTRFS_I(inode)->logged_trans <=
3167 root->fs_info->last_trans_committed &&
3168 (!old_dir || BTRFS_I(old_dir)->logged_trans <=
3169 root->fs_info->last_trans_committed))
3170 return 0;
3171
3172 return btrfs_log_inode_parent(trans, root, inode, parent, 1);
3173}
3174
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index b9409b32ed02..d09c7609e16b 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -22,14 +22,9 @@
22int btrfs_sync_log(struct btrfs_trans_handle *trans, 22int btrfs_sync_log(struct btrfs_trans_handle *trans,
23 struct btrfs_root *root); 23 struct btrfs_root *root);
24int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); 24int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
25int btrfs_log_dentry(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root, struct dentry *dentry);
27int btrfs_recover_log_trees(struct btrfs_root *tree_root); 25int btrfs_recover_log_trees(struct btrfs_root *tree_root);
28int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 26int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
29 struct btrfs_root *root, struct dentry *dentry); 27 struct btrfs_root *root, struct dentry *dentry);
30int btrfs_log_inode(struct btrfs_trans_handle *trans,
31 struct btrfs_root *root, struct inode *inode,
32 int inode_only);
33int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 28int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
34 struct btrfs_root *root, 29 struct btrfs_root *root,
35 const char *name, int name_len, 30 const char *name, int name_len,
@@ -38,4 +33,16 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
38 struct btrfs_root *root, 33 struct btrfs_root *root,
39 const char *name, int name_len, 34 const char *name, int name_len,
40 struct inode *inode, u64 dirid); 35 struct inode *inode, u64 dirid);
36int btrfs_join_running_log_trans(struct btrfs_root *root);
37int btrfs_end_log_trans(struct btrfs_root *root);
38int btrfs_pin_log_trans(struct btrfs_root *root);
39int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
40 struct btrfs_root *root, struct inode *inode,
41 struct dentry *parent, int exists_only);
42void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
43 struct inode *dir, struct inode *inode,
44 int for_rename);
45int btrfs_log_new_name(struct btrfs_trans_handle *trans,
46 struct inode *inode, struct inode *old_dir,
47 struct dentry *parent);
41#endif 48#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dd06e18e5aac..a6d35b0054ca 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -20,6 +20,7 @@
20#include <linux/buffer_head.h> 20#include <linux/buffer_head.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/random.h> 22#include <linux/random.h>
23#include <linux/iocontext.h>
23#include <asm/div64.h> 24#include <asm/div64.h>
24#include "compat.h" 25#include "compat.h"
25#include "ctree.h" 26#include "ctree.h"
@@ -124,6 +125,20 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
124 return NULL; 125 return NULL;
125} 126}
126 127
128static void requeue_list(struct btrfs_pending_bios *pending_bios,
129 struct bio *head, struct bio *tail)
130{
131
132 struct bio *old_head;
133
134 old_head = pending_bios->head;
135 pending_bios->head = head;
136 if (pending_bios->tail)
137 tail->bi_next = old_head;
138 else
139 pending_bios->tail = tail;
140}
141
127/* 142/*
128 * we try to collect pending bios for a device so we don't get a large 143 * we try to collect pending bios for a device so we don't get a large
129 * number of procs sending bios down to the same device. This greatly 144 * number of procs sending bios down to the same device. This greatly
@@ -140,31 +155,44 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
140 struct bio *pending; 155 struct bio *pending;
141 struct backing_dev_info *bdi; 156 struct backing_dev_info *bdi;
142 struct btrfs_fs_info *fs_info; 157 struct btrfs_fs_info *fs_info;
158 struct btrfs_pending_bios *pending_bios;
143 struct bio *tail; 159 struct bio *tail;
144 struct bio *cur; 160 struct bio *cur;
145 int again = 0; 161 int again = 0;
146 unsigned long num_run = 0; 162 unsigned long num_run;
163 unsigned long num_sync_run;
147 unsigned long limit; 164 unsigned long limit;
165 unsigned long last_waited = 0;
148 166
149 bdi = device->bdev->bd_inode->i_mapping->backing_dev_info; 167 bdi = blk_get_backing_dev_info(device->bdev);
150 fs_info = device->dev_root->fs_info; 168 fs_info = device->dev_root->fs_info;
151 limit = btrfs_async_submit_limit(fs_info); 169 limit = btrfs_async_submit_limit(fs_info);
152 limit = limit * 2 / 3; 170 limit = limit * 2 / 3;
153 171
172 /* we want to make sure that every time we switch from the sync
173 * list to the normal list, we unplug
174 */
175 num_sync_run = 0;
176
154loop: 177loop:
155 spin_lock(&device->io_lock); 178 spin_lock(&device->io_lock);
179 num_run = 0;
156 180
157loop_lock: 181loop_lock:
182
158 /* take all the bios off the list at once and process them 183 /* take all the bios off the list at once and process them
159 * later on (without the lock held). But, remember the 184 * later on (without the lock held). But, remember the
160 * tail and other pointers so the bios can be properly reinserted 185 * tail and other pointers so the bios can be properly reinserted
161 * into the list if we hit congestion 186 * into the list if we hit congestion
162 */ 187 */
163 pending = device->pending_bios; 188 if (device->pending_sync_bios.head)
164 tail = device->pending_bio_tail; 189 pending_bios = &device->pending_sync_bios;
190 else
191 pending_bios = &device->pending_bios;
192
193 pending = pending_bios->head;
194 tail = pending_bios->tail;
165 WARN_ON(pending && !tail); 195 WARN_ON(pending && !tail);
166 device->pending_bios = NULL;
167 device->pending_bio_tail = NULL;
168 196
169 /* 197 /*
170 * if pending was null this time around, no bios need processing 198 * if pending was null this time around, no bios need processing
@@ -174,16 +202,41 @@ loop_lock:
174 * device->running_pending is used to synchronize with the 202 * device->running_pending is used to synchronize with the
175 * schedule_bio code. 203 * schedule_bio code.
176 */ 204 */
177 if (pending) { 205 if (device->pending_sync_bios.head == NULL &&
178 again = 1; 206 device->pending_bios.head == NULL) {
179 device->running_pending = 1;
180 } else {
181 again = 0; 207 again = 0;
182 device->running_pending = 0; 208 device->running_pending = 0;
209 } else {
210 again = 1;
211 device->running_pending = 1;
183 } 212 }
213
214 pending_bios->head = NULL;
215 pending_bios->tail = NULL;
216
184 spin_unlock(&device->io_lock); 217 spin_unlock(&device->io_lock);
185 218
219 /*
220 * if we're doing the regular priority list, make sure we unplug
221 * for any high prio bios we've sent down
222 */
223 if (pending_bios == &device->pending_bios && num_sync_run > 0) {
224 num_sync_run = 0;
225 blk_run_backing_dev(bdi, NULL);
226 }
227
186 while (pending) { 228 while (pending) {
229
230 rmb();
231 if (pending_bios != &device->pending_sync_bios &&
232 device->pending_sync_bios.head &&
233 num_run > 16) {
234 cond_resched();
235 spin_lock(&device->io_lock);
236 requeue_list(pending_bios, pending, tail);
237 goto loop_lock;
238 }
239
187 cur = pending; 240 cur = pending;
188 pending = pending->bi_next; 241 pending = pending->bi_next;
189 cur->bi_next = NULL; 242 cur->bi_next = NULL;
@@ -194,10 +247,18 @@ loop_lock:
194 wake_up(&fs_info->async_submit_wait); 247 wake_up(&fs_info->async_submit_wait);
195 248
196 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 249 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
197 bio_get(cur);
198 submit_bio(cur->bi_rw, cur); 250 submit_bio(cur->bi_rw, cur);
199 bio_put(cur);
200 num_run++; 251 num_run++;
252 if (bio_sync(cur))
253 num_sync_run++;
254
255 if (need_resched()) {
256 if (num_sync_run) {
257 blk_run_backing_dev(bdi, NULL);
258 num_sync_run = 0;
259 }
260 cond_resched();
261 }
201 262
202 /* 263 /*
203 * we made progress, there is more work to do and the bdi 264 * we made progress, there is more work to do and the bdi
@@ -206,17 +267,41 @@ loop_lock:
206 */ 267 */
207 if (pending && bdi_write_congested(bdi) && num_run > 16 && 268 if (pending && bdi_write_congested(bdi) && num_run > 16 &&
208 fs_info->fs_devices->open_devices > 1) { 269 fs_info->fs_devices->open_devices > 1) {
209 struct bio *old_head; 270 struct io_context *ioc;
210 271
211 spin_lock(&device->io_lock); 272 ioc = current->io_context;
212
213 old_head = device->pending_bios;
214 device->pending_bios = pending;
215 if (device->pending_bio_tail)
216 tail->bi_next = old_head;
217 else
218 device->pending_bio_tail = tail;
219 273
274 /*
275 * the main goal here is that we don't want to
276 * block if we're going to be able to submit
277 * more requests without blocking.
278 *
279 * This code does two great things, it pokes into
280 * the elevator code from a filesystem _and_
281 * it makes assumptions about how batching works.
282 */
283 if (ioc && ioc->nr_batch_requests > 0 &&
284 time_before(jiffies, ioc->last_waited + HZ/50UL) &&
285 (last_waited == 0 ||
286 ioc->last_waited == last_waited)) {
287 /*
288 * we want to go through our batch of
289 * requests and stop. So, we copy out
290 * the ioc->last_waited time and test
291 * against it before looping
292 */
293 last_waited = ioc->last_waited;
294 if (need_resched()) {
295 if (num_sync_run) {
296 blk_run_backing_dev(bdi, NULL);
297 num_sync_run = 0;
298 }
299 cond_resched();
300 }
301 continue;
302 }
303 spin_lock(&device->io_lock);
304 requeue_list(pending_bios, pending, tail);
220 device->running_pending = 1; 305 device->running_pending = 1;
221 306
222 spin_unlock(&device->io_lock); 307 spin_unlock(&device->io_lock);
@@ -224,13 +309,32 @@ loop_lock:
224 goto done; 309 goto done;
225 } 310 }
226 } 311 }
312
313 if (num_sync_run) {
314 num_sync_run = 0;
315 blk_run_backing_dev(bdi, NULL);
316 }
317
318 cond_resched();
227 if (again) 319 if (again)
228 goto loop; 320 goto loop;
229 321
230 spin_lock(&device->io_lock); 322 spin_lock(&device->io_lock);
231 if (device->pending_bios) 323 if (device->pending_bios.head || device->pending_sync_bios.head)
232 goto loop_lock; 324 goto loop_lock;
233 spin_unlock(&device->io_lock); 325 spin_unlock(&device->io_lock);
326
327 /*
328 * IO has already been through a long path to get here. Checksumming,
329 * async helper threads, perhaps compression. We've done a pretty
330 * good job of collecting a batch of IO and should just unplug
331 * the device right away.
332 *
333 * This will help anyone who is waiting on the IO, they might have
334 * already unplugged, but managed to do so before the bio they
335 * cared about found its way down here.
336 */
337 blk_run_backing_dev(bdi, NULL);
234done: 338done:
235 return 0; 339 return 0;
236} 340}
@@ -1336,6 +1440,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1336 device->io_align = root->sectorsize; 1440 device->io_align = root->sectorsize;
1337 device->sector_size = root->sectorsize; 1441 device->sector_size = root->sectorsize;
1338 device->total_bytes = i_size_read(bdev->bd_inode); 1442 device->total_bytes = i_size_read(bdev->bd_inode);
1443 device->disk_total_bytes = device->total_bytes;
1339 device->dev_root = root->fs_info->dev_root; 1444 device->dev_root = root->fs_info->dev_root;
1340 device->bdev = bdev; 1445 device->bdev = bdev;
1341 device->in_fs_metadata = 1; 1446 device->in_fs_metadata = 1;
@@ -1439,7 +1544,7 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
1439 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1544 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1440 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1545 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1441 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1546 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1442 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); 1547 btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
1443 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1548 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1444 btrfs_mark_buffer_dirty(leaf); 1549 btrfs_mark_buffer_dirty(leaf);
1445 1550
@@ -1836,14 +1941,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
1836 device->total_bytes = new_size; 1941 device->total_bytes = new_size;
1837 if (device->writeable) 1942 if (device->writeable)
1838 device->fs_devices->total_rw_bytes -= diff; 1943 device->fs_devices->total_rw_bytes -= diff;
1839 ret = btrfs_update_device(trans, device);
1840 if (ret) {
1841 unlock_chunks(root);
1842 btrfs_end_transaction(trans, root);
1843 goto done;
1844 }
1845 WARN_ON(diff > old_total);
1846 btrfs_set_super_total_bytes(super_copy, old_total - diff);
1847 unlock_chunks(root); 1944 unlock_chunks(root);
1848 btrfs_end_transaction(trans, root); 1945 btrfs_end_transaction(trans, root);
1849 1946
@@ -1875,7 +1972,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
1875 length = btrfs_dev_extent_length(l, dev_extent); 1972 length = btrfs_dev_extent_length(l, dev_extent);
1876 1973
1877 if (key.offset + length <= new_size) 1974 if (key.offset + length <= new_size)
1878 goto done; 1975 break;
1879 1976
1880 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); 1977 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
1881 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 1978 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
@@ -1888,6 +1985,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
1888 goto done; 1985 goto done;
1889 } 1986 }
1890 1987
1988 /* Shrinking succeeded, else we would be at "done". */
1989 trans = btrfs_start_transaction(root, 1);
1990 if (!trans) {
1991 ret = -ENOMEM;
1992 goto done;
1993 }
1994 lock_chunks(root);
1995
1996 device->disk_total_bytes = new_size;
1997 /* Now btrfs_update_device() will change the on-disk size. */
1998 ret = btrfs_update_device(trans, device);
1999 if (ret) {
2000 unlock_chunks(root);
2001 btrfs_end_transaction(trans, root);
2002 goto done;
2003 }
2004 WARN_ON(diff > old_total);
2005 btrfs_set_super_total_bytes(super_copy, old_total - diff);
2006 unlock_chunks(root);
2007 btrfs_end_transaction(trans, root);
1891done: 2008done:
1892 btrfs_free_path(path); 2009 btrfs_free_path(path);
1893 return ret; 2010 return ret;
@@ -2458,7 +2575,7 @@ again:
2458 max_errors = 1; 2575 max_errors = 1;
2459 } 2576 }
2460 } 2577 }
2461 if (multi_ret && rw == WRITE && 2578 if (multi_ret && (rw & (1 << BIO_RW)) &&
2462 stripes_allocated < stripes_required) { 2579 stripes_allocated < stripes_required) {
2463 stripes_allocated = map->num_stripes; 2580 stripes_allocated = map->num_stripes;
2464 free_extent_map(em); 2581 free_extent_map(em);
@@ -2723,6 +2840,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
2723 int rw, struct bio *bio) 2840 int rw, struct bio *bio)
2724{ 2841{
2725 int should_queue = 1; 2842 int should_queue = 1;
2843 struct btrfs_pending_bios *pending_bios;
2726 2844
2727 /* don't bother with additional async steps for reads, right now */ 2845 /* don't bother with additional async steps for reads, right now */
2728 if (!(rw & (1 << BIO_RW))) { 2846 if (!(rw & (1 << BIO_RW))) {
@@ -2744,13 +2862,17 @@ static noinline int schedule_bio(struct btrfs_root *root,
2744 bio->bi_rw |= rw; 2862 bio->bi_rw |= rw;
2745 2863
2746 spin_lock(&device->io_lock); 2864 spin_lock(&device->io_lock);
2865 if (bio_sync(bio))
2866 pending_bios = &device->pending_sync_bios;
2867 else
2868 pending_bios = &device->pending_bios;
2747 2869
2748 if (device->pending_bio_tail) 2870 if (pending_bios->tail)
2749 device->pending_bio_tail->bi_next = bio; 2871 pending_bios->tail->bi_next = bio;
2750 2872
2751 device->pending_bio_tail = bio; 2873 pending_bios->tail = bio;
2752 if (!device->pending_bios) 2874 if (!pending_bios->head)
2753 device->pending_bios = bio; 2875 pending_bios->head = bio;
2754 if (device->running_pending) 2876 if (device->running_pending)
2755 should_queue = 0; 2877 should_queue = 0;
2756 2878
@@ -2967,7 +3089,8 @@ static int fill_device_from_item(struct extent_buffer *leaf,
2967 unsigned long ptr; 3089 unsigned long ptr;
2968 3090
2969 device->devid = btrfs_device_id(leaf, dev_item); 3091 device->devid = btrfs_device_id(leaf, dev_item);
2970 device->total_bytes = btrfs_device_total_bytes(leaf, dev_item); 3092 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
3093 device->total_bytes = device->disk_total_bytes;
2971 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 3094 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
2972 device->type = btrfs_device_type(leaf, dev_item); 3095 device->type = btrfs_device_type(leaf, dev_item);
2973 device->io_align = btrfs_device_io_align(leaf, dev_item); 3096 device->io_align = btrfs_device_io_align(leaf, dev_item);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 86c44e9ae110..5c3ff6d02fd7 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -23,13 +23,22 @@
23#include "async-thread.h" 23#include "async-thread.h"
24 24
25struct buffer_head; 25struct buffer_head;
26struct btrfs_pending_bios {
27 struct bio *head;
28 struct bio *tail;
29};
30
26struct btrfs_device { 31struct btrfs_device {
27 struct list_head dev_list; 32 struct list_head dev_list;
28 struct list_head dev_alloc_list; 33 struct list_head dev_alloc_list;
29 struct btrfs_fs_devices *fs_devices; 34 struct btrfs_fs_devices *fs_devices;
30 struct btrfs_root *dev_root; 35 struct btrfs_root *dev_root;
31 struct bio *pending_bios; 36
32 struct bio *pending_bio_tail; 37 /* regular prio bios */
38 struct btrfs_pending_bios pending_bios;
39 /* WRITE_SYNC bios */
40 struct btrfs_pending_bios pending_sync_bios;
41
33 int running_pending; 42 int running_pending;
34 u64 generation; 43 u64 generation;
35 44
@@ -52,6 +61,9 @@ struct btrfs_device {
52 /* size of the device */ 61 /* size of the device */
53 u64 total_bytes; 62 u64 total_bytes;
54 63
64 /* size of the disk */
65 u64 disk_total_bytes;
66
55 /* bytes used */ 67 /* bytes used */
56 u64 bytes_used; 68 u64 bytes_used;
57 69
@@ -76,7 +88,7 @@ struct btrfs_device {
76struct btrfs_fs_devices { 88struct btrfs_fs_devices {
77 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ 89 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
78 90
79 /* the device with this id has the most recent coyp of the super */ 91 /* the device with this id has the most recent copy of the super */
80 u64 latest_devid; 92 u64 latest_devid;
81 u64 latest_trans; 93 u64 latest_trans;
82 u64 num_devices; 94 u64 num_devices;
diff --git a/fs/buffer.c b/fs/buffer.c
index a2fd743d97cb..49106127a4aa 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -199,13 +199,13 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
199 head = page_buffers(page); 199 head = page_buffers(page);
200 bh = head; 200 bh = head;
201 do { 201 do {
202 if (bh->b_blocknr == block) { 202 if (!buffer_mapped(bh))
203 all_mapped = 0;
204 else if (bh->b_blocknr == block) {
203 ret = bh; 205 ret = bh;
204 get_bh(bh); 206 get_bh(bh);
205 goto out_unlock; 207 goto out_unlock;
206 } 208 }
207 if (!buffer_mapped(bh))
208 all_mapped = 0;
209 bh = bh->b_this_page; 209 bh = bh->b_this_page;
210 } while (bh != head); 210 } while (bh != head);
211 211
@@ -290,7 +290,7 @@ static void free_more_memory(void)
290 &zone); 290 &zone);
291 if (zone) 291 if (zone)
292 try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, 292 try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
293 GFP_NOFS); 293 GFP_NOFS, NULL);
294 } 294 }
295} 295}
296 296
@@ -360,7 +360,7 @@ still_busy:
360 * Completion handler for block_write_full_page() - pages which are unlocked 360 * Completion handler for block_write_full_page() - pages which are unlocked
361 * during I/O, and which have PageWriteback cleared upon I/O completion. 361 * during I/O, and which have PageWriteback cleared upon I/O completion.
362 */ 362 */
363static void end_buffer_async_write(struct buffer_head *bh, int uptodate) 363void end_buffer_async_write(struct buffer_head *bh, int uptodate)
364{ 364{
365 char b[BDEVNAME_SIZE]; 365 char b[BDEVNAME_SIZE];
366 unsigned long flags; 366 unsigned long flags;
@@ -438,11 +438,17 @@ static void mark_buffer_async_read(struct buffer_head *bh)
438 set_buffer_async_read(bh); 438 set_buffer_async_read(bh);
439} 439}
440 440
441void mark_buffer_async_write(struct buffer_head *bh) 441void mark_buffer_async_write_endio(struct buffer_head *bh,
442 bh_end_io_t *handler)
442{ 443{
443 bh->b_end_io = end_buffer_async_write; 444 bh->b_end_io = handler;
444 set_buffer_async_write(bh); 445 set_buffer_async_write(bh);
445} 446}
447
448void mark_buffer_async_write(struct buffer_head *bh)
449{
450 mark_buffer_async_write_endio(bh, end_buffer_async_write);
451}
446EXPORT_SYMBOL(mark_buffer_async_write); 452EXPORT_SYMBOL(mark_buffer_async_write);
447 453
448 454
@@ -547,6 +553,46 @@ repeat:
547 return err; 553 return err;
548} 554}
549 555
556void do_thaw_all(struct work_struct *work)
557{
558 struct super_block *sb;
559 char b[BDEVNAME_SIZE];
560
561 spin_lock(&sb_lock);
562restart:
563 list_for_each_entry(sb, &super_blocks, s_list) {
564 sb->s_count++;
565 spin_unlock(&sb_lock);
566 down_read(&sb->s_umount);
567 while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
568 printk(KERN_WARNING "Emergency Thaw on %s\n",
569 bdevname(sb->s_bdev, b));
570 up_read(&sb->s_umount);
571 spin_lock(&sb_lock);
572 if (__put_super_and_need_restart(sb))
573 goto restart;
574 }
575 spin_unlock(&sb_lock);
576 kfree(work);
577 printk(KERN_WARNING "Emergency Thaw complete\n");
578}
579
580/**
581 * emergency_thaw_all -- forcibly thaw every frozen filesystem
582 *
583 * Used for emergency unfreeze of all filesystems via SysRq
584 */
585void emergency_thaw_all(void)
586{
587 struct work_struct *work;
588
589 work = kmalloc(sizeof(*work), GFP_ATOMIC);
590 if (work) {
591 INIT_WORK(work, do_thaw_all);
592 schedule_work(work);
593 }
594}
595
550/** 596/**
551 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers 597 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
552 * @mapping: the mapping which wants those buffers written 598 * @mapping: the mapping which wants those buffers written
@@ -621,14 +667,7 @@ static void __set_page_dirty(struct page *page,
621 spin_lock_irq(&mapping->tree_lock); 667 spin_lock_irq(&mapping->tree_lock);
622 if (page->mapping) { /* Race with truncate? */ 668 if (page->mapping) { /* Race with truncate? */
623 WARN_ON_ONCE(warn && !PageUptodate(page)); 669 WARN_ON_ONCE(warn && !PageUptodate(page));
624 670 account_page_dirtied(page, mapping);
625 if (mapping_cap_account_dirty(mapping)) {
626 __inc_zone_page_state(page, NR_FILE_DIRTY);
627 __inc_bdi_stat(mapping->backing_dev_info,
628 BDI_RECLAIMABLE);
629 task_dirty_inc(current);
630 task_io_account_write(PAGE_CACHE_SIZE);
631 }
632 radix_tree_tag_set(&mapping->page_tree, 671 radix_tree_tag_set(&mapping->page_tree,
633 page_index(page), PAGECACHE_TAG_DIRTY); 672 page_index(page), PAGECACHE_TAG_DIRTY);
634 } 673 }
@@ -711,7 +750,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
711{ 750{
712 struct buffer_head *bh; 751 struct buffer_head *bh;
713 struct list_head tmp; 752 struct list_head tmp;
714 struct address_space *mapping; 753 struct address_space *mapping, *prev_mapping = NULL;
715 int err = 0, err2; 754 int err = 0, err2;
716 755
717 INIT_LIST_HEAD(&tmp); 756 INIT_LIST_HEAD(&tmp);
@@ -736,7 +775,18 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
736 * contents - it is a noop if I/O is still in 775 * contents - it is a noop if I/O is still in
737 * flight on potentially older contents. 776 * flight on potentially older contents.
738 */ 777 */
739 ll_rw_block(SWRITE_SYNC, 1, &bh); 778 ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh);
779
780 /*
781 * Kick off IO for the previous mapping. Note
782 * that we will not run the very last mapping,
783 * wait_on_buffer() will do that for us
784 * through sync_buffer().
785 */
786 if (prev_mapping && prev_mapping != mapping)
787 blk_run_address_space(prev_mapping);
788 prev_mapping = mapping;
789
740 brelse(bh); 790 brelse(bh);
741 spin_lock(lock); 791 spin_lock(lock);
742 } 792 }
@@ -1559,9 +1609,20 @@ EXPORT_SYMBOL(unmap_underlying_metadata);
1559 * locked buffer. This only can happen if someone has written the buffer 1609 * locked buffer. This only can happen if someone has written the buffer
1560 * directly, with submit_bh(). At the address_space level PageWriteback 1610 * directly, with submit_bh(). At the address_space level PageWriteback
1561 * prevents this contention from occurring. 1611 * prevents this contention from occurring.
1612 *
1613 * If block_write_full_page() is called with wbc->sync_mode ==
1614 * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this
1615 * causes the writes to be flagged as synchronous writes, but the
1616 * block device queue will NOT be unplugged, since usually many pages
1617 * will be pushed to the out before the higher-level caller actually
1618 * waits for the writes to be completed. The various wait functions,
1619 * such as wait_on_writeback_range() will ultimately call sync_page()
1620 * which will ultimately call blk_run_backing_dev(), which will end up
1621 * unplugging the device queue.
1562 */ 1622 */
1563static int __block_write_full_page(struct inode *inode, struct page *page, 1623static int __block_write_full_page(struct inode *inode, struct page *page,
1564 get_block_t *get_block, struct writeback_control *wbc) 1624 get_block_t *get_block, struct writeback_control *wbc,
1625 bh_end_io_t *handler)
1565{ 1626{
1566 int err; 1627 int err;
1567 sector_t block; 1628 sector_t block;
@@ -1569,6 +1630,8 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
1569 struct buffer_head *bh, *head; 1630 struct buffer_head *bh, *head;
1570 const unsigned blocksize = 1 << inode->i_blkbits; 1631 const unsigned blocksize = 1 << inode->i_blkbits;
1571 int nr_underway = 0; 1632 int nr_underway = 0;
1633 int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
1634 WRITE_SYNC_PLUG : WRITE);
1572 1635
1573 BUG_ON(!PageLocked(page)); 1636 BUG_ON(!PageLocked(page));
1574 1637
@@ -1644,7 +1707,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
1644 continue; 1707 continue;
1645 } 1708 }
1646 if (test_clear_buffer_dirty(bh)) { 1709 if (test_clear_buffer_dirty(bh)) {
1647 mark_buffer_async_write(bh); 1710 mark_buffer_async_write_endio(bh, handler);
1648 } else { 1711 } else {
1649 unlock_buffer(bh); 1712 unlock_buffer(bh);
1650 } 1713 }
@@ -1660,7 +1723,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
1660 do { 1723 do {
1661 struct buffer_head *next = bh->b_this_page; 1724 struct buffer_head *next = bh->b_this_page;
1662 if (buffer_async_write(bh)) { 1725 if (buffer_async_write(bh)) {
1663 submit_bh(WRITE, bh); 1726 submit_bh(write_op, bh);
1664 nr_underway++; 1727 nr_underway++;
1665 } 1728 }
1666 bh = next; 1729 bh = next;
@@ -1697,7 +1760,7 @@ recover:
1697 if (buffer_mapped(bh) && buffer_dirty(bh) && 1760 if (buffer_mapped(bh) && buffer_dirty(bh) &&
1698 !buffer_delay(bh)) { 1761 !buffer_delay(bh)) {
1699 lock_buffer(bh); 1762 lock_buffer(bh);
1700 mark_buffer_async_write(bh); 1763 mark_buffer_async_write_endio(bh, handler);
1701 } else { 1764 } else {
1702 /* 1765 /*
1703 * The buffer may have been set dirty during 1766 * The buffer may have been set dirty during
@@ -1714,7 +1777,7 @@ recover:
1714 struct buffer_head *next = bh->b_this_page; 1777 struct buffer_head *next = bh->b_this_page;
1715 if (buffer_async_write(bh)) { 1778 if (buffer_async_write(bh)) {
1716 clear_buffer_dirty(bh); 1779 clear_buffer_dirty(bh);
1717 submit_bh(WRITE, bh); 1780 submit_bh(write_op, bh);
1718 nr_underway++; 1781 nr_underway++;
1719 } 1782 }
1720 bh = next; 1783 bh = next;
@@ -2320,20 +2383,22 @@ int block_commit_write(struct page *page, unsigned from, unsigned to)
2320 * unlock the page. 2383 * unlock the page.
2321 */ 2384 */
2322int 2385int
2323block_page_mkwrite(struct vm_area_struct *vma, struct page *page, 2386block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2324 get_block_t get_block) 2387 get_block_t get_block)
2325{ 2388{
2389 struct page *page = vmf->page;
2326 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 2390 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
2327 unsigned long end; 2391 unsigned long end;
2328 loff_t size; 2392 loff_t size;
2329 int ret = -EINVAL; 2393 int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
2330 2394
2331 lock_page(page); 2395 lock_page(page);
2332 size = i_size_read(inode); 2396 size = i_size_read(inode);
2333 if ((page->mapping != inode->i_mapping) || 2397 if ((page->mapping != inode->i_mapping) ||
2334 (page_offset(page) > size)) { 2398 (page_offset(page) > size)) {
2335 /* page got truncated out from underneath us */ 2399 /* page got truncated out from underneath us */
2336 goto out_unlock; 2400 unlock_page(page);
2401 goto out;
2337 } 2402 }
2338 2403
2339 /* page is wholly or partially inside EOF */ 2404 /* page is wholly or partially inside EOF */
@@ -2346,8 +2411,16 @@ block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
2346 if (!ret) 2411 if (!ret)
2347 ret = block_commit_write(page, 0, end); 2412 ret = block_commit_write(page, 0, end);
2348 2413
2349out_unlock: 2414 if (unlikely(ret)) {
2350 unlock_page(page); 2415 unlock_page(page);
2416 if (ret == -ENOMEM)
2417 ret = VM_FAULT_OOM;
2418 else /* -ENOSPC, -EIO, etc */
2419 ret = VM_FAULT_SIGBUS;
2420 } else
2421 ret = VM_FAULT_LOCKED;
2422
2423out:
2351 return ret; 2424 return ret;
2352} 2425}
2353 2426
@@ -2615,7 +2688,8 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
2615out: 2688out:
2616 ret = mpage_writepage(page, get_block, wbc); 2689 ret = mpage_writepage(page, get_block, wbc);
2617 if (ret == -EAGAIN) 2690 if (ret == -EAGAIN)
2618 ret = __block_write_full_page(inode, page, get_block, wbc); 2691 ret = __block_write_full_page(inode, page, get_block, wbc,
2692 end_buffer_async_write);
2619 return ret; 2693 return ret;
2620} 2694}
2621EXPORT_SYMBOL(nobh_writepage); 2695EXPORT_SYMBOL(nobh_writepage);
@@ -2662,6 +2736,8 @@ has_buffers:
2662 pos += blocksize; 2736 pos += blocksize;
2663 } 2737 }
2664 2738
2739 map_bh.b_size = blocksize;
2740 map_bh.b_state = 0;
2665 err = get_block(inode, iblock, &map_bh, 0); 2741 err = get_block(inode, iblock, &map_bh, 0);
2666 if (err) 2742 if (err)
2667 goto unlock; 2743 goto unlock;
@@ -2773,9 +2849,10 @@ out:
2773 2849
2774/* 2850/*
2775 * The generic ->writepage function for buffer-backed address_spaces 2851 * The generic ->writepage function for buffer-backed address_spaces
2852 * this form passes in the end_io handler used to finish the IO.
2776 */ 2853 */
2777int block_write_full_page(struct page *page, get_block_t *get_block, 2854int block_write_full_page_endio(struct page *page, get_block_t *get_block,
2778 struct writeback_control *wbc) 2855 struct writeback_control *wbc, bh_end_io_t *handler)
2779{ 2856{
2780 struct inode * const inode = page->mapping->host; 2857 struct inode * const inode = page->mapping->host;
2781 loff_t i_size = i_size_read(inode); 2858 loff_t i_size = i_size_read(inode);
@@ -2784,7 +2861,8 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
2784 2861
2785 /* Is the page fully inside i_size? */ 2862 /* Is the page fully inside i_size? */
2786 if (page->index < end_index) 2863 if (page->index < end_index)
2787 return __block_write_full_page(inode, page, get_block, wbc); 2864 return __block_write_full_page(inode, page, get_block, wbc,
2865 handler);
2788 2866
2789 /* Is the page fully outside i_size? (truncate in progress) */ 2867 /* Is the page fully outside i_size? (truncate in progress) */
2790 offset = i_size & (PAGE_CACHE_SIZE-1); 2868 offset = i_size & (PAGE_CACHE_SIZE-1);
@@ -2807,9 +2885,20 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
2807 * writes to that region are not written out to the file." 2885 * writes to that region are not written out to the file."
2808 */ 2886 */
2809 zero_user_segment(page, offset, PAGE_CACHE_SIZE); 2887 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2810 return __block_write_full_page(inode, page, get_block, wbc); 2888 return __block_write_full_page(inode, page, get_block, wbc, handler);
2889}
2890
2891/*
2892 * The generic ->writepage function for buffer-backed address_spaces
2893 */
2894int block_write_full_page(struct page *page, get_block_t *get_block,
2895 struct writeback_control *wbc)
2896{
2897 return block_write_full_page_endio(page, get_block, wbc,
2898 end_buffer_async_write);
2811} 2899}
2812 2900
2901
2813sector_t generic_block_bmap(struct address_space *mapping, sector_t block, 2902sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2814 get_block_t *get_block) 2903 get_block_t *get_block)
2815{ 2904{
@@ -2922,12 +3011,13 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2922 for (i = 0; i < nr; i++) { 3011 for (i = 0; i < nr; i++) {
2923 struct buffer_head *bh = bhs[i]; 3012 struct buffer_head *bh = bhs[i];
2924 3013
2925 if (rw == SWRITE || rw == SWRITE_SYNC) 3014 if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG)
2926 lock_buffer(bh); 3015 lock_buffer(bh);
2927 else if (!trylock_buffer(bh)) 3016 else if (!trylock_buffer(bh))
2928 continue; 3017 continue;
2929 3018
2930 if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) { 3019 if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
3020 rw == SWRITE_SYNC_PLUG) {
2931 if (test_clear_buffer_dirty(bh)) { 3021 if (test_clear_buffer_dirty(bh)) {
2932 bh->b_end_io = end_buffer_write_sync; 3022 bh->b_end_io = end_buffer_write_sync;
2933 get_bh(bh); 3023 get_bh(bh);
@@ -2963,7 +3053,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
2963 if (test_clear_buffer_dirty(bh)) { 3053 if (test_clear_buffer_dirty(bh)) {
2964 get_bh(bh); 3054 get_bh(bh);
2965 bh->b_end_io = end_buffer_write_sync; 3055 bh->b_end_io = end_buffer_write_sync;
2966 ret = submit_bh(WRITE, bh); 3056 ret = submit_bh(WRITE_SYNC, bh);
2967 wait_on_buffer(bh); 3057 wait_on_buffer(bh);
2968 if (buffer_eopnotsupp(bh)) { 3058 if (buffer_eopnotsupp(bh)) {
2969 clear_buffer_eopnotsupp(bh); 3059 clear_buffer_eopnotsupp(bh);
@@ -3277,11 +3367,12 @@ EXPORT_SYMBOL(block_read_full_page);
3277EXPORT_SYMBOL(block_sync_page); 3367EXPORT_SYMBOL(block_sync_page);
3278EXPORT_SYMBOL(block_truncate_page); 3368EXPORT_SYMBOL(block_truncate_page);
3279EXPORT_SYMBOL(block_write_full_page); 3369EXPORT_SYMBOL(block_write_full_page);
3370EXPORT_SYMBOL(block_write_full_page_endio);
3280EXPORT_SYMBOL(cont_write_begin); 3371EXPORT_SYMBOL(cont_write_begin);
3281EXPORT_SYMBOL(end_buffer_read_sync); 3372EXPORT_SYMBOL(end_buffer_read_sync);
3282EXPORT_SYMBOL(end_buffer_write_sync); 3373EXPORT_SYMBOL(end_buffer_write_sync);
3374EXPORT_SYMBOL(end_buffer_async_write);
3283EXPORT_SYMBOL(file_fsync); 3375EXPORT_SYMBOL(file_fsync);
3284EXPORT_SYMBOL(fsync_bdev);
3285EXPORT_SYMBOL(generic_block_bmap); 3376EXPORT_SYMBOL(generic_block_bmap);
3286EXPORT_SYMBOL(generic_cont_expand_simple); 3377EXPORT_SYMBOL(generic_cont_expand_simple);
3287EXPORT_SYMBOL(init_buffer); 3378EXPORT_SYMBOL(init_buffer);
diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig
new file mode 100644
index 000000000000..80e9c6167f0b
--- /dev/null
+++ b/fs/cachefiles/Kconfig
@@ -0,0 +1,39 @@
1
2config CACHEFILES
3 tristate "Filesystem caching on files"
4 depends on FSCACHE && BLOCK
5 help
6 This permits use of a mounted filesystem as a cache for other
7 filesystems - primarily networking filesystems - thus allowing fast
8 local disk to enhance the speed of slower devices.
9
10 See Documentation/filesystems/caching/cachefiles.txt for more
11 information.
12
13config CACHEFILES_DEBUG
14 bool "Debug CacheFiles"
15 depends on CACHEFILES
16 help
17 This permits debugging to be dynamically enabled in the filesystem
18 caching on files module. If this is set, the debugging output may be
19 enabled by setting bits in /sys/modules/cachefiles/parameter/debug or
20 by including a debugging specifier in /etc/cachefilesd.conf.
21
22config CACHEFILES_HISTOGRAM
23 bool "Gather latency information on CacheFiles"
24 depends on CACHEFILES && PROC_FS
25 help
26
27 This option causes latency information to be gathered on CacheFiles
28 operation and exported through file:
29
30 /proc/fs/cachefiles/histogram
31
32 The generation of this histogram adds a certain amount of overhead to
33 execution as there are a number of points at which data is gathered,
34 and on a multi-CPU system these may be on cachelines that keep
35 bouncing between CPUs. On the other hand, the histogram may be
36 useful for debugging purposes. Saying 'N' here is recommended.
37
38 See Documentation/filesystems/caching/cachefiles.txt for more
39 information.
diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile
new file mode 100644
index 000000000000..32cbab0ffce3
--- /dev/null
+++ b/fs/cachefiles/Makefile
@@ -0,0 +1,18 @@
1#
2# Makefile for caching in a mounted filesystem
3#
4
5cachefiles-y := \
6 bind.o \
7 daemon.o \
8 interface.o \
9 key.o \
10 main.o \
11 namei.o \
12 rdwr.o \
13 security.o \
14 xattr.o
15
16cachefiles-$(CONFIG_CACHEFILES_HISTOGRAM) += proc.o
17
18obj-$(CONFIG_CACHEFILES) := cachefiles.o
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
new file mode 100644
index 000000000000..3797e0077b35
--- /dev/null
+++ b/fs/cachefiles/bind.c
@@ -0,0 +1,286 @@
1/* Bind and unbind a cache from the filesystem backing it
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/sched.h>
15#include <linux/completion.h>
16#include <linux/slab.h>
17#include <linux/fs.h>
18#include <linux/file.h>
19#include <linux/namei.h>
20#include <linux/mount.h>
21#include <linux/statfs.h>
22#include <linux/ctype.h>
23#include "internal.h"
24
25static int cachefiles_daemon_add_cache(struct cachefiles_cache *caches);
26
27/*
28 * bind a directory as a cache
29 */
30int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
31{
32 _enter("{%u,%u,%u,%u,%u,%u},%s",
33 cache->frun_percent,
34 cache->fcull_percent,
35 cache->fstop_percent,
36 cache->brun_percent,
37 cache->bcull_percent,
38 cache->bstop_percent,
39 args);
40
41 /* start by checking things over */
42 ASSERT(cache->fstop_percent >= 0 &&
43 cache->fstop_percent < cache->fcull_percent &&
44 cache->fcull_percent < cache->frun_percent &&
45 cache->frun_percent < 100);
46
47 ASSERT(cache->bstop_percent >= 0 &&
48 cache->bstop_percent < cache->bcull_percent &&
49 cache->bcull_percent < cache->brun_percent &&
50 cache->brun_percent < 100);
51
52 if (*args) {
53 kerror("'bind' command doesn't take an argument");
54 return -EINVAL;
55 }
56
57 if (!cache->rootdirname) {
58 kerror("No cache directory specified");
59 return -EINVAL;
60 }
61
62 /* don't permit already bound caches to be re-bound */
63 if (test_bit(CACHEFILES_READY, &cache->flags)) {
64 kerror("Cache already bound");
65 return -EBUSY;
66 }
67
68 /* make sure we have copies of the tag and dirname strings */
69 if (!cache->tag) {
70 /* the tag string is released by the fops->release()
71 * function, so we don't release it on error here */
72 cache->tag = kstrdup("CacheFiles", GFP_KERNEL);
73 if (!cache->tag)
74 return -ENOMEM;
75 }
76
77 /* add the cache */
78 return cachefiles_daemon_add_cache(cache);
79}
80
81/*
82 * add a cache
83 */
84static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
85{
86 struct cachefiles_object *fsdef;
87 struct nameidata nd;
88 struct kstatfs stats;
89 struct dentry *graveyard, *cachedir, *root;
90 const struct cred *saved_cred;
91 int ret;
92
93 _enter("");
94
95 /* we want to work under the module's security ID */
96 ret = cachefiles_get_security_ID(cache);
97 if (ret < 0)
98 return ret;
99
100 cachefiles_begin_secure(cache, &saved_cred);
101
102 /* allocate the root index object */
103 ret = -ENOMEM;
104
105 fsdef = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL);
106 if (!fsdef)
107 goto error_root_object;
108
109 ASSERTCMP(fsdef->backer, ==, NULL);
110
111 atomic_set(&fsdef->usage, 1);
112 fsdef->type = FSCACHE_COOKIE_TYPE_INDEX;
113
114 _debug("- fsdef %p", fsdef);
115
116 /* look up the directory at the root of the cache */
117 memset(&nd, 0, sizeof(nd));
118
119 ret = path_lookup(cache->rootdirname, LOOKUP_DIRECTORY, &nd);
120 if (ret < 0)
121 goto error_open_root;
122
123 cache->mnt = mntget(nd.path.mnt);
124 root = dget(nd.path.dentry);
125 path_put(&nd.path);
126
127 /* check parameters */
128 ret = -EOPNOTSUPP;
129 if (!root->d_inode ||
130 !root->d_inode->i_op ||
131 !root->d_inode->i_op->lookup ||
132 !root->d_inode->i_op->mkdir ||
133 !root->d_inode->i_op->setxattr ||
134 !root->d_inode->i_op->getxattr ||
135 !root->d_sb ||
136 !root->d_sb->s_op ||
137 !root->d_sb->s_op->statfs ||
138 !root->d_sb->s_op->sync_fs)
139 goto error_unsupported;
140
141 ret = -EROFS;
142 if (root->d_sb->s_flags & MS_RDONLY)
143 goto error_unsupported;
144
145 /* determine the security of the on-disk cache as this governs
146 * security ID of files we create */
147 ret = cachefiles_determine_cache_security(cache, root, &saved_cred);
148 if (ret < 0)
149 goto error_unsupported;
150
151 /* get the cache size and blocksize */
152 ret = vfs_statfs(root, &stats);
153 if (ret < 0)
154 goto error_unsupported;
155
156 ret = -ERANGE;
157 if (stats.f_bsize <= 0)
158 goto error_unsupported;
159
160 ret = -EOPNOTSUPP;
161 if (stats.f_bsize > PAGE_SIZE)
162 goto error_unsupported;
163
164 cache->bsize = stats.f_bsize;
165 cache->bshift = 0;
166 if (stats.f_bsize < PAGE_SIZE)
167 cache->bshift = PAGE_SHIFT - ilog2(stats.f_bsize);
168
169 _debug("blksize %u (shift %u)",
170 cache->bsize, cache->bshift);
171
172 _debug("size %llu, avail %llu",
173 (unsigned long long) stats.f_blocks,
174 (unsigned long long) stats.f_bavail);
175
176 /* set up caching limits */
177 do_div(stats.f_files, 100);
178 cache->fstop = stats.f_files * cache->fstop_percent;
179 cache->fcull = stats.f_files * cache->fcull_percent;
180 cache->frun = stats.f_files * cache->frun_percent;
181
182 _debug("limits {%llu,%llu,%llu} files",
183 (unsigned long long) cache->frun,
184 (unsigned long long) cache->fcull,
185 (unsigned long long) cache->fstop);
186
187 stats.f_blocks >>= cache->bshift;
188 do_div(stats.f_blocks, 100);
189 cache->bstop = stats.f_blocks * cache->bstop_percent;
190 cache->bcull = stats.f_blocks * cache->bcull_percent;
191 cache->brun = stats.f_blocks * cache->brun_percent;
192
193 _debug("limits {%llu,%llu,%llu} blocks",
194 (unsigned long long) cache->brun,
195 (unsigned long long) cache->bcull,
196 (unsigned long long) cache->bstop);
197
198 /* get the cache directory and check its type */
199 cachedir = cachefiles_get_directory(cache, root, "cache");
200 if (IS_ERR(cachedir)) {
201 ret = PTR_ERR(cachedir);
202 goto error_unsupported;
203 }
204
205 fsdef->dentry = cachedir;
206 fsdef->fscache.cookie = NULL;
207
208 ret = cachefiles_check_object_type(fsdef);
209 if (ret < 0)
210 goto error_unsupported;
211
212 /* get the graveyard directory */
213 graveyard = cachefiles_get_directory(cache, root, "graveyard");
214 if (IS_ERR(graveyard)) {
215 ret = PTR_ERR(graveyard);
216 goto error_unsupported;
217 }
218
219 cache->graveyard = graveyard;
220
221 /* publish the cache */
222 fscache_init_cache(&cache->cache,
223 &cachefiles_cache_ops,
224 "%s",
225 fsdef->dentry->d_sb->s_id);
226
227 fscache_object_init(&fsdef->fscache, NULL, &cache->cache);
228
229 ret = fscache_add_cache(&cache->cache, &fsdef->fscache, cache->tag);
230 if (ret < 0)
231 goto error_add_cache;
232
233 /* done */
234 set_bit(CACHEFILES_READY, &cache->flags);
235 dput(root);
236
237 printk(KERN_INFO "CacheFiles:"
238 " File cache on %s registered\n",
239 cache->cache.identifier);
240
241 /* check how much space the cache has */
242 cachefiles_has_space(cache, 0, 0);
243 cachefiles_end_secure(cache, saved_cred);
244 return 0;
245
246error_add_cache:
247 dput(cache->graveyard);
248 cache->graveyard = NULL;
249error_unsupported:
250 mntput(cache->mnt);
251 cache->mnt = NULL;
252 dput(fsdef->dentry);
253 fsdef->dentry = NULL;
254 dput(root);
255error_open_root:
256 kmem_cache_free(cachefiles_object_jar, fsdef);
257error_root_object:
258 cachefiles_end_secure(cache, saved_cred);
259 kerror("Failed to register: %d", ret);
260 return ret;
261}
262
263/*
264 * unbind a cache on fd release
265 */
266void cachefiles_daemon_unbind(struct cachefiles_cache *cache)
267{
268 _enter("");
269
270 if (test_bit(CACHEFILES_READY, &cache->flags)) {
271 printk(KERN_INFO "CacheFiles:"
272 " File cache on %s unregistering\n",
273 cache->cache.identifier);
274
275 fscache_withdraw_cache(&cache->cache);
276 }
277
278 dput(cache->graveyard);
279 mntput(cache->mnt);
280
281 kfree(cache->rootdirname);
282 kfree(cache->secctx);
283 kfree(cache->tag);
284
285 _leave("");
286}
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
new file mode 100644
index 000000000000..4618516dd994
--- /dev/null
+++ b/fs/cachefiles/daemon.c
@@ -0,0 +1,755 @@
1/* Daemon interface
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/sched.h>
15#include <linux/completion.h>
16#include <linux/slab.h>
17#include <linux/fs.h>
18#include <linux/file.h>
19#include <linux/namei.h>
20#include <linux/poll.h>
21#include <linux/mount.h>
22#include <linux/statfs.h>
23#include <linux/ctype.h>
24#include <linux/fs_struct.h>
25#include "internal.h"
26
27static int cachefiles_daemon_open(struct inode *, struct file *);
28static int cachefiles_daemon_release(struct inode *, struct file *);
29static ssize_t cachefiles_daemon_read(struct file *, char __user *, size_t,
30 loff_t *);
31static ssize_t cachefiles_daemon_write(struct file *, const char __user *,
32 size_t, loff_t *);
33static unsigned int cachefiles_daemon_poll(struct file *,
34 struct poll_table_struct *);
35static int cachefiles_daemon_frun(struct cachefiles_cache *, char *);
36static int cachefiles_daemon_fcull(struct cachefiles_cache *, char *);
37static int cachefiles_daemon_fstop(struct cachefiles_cache *, char *);
38static int cachefiles_daemon_brun(struct cachefiles_cache *, char *);
39static int cachefiles_daemon_bcull(struct cachefiles_cache *, char *);
40static int cachefiles_daemon_bstop(struct cachefiles_cache *, char *);
41static int cachefiles_daemon_cull(struct cachefiles_cache *, char *);
42static int cachefiles_daemon_debug(struct cachefiles_cache *, char *);
43static int cachefiles_daemon_dir(struct cachefiles_cache *, char *);
44static int cachefiles_daemon_inuse(struct cachefiles_cache *, char *);
45static int cachefiles_daemon_secctx(struct cachefiles_cache *, char *);
46static int cachefiles_daemon_tag(struct cachefiles_cache *, char *);
47
48static unsigned long cachefiles_open;
49
50const struct file_operations cachefiles_daemon_fops = {
51 .owner = THIS_MODULE,
52 .open = cachefiles_daemon_open,
53 .release = cachefiles_daemon_release,
54 .read = cachefiles_daemon_read,
55 .write = cachefiles_daemon_write,
56 .poll = cachefiles_daemon_poll,
57};
58
59struct cachefiles_daemon_cmd {
60 char name[8];
61 int (*handler)(struct cachefiles_cache *cache, char *args);
62};
63
64static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = {
65 { "bind", cachefiles_daemon_bind },
66 { "brun", cachefiles_daemon_brun },
67 { "bcull", cachefiles_daemon_bcull },
68 { "bstop", cachefiles_daemon_bstop },
69 { "cull", cachefiles_daemon_cull },
70 { "debug", cachefiles_daemon_debug },
71 { "dir", cachefiles_daemon_dir },
72 { "frun", cachefiles_daemon_frun },
73 { "fcull", cachefiles_daemon_fcull },
74 { "fstop", cachefiles_daemon_fstop },
75 { "inuse", cachefiles_daemon_inuse },
76 { "secctx", cachefiles_daemon_secctx },
77 { "tag", cachefiles_daemon_tag },
78 { "", NULL }
79};
80
81
82/*
83 * do various checks
84 */
85static int cachefiles_daemon_open(struct inode *inode, struct file *file)
86{
87 struct cachefiles_cache *cache;
88
89 _enter("");
90
91 /* only the superuser may do this */
92 if (!capable(CAP_SYS_ADMIN))
93 return -EPERM;
94
95 /* the cachefiles device may only be open once at a time */
96 if (xchg(&cachefiles_open, 1) == 1)
97 return -EBUSY;
98
99 /* allocate a cache record */
100 cache = kzalloc(sizeof(struct cachefiles_cache), GFP_KERNEL);
101 if (!cache) {
102 cachefiles_open = 0;
103 return -ENOMEM;
104 }
105
106 mutex_init(&cache->daemon_mutex);
107 cache->active_nodes = RB_ROOT;
108 rwlock_init(&cache->active_lock);
109 init_waitqueue_head(&cache->daemon_pollwq);
110
111 /* set default caching limits
112 * - limit at 1% free space and/or free files
113 * - cull below 5% free space and/or free files
114 * - cease culling above 7% free space and/or free files
115 */
116 cache->frun_percent = 7;
117 cache->fcull_percent = 5;
118 cache->fstop_percent = 1;
119 cache->brun_percent = 7;
120 cache->bcull_percent = 5;
121 cache->bstop_percent = 1;
122
123 file->private_data = cache;
124 cache->cachefilesd = file;
125 return 0;
126}
127
128/*
129 * release a cache
130 */
131static int cachefiles_daemon_release(struct inode *inode, struct file *file)
132{
133 struct cachefiles_cache *cache = file->private_data;
134
135 _enter("");
136
137 ASSERT(cache);
138
139 set_bit(CACHEFILES_DEAD, &cache->flags);
140
141 cachefiles_daemon_unbind(cache);
142
143 ASSERT(!cache->active_nodes.rb_node);
144
145 /* clean up the control file interface */
146 cache->cachefilesd = NULL;
147 file->private_data = NULL;
148 cachefiles_open = 0;
149
150 kfree(cache);
151
152 _leave("");
153 return 0;
154}
155
156/*
157 * read the cache state
158 */
159static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
160 size_t buflen, loff_t *pos)
161{
162 struct cachefiles_cache *cache = file->private_data;
163 char buffer[256];
164 int n;
165
166 //_enter(",,%zu,", buflen);
167
168 if (!test_bit(CACHEFILES_READY, &cache->flags))
169 return 0;
170
171 /* check how much space the cache has */
172 cachefiles_has_space(cache, 0, 0);
173
174 /* summarise */
175 clear_bit(CACHEFILES_STATE_CHANGED, &cache->flags);
176
177 n = snprintf(buffer, sizeof(buffer),
178 "cull=%c"
179 " frun=%llx"
180 " fcull=%llx"
181 " fstop=%llx"
182 " brun=%llx"
183 " bcull=%llx"
184 " bstop=%llx",
185 test_bit(CACHEFILES_CULLING, &cache->flags) ? '1' : '0',
186 (unsigned long long) cache->frun,
187 (unsigned long long) cache->fcull,
188 (unsigned long long) cache->fstop,
189 (unsigned long long) cache->brun,
190 (unsigned long long) cache->bcull,
191 (unsigned long long) cache->bstop
192 );
193
194 if (n > buflen)
195 return -EMSGSIZE;
196
197 if (copy_to_user(_buffer, buffer, n) != 0)
198 return -EFAULT;
199
200 return n;
201}
202
203/*
204 * command the cache
205 */
206static ssize_t cachefiles_daemon_write(struct file *file,
207 const char __user *_data,
208 size_t datalen,
209 loff_t *pos)
210{
211 const struct cachefiles_daemon_cmd *cmd;
212 struct cachefiles_cache *cache = file->private_data;
213 ssize_t ret;
214 char *data, *args, *cp;
215
216 //_enter(",,%zu,", datalen);
217
218 ASSERT(cache);
219
220 if (test_bit(CACHEFILES_DEAD, &cache->flags))
221 return -EIO;
222
223 if (datalen < 0 || datalen > PAGE_SIZE - 1)
224 return -EOPNOTSUPP;
225
226 /* drag the command string into the kernel so we can parse it */
227 data = kmalloc(datalen + 1, GFP_KERNEL);
228 if (!data)
229 return -ENOMEM;
230
231 ret = -EFAULT;
232 if (copy_from_user(data, _data, datalen) != 0)
233 goto error;
234
235 data[datalen] = '\0';
236
237 ret = -EINVAL;
238 if (memchr(data, '\0', datalen))
239 goto error;
240
241 /* strip any newline */
242 cp = memchr(data, '\n', datalen);
243 if (cp) {
244 if (cp == data)
245 goto error;
246
247 *cp = '\0';
248 }
249
250 /* parse the command */
251 ret = -EOPNOTSUPP;
252
253 for (args = data; *args; args++)
254 if (isspace(*args))
255 break;
256 if (*args) {
257 if (args == data)
258 goto error;
259 *args = '\0';
260 for (args++; isspace(*args); args++)
261 continue;
262 }
263
264 /* run the appropriate command handler */
265 for (cmd = cachefiles_daemon_cmds; cmd->name[0]; cmd++)
266 if (strcmp(cmd->name, data) == 0)
267 goto found_command;
268
269error:
270 kfree(data);
271 //_leave(" = %zd", ret);
272 return ret;
273
274found_command:
275 mutex_lock(&cache->daemon_mutex);
276
277 ret = -EIO;
278 if (!test_bit(CACHEFILES_DEAD, &cache->flags))
279 ret = cmd->handler(cache, args);
280
281 mutex_unlock(&cache->daemon_mutex);
282
283 if (ret == 0)
284 ret = datalen;
285 goto error;
286}
287
288/*
289 * poll for culling state
290 * - use POLLOUT to indicate culling state
291 */
292static unsigned int cachefiles_daemon_poll(struct file *file,
293 struct poll_table_struct *poll)
294{
295 struct cachefiles_cache *cache = file->private_data;
296 unsigned int mask;
297
298 poll_wait(file, &cache->daemon_pollwq, poll);
299 mask = 0;
300
301 if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
302 mask |= POLLIN;
303
304 if (test_bit(CACHEFILES_CULLING, &cache->flags))
305 mask |= POLLOUT;
306
307 return mask;
308}
309
310/*
311 * give a range error for cache space constraints
312 * - can be tail-called
313 */
314static int cachefiles_daemon_range_error(struct cachefiles_cache *cache,
315 char *args)
316{
317 kerror("Free space limits must be in range"
318 " 0%%<=stop<cull<run<100%%");
319
320 return -EINVAL;
321}
322
323/*
324 * set the percentage of files at which to stop culling
325 * - command: "frun <N>%"
326 */
327static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args)
328{
329 unsigned long frun;
330
331 _enter(",%s", args);
332
333 if (!*args)
334 return -EINVAL;
335
336 frun = simple_strtoul(args, &args, 10);
337 if (args[0] != '%' || args[1] != '\0')
338 return -EINVAL;
339
340 if (frun <= cache->fcull_percent || frun >= 100)
341 return cachefiles_daemon_range_error(cache, args);
342
343 cache->frun_percent = frun;
344 return 0;
345}
346
347/*
348 * set the percentage of files at which to start culling
349 * - command: "fcull <N>%"
350 */
351static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args)
352{
353 unsigned long fcull;
354
355 _enter(",%s", args);
356
357 if (!*args)
358 return -EINVAL;
359
360 fcull = simple_strtoul(args, &args, 10);
361 if (args[0] != '%' || args[1] != '\0')
362 return -EINVAL;
363
364 if (fcull <= cache->fstop_percent || fcull >= cache->frun_percent)
365 return cachefiles_daemon_range_error(cache, args);
366
367 cache->fcull_percent = fcull;
368 return 0;
369}
370
371/*
372 * set the percentage of files at which to stop allocating
373 * - command: "fstop <N>%"
374 */
375static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args)
376{
377 unsigned long fstop;
378
379 _enter(",%s", args);
380
381 if (!*args)
382 return -EINVAL;
383
384 fstop = simple_strtoul(args, &args, 10);
385 if (args[0] != '%' || args[1] != '\0')
386 return -EINVAL;
387
388 if (fstop < 0 || fstop >= cache->fcull_percent)
389 return cachefiles_daemon_range_error(cache, args);
390
391 cache->fstop_percent = fstop;
392 return 0;
393}
394
395/*
396 * set the percentage of blocks at which to stop culling
397 * - command: "brun <N>%"
398 */
399static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args)
400{
401 unsigned long brun;
402
403 _enter(",%s", args);
404
405 if (!*args)
406 return -EINVAL;
407
408 brun = simple_strtoul(args, &args, 10);
409 if (args[0] != '%' || args[1] != '\0')
410 return -EINVAL;
411
412 if (brun <= cache->bcull_percent || brun >= 100)
413 return cachefiles_daemon_range_error(cache, args);
414
415 cache->brun_percent = brun;
416 return 0;
417}
418
419/*
420 * set the percentage of blocks at which to start culling
421 * - command: "bcull <N>%"
422 */
423static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args)
424{
425 unsigned long bcull;
426
427 _enter(",%s", args);
428
429 if (!*args)
430 return -EINVAL;
431
432 bcull = simple_strtoul(args, &args, 10);
433 if (args[0] != '%' || args[1] != '\0')
434 return -EINVAL;
435
436 if (bcull <= cache->bstop_percent || bcull >= cache->brun_percent)
437 return cachefiles_daemon_range_error(cache, args);
438
439 cache->bcull_percent = bcull;
440 return 0;
441}
442
443/*
444 * set the percentage of blocks at which to stop allocating
445 * - command: "bstop <N>%"
446 */
447static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args)
448{
449 unsigned long bstop;
450
451 _enter(",%s", args);
452
453 if (!*args)
454 return -EINVAL;
455
456 bstop = simple_strtoul(args, &args, 10);
457 if (args[0] != '%' || args[1] != '\0')
458 return -EINVAL;
459
460 if (bstop < 0 || bstop >= cache->bcull_percent)
461 return cachefiles_daemon_range_error(cache, args);
462
463 cache->bstop_percent = bstop;
464 return 0;
465}
466
467/*
468 * set the cache directory
469 * - command: "dir <name>"
470 */
471static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args)
472{
473 char *dir;
474
475 _enter(",%s", args);
476
477 if (!*args) {
478 kerror("Empty directory specified");
479 return -EINVAL;
480 }
481
482 if (cache->rootdirname) {
483 kerror("Second cache directory specified");
484 return -EEXIST;
485 }
486
487 dir = kstrdup(args, GFP_KERNEL);
488 if (!dir)
489 return -ENOMEM;
490
491 cache->rootdirname = dir;
492 return 0;
493}
494
495/*
496 * set the cache security context
497 * - command: "secctx <ctx>"
498 */
499static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args)
500{
501 char *secctx;
502
503 _enter(",%s", args);
504
505 if (!*args) {
506 kerror("Empty security context specified");
507 return -EINVAL;
508 }
509
510 if (cache->secctx) {
511 kerror("Second security context specified");
512 return -EINVAL;
513 }
514
515 secctx = kstrdup(args, GFP_KERNEL);
516 if (!secctx)
517 return -ENOMEM;
518
519 cache->secctx = secctx;
520 return 0;
521}
522
523/*
524 * set the cache tag
525 * - command: "tag <name>"
526 */
527static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
528{
529 char *tag;
530
531 _enter(",%s", args);
532
533 if (!*args) {
534 kerror("Empty tag specified");
535 return -EINVAL;
536 }
537
538 if (cache->tag)
539 return -EEXIST;
540
541 tag = kstrdup(args, GFP_KERNEL);
542 if (!tag)
543 return -ENOMEM;
544
545 cache->tag = tag;
546 return 0;
547}
548
549/*
550 * request a node in the cache be culled from the current working directory
551 * - command: "cull <name>"
552 */
553static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
554{
555 struct fs_struct *fs;
556 struct dentry *dir;
557 const struct cred *saved_cred;
558 int ret;
559
560 _enter(",%s", args);
561
562 if (strchr(args, '/'))
563 goto inval;
564
565 if (!test_bit(CACHEFILES_READY, &cache->flags)) {
566 kerror("cull applied to unready cache");
567 return -EIO;
568 }
569
570 if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
571 kerror("cull applied to dead cache");
572 return -EIO;
573 }
574
575 /* extract the directory dentry from the cwd */
576 fs = current->fs;
577 read_lock(&fs->lock);
578 dir = dget(fs->pwd.dentry);
579 read_unlock(&fs->lock);
580
581 if (!S_ISDIR(dir->d_inode->i_mode))
582 goto notdir;
583
584 cachefiles_begin_secure(cache, &saved_cred);
585 ret = cachefiles_cull(cache, dir, args);
586 cachefiles_end_secure(cache, saved_cred);
587
588 dput(dir);
589 _leave(" = %d", ret);
590 return ret;
591
592notdir:
593 dput(dir);
594 kerror("cull command requires dirfd to be a directory");
595 return -ENOTDIR;
596
597inval:
598 kerror("cull command requires dirfd and filename");
599 return -EINVAL;
600}
601
602/*
603 * set debugging mode
604 * - command: "debug <mask>"
605 */
606static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args)
607{
608 unsigned long mask;
609
610 _enter(",%s", args);
611
612 mask = simple_strtoul(args, &args, 0);
613 if (args[0] != '\0')
614 goto inval;
615
616 cachefiles_debug = mask;
617 _leave(" = 0");
618 return 0;
619
620inval:
621 kerror("debug command requires mask");
622 return -EINVAL;
623}
624
625/*
626 * find out whether an object in the current working directory is in use or not
627 * - command: "inuse <name>"
628 */
629static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
630{
631 struct fs_struct *fs;
632 struct dentry *dir;
633 const struct cred *saved_cred;
634 int ret;
635
636 //_enter(",%s", args);
637
638 if (strchr(args, '/'))
639 goto inval;
640
641 if (!test_bit(CACHEFILES_READY, &cache->flags)) {
642 kerror("inuse applied to unready cache");
643 return -EIO;
644 }
645
646 if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
647 kerror("inuse applied to dead cache");
648 return -EIO;
649 }
650
651 /* extract the directory dentry from the cwd */
652 fs = current->fs;
653 read_lock(&fs->lock);
654 dir = dget(fs->pwd.dentry);
655 read_unlock(&fs->lock);
656
657 if (!S_ISDIR(dir->d_inode->i_mode))
658 goto notdir;
659
660 cachefiles_begin_secure(cache, &saved_cred);
661 ret = cachefiles_check_in_use(cache, dir, args);
662 cachefiles_end_secure(cache, saved_cred);
663
664 dput(dir);
665 //_leave(" = %d", ret);
666 return ret;
667
668notdir:
669 dput(dir);
670 kerror("inuse command requires dirfd to be a directory");
671 return -ENOTDIR;
672
673inval:
674 kerror("inuse command requires dirfd and filename");
675 return -EINVAL;
676}
677
678/*
679 * see if we have space for a number of pages and/or a number of files in the
680 * cache
681 */
682int cachefiles_has_space(struct cachefiles_cache *cache,
683 unsigned fnr, unsigned bnr)
684{
685 struct kstatfs stats;
686 int ret;
687
688 //_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u",
689 // (unsigned long long) cache->frun,
690 // (unsigned long long) cache->fcull,
691 // (unsigned long long) cache->fstop,
692 // (unsigned long long) cache->brun,
693 // (unsigned long long) cache->bcull,
694 // (unsigned long long) cache->bstop,
695 // fnr, bnr);
696
697 /* find out how many pages of blockdev are available */
698 memset(&stats, 0, sizeof(stats));
699
700 ret = vfs_statfs(cache->mnt->mnt_root, &stats);
701 if (ret < 0) {
702 if (ret == -EIO)
703 cachefiles_io_error(cache, "statfs failed");
704 _leave(" = %d", ret);
705 return ret;
706 }
707
708 stats.f_bavail >>= cache->bshift;
709
710 //_debug("avail %llu,%llu",
711 // (unsigned long long) stats.f_ffree,
712 // (unsigned long long) stats.f_bavail);
713
714 /* see if there is sufficient space */
715 if (stats.f_ffree > fnr)
716 stats.f_ffree -= fnr;
717 else
718 stats.f_ffree = 0;
719
720 if (stats.f_bavail > bnr)
721 stats.f_bavail -= bnr;
722 else
723 stats.f_bavail = 0;
724
725 ret = -ENOBUFS;
726 if (stats.f_ffree < cache->fstop ||
727 stats.f_bavail < cache->bstop)
728 goto begin_cull;
729
730 ret = 0;
731 if (stats.f_ffree < cache->fcull ||
732 stats.f_bavail < cache->bcull)
733 goto begin_cull;
734
735 if (test_bit(CACHEFILES_CULLING, &cache->flags) &&
736 stats.f_ffree >= cache->frun &&
737 stats.f_bavail >= cache->brun &&
738 test_and_clear_bit(CACHEFILES_CULLING, &cache->flags)
739 ) {
740 _debug("cease culling");
741 cachefiles_state_changed(cache);
742 }
743
744 //_leave(" = 0");
745 return 0;
746
747begin_cull:
748 if (!test_and_set_bit(CACHEFILES_CULLING, &cache->flags)) {
749 _debug("### CULL CACHE ###");
750 cachefiles_state_changed(cache);
751 }
752
753 _leave(" = %d", ret);
754 return ret;
755}
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
new file mode 100644
index 000000000000..1e962348d111
--- /dev/null
+++ b/fs/cachefiles/interface.c
@@ -0,0 +1,449 @@
1/* FS-Cache interface to CacheFiles
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/mount.h>
13#include <linux/buffer_head.h>
14#include "internal.h"
15
16#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
17
18struct cachefiles_lookup_data {
19 struct cachefiles_xattr *auxdata; /* auxiliary data */
20 char *key; /* key path */
21};
22
23static int cachefiles_attr_changed(struct fscache_object *_object);
24
25/*
26 * allocate an object record for a cookie lookup and prepare the lookup data
27 */
28static struct fscache_object *cachefiles_alloc_object(
29 struct fscache_cache *_cache,
30 struct fscache_cookie *cookie)
31{
32 struct cachefiles_lookup_data *lookup_data;
33 struct cachefiles_object *object;
34 struct cachefiles_cache *cache;
35 struct cachefiles_xattr *auxdata;
36 unsigned keylen, auxlen;
37 void *buffer;
38 char *key;
39
40 cache = container_of(_cache, struct cachefiles_cache, cache);
41
42 _enter("{%s},%p,", cache->cache.identifier, cookie);
43
44 lookup_data = kmalloc(sizeof(*lookup_data), GFP_KERNEL);
45 if (!lookup_data)
46 goto nomem_lookup_data;
47
48 /* create a new object record and a temporary leaf image */
49 object = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL);
50 if (!object)
51 goto nomem_object;
52
53 ASSERTCMP(object->backer, ==, NULL);
54
55 BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
56 atomic_set(&object->usage, 1);
57
58 fscache_object_init(&object->fscache, cookie, &cache->cache);
59
60 object->type = cookie->def->type;
61
62 /* get hold of the raw key
63 * - stick the length on the front and leave space on the back for the
64 * encoder
65 */
66 buffer = kmalloc((2 + 512) + 3, GFP_KERNEL);
67 if (!buffer)
68 goto nomem_buffer;
69
70 keylen = cookie->def->get_key(cookie->netfs_data, buffer + 2, 512);
71 ASSERTCMP(keylen, <, 512);
72
73 *(uint16_t *)buffer = keylen;
74 ((char *)buffer)[keylen + 2] = 0;
75 ((char *)buffer)[keylen + 3] = 0;
76 ((char *)buffer)[keylen + 4] = 0;
77
78 /* turn the raw key into something that can work with as a filename */
79 key = cachefiles_cook_key(buffer, keylen + 2, object->type);
80 if (!key)
81 goto nomem_key;
82
83 /* get hold of the auxiliary data and prepend the object type */
84 auxdata = buffer;
85 auxlen = 0;
86 if (cookie->def->get_aux) {
87 auxlen = cookie->def->get_aux(cookie->netfs_data,
88 auxdata->data, 511);
89 ASSERTCMP(auxlen, <, 511);
90 }
91
92 auxdata->len = auxlen + 1;
93 auxdata->type = cookie->def->type;
94
95 lookup_data->auxdata = auxdata;
96 lookup_data->key = key;
97 object->lookup_data = lookup_data;
98
99 _leave(" = %p [%p]", &object->fscache, lookup_data);
100 return &object->fscache;
101
102nomem_key:
103 kfree(buffer);
104nomem_buffer:
105 BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
106 kmem_cache_free(cachefiles_object_jar, object);
107 fscache_object_destroyed(&cache->cache);
108nomem_object:
109 kfree(lookup_data);
110nomem_lookup_data:
111 _leave(" = -ENOMEM");
112 return ERR_PTR(-ENOMEM);
113}
114
115/*
116 * attempt to look up the nominated node in this cache
117 */
118static void cachefiles_lookup_object(struct fscache_object *_object)
119{
120 struct cachefiles_lookup_data *lookup_data;
121 struct cachefiles_object *parent, *object;
122 struct cachefiles_cache *cache;
123 const struct cred *saved_cred;
124 int ret;
125
126 _enter("{OBJ%x}", _object->debug_id);
127
128 cache = container_of(_object->cache, struct cachefiles_cache, cache);
129 parent = container_of(_object->parent,
130 struct cachefiles_object, fscache);
131 object = container_of(_object, struct cachefiles_object, fscache);
132 lookup_data = object->lookup_data;
133
134 ASSERTCMP(lookup_data, !=, NULL);
135
136 /* look up the key, creating any missing bits */
137 cachefiles_begin_secure(cache, &saved_cred);
138 ret = cachefiles_walk_to_object(parent, object,
139 lookup_data->key,
140 lookup_data->auxdata);
141 cachefiles_end_secure(cache, saved_cred);
142
143 /* polish off by setting the attributes of non-index files */
144 if (ret == 0 &&
145 object->fscache.cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX)
146 cachefiles_attr_changed(&object->fscache);
147
148 if (ret < 0) {
149 printk(KERN_WARNING "CacheFiles: Lookup failed error %d\n",
150 ret);
151 fscache_object_lookup_error(&object->fscache);
152 }
153
154 _leave(" [%d]", ret);
155}
156
157/*
158 * indication of lookup completion
159 */
160static void cachefiles_lookup_complete(struct fscache_object *_object)
161{
162 struct cachefiles_object *object;
163
164 object = container_of(_object, struct cachefiles_object, fscache);
165
166 _enter("{OBJ%x,%p}", object->fscache.debug_id, object->lookup_data);
167
168 if (object->lookup_data) {
169 kfree(object->lookup_data->key);
170 kfree(object->lookup_data->auxdata);
171 kfree(object->lookup_data);
172 object->lookup_data = NULL;
173 }
174}
175
176/*
177 * increment the usage count on an inode object (may fail if unmounting)
178 */
179static
180struct fscache_object *cachefiles_grab_object(struct fscache_object *_object)
181{
182 struct cachefiles_object *object =
183 container_of(_object, struct cachefiles_object, fscache);
184
185 _enter("{OBJ%x,%d}", _object->debug_id, atomic_read(&object->usage));
186
187#ifdef CACHEFILES_DEBUG_SLAB
188 ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
189#endif
190
191 atomic_inc(&object->usage);
192 return &object->fscache;
193}
194
195/*
196 * update the auxilliary data for an object object on disk
197 */
198static void cachefiles_update_object(struct fscache_object *_object)
199{
200 struct cachefiles_object *object;
201 struct cachefiles_xattr *auxdata;
202 struct cachefiles_cache *cache;
203 struct fscache_cookie *cookie;
204 const struct cred *saved_cred;
205 unsigned auxlen;
206
207 _enter("{OBJ%x}", _object->debug_id);
208
209 object = container_of(_object, struct cachefiles_object, fscache);
210 cache = container_of(object->fscache.cache, struct cachefiles_cache,
211 cache);
212 cookie = object->fscache.cookie;
213
214 if (!cookie->def->get_aux) {
215 _leave(" [no aux]");
216 return;
217 }
218
219 auxdata = kmalloc(2 + 512 + 3, GFP_KERNEL);
220 if (!auxdata) {
221 _leave(" [nomem]");
222 return;
223 }
224
225 auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511);
226 ASSERTCMP(auxlen, <, 511);
227
228 auxdata->len = auxlen + 1;
229 auxdata->type = cookie->def->type;
230
231 cachefiles_begin_secure(cache, &saved_cred);
232 cachefiles_update_object_xattr(object, auxdata);
233 cachefiles_end_secure(cache, saved_cred);
234 kfree(auxdata);
235 _leave("");
236}
237
238/*
239 * discard the resources pinned by an object and effect retirement if
240 * requested
241 */
242static void cachefiles_drop_object(struct fscache_object *_object)
243{
244 struct cachefiles_object *object;
245 struct cachefiles_cache *cache;
246 const struct cred *saved_cred;
247
248 ASSERT(_object);
249
250 object = container_of(_object, struct cachefiles_object, fscache);
251
252 _enter("{OBJ%x,%d}",
253 object->fscache.debug_id, atomic_read(&object->usage));
254
255 cache = container_of(object->fscache.cache,
256 struct cachefiles_cache, cache);
257
258#ifdef CACHEFILES_DEBUG_SLAB
259 ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
260#endif
261
262 /* delete retired objects */
263 if (object->fscache.state == FSCACHE_OBJECT_RECYCLING &&
264 _object != cache->cache.fsdef
265 ) {
266 _debug("- retire object OBJ%x", object->fscache.debug_id);
267 cachefiles_begin_secure(cache, &saved_cred);
268 cachefiles_delete_object(cache, object);
269 cachefiles_end_secure(cache, saved_cred);
270 }
271
272 /* close the filesystem stuff attached to the object */
273 if (object->backer != object->dentry)
274 dput(object->backer);
275 object->backer = NULL;
276
277 /* note that the object is now inactive */
278 if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
279 write_lock(&cache->active_lock);
280 if (!test_and_clear_bit(CACHEFILES_OBJECT_ACTIVE,
281 &object->flags))
282 BUG();
283 rb_erase(&object->active_node, &cache->active_nodes);
284 wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
285 write_unlock(&cache->active_lock);
286 }
287
288 dput(object->dentry);
289 object->dentry = NULL;
290
291 _leave("");
292}
293
294/*
295 * dispose of a reference to an object
296 */
297static void cachefiles_put_object(struct fscache_object *_object)
298{
299 struct cachefiles_object *object;
300 struct fscache_cache *cache;
301
302 ASSERT(_object);
303
304 object = container_of(_object, struct cachefiles_object, fscache);
305
306 _enter("{OBJ%x,%d}",
307 object->fscache.debug_id, atomic_read(&object->usage));
308
309#ifdef CACHEFILES_DEBUG_SLAB
310 ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
311#endif
312
313 ASSERTIFCMP(object->fscache.parent,
314 object->fscache.parent->n_children, >, 0);
315
316 if (atomic_dec_and_test(&object->usage)) {
317 _debug("- kill object OBJ%x", object->fscache.debug_id);
318
319 ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
320 ASSERTCMP(object->fscache.parent, ==, NULL);
321 ASSERTCMP(object->backer, ==, NULL);
322 ASSERTCMP(object->dentry, ==, NULL);
323 ASSERTCMP(object->fscache.n_ops, ==, 0);
324 ASSERTCMP(object->fscache.n_children, ==, 0);
325
326 if (object->lookup_data) {
327 kfree(object->lookup_data->key);
328 kfree(object->lookup_data->auxdata);
329 kfree(object->lookup_data);
330 object->lookup_data = NULL;
331 }
332
333 cache = object->fscache.cache;
334 kmem_cache_free(cachefiles_object_jar, object);
335 fscache_object_destroyed(cache);
336 }
337
338 _leave("");
339}
340
341/*
342 * sync a cache
343 */
344static void cachefiles_sync_cache(struct fscache_cache *_cache)
345{
346 struct cachefiles_cache *cache;
347 const struct cred *saved_cred;
348 int ret;
349
350 _enter("%p", _cache);
351
352 cache = container_of(_cache, struct cachefiles_cache, cache);
353
354 /* make sure all pages pinned by operations on behalf of the netfs are
355 * written to disc */
356 cachefiles_begin_secure(cache, &saved_cred);
357 ret = fsync_super(cache->mnt->mnt_sb);
358 cachefiles_end_secure(cache, saved_cred);
359
360 if (ret == -EIO)
361 cachefiles_io_error(cache,
362 "Attempt to sync backing fs superblock"
363 " returned error %d",
364 ret);
365}
366
367/*
368 * notification the attributes on an object have changed
369 * - called with reads/writes excluded by FS-Cache
370 */
371static int cachefiles_attr_changed(struct fscache_object *_object)
372{
373 struct cachefiles_object *object;
374 struct cachefiles_cache *cache;
375 const struct cred *saved_cred;
376 struct iattr newattrs;
377 uint64_t ni_size;
378 loff_t oi_size;
379 int ret;
380
381 _object->cookie->def->get_attr(_object->cookie->netfs_data, &ni_size);
382
383 _enter("{OBJ%x},[%llu]",
384 _object->debug_id, (unsigned long long) ni_size);
385
386 object = container_of(_object, struct cachefiles_object, fscache);
387 cache = container_of(object->fscache.cache,
388 struct cachefiles_cache, cache);
389
390 if (ni_size == object->i_size)
391 return 0;
392
393 if (!object->backer)
394 return -ENOBUFS;
395
396 ASSERT(S_ISREG(object->backer->d_inode->i_mode));
397
398 fscache_set_store_limit(&object->fscache, ni_size);
399
400 oi_size = i_size_read(object->backer->d_inode);
401 if (oi_size == ni_size)
402 return 0;
403
404 newattrs.ia_size = ni_size;
405 newattrs.ia_valid = ATTR_SIZE;
406
407 cachefiles_begin_secure(cache, &saved_cred);
408 mutex_lock(&object->backer->d_inode->i_mutex);
409 ret = notify_change(object->backer, &newattrs);
410 mutex_unlock(&object->backer->d_inode->i_mutex);
411 cachefiles_end_secure(cache, saved_cred);
412
413 if (ret == -EIO) {
414 fscache_set_store_limit(&object->fscache, 0);
415 cachefiles_io_error_obj(object, "Size set failed");
416 ret = -ENOBUFS;
417 }
418
419 _leave(" = %d", ret);
420 return ret;
421}
422
423/*
424 * dissociate a cache from all the pages it was backing
425 */
426static void cachefiles_dissociate_pages(struct fscache_cache *cache)
427{
428 _enter("");
429}
430
431const struct fscache_cache_ops cachefiles_cache_ops = {
432 .name = "cachefiles",
433 .alloc_object = cachefiles_alloc_object,
434 .lookup_object = cachefiles_lookup_object,
435 .lookup_complete = cachefiles_lookup_complete,
436 .grab_object = cachefiles_grab_object,
437 .update_object = cachefiles_update_object,
438 .drop_object = cachefiles_drop_object,
439 .put_object = cachefiles_put_object,
440 .sync_cache = cachefiles_sync_cache,
441 .attr_changed = cachefiles_attr_changed,
442 .read_or_alloc_page = cachefiles_read_or_alloc_page,
443 .read_or_alloc_pages = cachefiles_read_or_alloc_pages,
444 .allocate_page = cachefiles_allocate_page,
445 .allocate_pages = cachefiles_allocate_pages,
446 .write_page = cachefiles_write_page,
447 .uncache_page = cachefiles_uncache_page,
448 .dissociate_pages = cachefiles_dissociate_pages,
449};
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
new file mode 100644
index 000000000000..f7c255f9c624
--- /dev/null
+++ b/fs/cachefiles/internal.h
@@ -0,0 +1,360 @@
1/* General netfs cache on cache files internal defs
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/fscache-cache.h>
13#include <linux/timer.h>
14#include <linux/wait.h>
15#include <linux/workqueue.h>
16#include <linux/security.h>
17
18struct cachefiles_cache;
19struct cachefiles_object;
20
21extern unsigned cachefiles_debug;
22#define CACHEFILES_DEBUG_KENTER 1
23#define CACHEFILES_DEBUG_KLEAVE 2
24#define CACHEFILES_DEBUG_KDEBUG 4
25
26/*
27 * node records
28 */
29struct cachefiles_object {
30 struct fscache_object fscache; /* fscache handle */
31 struct cachefiles_lookup_data *lookup_data; /* cached lookup data */
32 struct dentry *dentry; /* the file/dir representing this object */
33 struct dentry *backer; /* backing file */
34 loff_t i_size; /* object size */
35 unsigned long flags;
36#define CACHEFILES_OBJECT_ACTIVE 0 /* T if marked active */
37 atomic_t usage; /* object usage count */
38 uint8_t type; /* object type */
39 uint8_t new; /* T if object new */
40 spinlock_t work_lock;
41 struct rb_node active_node; /* link in active tree (dentry is key) */
42};
43
44extern struct kmem_cache *cachefiles_object_jar;
45
46/*
47 * Cache files cache definition
48 */
49struct cachefiles_cache {
50 struct fscache_cache cache; /* FS-Cache record */
51 struct vfsmount *mnt; /* mountpoint holding the cache */
52 struct dentry *graveyard; /* directory into which dead objects go */
53 struct file *cachefilesd; /* manager daemon handle */
54 const struct cred *cache_cred; /* security override for accessing cache */
55 struct mutex daemon_mutex; /* command serialisation mutex */
56 wait_queue_head_t daemon_pollwq; /* poll waitqueue for daemon */
57 struct rb_root active_nodes; /* active nodes (can't be culled) */
58 rwlock_t active_lock; /* lock for active_nodes */
59 atomic_t gravecounter; /* graveyard uniquifier */
60 unsigned frun_percent; /* when to stop culling (% files) */
61 unsigned fcull_percent; /* when to start culling (% files) */
62 unsigned fstop_percent; /* when to stop allocating (% files) */
63 unsigned brun_percent; /* when to stop culling (% blocks) */
64 unsigned bcull_percent; /* when to start culling (% blocks) */
65 unsigned bstop_percent; /* when to stop allocating (% blocks) */
66 unsigned bsize; /* cache's block size */
67 unsigned bshift; /* min(ilog2(PAGE_SIZE / bsize), 0) */
68 uint64_t frun; /* when to stop culling */
69 uint64_t fcull; /* when to start culling */
70 uint64_t fstop; /* when to stop allocating */
71 sector_t brun; /* when to stop culling */
72 sector_t bcull; /* when to start culling */
73 sector_t bstop; /* when to stop allocating */
74 unsigned long flags;
75#define CACHEFILES_READY 0 /* T if cache prepared */
76#define CACHEFILES_DEAD 1 /* T if cache dead */
77#define CACHEFILES_CULLING 2 /* T if cull engaged */
78#define CACHEFILES_STATE_CHANGED 3 /* T if state changed (poll trigger) */
79 char *rootdirname; /* name of cache root directory */
80 char *secctx; /* LSM security context */
81 char *tag; /* cache binding tag */
82};
83
84/*
85 * backing file read tracking
86 */
87struct cachefiles_one_read {
88 wait_queue_t monitor; /* link into monitored waitqueue */
89 struct page *back_page; /* backing file page we're waiting for */
90 struct page *netfs_page; /* netfs page we're going to fill */
91 struct fscache_retrieval *op; /* retrieval op covering this */
92 struct list_head op_link; /* link in op's todo list */
93};
94
95/*
96 * backing file write tracking
97 */
98struct cachefiles_one_write {
99 struct page *netfs_page; /* netfs page to copy */
100 struct cachefiles_object *object;
101 struct list_head obj_link; /* link in object's lists */
102 fscache_rw_complete_t end_io_func;
103 void *context;
104};
105
106/*
107 * auxiliary data xattr buffer
108 */
109struct cachefiles_xattr {
110 uint16_t len;
111 uint8_t type;
112 uint8_t data[];
113};
114
115/*
116 * note change of state for daemon
117 */
118static inline void cachefiles_state_changed(struct cachefiles_cache *cache)
119{
120 set_bit(CACHEFILES_STATE_CHANGED, &cache->flags);
121 wake_up_all(&cache->daemon_pollwq);
122}
123
124/*
125 * bind.c
126 */
127extern int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args);
128extern void cachefiles_daemon_unbind(struct cachefiles_cache *cache);
129
130/*
131 * daemon.c
132 */
133extern const struct file_operations cachefiles_daemon_fops;
134
135extern int cachefiles_has_space(struct cachefiles_cache *cache,
136 unsigned fnr, unsigned bnr);
137
138/*
139 * interface.c
140 */
141extern const struct fscache_cache_ops cachefiles_cache_ops;
142
143/*
144 * key.c
145 */
146extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type);
147
148/*
149 * namei.c
150 */
151extern int cachefiles_delete_object(struct cachefiles_cache *cache,
152 struct cachefiles_object *object);
153extern int cachefiles_walk_to_object(struct cachefiles_object *parent,
154 struct cachefiles_object *object,
155 const char *key,
156 struct cachefiles_xattr *auxdata);
157extern struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
158 struct dentry *dir,
159 const char *name);
160
161extern int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
162 char *filename);
163
164extern int cachefiles_check_in_use(struct cachefiles_cache *cache,
165 struct dentry *dir, char *filename);
166
167/*
168 * proc.c
169 */
170#ifdef CONFIG_CACHEFILES_HISTOGRAM
171extern atomic_t cachefiles_lookup_histogram[HZ];
172extern atomic_t cachefiles_mkdir_histogram[HZ];
173extern atomic_t cachefiles_create_histogram[HZ];
174
175extern int __init cachefiles_proc_init(void);
176extern void cachefiles_proc_cleanup(void);
177static inline
178void cachefiles_hist(atomic_t histogram[], unsigned long start_jif)
179{
180 unsigned long jif = jiffies - start_jif;
181 if (jif >= HZ)
182 jif = HZ - 1;
183 atomic_inc(&histogram[jif]);
184}
185
186#else
187#define cachefiles_proc_init() (0)
188#define cachefiles_proc_cleanup() do {} while (0)
189#define cachefiles_hist(hist, start_jif) do {} while (0)
190#endif
191
192/*
193 * rdwr.c
194 */
195extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *,
196 struct page *, gfp_t);
197extern int cachefiles_read_or_alloc_pages(struct fscache_retrieval *,
198 struct list_head *, unsigned *,
199 gfp_t);
200extern int cachefiles_allocate_page(struct fscache_retrieval *, struct page *,
201 gfp_t);
202extern int cachefiles_allocate_pages(struct fscache_retrieval *,
203 struct list_head *, unsigned *, gfp_t);
204extern int cachefiles_write_page(struct fscache_storage *, struct page *);
205extern void cachefiles_uncache_page(struct fscache_object *, struct page *);
206
207/*
208 * security.c
209 */
210extern int cachefiles_get_security_ID(struct cachefiles_cache *cache);
211extern int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
212 struct dentry *root,
213 const struct cred **_saved_cred);
214
215static inline void cachefiles_begin_secure(struct cachefiles_cache *cache,
216 const struct cred **_saved_cred)
217{
218 *_saved_cred = override_creds(cache->cache_cred);
219}
220
221static inline void cachefiles_end_secure(struct cachefiles_cache *cache,
222 const struct cred *saved_cred)
223{
224 revert_creds(saved_cred);
225}
226
227/*
228 * xattr.c
229 */
230extern int cachefiles_check_object_type(struct cachefiles_object *object);
231extern int cachefiles_set_object_xattr(struct cachefiles_object *object,
232 struct cachefiles_xattr *auxdata);
233extern int cachefiles_update_object_xattr(struct cachefiles_object *object,
234 struct cachefiles_xattr *auxdata);
235extern int cachefiles_check_object_xattr(struct cachefiles_object *object,
236 struct cachefiles_xattr *auxdata);
237extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
238 struct dentry *dentry);
239
240
241/*
242 * error handling
243 */
244#define kerror(FMT, ...) printk(KERN_ERR "CacheFiles: "FMT"\n", ##__VA_ARGS__)
245
246#define cachefiles_io_error(___cache, FMT, ...) \
247do { \
248 kerror("I/O Error: " FMT, ##__VA_ARGS__); \
249 fscache_io_error(&(___cache)->cache); \
250 set_bit(CACHEFILES_DEAD, &(___cache)->flags); \
251} while (0)
252
253#define cachefiles_io_error_obj(object, FMT, ...) \
254do { \
255 struct cachefiles_cache *___cache; \
256 \
257 ___cache = container_of((object)->fscache.cache, \
258 struct cachefiles_cache, cache); \
259 cachefiles_io_error(___cache, FMT, ##__VA_ARGS__); \
260} while (0)
261
262
263/*
264 * debug tracing
265 */
266#define dbgprintk(FMT, ...) \
267 printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
268
269/* make sure we maintain the format strings, even when debugging is disabled */
270static inline void _dbprintk(const char *fmt, ...)
271 __attribute__((format(printf, 1, 2)));
272static inline void _dbprintk(const char *fmt, ...)
273{
274}
275
276#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
277#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
278#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
279
280
281#if defined(__KDEBUG)
282#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
283#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
284#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
285
286#elif defined(CONFIG_CACHEFILES_DEBUG)
287#define _enter(FMT, ...) \
288do { \
289 if (cachefiles_debug & CACHEFILES_DEBUG_KENTER) \
290 kenter(FMT, ##__VA_ARGS__); \
291} while (0)
292
293#define _leave(FMT, ...) \
294do { \
295 if (cachefiles_debug & CACHEFILES_DEBUG_KLEAVE) \
296 kleave(FMT, ##__VA_ARGS__); \
297} while (0)
298
299#define _debug(FMT, ...) \
300do { \
301 if (cachefiles_debug & CACHEFILES_DEBUG_KDEBUG) \
302 kdebug(FMT, ##__VA_ARGS__); \
303} while (0)
304
305#else
306#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
307#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
308#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
309#endif
310
311#if 1 /* defined(__KDEBUGALL) */
312
313#define ASSERT(X) \
314do { \
315 if (unlikely(!(X))) { \
316 printk(KERN_ERR "\n"); \
317 printk(KERN_ERR "CacheFiles: Assertion failed\n"); \
318 BUG(); \
319 } \
320} while (0)
321
322#define ASSERTCMP(X, OP, Y) \
323do { \
324 if (unlikely(!((X) OP (Y)))) { \
325 printk(KERN_ERR "\n"); \
326 printk(KERN_ERR "CacheFiles: Assertion failed\n"); \
327 printk(KERN_ERR "%lx " #OP " %lx is false\n", \
328 (unsigned long)(X), (unsigned long)(Y)); \
329 BUG(); \
330 } \
331} while (0)
332
333#define ASSERTIF(C, X) \
334do { \
335 if (unlikely((C) && !(X))) { \
336 printk(KERN_ERR "\n"); \
337 printk(KERN_ERR "CacheFiles: Assertion failed\n"); \
338 BUG(); \
339 } \
340} while (0)
341
342#define ASSERTIFCMP(C, X, OP, Y) \
343do { \
344 if (unlikely((C) && !((X) OP (Y)))) { \
345 printk(KERN_ERR "\n"); \
346 printk(KERN_ERR "CacheFiles: Assertion failed\n"); \
347 printk(KERN_ERR "%lx " #OP " %lx is false\n", \
348 (unsigned long)(X), (unsigned long)(Y)); \
349 BUG(); \
350 } \
351} while (0)
352
353#else
354
355#define ASSERT(X) do {} while (0)
356#define ASSERTCMP(X, OP, Y) do {} while (0)
357#define ASSERTIF(C, X) do {} while (0)
358#define ASSERTIFCMP(C, X, OP, Y) do {} while (0)
359
360#endif
diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c
new file mode 100644
index 000000000000..81b8b2b3a674
--- /dev/null
+++ b/fs/cachefiles/key.c
@@ -0,0 +1,159 @@
1/* Key to pathname encoder
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/slab.h>
13#include "internal.h"
14
15static const char cachefiles_charmap[64] =
16 "0123456789" /* 0 - 9 */
17 "abcdefghijklmnopqrstuvwxyz" /* 10 - 35 */
18 "ABCDEFGHIJKLMNOPQRSTUVWXYZ" /* 36 - 61 */
19 "_-" /* 62 - 63 */
20 ;
21
22static const char cachefiles_filecharmap[256] = {
23 /* we skip space and tab and control chars */
24 [33 ... 46] = 1, /* '!' -> '.' */
25 /* we skip '/' as it's significant to pathwalk */
26 [48 ... 127] = 1, /* '0' -> '~' */
27};
28
29/*
30 * turn the raw key into something cooked
31 * - the raw key should include the length in the two bytes at the front
32 * - the key may be up to 514 bytes in length (including the length word)
33 * - "base64" encode the strange keys, mapping 3 bytes of raw to four of
34 * cooked
35 * - need to cut the cooked key into 252 char lengths (189 raw bytes)
36 */
37char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type)
38{
39 unsigned char csum, ch;
40 unsigned int acc;
41 char *key;
42 int loop, len, max, seg, mark, print;
43
44 _enter(",%d", keylen);
45
46 BUG_ON(keylen < 2 || keylen > 514);
47
48 csum = raw[0] + raw[1];
49 print = 1;
50 for (loop = 2; loop < keylen; loop++) {
51 ch = raw[loop];
52 csum += ch;
53 print &= cachefiles_filecharmap[ch];
54 }
55
56 if (print) {
57 /* if the path is usable ASCII, then we render it directly */
58 max = keylen - 2;
59 max += 2; /* two base64'd length chars on the front */
60 max += 5; /* @checksum/M */
61 max += 3 * 2; /* maximum number of segment dividers (".../M")
62 * is ((514 + 251) / 252) = 3
63 */
64 max += 1; /* NUL on end */
65 } else {
66 /* calculate the maximum length of the cooked key */
67 keylen = (keylen + 2) / 3;
68
69 max = keylen * 4;
70 max += 5; /* @checksum/M */
71 max += 3 * 2; /* maximum number of segment dividers (".../M")
72 * is ((514 + 188) / 189) = 3
73 */
74 max += 1; /* NUL on end */
75 }
76
77 max += 1; /* 2nd NUL on end */
78
79 _debug("max: %d", max);
80
81 key = kmalloc(max, GFP_KERNEL);
82 if (!key)
83 return NULL;
84
85 len = 0;
86
87 /* build the cooked key */
88 sprintf(key, "@%02x%c+", (unsigned) csum, 0);
89 len = 5;
90 mark = len - 1;
91
92 if (print) {
93 acc = *(uint16_t *) raw;
94 raw += 2;
95
96 key[len + 1] = cachefiles_charmap[acc & 63];
97 acc >>= 6;
98 key[len] = cachefiles_charmap[acc & 63];
99 len += 2;
100
101 seg = 250;
102 for (loop = keylen; loop > 0; loop--) {
103 if (seg <= 0) {
104 key[len++] = '\0';
105 mark = len;
106 key[len++] = '+';
107 seg = 252;
108 }
109
110 key[len++] = *raw++;
111 ASSERT(len < max);
112 }
113
114 switch (type) {
115 case FSCACHE_COOKIE_TYPE_INDEX: type = 'I'; break;
116 case FSCACHE_COOKIE_TYPE_DATAFILE: type = 'D'; break;
117 default: type = 'S'; break;
118 }
119 } else {
120 seg = 252;
121 for (loop = keylen; loop > 0; loop--) {
122 if (seg <= 0) {
123 key[len++] = '\0';
124 mark = len;
125 key[len++] = '+';
126 seg = 252;
127 }
128
129 acc = *raw++;
130 acc |= *raw++ << 8;
131 acc |= *raw++ << 16;
132
133 _debug("acc: %06x", acc);
134
135 key[len++] = cachefiles_charmap[acc & 63];
136 acc >>= 6;
137 key[len++] = cachefiles_charmap[acc & 63];
138 acc >>= 6;
139 key[len++] = cachefiles_charmap[acc & 63];
140 acc >>= 6;
141 key[len++] = cachefiles_charmap[acc & 63];
142
143 ASSERT(len < max);
144 }
145
146 switch (type) {
147 case FSCACHE_COOKIE_TYPE_INDEX: type = 'J'; break;
148 case FSCACHE_COOKIE_TYPE_DATAFILE: type = 'E'; break;
149 default: type = 'T'; break;
150 }
151 }
152
153 key[mark] = type;
154 key[len++] = 0;
155 key[len] = 0;
156
157 _leave(" = %p %d", key, len);
158 return key;
159}
diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c
new file mode 100644
index 000000000000..4bfa8cf43bf5
--- /dev/null
+++ b/fs/cachefiles/main.c
@@ -0,0 +1,106 @@
1/* Network filesystem caching backend to use cache files on a premounted
2 * filesystem
3 *
4 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
5 * Written by David Howells (dhowells@redhat.com)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public Licence
9 * as published by the Free Software Foundation; either version
10 * 2 of the Licence, or (at your option) any later version.
11 */
12
13#include <linux/module.h>
14#include <linux/init.h>
15#include <linux/sched.h>
16#include <linux/completion.h>
17#include <linux/slab.h>
18#include <linux/fs.h>
19#include <linux/file.h>
20#include <linux/namei.h>
21#include <linux/mount.h>
22#include <linux/statfs.h>
23#include <linux/sysctl.h>
24#include <linux/miscdevice.h>
25#include "internal.h"
26
27unsigned cachefiles_debug;
28module_param_named(debug, cachefiles_debug, uint, S_IWUSR | S_IRUGO);
29MODULE_PARM_DESC(cachefiles_debug, "CacheFiles debugging mask");
30
31MODULE_DESCRIPTION("Mounted-filesystem based cache");
32MODULE_AUTHOR("Red Hat, Inc.");
33MODULE_LICENSE("GPL");
34
35struct kmem_cache *cachefiles_object_jar;
36
37static struct miscdevice cachefiles_dev = {
38 .minor = MISC_DYNAMIC_MINOR,
39 .name = "cachefiles",
40 .fops = &cachefiles_daemon_fops,
41};
42
43static void cachefiles_object_init_once(void *_object)
44{
45 struct cachefiles_object *object = _object;
46
47 memset(object, 0, sizeof(*object));
48 spin_lock_init(&object->work_lock);
49}
50
51/*
52 * initialise the fs caching module
53 */
54static int __init cachefiles_init(void)
55{
56 int ret;
57
58 ret = misc_register(&cachefiles_dev);
59 if (ret < 0)
60 goto error_dev;
61
62 /* create an object jar */
63 ret = -ENOMEM;
64 cachefiles_object_jar =
65 kmem_cache_create("cachefiles_object_jar",
66 sizeof(struct cachefiles_object),
67 0,
68 SLAB_HWCACHE_ALIGN,
69 cachefiles_object_init_once);
70 if (!cachefiles_object_jar) {
71 printk(KERN_NOTICE
72 "CacheFiles: Failed to allocate an object jar\n");
73 goto error_object_jar;
74 }
75
76 ret = cachefiles_proc_init();
77 if (ret < 0)
78 goto error_proc;
79
80 printk(KERN_INFO "CacheFiles: Loaded\n");
81 return 0;
82
83error_proc:
84 kmem_cache_destroy(cachefiles_object_jar);
85error_object_jar:
86 misc_deregister(&cachefiles_dev);
87error_dev:
88 kerror("failed to register: %d", ret);
89 return ret;
90}
91
92fs_initcall(cachefiles_init);
93
94/*
95 * clean up on module removal
96 */
97static void __exit cachefiles_exit(void)
98{
99 printk(KERN_INFO "CacheFiles: Unloading\n");
100
101 cachefiles_proc_cleanup();
102 kmem_cache_destroy(cachefiles_object_jar);
103 misc_deregister(&cachefiles_dev);
104}
105
106module_exit(cachefiles_exit);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
new file mode 100644
index 000000000000..4ce818ae39ea
--- /dev/null
+++ b/fs/cachefiles/namei.c
@@ -0,0 +1,771 @@
1/* CacheFiles path walking and related routines
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/sched.h>
14#include <linux/file.h>
15#include <linux/fs.h>
16#include <linux/fsnotify.h>
17#include <linux/quotaops.h>
18#include <linux/xattr.h>
19#include <linux/mount.h>
20#include <linux/namei.h>
21#include <linux/security.h>
22#include "internal.h"
23
24static int cachefiles_wait_bit(void *flags)
25{
26 schedule();
27 return 0;
28}
29
30/*
31 * record the fact that an object is now active
32 */
33static void cachefiles_mark_object_active(struct cachefiles_cache *cache,
34 struct cachefiles_object *object)
35{
36 struct cachefiles_object *xobject;
37 struct rb_node **_p, *_parent = NULL;
38 struct dentry *dentry;
39
40 _enter(",%p", object);
41
42try_again:
43 write_lock(&cache->active_lock);
44
45 if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags))
46 BUG();
47
48 dentry = object->dentry;
49 _p = &cache->active_nodes.rb_node;
50 while (*_p) {
51 _parent = *_p;
52 xobject = rb_entry(_parent,
53 struct cachefiles_object, active_node);
54
55 ASSERT(xobject != object);
56
57 if (xobject->dentry > dentry)
58 _p = &(*_p)->rb_left;
59 else if (xobject->dentry < dentry)
60 _p = &(*_p)->rb_right;
61 else
62 goto wait_for_old_object;
63 }
64
65 rb_link_node(&object->active_node, _parent, _p);
66 rb_insert_color(&object->active_node, &cache->active_nodes);
67
68 write_unlock(&cache->active_lock);
69 _leave("");
70 return;
71
72 /* an old object from a previous incarnation is hogging the slot - we
73 * need to wait for it to be destroyed */
74wait_for_old_object:
75 if (xobject->fscache.state < FSCACHE_OBJECT_DYING) {
76 printk(KERN_ERR "\n");
77 printk(KERN_ERR "CacheFiles: Error:"
78 " Unexpected object collision\n");
79 printk(KERN_ERR "xobject: OBJ%x\n",
80 xobject->fscache.debug_id);
81 printk(KERN_ERR "xobjstate=%s\n",
82 fscache_object_states[xobject->fscache.state]);
83 printk(KERN_ERR "xobjflags=%lx\n", xobject->fscache.flags);
84 printk(KERN_ERR "xobjevent=%lx [%lx]\n",
85 xobject->fscache.events, xobject->fscache.event_mask);
86 printk(KERN_ERR "xops=%u inp=%u exc=%u\n",
87 xobject->fscache.n_ops, xobject->fscache.n_in_progress,
88 xobject->fscache.n_exclusive);
89 printk(KERN_ERR "xcookie=%p [pr=%p nd=%p fl=%lx]\n",
90 xobject->fscache.cookie,
91 xobject->fscache.cookie->parent,
92 xobject->fscache.cookie->netfs_data,
93 xobject->fscache.cookie->flags);
94 printk(KERN_ERR "xparent=%p\n",
95 xobject->fscache.parent);
96 printk(KERN_ERR "object: OBJ%x\n",
97 object->fscache.debug_id);
98 printk(KERN_ERR "cookie=%p [pr=%p nd=%p fl=%lx]\n",
99 object->fscache.cookie,
100 object->fscache.cookie->parent,
101 object->fscache.cookie->netfs_data,
102 object->fscache.cookie->flags);
103 printk(KERN_ERR "parent=%p\n",
104 object->fscache.parent);
105 BUG();
106 }
107 atomic_inc(&xobject->usage);
108 write_unlock(&cache->active_lock);
109
110 _debug(">>> wait");
111 wait_on_bit(&xobject->flags, CACHEFILES_OBJECT_ACTIVE,
112 cachefiles_wait_bit, TASK_UNINTERRUPTIBLE);
113 _debug("<<< waited");
114
115 cache->cache.ops->put_object(&xobject->fscache);
116 goto try_again;
117}
118
119/*
120 * delete an object representation from the cache
121 * - file backed objects are unlinked
122 * - directory backed objects are stuffed into the graveyard for userspace to
123 * delete
124 * - unlocks the directory mutex
125 */
126static int cachefiles_bury_object(struct cachefiles_cache *cache,
127 struct dentry *dir,
128 struct dentry *rep)
129{
130 struct dentry *grave, *trap;
131 char nbuffer[8 + 8 + 1];
132 int ret;
133
134 _enter(",'%*.*s','%*.*s'",
135 dir->d_name.len, dir->d_name.len, dir->d_name.name,
136 rep->d_name.len, rep->d_name.len, rep->d_name.name);
137
138 /* non-directories can just be unlinked */
139 if (!S_ISDIR(rep->d_inode->i_mode)) {
140 _debug("unlink stale object");
141 ret = vfs_unlink(dir->d_inode, rep);
142
143 mutex_unlock(&dir->d_inode->i_mutex);
144
145 if (ret == -EIO)
146 cachefiles_io_error(cache, "Unlink failed");
147
148 _leave(" = %d", ret);
149 return ret;
150 }
151
152 /* directories have to be moved to the graveyard */
153 _debug("move stale object to graveyard");
154 mutex_unlock(&dir->d_inode->i_mutex);
155
156try_again:
157 /* first step is to make up a grave dentry in the graveyard */
158 sprintf(nbuffer, "%08x%08x",
159 (uint32_t) get_seconds(),
160 (uint32_t) atomic_inc_return(&cache->gravecounter));
161
162 /* do the multiway lock magic */
163 trap = lock_rename(cache->graveyard, dir);
164
165 /* do some checks before getting the grave dentry */
166 if (rep->d_parent != dir) {
167 /* the entry was probably culled when we dropped the parent dir
168 * lock */
169 unlock_rename(cache->graveyard, dir);
170 _leave(" = 0 [culled?]");
171 return 0;
172 }
173
174 if (!S_ISDIR(cache->graveyard->d_inode->i_mode)) {
175 unlock_rename(cache->graveyard, dir);
176 cachefiles_io_error(cache, "Graveyard no longer a directory");
177 return -EIO;
178 }
179
180 if (trap == rep) {
181 unlock_rename(cache->graveyard, dir);
182 cachefiles_io_error(cache, "May not make directory loop");
183 return -EIO;
184 }
185
186 if (d_mountpoint(rep)) {
187 unlock_rename(cache->graveyard, dir);
188 cachefiles_io_error(cache, "Mountpoint in cache");
189 return -EIO;
190 }
191
192 grave = lookup_one_len(nbuffer, cache->graveyard, strlen(nbuffer));
193 if (IS_ERR(grave)) {
194 unlock_rename(cache->graveyard, dir);
195
196 if (PTR_ERR(grave) == -ENOMEM) {
197 _leave(" = -ENOMEM");
198 return -ENOMEM;
199 }
200
201 cachefiles_io_error(cache, "Lookup error %ld",
202 PTR_ERR(grave));
203 return -EIO;
204 }
205
206 if (grave->d_inode) {
207 unlock_rename(cache->graveyard, dir);
208 dput(grave);
209 grave = NULL;
210 cond_resched();
211 goto try_again;
212 }
213
214 if (d_mountpoint(grave)) {
215 unlock_rename(cache->graveyard, dir);
216 dput(grave);
217 cachefiles_io_error(cache, "Mountpoint in graveyard");
218 return -EIO;
219 }
220
221 /* target should not be an ancestor of source */
222 if (trap == grave) {
223 unlock_rename(cache->graveyard, dir);
224 dput(grave);
225 cachefiles_io_error(cache, "May not make directory loop");
226 return -EIO;
227 }
228
229 /* attempt the rename */
230 ret = vfs_rename(dir->d_inode, rep, cache->graveyard->d_inode, grave);
231 if (ret != 0 && ret != -ENOMEM)
232 cachefiles_io_error(cache, "Rename failed with error %d", ret);
233
234 unlock_rename(cache->graveyard, dir);
235 dput(grave);
236 _leave(" = 0");
237 return 0;
238}
239
240/*
241 * delete an object representation from the cache
242 */
243int cachefiles_delete_object(struct cachefiles_cache *cache,
244 struct cachefiles_object *object)
245{
246 struct dentry *dir;
247 int ret;
248
249 _enter(",{%p}", object->dentry);
250
251 ASSERT(object->dentry);
252 ASSERT(object->dentry->d_inode);
253 ASSERT(object->dentry->d_parent);
254
255 dir = dget_parent(object->dentry);
256
257 mutex_lock(&dir->d_inode->i_mutex);
258 ret = cachefiles_bury_object(cache, dir, object->dentry);
259
260 dput(dir);
261 _leave(" = %d", ret);
262 return ret;
263}
264
265/*
266 * walk from the parent object to the child object through the backing
267 * filesystem, creating directories as we go
268 */
269int cachefiles_walk_to_object(struct cachefiles_object *parent,
270 struct cachefiles_object *object,
271 const char *key,
272 struct cachefiles_xattr *auxdata)
273{
274 struct cachefiles_cache *cache;
275 struct dentry *dir, *next = NULL;
276 unsigned long start;
277 const char *name;
278 int ret, nlen;
279
280 _enter("{%p},,%s,", parent->dentry, key);
281
282 cache = container_of(parent->fscache.cache,
283 struct cachefiles_cache, cache);
284
285 ASSERT(parent->dentry);
286 ASSERT(parent->dentry->d_inode);
287
288 if (!(S_ISDIR(parent->dentry->d_inode->i_mode))) {
289 // TODO: convert file to dir
290 _leave("looking up in none directory");
291 return -ENOBUFS;
292 }
293
294 dir = dget(parent->dentry);
295
296advance:
297 /* attempt to transit the first directory component */
298 name = key;
299 nlen = strlen(key);
300
301 /* key ends in a double NUL */
302 key = key + nlen + 1;
303 if (!*key)
304 key = NULL;
305
306lookup_again:
307 /* search the current directory for the element name */
308 _debug("lookup '%s'", name);
309
310 mutex_lock(&dir->d_inode->i_mutex);
311
312 start = jiffies;
313 next = lookup_one_len(name, dir, nlen);
314 cachefiles_hist(cachefiles_lookup_histogram, start);
315 if (IS_ERR(next))
316 goto lookup_error;
317
318 _debug("next -> %p %s", next, next->d_inode ? "positive" : "negative");
319
320 if (!key)
321 object->new = !next->d_inode;
322
323 /* if this element of the path doesn't exist, then the lookup phase
324 * failed, and we can release any readers in the certain knowledge that
325 * there's nothing for them to actually read */
326 if (!next->d_inode)
327 fscache_object_lookup_negative(&object->fscache);
328
329 /* we need to create the object if it's negative */
330 if (key || object->type == FSCACHE_COOKIE_TYPE_INDEX) {
331 /* index objects and intervening tree levels must be subdirs */
332 if (!next->d_inode) {
333 ret = cachefiles_has_space(cache, 1, 0);
334 if (ret < 0)
335 goto create_error;
336
337 start = jiffies;
338 ret = vfs_mkdir(dir->d_inode, next, 0);
339 cachefiles_hist(cachefiles_mkdir_histogram, start);
340 if (ret < 0)
341 goto create_error;
342
343 ASSERT(next->d_inode);
344
345 _debug("mkdir -> %p{%p{ino=%lu}}",
346 next, next->d_inode, next->d_inode->i_ino);
347
348 } else if (!S_ISDIR(next->d_inode->i_mode)) {
349 kerror("inode %lu is not a directory",
350 next->d_inode->i_ino);
351 ret = -ENOBUFS;
352 goto error;
353 }
354
355 } else {
356 /* non-index objects start out life as files */
357 if (!next->d_inode) {
358 ret = cachefiles_has_space(cache, 1, 0);
359 if (ret < 0)
360 goto create_error;
361
362 start = jiffies;
363 ret = vfs_create(dir->d_inode, next, S_IFREG, NULL);
364 cachefiles_hist(cachefiles_create_histogram, start);
365 if (ret < 0)
366 goto create_error;
367
368 ASSERT(next->d_inode);
369
370 _debug("create -> %p{%p{ino=%lu}}",
371 next, next->d_inode, next->d_inode->i_ino);
372
373 } else if (!S_ISDIR(next->d_inode->i_mode) &&
374 !S_ISREG(next->d_inode->i_mode)
375 ) {
376 kerror("inode %lu is not a file or directory",
377 next->d_inode->i_ino);
378 ret = -ENOBUFS;
379 goto error;
380 }
381 }
382
383 /* process the next component */
384 if (key) {
385 _debug("advance");
386 mutex_unlock(&dir->d_inode->i_mutex);
387 dput(dir);
388 dir = next;
389 next = NULL;
390 goto advance;
391 }
392
393 /* we've found the object we were looking for */
394 object->dentry = next;
395
396 /* if we've found that the terminal object exists, then we need to
397 * check its attributes and delete it if it's out of date */
398 if (!object->new) {
399 _debug("validate '%*.*s'",
400 next->d_name.len, next->d_name.len, next->d_name.name);
401
402 ret = cachefiles_check_object_xattr(object, auxdata);
403 if (ret == -ESTALE) {
404 /* delete the object (the deleter drops the directory
405 * mutex) */
406 object->dentry = NULL;
407
408 ret = cachefiles_bury_object(cache, dir, next);
409 dput(next);
410 next = NULL;
411
412 if (ret < 0)
413 goto delete_error;
414
415 _debug("redo lookup");
416 goto lookup_again;
417 }
418 }
419
420 /* note that we're now using this object */
421 cachefiles_mark_object_active(cache, object);
422
423 mutex_unlock(&dir->d_inode->i_mutex);
424 dput(dir);
425 dir = NULL;
426
427 _debug("=== OBTAINED_OBJECT ===");
428
429 if (object->new) {
430 /* attach data to a newly constructed terminal object */
431 ret = cachefiles_set_object_xattr(object, auxdata);
432 if (ret < 0)
433 goto check_error;
434 } else {
435 /* always update the atime on an object we've just looked up
436 * (this is used to keep track of culling, and atimes are only
437 * updated by read, write and readdir but not lookup or
438 * open) */
439 touch_atime(cache->mnt, next);
440 }
441
442 /* open a file interface onto a data file */
443 if (object->type != FSCACHE_COOKIE_TYPE_INDEX) {
444 if (S_ISREG(object->dentry->d_inode->i_mode)) {
445 const struct address_space_operations *aops;
446
447 ret = -EPERM;
448 aops = object->dentry->d_inode->i_mapping->a_ops;
449 if (!aops->bmap)
450 goto check_error;
451
452 object->backer = object->dentry;
453 } else {
454 BUG(); // TODO: open file in data-class subdir
455 }
456 }
457
458 object->new = 0;
459 fscache_obtained_object(&object->fscache);
460
461 _leave(" = 0 [%lu]", object->dentry->d_inode->i_ino);
462 return 0;
463
464create_error:
465 _debug("create error %d", ret);
466 if (ret == -EIO)
467 cachefiles_io_error(cache, "Create/mkdir failed");
468 goto error;
469
470check_error:
471 _debug("check error %d", ret);
472 write_lock(&cache->active_lock);
473 rb_erase(&object->active_node, &cache->active_nodes);
474 clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
475 wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
476 write_unlock(&cache->active_lock);
477
478 dput(object->dentry);
479 object->dentry = NULL;
480 goto error_out;
481
482delete_error:
483 _debug("delete error %d", ret);
484 goto error_out2;
485
486lookup_error:
487 _debug("lookup error %ld", PTR_ERR(next));
488 ret = PTR_ERR(next);
489 if (ret == -EIO)
490 cachefiles_io_error(cache, "Lookup failed");
491 next = NULL;
492error:
493 mutex_unlock(&dir->d_inode->i_mutex);
494 dput(next);
495error_out2:
496 dput(dir);
497error_out:
498 if (ret == -ENOSPC)
499 ret = -ENOBUFS;
500
501 _leave(" = error %d", -ret);
502 return ret;
503}
504
505/*
506 * get a subdirectory
507 */
508struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
509 struct dentry *dir,
510 const char *dirname)
511{
512 struct dentry *subdir;
513 unsigned long start;
514 int ret;
515
516 _enter(",,%s", dirname);
517
518 /* search the current directory for the element name */
519 mutex_lock(&dir->d_inode->i_mutex);
520
521 start = jiffies;
522 subdir = lookup_one_len(dirname, dir, strlen(dirname));
523 cachefiles_hist(cachefiles_lookup_histogram, start);
524 if (IS_ERR(subdir)) {
525 if (PTR_ERR(subdir) == -ENOMEM)
526 goto nomem_d_alloc;
527 goto lookup_error;
528 }
529
530 _debug("subdir -> %p %s",
531 subdir, subdir->d_inode ? "positive" : "negative");
532
533 /* we need to create the subdir if it doesn't exist yet */
534 if (!subdir->d_inode) {
535 ret = cachefiles_has_space(cache, 1, 0);
536 if (ret < 0)
537 goto mkdir_error;
538
539 _debug("attempt mkdir");
540
541 ret = vfs_mkdir(dir->d_inode, subdir, 0700);
542 if (ret < 0)
543 goto mkdir_error;
544
545 ASSERT(subdir->d_inode);
546
547 _debug("mkdir -> %p{%p{ino=%lu}}",
548 subdir,
549 subdir->d_inode,
550 subdir->d_inode->i_ino);
551 }
552
553 mutex_unlock(&dir->d_inode->i_mutex);
554
555 /* we need to make sure the subdir is a directory */
556 ASSERT(subdir->d_inode);
557
558 if (!S_ISDIR(subdir->d_inode->i_mode)) {
559 kerror("%s is not a directory", dirname);
560 ret = -EIO;
561 goto check_error;
562 }
563
564 ret = -EPERM;
565 if (!subdir->d_inode->i_op ||
566 !subdir->d_inode->i_op->setxattr ||
567 !subdir->d_inode->i_op->getxattr ||
568 !subdir->d_inode->i_op->lookup ||
569 !subdir->d_inode->i_op->mkdir ||
570 !subdir->d_inode->i_op->create ||
571 !subdir->d_inode->i_op->rename ||
572 !subdir->d_inode->i_op->rmdir ||
573 !subdir->d_inode->i_op->unlink)
574 goto check_error;
575
576 _leave(" = [%lu]", subdir->d_inode->i_ino);
577 return subdir;
578
579check_error:
580 dput(subdir);
581 _leave(" = %d [check]", ret);
582 return ERR_PTR(ret);
583
584mkdir_error:
585 mutex_unlock(&dir->d_inode->i_mutex);
586 dput(subdir);
587 kerror("mkdir %s failed with error %d", dirname, ret);
588 return ERR_PTR(ret);
589
590lookup_error:
591 mutex_unlock(&dir->d_inode->i_mutex);
592 ret = PTR_ERR(subdir);
593 kerror("Lookup %s failed with error %d", dirname, ret);
594 return ERR_PTR(ret);
595
596nomem_d_alloc:
597 mutex_unlock(&dir->d_inode->i_mutex);
598 _leave(" = -ENOMEM");
599 return ERR_PTR(-ENOMEM);
600}
601
602/*
603 * find out if an object is in use or not
604 * - if finds object and it's not in use:
605 * - returns a pointer to the object and a reference on it
606 * - returns with the directory locked
607 */
608static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
609 struct dentry *dir,
610 char *filename)
611{
612 struct cachefiles_object *object;
613 struct rb_node *_n;
614 struct dentry *victim;
615 unsigned long start;
616 int ret;
617
618 //_enter(",%*.*s/,%s",
619 // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
620
621 /* look up the victim */
622 mutex_lock_nested(&dir->d_inode->i_mutex, 1);
623
624 start = jiffies;
625 victim = lookup_one_len(filename, dir, strlen(filename));
626 cachefiles_hist(cachefiles_lookup_histogram, start);
627 if (IS_ERR(victim))
628 goto lookup_error;
629
630 //_debug("victim -> %p %s",
631 // victim, victim->d_inode ? "positive" : "negative");
632
633 /* if the object is no longer there then we probably retired the object
634 * at the netfs's request whilst the cull was in progress
635 */
636 if (!victim->d_inode) {
637 mutex_unlock(&dir->d_inode->i_mutex);
638 dput(victim);
639 _leave(" = -ENOENT [absent]");
640 return ERR_PTR(-ENOENT);
641 }
642
643 /* check to see if we're using this object */
644 read_lock(&cache->active_lock);
645
646 _n = cache->active_nodes.rb_node;
647
648 while (_n) {
649 object = rb_entry(_n, struct cachefiles_object, active_node);
650
651 if (object->dentry > victim)
652 _n = _n->rb_left;
653 else if (object->dentry < victim)
654 _n = _n->rb_right;
655 else
656 goto object_in_use;
657 }
658
659 read_unlock(&cache->active_lock);
660
661 //_leave(" = %p", victim);
662 return victim;
663
664object_in_use:
665 read_unlock(&cache->active_lock);
666 mutex_unlock(&dir->d_inode->i_mutex);
667 dput(victim);
668 //_leave(" = -EBUSY [in use]");
669 return ERR_PTR(-EBUSY);
670
671lookup_error:
672 mutex_unlock(&dir->d_inode->i_mutex);
673 ret = PTR_ERR(victim);
674 if (ret == -ENOENT) {
675 /* file or dir now absent - probably retired by netfs */
676 _leave(" = -ESTALE [absent]");
677 return ERR_PTR(-ESTALE);
678 }
679
680 if (ret == -EIO) {
681 cachefiles_io_error(cache, "Lookup failed");
682 } else if (ret != -ENOMEM) {
683 kerror("Internal error: %d", ret);
684 ret = -EIO;
685 }
686
687 _leave(" = %d", ret);
688 return ERR_PTR(ret);
689}
690
691/*
692 * cull an object if it's not in use
693 * - called only by cache manager daemon
694 */
695int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
696 char *filename)
697{
698 struct dentry *victim;
699 int ret;
700
701 _enter(",%*.*s/,%s",
702 dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
703
704 victim = cachefiles_check_active(cache, dir, filename);
705 if (IS_ERR(victim))
706 return PTR_ERR(victim);
707
708 _debug("victim -> %p %s",
709 victim, victim->d_inode ? "positive" : "negative");
710
711 /* okay... the victim is not being used so we can cull it
712 * - start by marking it as stale
713 */
714 _debug("victim is cullable");
715
716 ret = cachefiles_remove_object_xattr(cache, victim);
717 if (ret < 0)
718 goto error_unlock;
719
720 /* actually remove the victim (drops the dir mutex) */
721 _debug("bury");
722
723 ret = cachefiles_bury_object(cache, dir, victim);
724 if (ret < 0)
725 goto error;
726
727 dput(victim);
728 _leave(" = 0");
729 return 0;
730
731error_unlock:
732 mutex_unlock(&dir->d_inode->i_mutex);
733error:
734 dput(victim);
735 if (ret == -ENOENT) {
736 /* file or dir now absent - probably retired by netfs */
737 _leave(" = -ESTALE [absent]");
738 return -ESTALE;
739 }
740
741 if (ret != -ENOMEM) {
742 kerror("Internal error: %d", ret);
743 ret = -EIO;
744 }
745
746 _leave(" = %d", ret);
747 return ret;
748}
749
750/*
751 * find out if an object is in use or not
752 * - called only by cache manager daemon
753 * - returns -EBUSY or 0 to indicate whether an object is in use or not
754 */
755int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir,
756 char *filename)
757{
758 struct dentry *victim;
759
760 //_enter(",%*.*s/,%s",
761 // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
762
763 victim = cachefiles_check_active(cache, dir, filename);
764 if (IS_ERR(victim))
765 return PTR_ERR(victim);
766
767 mutex_unlock(&dir->d_inode->i_mutex);
768 dput(victim);
769 //_leave(" = 0");
770 return 0;
771}
diff --git a/fs/cachefiles/proc.c b/fs/cachefiles/proc.c
new file mode 100644
index 000000000000..eccd33941199
--- /dev/null
+++ b/fs/cachefiles/proc.c
@@ -0,0 +1,134 @@
1/* CacheFiles statistics
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/proc_fs.h>
14#include <linux/seq_file.h>
15#include "internal.h"
16
17atomic_t cachefiles_lookup_histogram[HZ];
18atomic_t cachefiles_mkdir_histogram[HZ];
19atomic_t cachefiles_create_histogram[HZ];
20
21/*
22 * display the latency histogram
23 */
24static int cachefiles_histogram_show(struct seq_file *m, void *v)
25{
26 unsigned long index;
27 unsigned x, y, z, t;
28
29 switch ((unsigned long) v) {
30 case 1:
31 seq_puts(m, "JIFS SECS LOOKUPS MKDIRS CREATES\n");
32 return 0;
33 case 2:
34 seq_puts(m, "===== ===== ========= ========= =========\n");
35 return 0;
36 default:
37 index = (unsigned long) v - 3;
38 x = atomic_read(&cachefiles_lookup_histogram[index]);
39 y = atomic_read(&cachefiles_mkdir_histogram[index]);
40 z = atomic_read(&cachefiles_create_histogram[index]);
41 if (x == 0 && y == 0 && z == 0)
42 return 0;
43
44 t = (index * 1000) / HZ;
45
46 seq_printf(m, "%4lu 0.%03u %9u %9u %9u\n", index, t, x, y, z);
47 return 0;
48 }
49}
50
51/*
52 * set up the iterator to start reading from the first line
53 */
54static void *cachefiles_histogram_start(struct seq_file *m, loff_t *_pos)
55{
56 if ((unsigned long long)*_pos >= HZ + 2)
57 return NULL;
58 if (*_pos == 0)
59 *_pos = 1;
60 return (void *)(unsigned long) *_pos;
61}
62
63/*
64 * move to the next line
65 */
66static void *cachefiles_histogram_next(struct seq_file *m, void *v, loff_t *pos)
67{
68 (*pos)++;
69 return (unsigned long long)*pos > HZ + 2 ?
70 NULL : (void *)(unsigned long) *pos;
71}
72
73/*
74 * clean up after reading
75 */
76static void cachefiles_histogram_stop(struct seq_file *m, void *v)
77{
78}
79
80static const struct seq_operations cachefiles_histogram_ops = {
81 .start = cachefiles_histogram_start,
82 .stop = cachefiles_histogram_stop,
83 .next = cachefiles_histogram_next,
84 .show = cachefiles_histogram_show,
85};
86
87/*
88 * open "/proc/fs/cachefiles/XXX" which provide statistics summaries
89 */
90static int cachefiles_histogram_open(struct inode *inode, struct file *file)
91{
92 return seq_open(file, &cachefiles_histogram_ops);
93}
94
95static const struct file_operations cachefiles_histogram_fops = {
96 .owner = THIS_MODULE,
97 .open = cachefiles_histogram_open,
98 .read = seq_read,
99 .llseek = seq_lseek,
100 .release = seq_release,
101};
102
103/*
104 * initialise the /proc/fs/cachefiles/ directory
105 */
106int __init cachefiles_proc_init(void)
107{
108 _enter("");
109
110 if (!proc_mkdir("fs/cachefiles", NULL))
111 goto error_dir;
112
113 if (!proc_create("fs/cachefiles/histogram", S_IFREG | 0444, NULL,
114 &cachefiles_histogram_fops))
115 goto error_histogram;
116
117 _leave(" = 0");
118 return 0;
119
120error_histogram:
121 remove_proc_entry("fs/cachefiles", NULL);
122error_dir:
123 _leave(" = -ENOMEM");
124 return -ENOMEM;
125}
126
127/*
128 * clean up the /proc/fs/cachefiles/ directory
129 */
130void cachefiles_proc_cleanup(void)
131{
132 remove_proc_entry("fs/cachefiles/histogram", NULL);
133 remove_proc_entry("fs/cachefiles", NULL);
134}
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
new file mode 100644
index 000000000000..a69787e7dd96
--- /dev/null
+++ b/fs/cachefiles/rdwr.c
@@ -0,0 +1,879 @@
1/* Storage object read/write
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/mount.h>
13#include <linux/file.h>
14#include "internal.h"
15
16/*
17 * detect wake up events generated by the unlocking of pages in which we're
18 * interested
19 * - we use this to detect read completion of backing pages
20 * - the caller holds the waitqueue lock
21 */
22static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode,
23 int sync, void *_key)
24{
25 struct cachefiles_one_read *monitor =
26 container_of(wait, struct cachefiles_one_read, monitor);
27 struct cachefiles_object *object;
28 struct wait_bit_key *key = _key;
29 struct page *page = wait->private;
30
31 ASSERT(key);
32
33 _enter("{%lu},%u,%d,{%p,%u}",
34 monitor->netfs_page->index, mode, sync,
35 key->flags, key->bit_nr);
36
37 if (key->flags != &page->flags ||
38 key->bit_nr != PG_locked)
39 return 0;
40
41 _debug("--- monitor %p %lx ---", page, page->flags);
42
43 if (!PageUptodate(page) && !PageError(page))
44 dump_stack();
45
46 /* remove from the waitqueue */
47 list_del(&wait->task_list);
48
49 /* move onto the action list and queue for FS-Cache thread pool */
50 ASSERT(monitor->op);
51
52 object = container_of(monitor->op->op.object,
53 struct cachefiles_object, fscache);
54
55 spin_lock(&object->work_lock);
56 list_add_tail(&monitor->op_link, &monitor->op->to_do);
57 spin_unlock(&object->work_lock);
58
59 fscache_enqueue_retrieval(monitor->op);
60 return 0;
61}
62
63/*
64 * copy data from backing pages to netfs pages to complete a read operation
65 * - driven by FS-Cache's thread pool
66 */
67static void cachefiles_read_copier(struct fscache_operation *_op)
68{
69 struct cachefiles_one_read *monitor;
70 struct cachefiles_object *object;
71 struct fscache_retrieval *op;
72 struct pagevec pagevec;
73 int error, max;
74
75 op = container_of(_op, struct fscache_retrieval, op);
76 object = container_of(op->op.object,
77 struct cachefiles_object, fscache);
78
79 _enter("{ino=%lu}", object->backer->d_inode->i_ino);
80
81 pagevec_init(&pagevec, 0);
82
83 max = 8;
84 spin_lock_irq(&object->work_lock);
85
86 while (!list_empty(&op->to_do)) {
87 monitor = list_entry(op->to_do.next,
88 struct cachefiles_one_read, op_link);
89 list_del(&monitor->op_link);
90
91 spin_unlock_irq(&object->work_lock);
92
93 _debug("- copy {%lu}", monitor->back_page->index);
94
95 error = -EIO;
96 if (PageUptodate(monitor->back_page)) {
97 copy_highpage(monitor->netfs_page, monitor->back_page);
98
99 pagevec_add(&pagevec, monitor->netfs_page);
100 fscache_mark_pages_cached(monitor->op, &pagevec);
101 error = 0;
102 }
103
104 if (error)
105 cachefiles_io_error_obj(
106 object,
107 "Readpage failed on backing file %lx",
108 (unsigned long) monitor->back_page->flags);
109
110 page_cache_release(monitor->back_page);
111
112 fscache_end_io(op, monitor->netfs_page, error);
113 page_cache_release(monitor->netfs_page);
114 fscache_put_retrieval(op);
115 kfree(monitor);
116
117 /* let the thread pool have some air occasionally */
118 max--;
119 if (max < 0 || need_resched()) {
120 if (!list_empty(&op->to_do))
121 fscache_enqueue_retrieval(op);
122 _leave(" [maxed out]");
123 return;
124 }
125
126 spin_lock_irq(&object->work_lock);
127 }
128
129 spin_unlock_irq(&object->work_lock);
130 _leave("");
131}
132
133/*
134 * read the corresponding page to the given set from the backing file
135 * - an uncertain page is simply discarded, to be tried again another time
136 */
137static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
138 struct fscache_retrieval *op,
139 struct page *netpage,
140 struct pagevec *pagevec)
141{
142 struct cachefiles_one_read *monitor;
143 struct address_space *bmapping;
144 struct page *newpage, *backpage;
145 int ret;
146
147 _enter("");
148
149 pagevec_reinit(pagevec);
150
151 _debug("read back %p{%lu,%d}",
152 netpage, netpage->index, page_count(netpage));
153
154 monitor = kzalloc(sizeof(*monitor), GFP_KERNEL);
155 if (!monitor)
156 goto nomem;
157
158 monitor->netfs_page = netpage;
159 monitor->op = fscache_get_retrieval(op);
160
161 init_waitqueue_func_entry(&monitor->monitor, cachefiles_read_waiter);
162
163 /* attempt to get hold of the backing page */
164 bmapping = object->backer->d_inode->i_mapping;
165 newpage = NULL;
166
167 for (;;) {
168 backpage = find_get_page(bmapping, netpage->index);
169 if (backpage)
170 goto backing_page_already_present;
171
172 if (!newpage) {
173 newpage = page_cache_alloc_cold(bmapping);
174 if (!newpage)
175 goto nomem_monitor;
176 }
177
178 ret = add_to_page_cache(newpage, bmapping,
179 netpage->index, GFP_KERNEL);
180 if (ret == 0)
181 goto installed_new_backing_page;
182 if (ret != -EEXIST)
183 goto nomem_page;
184 }
185
186 /* we've installed a new backing page, so now we need to add it
187 * to the LRU list and start it reading */
188installed_new_backing_page:
189 _debug("- new %p", newpage);
190
191 backpage = newpage;
192 newpage = NULL;
193
194 page_cache_get(backpage);
195 pagevec_add(pagevec, backpage);
196 __pagevec_lru_add_file(pagevec);
197
198read_backing_page:
199 ret = bmapping->a_ops->readpage(NULL, backpage);
200 if (ret < 0)
201 goto read_error;
202
203 /* set the monitor to transfer the data across */
204monitor_backing_page:
205 _debug("- monitor add");
206
207 /* install the monitor */
208 page_cache_get(monitor->netfs_page);
209 page_cache_get(backpage);
210 monitor->back_page = backpage;
211 monitor->monitor.private = backpage;
212 add_page_wait_queue(backpage, &monitor->monitor);
213 monitor = NULL;
214
215 /* but the page may have been read before the monitor was installed, so
216 * the monitor may miss the event - so we have to ensure that we do get
217 * one in such a case */
218 if (trylock_page(backpage)) {
219 _debug("jumpstart %p {%lx}", backpage, backpage->flags);
220 unlock_page(backpage);
221 }
222 goto success;
223
224 /* if the backing page is already present, it can be in one of
225 * three states: read in progress, read failed or read okay */
226backing_page_already_present:
227 _debug("- present");
228
229 if (newpage) {
230 page_cache_release(newpage);
231 newpage = NULL;
232 }
233
234 if (PageError(backpage))
235 goto io_error;
236
237 if (PageUptodate(backpage))
238 goto backing_page_already_uptodate;
239
240 if (!trylock_page(backpage))
241 goto monitor_backing_page;
242 _debug("read %p {%lx}", backpage, backpage->flags);
243 goto read_backing_page;
244
245 /* the backing page is already up to date, attach the netfs
246 * page to the pagecache and LRU and copy the data across */
247backing_page_already_uptodate:
248 _debug("- uptodate");
249
250 pagevec_add(pagevec, netpage);
251 fscache_mark_pages_cached(op, pagevec);
252
253 copy_highpage(netpage, backpage);
254 fscache_end_io(op, netpage, 0);
255
256success:
257 _debug("success");
258 ret = 0;
259
260out:
261 if (backpage)
262 page_cache_release(backpage);
263 if (monitor) {
264 fscache_put_retrieval(monitor->op);
265 kfree(monitor);
266 }
267 _leave(" = %d", ret);
268 return ret;
269
270read_error:
271 _debug("read error %d", ret);
272 if (ret == -ENOMEM)
273 goto out;
274io_error:
275 cachefiles_io_error_obj(object, "Page read error on backing file");
276 ret = -ENOBUFS;
277 goto out;
278
279nomem_page:
280 page_cache_release(newpage);
281nomem_monitor:
282 fscache_put_retrieval(monitor->op);
283 kfree(monitor);
284nomem:
285 _leave(" = -ENOMEM");
286 return -ENOMEM;
287}
288
289/*
290 * read a page from the cache or allocate a block in which to store it
291 * - cache withdrawal is prevented by the caller
292 * - returns -EINTR if interrupted
293 * - returns -ENOMEM if ran out of memory
294 * - returns -ENOBUFS if no buffers can be made available
295 * - returns -ENOBUFS if page is beyond EOF
296 * - if the page is backed by a block in the cache:
297 * - a read will be started which will call the callback on completion
298 * - 0 will be returned
299 * - else if the page is unbacked:
300 * - the metadata will be retained
301 * - -ENODATA will be returned
302 */
303int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
304 struct page *page,
305 gfp_t gfp)
306{
307 struct cachefiles_object *object;
308 struct cachefiles_cache *cache;
309 struct pagevec pagevec;
310 struct inode *inode;
311 sector_t block0, block;
312 unsigned shift;
313 int ret;
314
315 object = container_of(op->op.object,
316 struct cachefiles_object, fscache);
317 cache = container_of(object->fscache.cache,
318 struct cachefiles_cache, cache);
319
320 _enter("{%p},{%lx},,,", object, page->index);
321
322 if (!object->backer)
323 return -ENOBUFS;
324
325 inode = object->backer->d_inode;
326 ASSERT(S_ISREG(inode->i_mode));
327 ASSERT(inode->i_mapping->a_ops->bmap);
328 ASSERT(inode->i_mapping->a_ops->readpages);
329
330 /* calculate the shift required to use bmap */
331 if (inode->i_sb->s_blocksize > PAGE_SIZE)
332 return -ENOBUFS;
333
334 shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
335
336 op->op.flags = FSCACHE_OP_FAST;
337 op->op.processor = cachefiles_read_copier;
338
339 pagevec_init(&pagevec, 0);
340
341 /* we assume the absence or presence of the first block is a good
342 * enough indication for the page as a whole
343 * - TODO: don't use bmap() for this as it is _not_ actually good
344 * enough for this as it doesn't indicate errors, but it's all we've
345 * got for the moment
346 */
347 block0 = page->index;
348 block0 <<= shift;
349
350 block = inode->i_mapping->a_ops->bmap(inode->i_mapping, block0);
351 _debug("%llx -> %llx",
352 (unsigned long long) block0,
353 (unsigned long long) block);
354
355 if (block) {
356 /* submit the apparently valid page to the backing fs to be
357 * read from disk */
358 ret = cachefiles_read_backing_file_one(object, op, page,
359 &pagevec);
360 } else if (cachefiles_has_space(cache, 0, 1) == 0) {
361 /* there's space in the cache we can use */
362 pagevec_add(&pagevec, page);
363 fscache_mark_pages_cached(op, &pagevec);
364 ret = -ENODATA;
365 } else {
366 ret = -ENOBUFS;
367 }
368
369 _leave(" = %d", ret);
370 return ret;
371}
372
373/*
374 * read the corresponding pages to the given set from the backing file
375 * - any uncertain pages are simply discarded, to be tried again another time
376 */
377static int cachefiles_read_backing_file(struct cachefiles_object *object,
378 struct fscache_retrieval *op,
379 struct list_head *list,
380 struct pagevec *mark_pvec)
381{
382 struct cachefiles_one_read *monitor = NULL;
383 struct address_space *bmapping = object->backer->d_inode->i_mapping;
384 struct pagevec lru_pvec;
385 struct page *newpage = NULL, *netpage, *_n, *backpage = NULL;
386 int ret = 0;
387
388 _enter("");
389
390 pagevec_init(&lru_pvec, 0);
391
392 list_for_each_entry_safe(netpage, _n, list, lru) {
393 list_del(&netpage->lru);
394
395 _debug("read back %p{%lu,%d}",
396 netpage, netpage->index, page_count(netpage));
397
398 if (!monitor) {
399 monitor = kzalloc(sizeof(*monitor), GFP_KERNEL);
400 if (!monitor)
401 goto nomem;
402
403 monitor->op = fscache_get_retrieval(op);
404 init_waitqueue_func_entry(&monitor->monitor,
405 cachefiles_read_waiter);
406 }
407
408 for (;;) {
409 backpage = find_get_page(bmapping, netpage->index);
410 if (backpage)
411 goto backing_page_already_present;
412
413 if (!newpage) {
414 newpage = page_cache_alloc_cold(bmapping);
415 if (!newpage)
416 goto nomem;
417 }
418
419 ret = add_to_page_cache(newpage, bmapping,
420 netpage->index, GFP_KERNEL);
421 if (ret == 0)
422 goto installed_new_backing_page;
423 if (ret != -EEXIST)
424 goto nomem;
425 }
426
427 /* we've installed a new backing page, so now we need to add it
428 * to the LRU list and start it reading */
429 installed_new_backing_page:
430 _debug("- new %p", newpage);
431
432 backpage = newpage;
433 newpage = NULL;
434
435 page_cache_get(backpage);
436 if (!pagevec_add(&lru_pvec, backpage))
437 __pagevec_lru_add_file(&lru_pvec);
438
439 reread_backing_page:
440 ret = bmapping->a_ops->readpage(NULL, backpage);
441 if (ret < 0)
442 goto read_error;
443
444 /* add the netfs page to the pagecache and LRU, and set the
445 * monitor to transfer the data across */
446 monitor_backing_page:
447 _debug("- monitor add");
448
449 ret = add_to_page_cache(netpage, op->mapping, netpage->index,
450 GFP_KERNEL);
451 if (ret < 0) {
452 if (ret == -EEXIST) {
453 page_cache_release(netpage);
454 continue;
455 }
456 goto nomem;
457 }
458
459 page_cache_get(netpage);
460 if (!pagevec_add(&lru_pvec, netpage))
461 __pagevec_lru_add_file(&lru_pvec);
462
463 /* install a monitor */
464 page_cache_get(netpage);
465 monitor->netfs_page = netpage;
466
467 page_cache_get(backpage);
468 monitor->back_page = backpage;
469 monitor->monitor.private = backpage;
470 add_page_wait_queue(backpage, &monitor->monitor);
471 monitor = NULL;
472
473 /* but the page may have been read before the monitor was
474 * installed, so the monitor may miss the event - so we have to
475 * ensure that we do get one in such a case */
476 if (trylock_page(backpage)) {
477 _debug("2unlock %p {%lx}", backpage, backpage->flags);
478 unlock_page(backpage);
479 }
480
481 page_cache_release(backpage);
482 backpage = NULL;
483
484 page_cache_release(netpage);
485 netpage = NULL;
486 continue;
487
488 /* if the backing page is already present, it can be in one of
489 * three states: read in progress, read failed or read okay */
490 backing_page_already_present:
491 _debug("- present %p", backpage);
492
493 if (PageError(backpage))
494 goto io_error;
495
496 if (PageUptodate(backpage))
497 goto backing_page_already_uptodate;
498
499 _debug("- not ready %p{%lx}", backpage, backpage->flags);
500
501 if (!trylock_page(backpage))
502 goto monitor_backing_page;
503
504 if (PageError(backpage)) {
505 _debug("error %lx", backpage->flags);
506 unlock_page(backpage);
507 goto io_error;
508 }
509
510 if (PageUptodate(backpage))
511 goto backing_page_already_uptodate_unlock;
512
513 /* we've locked a page that's neither up to date nor erroneous,
514 * so we need to attempt to read it again */
515 goto reread_backing_page;
516
517 /* the backing page is already up to date, attach the netfs
518 * page to the pagecache and LRU and copy the data across */
519 backing_page_already_uptodate_unlock:
520 _debug("uptodate %lx", backpage->flags);
521 unlock_page(backpage);
522 backing_page_already_uptodate:
523 _debug("- uptodate");
524
525 ret = add_to_page_cache(netpage, op->mapping, netpage->index,
526 GFP_KERNEL);
527 if (ret < 0) {
528 if (ret == -EEXIST) {
529 page_cache_release(netpage);
530 continue;
531 }
532 goto nomem;
533 }
534
535 copy_highpage(netpage, backpage);
536
537 page_cache_release(backpage);
538 backpage = NULL;
539
540 if (!pagevec_add(mark_pvec, netpage))
541 fscache_mark_pages_cached(op, mark_pvec);
542
543 page_cache_get(netpage);
544 if (!pagevec_add(&lru_pvec, netpage))
545 __pagevec_lru_add_file(&lru_pvec);
546
547 fscache_end_io(op, netpage, 0);
548 page_cache_release(netpage);
549 netpage = NULL;
550 continue;
551 }
552
553 netpage = NULL;
554
555 _debug("out");
556
557out:
558 /* tidy up */
559 pagevec_lru_add_file(&lru_pvec);
560
561 if (newpage)
562 page_cache_release(newpage);
563 if (netpage)
564 page_cache_release(netpage);
565 if (backpage)
566 page_cache_release(backpage);
567 if (monitor) {
568 fscache_put_retrieval(op);
569 kfree(monitor);
570 }
571
572 list_for_each_entry_safe(netpage, _n, list, lru) {
573 list_del(&netpage->lru);
574 page_cache_release(netpage);
575 }
576
577 _leave(" = %d", ret);
578 return ret;
579
580nomem:
581 _debug("nomem");
582 ret = -ENOMEM;
583 goto out;
584
585read_error:
586 _debug("read error %d", ret);
587 if (ret == -ENOMEM)
588 goto out;
589io_error:
590 cachefiles_io_error_obj(object, "Page read error on backing file");
591 ret = -ENOBUFS;
592 goto out;
593}
594
595/*
596 * read a list of pages from the cache or allocate blocks in which to store
597 * them
598 */
599int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
600 struct list_head *pages,
601 unsigned *nr_pages,
602 gfp_t gfp)
603{
604 struct cachefiles_object *object;
605 struct cachefiles_cache *cache;
606 struct list_head backpages;
607 struct pagevec pagevec;
608 struct inode *inode;
609 struct page *page, *_n;
610 unsigned shift, nrbackpages;
611 int ret, ret2, space;
612
613 object = container_of(op->op.object,
614 struct cachefiles_object, fscache);
615 cache = container_of(object->fscache.cache,
616 struct cachefiles_cache, cache);
617
618 _enter("{OBJ%x,%d},,%d,,",
619 object->fscache.debug_id, atomic_read(&op->op.usage),
620 *nr_pages);
621
622 if (!object->backer)
623 return -ENOBUFS;
624
625 space = 1;
626 if (cachefiles_has_space(cache, 0, *nr_pages) < 0)
627 space = 0;
628
629 inode = object->backer->d_inode;
630 ASSERT(S_ISREG(inode->i_mode));
631 ASSERT(inode->i_mapping->a_ops->bmap);
632 ASSERT(inode->i_mapping->a_ops->readpages);
633
634 /* calculate the shift required to use bmap */
635 if (inode->i_sb->s_blocksize > PAGE_SIZE)
636 return -ENOBUFS;
637
638 shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
639
640 pagevec_init(&pagevec, 0);
641
642 op->op.flags = FSCACHE_OP_FAST;
643 op->op.processor = cachefiles_read_copier;
644
645 INIT_LIST_HEAD(&backpages);
646 nrbackpages = 0;
647
648 ret = space ? -ENODATA : -ENOBUFS;
649 list_for_each_entry_safe(page, _n, pages, lru) {
650 sector_t block0, block;
651
652 /* we assume the absence or presence of the first block is a
653 * good enough indication for the page as a whole
654 * - TODO: don't use bmap() for this as it is _not_ actually
655 * good enough for this as it doesn't indicate errors, but
656 * it's all we've got for the moment
657 */
658 block0 = page->index;
659 block0 <<= shift;
660
661 block = inode->i_mapping->a_ops->bmap(inode->i_mapping,
662 block0);
663 _debug("%llx -> %llx",
664 (unsigned long long) block0,
665 (unsigned long long) block);
666
667 if (block) {
668 /* we have data - add it to the list to give to the
669 * backing fs */
670 list_move(&page->lru, &backpages);
671 (*nr_pages)--;
672 nrbackpages++;
673 } else if (space && pagevec_add(&pagevec, page) == 0) {
674 fscache_mark_pages_cached(op, &pagevec);
675 ret = -ENODATA;
676 }
677 }
678
679 if (pagevec_count(&pagevec) > 0)
680 fscache_mark_pages_cached(op, &pagevec);
681
682 if (list_empty(pages))
683 ret = 0;
684
685 /* submit the apparently valid pages to the backing fs to be read from
686 * disk */
687 if (nrbackpages > 0) {
688 ret2 = cachefiles_read_backing_file(object, op, &backpages,
689 &pagevec);
690 if (ret2 == -ENOMEM || ret2 == -EINTR)
691 ret = ret2;
692 }
693
694 if (pagevec_count(&pagevec) > 0)
695 fscache_mark_pages_cached(op, &pagevec);
696
697 _leave(" = %d [nr=%u%s]",
698 ret, *nr_pages, list_empty(pages) ? " empty" : "");
699 return ret;
700}
701
702/*
703 * allocate a block in the cache in which to store a page
704 * - cache withdrawal is prevented by the caller
705 * - returns -EINTR if interrupted
706 * - returns -ENOMEM if ran out of memory
707 * - returns -ENOBUFS if no buffers can be made available
708 * - returns -ENOBUFS if page is beyond EOF
709 * - otherwise:
710 * - the metadata will be retained
711 * - 0 will be returned
712 */
713int cachefiles_allocate_page(struct fscache_retrieval *op,
714 struct page *page,
715 gfp_t gfp)
716{
717 struct cachefiles_object *object;
718 struct cachefiles_cache *cache;
719 struct pagevec pagevec;
720 int ret;
721
722 object = container_of(op->op.object,
723 struct cachefiles_object, fscache);
724 cache = container_of(object->fscache.cache,
725 struct cachefiles_cache, cache);
726
727 _enter("%p,{%lx},", object, page->index);
728
729 ret = cachefiles_has_space(cache, 0, 1);
730 if (ret == 0) {
731 pagevec_init(&pagevec, 0);
732 pagevec_add(&pagevec, page);
733 fscache_mark_pages_cached(op, &pagevec);
734 } else {
735 ret = -ENOBUFS;
736 }
737
738 _leave(" = %d", ret);
739 return ret;
740}
741
742/*
743 * allocate blocks in the cache in which to store a set of pages
744 * - cache withdrawal is prevented by the caller
745 * - returns -EINTR if interrupted
746 * - returns -ENOMEM if ran out of memory
747 * - returns -ENOBUFS if some buffers couldn't be made available
748 * - returns -ENOBUFS if some pages are beyond EOF
749 * - otherwise:
750 * - -ENODATA will be returned
751 * - metadata will be retained for any page marked
752 */
753int cachefiles_allocate_pages(struct fscache_retrieval *op,
754 struct list_head *pages,
755 unsigned *nr_pages,
756 gfp_t gfp)
757{
758 struct cachefiles_object *object;
759 struct cachefiles_cache *cache;
760 struct pagevec pagevec;
761 struct page *page;
762 int ret;
763
764 object = container_of(op->op.object,
765 struct cachefiles_object, fscache);
766 cache = container_of(object->fscache.cache,
767 struct cachefiles_cache, cache);
768
769 _enter("%p,,,%d,", object, *nr_pages);
770
771 ret = cachefiles_has_space(cache, 0, *nr_pages);
772 if (ret == 0) {
773 pagevec_init(&pagevec, 0);
774
775 list_for_each_entry(page, pages, lru) {
776 if (pagevec_add(&pagevec, page) == 0)
777 fscache_mark_pages_cached(op, &pagevec);
778 }
779
780 if (pagevec_count(&pagevec) > 0)
781 fscache_mark_pages_cached(op, &pagevec);
782 ret = -ENODATA;
783 } else {
784 ret = -ENOBUFS;
785 }
786
787 _leave(" = %d", ret);
788 return ret;
789}
790
791/*
792 * request a page be stored in the cache
793 * - cache withdrawal is prevented by the caller
794 * - this request may be ignored if there's no cache block available, in which
795 * case -ENOBUFS will be returned
796 * - if the op is in progress, 0 will be returned
797 */
798int cachefiles_write_page(struct fscache_storage *op, struct page *page)
799{
800 struct cachefiles_object *object;
801 struct cachefiles_cache *cache;
802 mm_segment_t old_fs;
803 struct file *file;
804 loff_t pos;
805 void *data;
806 int ret;
807
808 ASSERT(op != NULL);
809 ASSERT(page != NULL);
810
811 object = container_of(op->op.object,
812 struct cachefiles_object, fscache);
813
814 _enter("%p,%p{%lx},,,", object, page, page->index);
815
816 if (!object->backer) {
817 _leave(" = -ENOBUFS");
818 return -ENOBUFS;
819 }
820
821 ASSERT(S_ISREG(object->backer->d_inode->i_mode));
822
823 cache = container_of(object->fscache.cache,
824 struct cachefiles_cache, cache);
825
826 /* write the page to the backing filesystem and let it store it in its
827 * own time */
828 dget(object->backer);
829 mntget(cache->mnt);
830 file = dentry_open(object->backer, cache->mnt, O_RDWR,
831 cache->cache_cred);
832 if (IS_ERR(file)) {
833 ret = PTR_ERR(file);
834 } else {
835 ret = -EIO;
836 if (file->f_op->write) {
837 pos = (loff_t) page->index << PAGE_SHIFT;
838 data = kmap(page);
839 old_fs = get_fs();
840 set_fs(KERNEL_DS);
841 ret = file->f_op->write(
842 file, (const void __user *) data, PAGE_SIZE,
843 &pos);
844 set_fs(old_fs);
845 kunmap(page);
846 if (ret != PAGE_SIZE)
847 ret = -EIO;
848 }
849 fput(file);
850 }
851
852 if (ret < 0) {
853 if (ret == -EIO)
854 cachefiles_io_error_obj(
855 object, "Write page to backing file failed");
856 ret = -ENOBUFS;
857 }
858
859 _leave(" = %d", ret);
860 return ret;
861}
862
863/*
864 * detach a backing block from a page
865 * - cache withdrawal is prevented by the caller
866 */
867void cachefiles_uncache_page(struct fscache_object *_object, struct page *page)
868{
869 struct cachefiles_object *object;
870 struct cachefiles_cache *cache;
871
872 object = container_of(_object, struct cachefiles_object, fscache);
873 cache = container_of(object->fscache.cache,
874 struct cachefiles_cache, cache);
875
876 _enter("%p,{%lu}", object, page->index);
877
878 spin_unlock(&object->fscache.cookie->lock);
879}
diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c
new file mode 100644
index 000000000000..b5808cdb2232
--- /dev/null
+++ b/fs/cachefiles/security.c
@@ -0,0 +1,116 @@
1/* CacheFiles security management
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/fs.h>
13#include <linux/cred.h>
14#include "internal.h"
15
16/*
17 * determine the security context within which we access the cache from within
18 * the kernel
19 */
20int cachefiles_get_security_ID(struct cachefiles_cache *cache)
21{
22 struct cred *new;
23 int ret;
24
25 _enter("{%s}", cache->secctx);
26
27 new = prepare_kernel_cred(current);
28 if (!new) {
29 ret = -ENOMEM;
30 goto error;
31 }
32
33 if (cache->secctx) {
34 ret = set_security_override_from_ctx(new, cache->secctx);
35 if (ret < 0) {
36 put_cred(new);
37 printk(KERN_ERR "CacheFiles:"
38 " Security denies permission to nominate"
39 " security context: error %d\n",
40 ret);
41 goto error;
42 }
43 }
44
45 cache->cache_cred = new;
46 ret = 0;
47error:
48 _leave(" = %d", ret);
49 return ret;
50}
51
52/*
53 * see if mkdir and create can be performed in the root directory
54 */
55static int cachefiles_check_cache_dir(struct cachefiles_cache *cache,
56 struct dentry *root)
57{
58 int ret;
59
60 ret = security_inode_mkdir(root->d_inode, root, 0);
61 if (ret < 0) {
62 printk(KERN_ERR "CacheFiles:"
63 " Security denies permission to make dirs: error %d",
64 ret);
65 return ret;
66 }
67
68 ret = security_inode_create(root->d_inode, root, 0);
69 if (ret < 0)
70 printk(KERN_ERR "CacheFiles:"
71 " Security denies permission to create files: error %d",
72 ret);
73
74 return ret;
75}
76
77/*
78 * check the security details of the on-disk cache
79 * - must be called with security override in force
80 */
81int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
82 struct dentry *root,
83 const struct cred **_saved_cred)
84{
85 struct cred *new;
86 int ret;
87
88 _enter("");
89
90 /* duplicate the cache creds for COW (the override is currently in
91 * force, so we can use prepare_creds() to do this) */
92 new = prepare_creds();
93 if (!new)
94 return -ENOMEM;
95
96 cachefiles_end_secure(cache, *_saved_cred);
97
98 /* use the cache root dir's security context as the basis with
99 * which create files */
100 ret = set_create_files_as(new, root->d_inode);
101 if (ret < 0) {
102 _leave(" = %d [cfa]", ret);
103 return ret;
104 }
105
106 put_cred(cache->cache_cred);
107 cache->cache_cred = new;
108
109 cachefiles_begin_secure(cache, _saved_cred);
110 ret = cachefiles_check_cache_dir(cache, root);
111
112 if (ret == -EOPNOTSUPP)
113 ret = 0;
114 _leave(" = %d", ret);
115 return ret;
116}
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
new file mode 100644
index 000000000000..f3e7a0bf068b
--- /dev/null
+++ b/fs/cachefiles/xattr.c
@@ -0,0 +1,291 @@
1/* CacheFiles extended attribute management
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/sched.h>
14#include <linux/file.h>
15#include <linux/fs.h>
16#include <linux/fsnotify.h>
17#include <linux/quotaops.h>
18#include <linux/xattr.h>
19#include "internal.h"
20
21static const char cachefiles_xattr_cache[] =
22 XATTR_USER_PREFIX "CacheFiles.cache";
23
24/*
25 * check the type label on an object
26 * - done using xattrs
27 */
28int cachefiles_check_object_type(struct cachefiles_object *object)
29{
30 struct dentry *dentry = object->dentry;
31 char type[3], xtype[3];
32 int ret;
33
34 ASSERT(dentry);
35 ASSERT(dentry->d_inode);
36
37 if (!object->fscache.cookie)
38 strcpy(type, "C3");
39 else
40 snprintf(type, 3, "%02x", object->fscache.cookie->def->type);
41
42 _enter("%p{%s}", object, type);
43
44 /* attempt to install a type label directly */
45 ret = vfs_setxattr(dentry, cachefiles_xattr_cache, type, 2,
46 XATTR_CREATE);
47 if (ret == 0) {
48 _debug("SET"); /* we succeeded */
49 goto error;
50 }
51
52 if (ret != -EEXIST) {
53 kerror("Can't set xattr on %*.*s [%lu] (err %d)",
54 dentry->d_name.len, dentry->d_name.len,
55 dentry->d_name.name, dentry->d_inode->i_ino,
56 -ret);
57 goto error;
58 }
59
60 /* read the current type label */
61 ret = vfs_getxattr(dentry, cachefiles_xattr_cache, xtype, 3);
62 if (ret < 0) {
63 if (ret == -ERANGE)
64 goto bad_type_length;
65
66 kerror("Can't read xattr on %*.*s [%lu] (err %d)",
67 dentry->d_name.len, dentry->d_name.len,
68 dentry->d_name.name, dentry->d_inode->i_ino,
69 -ret);
70 goto error;
71 }
72
73 /* check the type is what we're expecting */
74 if (ret != 2)
75 goto bad_type_length;
76
77 if (xtype[0] != type[0] || xtype[1] != type[1])
78 goto bad_type;
79
80 ret = 0;
81
82error:
83 _leave(" = %d", ret);
84 return ret;
85
86bad_type_length:
87 kerror("Cache object %lu type xattr length incorrect",
88 dentry->d_inode->i_ino);
89 ret = -EIO;
90 goto error;
91
92bad_type:
93 xtype[2] = 0;
94 kerror("Cache object %*.*s [%lu] type %s not %s",
95 dentry->d_name.len, dentry->d_name.len,
96 dentry->d_name.name, dentry->d_inode->i_ino,
97 xtype, type);
98 ret = -EIO;
99 goto error;
100}
101
102/*
103 * set the state xattr on a cache file
104 */
105int cachefiles_set_object_xattr(struct cachefiles_object *object,
106 struct cachefiles_xattr *auxdata)
107{
108 struct dentry *dentry = object->dentry;
109 int ret;
110
111 ASSERT(object->fscache.cookie);
112 ASSERT(dentry);
113
114 _enter("%p,#%d", object, auxdata->len);
115
116 /* attempt to install the cache metadata directly */
117 _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
118
119 ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
120 &auxdata->type, auxdata->len,
121 XATTR_CREATE);
122 if (ret < 0 && ret != -ENOMEM)
123 cachefiles_io_error_obj(
124 object,
125 "Failed to set xattr with error %d", ret);
126
127 _leave(" = %d", ret);
128 return ret;
129}
130
131/*
132 * update the state xattr on a cache file
133 */
134int cachefiles_update_object_xattr(struct cachefiles_object *object,
135 struct cachefiles_xattr *auxdata)
136{
137 struct dentry *dentry = object->dentry;
138 int ret;
139
140 ASSERT(object->fscache.cookie);
141 ASSERT(dentry);
142
143 _enter("%p,#%d", object, auxdata->len);
144
145 /* attempt to install the cache metadata directly */
146 _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
147
148 ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
149 &auxdata->type, auxdata->len,
150 XATTR_REPLACE);
151 if (ret < 0 && ret != -ENOMEM)
152 cachefiles_io_error_obj(
153 object,
154 "Failed to update xattr with error %d", ret);
155
156 _leave(" = %d", ret);
157 return ret;
158}
159
160/*
161 * check the state xattr on a cache file
162 * - return -ESTALE if the object should be deleted
163 */
164int cachefiles_check_object_xattr(struct cachefiles_object *object,
165 struct cachefiles_xattr *auxdata)
166{
167 struct cachefiles_xattr *auxbuf;
168 struct dentry *dentry = object->dentry;
169 int ret;
170
171 _enter("%p,#%d", object, auxdata->len);
172
173 ASSERT(dentry);
174 ASSERT(dentry->d_inode);
175
176 auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL);
177 if (!auxbuf) {
178 _leave(" = -ENOMEM");
179 return -ENOMEM;
180 }
181
182 /* read the current type label */
183 ret = vfs_getxattr(dentry, cachefiles_xattr_cache,
184 &auxbuf->type, 512 + 1);
185 if (ret < 0) {
186 if (ret == -ENODATA)
187 goto stale; /* no attribute - power went off
188 * mid-cull? */
189
190 if (ret == -ERANGE)
191 goto bad_type_length;
192
193 cachefiles_io_error_obj(object,
194 "Can't read xattr on %lu (err %d)",
195 dentry->d_inode->i_ino, -ret);
196 goto error;
197 }
198
199 /* check the on-disk object */
200 if (ret < 1)
201 goto bad_type_length;
202
203 if (auxbuf->type != auxdata->type)
204 goto stale;
205
206 auxbuf->len = ret;
207
208 /* consult the netfs */
209 if (object->fscache.cookie->def->check_aux) {
210 enum fscache_checkaux result;
211 unsigned int dlen;
212
213 dlen = auxbuf->len - 1;
214
215 _debug("checkaux %s #%u",
216 object->fscache.cookie->def->name, dlen);
217
218 result = fscache_check_aux(&object->fscache,
219 &auxbuf->data, dlen);
220
221 switch (result) {
222 /* entry okay as is */
223 case FSCACHE_CHECKAUX_OKAY:
224 goto okay;
225
226 /* entry requires update */
227 case FSCACHE_CHECKAUX_NEEDS_UPDATE:
228 break;
229
230 /* entry requires deletion */
231 case FSCACHE_CHECKAUX_OBSOLETE:
232 goto stale;
233
234 default:
235 BUG();
236 }
237
238 /* update the current label */
239 ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
240 &auxdata->type, auxdata->len,
241 XATTR_REPLACE);
242 if (ret < 0) {
243 cachefiles_io_error_obj(object,
244 "Can't update xattr on %lu"
245 " (error %d)",
246 dentry->d_inode->i_ino, -ret);
247 goto error;
248 }
249 }
250
251okay:
252 ret = 0;
253
254error:
255 kfree(auxbuf);
256 _leave(" = %d", ret);
257 return ret;
258
259bad_type_length:
260 kerror("Cache object %lu xattr length incorrect",
261 dentry->d_inode->i_ino);
262 ret = -EIO;
263 goto error;
264
265stale:
266 ret = -ESTALE;
267 goto error;
268}
269
270/*
271 * remove the object's xattr to mark it stale
272 */
273int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
274 struct dentry *dentry)
275{
276 int ret;
277
278 ret = vfs_removexattr(dentry, cachefiles_xattr_cache);
279 if (ret < 0) {
280 if (ret == -ENOENT || ret == -ENODATA)
281 ret = 0;
282 else if (ret != -ENOMEM)
283 cachefiles_io_error(cache,
284 "Can't remove xattr from %lu"
285 " (error %d)",
286 dentry->d_inode->i_ino, -ret);
287 }
288
289 _leave(" = %d", ret);
290 return ret;
291}
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 65984006192c..f20c4069c220 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,16 @@
1Version 1.58
2------------
3Guard against buffer overruns in various UCS-2 to UTF-8 string conversions
4when the UTF-8 string is composed of unusually long (more than 4 byte) converted
5characters. Add support for mounting root of a share which redirects immediately
6to DFS target. Convert string conversion functions from Unicode to more
7accurately mark string length before allocating memory (which may help the
8rare cases where a UTF-8 string is much larger than the UCS2 string that
9we converted from). Fix endianness of the vcnum field used during
10session setup to distinguish multiple mounts to same server from different
11userids. Raw NTLMSSP fixed (it requires /proc/fs/cifs/experimental
12flag to be set to 2, and mount must enable krb5 to turn on extended security).
13
1Version 1.57 14Version 1.57
2------------ 15------------
3Improve support for multiple security contexts to the same server. We 16Improve support for multiple security contexts to the same server. We
@@ -15,7 +28,8 @@ Posix file open support added (turned off after one attempt if server
15fails to support it properly, as with Samba server versions prior to 3.3.2) 28fails to support it properly, as with Samba server versions prior to 3.3.2)
16Fix "redzone overwritten" bug in cifs_put_tcon (CIFSTcon may allocate too 29Fix "redzone overwritten" bug in cifs_put_tcon (CIFSTcon may allocate too
17little memory for the "nativeFileSystem" field returned by the server 30little memory for the "nativeFileSystem" field returned by the server
18during mount). 31during mount). Endian convert inode numbers if necessary (makes it easier
32to compare inode numbers on network files from big endian systems).
19 33
20Version 1.56 34Version 1.56
21------------ 35------------
diff --git a/fs/cifs/README b/fs/cifs/README
index 07434181623b..db208ddb9899 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -651,7 +651,15 @@ Experimental When set to 1 used to enable certain experimental
651 signing turned on in case buffer was modified 651 signing turned on in case buffer was modified
652 just before it was sent, also this flag will 652 just before it was sent, also this flag will
653 be used to use the new experimental directory change 653 be used to use the new experimental directory change
654 notification code). 654 notification code). When set to 2 enables
655 an additional experimental feature, "raw ntlmssp"
656 session establishment support (which allows
657 specifying "sec=ntlmssp" on mount). The Linux cifs
658 module will use ntlmv2 authentication encapsulated
659 in "raw ntlmssp" (not using SPNEGO) when
660 "sec=ntlmssp" is specified on mount.
661 This support also requires building cifs with
662 the CONFIG_CIFS_EXPERIMENTAL configuration flag.
655 663
656These experimental features and tracing can be enabled by changing flags in 664These experimental features and tracing can be enabled by changing flags in
657/proc/fs/cifs (after the cifs module has been installed or built into the 665/proc/fs/cifs (after the cifs module has been installed or built into the
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 5fdbf8a14472..83d62759c7c7 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -340,28 +340,24 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
340 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 340 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
341 341
342 for (i = 0; i < num_referrals; i++) { 342 for (i = 0; i < num_referrals; i++) {
343 int len;
343 dump_referral(referrals+i); 344 dump_referral(referrals+i);
344 /* connect to a storage node */ 345 /* connect to a node */
345 if (referrals[i].flags & DFSREF_STORAGE_SERVER) { 346 len = strlen(referrals[i].node_name);
346 int len; 347 if (len < 2) {
347 len = strlen(referrals[i].node_name); 348 cERROR(1, ("%s: Net Address path too short: %s",
348 if (len < 2) {
349 cERROR(1, ("%s: Net Address path too short: %s",
350 __func__, referrals[i].node_name)); 349 __func__, referrals[i].node_name));
351 rc = -EINVAL; 350 rc = -EINVAL;
352 goto out_err; 351 goto out_err;
353 } 352 }
354 mnt = cifs_dfs_do_refmount(nd->path.mnt, 353 mnt = cifs_dfs_do_refmount(nd->path.mnt,
355 nd->path.dentry, 354 nd->path.dentry, referrals + i);
356 referrals + i); 355 cFYI(1, ("%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
357 cFYI(1, ("%s: cifs_dfs_do_refmount:%s , mnt:%p",
358 __func__,
359 referrals[i].node_name, mnt)); 356 referrals[i].node_name, mnt));
360 357
361 /* complete mount procedure if we accured submount */ 358 /* complete mount procedure if we accured submount */
362 if (!IS_ERR(mnt)) 359 if (!IS_ERR(mnt))
363 break; 360 break;
364 }
365 } 361 }
366 362
367 /* we need it cause for() above could exit without valid submount */ 363 /* we need it cause for() above could exit without valid submount */
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 3fd3a9df043a..67bf93a40d2e 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -41,7 +41,7 @@ cifs_spnego_key_instantiate(struct key *key, const void *data, size_t datalen)
41 41
42 /* attach the data */ 42 /* attach the data */
43 memcpy(payload, data, datalen); 43 memcpy(payload, data, datalen);
44 rcu_assign_pointer(key->payload.data, payload); 44 key->payload.data = payload;
45 ret = 0; 45 ret = 0;
46 46
47error: 47error:
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 7d75272a6b3f..60e3c4253de0 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/cifs_unicode.c 2 * fs/cifs/cifs_unicode.c
3 * 3 *
4 * Copyright (c) International Business Machines Corp., 2000,2005 4 * Copyright (c) International Business Machines Corp., 2000,2009
5 * Modified by Steve French (sfrench@us.ibm.com) 5 * Modified by Steve French (sfrench@us.ibm.com)
6 * 6 *
7 * This program is free software; you can redistribute it and/or modify 7 * This program is free software; you can redistribute it and/or modify
@@ -26,31 +26,157 @@
26#include "cifs_debug.h" 26#include "cifs_debug.h"
27 27
28/* 28/*
29 * NAME: cifs_strfromUCS() 29 * cifs_ucs2_bytes - how long will a string be after conversion?
30 * 30 * @ucs - pointer to input string
31 * FUNCTION: Convert little-endian unicode string to character string 31 * @maxbytes - don't go past this many bytes of input string
32 * @codepage - destination codepage
32 * 33 *
34 * Walk a ucs2le string and return the number of bytes that the string will
35 * be after being converted to the given charset, not including any null
36 * termination required. Don't walk past maxbytes in the source buffer.
33 */ 37 */
34int 38int
35cifs_strfromUCS_le(char *to, const __le16 *from, 39cifs_ucs2_bytes(const __le16 *from, int maxbytes,
36 int len, const struct nls_table *codepage) 40 const struct nls_table *codepage)
37{ 41{
38 int i; 42 int i;
39 int outlen = 0; 43 int charlen, outlen = 0;
44 int maxwords = maxbytes / 2;
45 char tmp[NLS_MAX_CHARSET_SIZE];
40 46
41 for (i = 0; (i < len) && from[i]; i++) { 47 for (i = 0; from[i] && i < maxwords; i++) {
42 int charlen; 48 charlen = codepage->uni2char(le16_to_cpu(from[i]), tmp,
43 /* 2.4.0 kernel or greater */ 49 NLS_MAX_CHARSET_SIZE);
44 charlen = 50 if (charlen > 0)
45 codepage->uni2char(le16_to_cpu(from[i]), &to[outlen],
46 NLS_MAX_CHARSET_SIZE);
47 if (charlen > 0) {
48 outlen += charlen; 51 outlen += charlen;
49 } else { 52 else
50 to[outlen++] = '?'; 53 outlen++;
54 }
55
56 return outlen;
57}
58
59/*
60 * cifs_mapchar - convert a little-endian char to proper char in codepage
61 * @target - where converted character should be copied
62 * @src_char - 2 byte little-endian source character
63 * @cp - codepage to which character should be converted
64 * @mapchar - should character be mapped according to mapchars mount option?
65 *
66 * This function handles the conversion of a single character. It is the
67 * responsibility of the caller to ensure that the target buffer is large
68 * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
69 */
70static int
71cifs_mapchar(char *target, const __le16 src_char, const struct nls_table *cp,
72 bool mapchar)
73{
74 int len = 1;
75
76 if (!mapchar)
77 goto cp_convert;
78
79 /*
80 * BB: Cannot handle remapping UNI_SLASH until all the calls to
81 * build_path_from_dentry are modified, as they use slash as
82 * separator.
83 */
84 switch (le16_to_cpu(src_char)) {
85 case UNI_COLON:
86 *target = ':';
87 break;
88 case UNI_ASTERIK:
89 *target = '*';
90 break;
91 case UNI_QUESTION:
92 *target = '?';
93 break;
94 case UNI_PIPE:
95 *target = '|';
96 break;
97 case UNI_GRTRTHAN:
98 *target = '>';
99 break;
100 case UNI_LESSTHAN:
101 *target = '<';
102 break;
103 default:
104 goto cp_convert;
105 }
106
107out:
108 return len;
109
110cp_convert:
111 len = cp->uni2char(le16_to_cpu(src_char), target,
112 NLS_MAX_CHARSET_SIZE);
113 if (len <= 0) {
114 *target = '?';
115 len = 1;
116 }
117 goto out;
118}
119
120/*
121 * cifs_from_ucs2 - convert utf16le string to local charset
122 * @to - destination buffer
123 * @from - source buffer
124 * @tolen - destination buffer size (in bytes)
125 * @fromlen - source buffer size (in bytes)
126 * @codepage - codepage to which characters should be converted
127 * @mapchar - should characters be remapped according to the mapchars option?
128 *
129 * Convert a little-endian ucs2le string (as sent by the server) to a string
130 * in the provided codepage. The tolen and fromlen parameters are to ensure
131 * that the code doesn't walk off of the end of the buffer (which is always
132 * a danger if the alignment of the source buffer is off). The destination
133 * string is always properly null terminated and fits in the destination
134 * buffer. Returns the length of the destination string in bytes (including
135 * null terminator).
136 *
137 * Note that some windows versions actually send multiword UTF-16 characters
138 * instead of straight UCS-2. The linux nls routines however aren't able to
139 * deal with those characters properly. In the event that we get some of
140 * those characters, they won't be translated properly.
141 */
142int
143cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
144 const struct nls_table *codepage, bool mapchar)
145{
146 int i, charlen, safelen;
147 int outlen = 0;
148 int nullsize = nls_nullsize(codepage);
149 int fromwords = fromlen / 2;
150 char tmp[NLS_MAX_CHARSET_SIZE];
151
152 /*
153 * because the chars can be of varying widths, we need to take care
154 * not to overflow the destination buffer when we get close to the
155 * end of it. Until we get to this offset, we don't need to check
156 * for overflow however.
157 */
158 safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
159
160 for (i = 0; i < fromwords && from[i]; i++) {
161 /*
162 * check to see if converting this character might make the
163 * conversion bleed into the null terminator
164 */
165 if (outlen >= safelen) {
166 charlen = cifs_mapchar(tmp, from[i], codepage, mapchar);
167 if ((outlen + charlen) > (tolen - nullsize))
168 break;
51 } 169 }
170
171 /* put converted char into 'to' buffer */
172 charlen = cifs_mapchar(&to[outlen], from[i], codepage, mapchar);
173 outlen += charlen;
52 } 174 }
53 to[outlen] = 0; 175
176 /* properly null-terminate string */
177 for (i = 0; i < nullsize; i++)
178 to[outlen++] = 0;
179
54 return outlen; 180 return outlen;
55} 181}
56 182
@@ -88,3 +214,41 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
88 return i; 214 return i;
89} 215}
90 216
217/*
218 * cifs_strndup_from_ucs - copy a string from wire format to the local codepage
219 * @src - source string
220 * @maxlen - don't walk past this many bytes in the source string
221 * @is_unicode - is this a unicode string?
222 * @codepage - destination codepage
223 *
224 * Take a string given by the server, convert it to the local codepage and
225 * put it in a new buffer. Returns a pointer to the new string or NULL on
226 * error.
227 */
228char *
229cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
230 const struct nls_table *codepage)
231{
232 int len;
233 char *dst;
234
235 if (is_unicode) {
236 len = cifs_ucs2_bytes((__le16 *) src, maxlen, codepage);
237 len += nls_nullsize(codepage);
238 dst = kmalloc(len, GFP_KERNEL);
239 if (!dst)
240 return NULL;
241 cifs_from_ucs2(dst, (__le16 *) src, len, maxlen, codepage,
242 false);
243 } else {
244 len = strnlen(src, maxlen);
245 len++;
246 dst = kmalloc(len, GFP_KERNEL);
247 if (!dst)
248 return NULL;
249 strlcpy(dst, src, len);
250 }
251
252 return dst;
253}
254
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 14eb9a2395d3..650638275a6f 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -5,7 +5,7 @@
5 * Convert a unicode character to upper or lower case using 5 * Convert a unicode character to upper or lower case using
6 * compressed tables. 6 * compressed tables.
7 * 7 *
8 * Copyright (c) International Business Machines Corp., 2000,2007 8 * Copyright (c) International Business Machines Corp., 2000,2009
9 * 9 *
10 * This program is free software; you can redistribute it and/or modify 10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by 11 * it under the terms of the GNU General Public License as published by
@@ -37,6 +37,19 @@
37 37
38#define UNIUPR_NOLOWER /* Example to not expand lower case tables */ 38#define UNIUPR_NOLOWER /* Example to not expand lower case tables */
39 39
40/*
41 * Windows maps these to the user defined 16 bit Unicode range since they are
42 * reserved symbols (along with \ and /), otherwise illegal to store
43 * in filenames in NTFS
44 */
45#define UNI_ASTERIK (__u16) ('*' + 0xF000)
46#define UNI_QUESTION (__u16) ('?' + 0xF000)
47#define UNI_COLON (__u16) (':' + 0xF000)
48#define UNI_GRTRTHAN (__u16) ('>' + 0xF000)
49#define UNI_LESSTHAN (__u16) ('<' + 0xF000)
50#define UNI_PIPE (__u16) ('|' + 0xF000)
51#define UNI_SLASH (__u16) ('\\' + 0xF000)
52
40/* Just define what we want from uniupr.h. We don't want to define the tables 53/* Just define what we want from uniupr.h. We don't want to define the tables
41 * in each source file. 54 * in each source file.
42 */ 55 */
@@ -59,8 +72,14 @@ extern struct UniCaseRange UniLowerRange[];
59#endif /* UNIUPR_NOLOWER */ 72#endif /* UNIUPR_NOLOWER */
60 73
61#ifdef __KERNEL__ 74#ifdef __KERNEL__
62int cifs_strfromUCS_le(char *, const __le16 *, int, const struct nls_table *); 75int cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
76 const struct nls_table *codepage, bool mapchar);
77int cifs_ucs2_bytes(const __le16 *from, int maxbytes,
78 const struct nls_table *codepage);
63int cifs_strtoUCS(__le16 *, const char *, int, const struct nls_table *); 79int cifs_strtoUCS(__le16 *, const char *, int, const struct nls_table *);
80char *cifs_strndup_from_ucs(const char *src, const int maxlen,
81 const bool is_unicode,
82 const struct nls_table *codepage);
64#endif 83#endif
65 84
66/* 85/*
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 38491fd3871d..5e6d35804d73 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -35,6 +35,7 @@
35#include <linux/delay.h> 35#include <linux/delay.h>
36#include <linux/kthread.h> 36#include <linux/kthread.h>
37#include <linux/freezer.h> 37#include <linux/freezer.h>
38#include <linux/smp_lock.h>
38#include "cifsfs.h" 39#include "cifsfs.h"
39#include "cifspdu.h" 40#include "cifspdu.h"
40#define DECLARE_GLOBALS_HERE 41#define DECLARE_GLOBALS_HERE
@@ -66,9 +67,6 @@ unsigned int sign_CIFS_PDUs = 1;
66extern struct task_struct *oplockThread; /* remove sparse warning */ 67extern struct task_struct *oplockThread; /* remove sparse warning */
67struct task_struct *oplockThread = NULL; 68struct task_struct *oplockThread = NULL;
68/* extern struct task_struct * dnotifyThread; remove sparse warning */ 69/* extern struct task_struct * dnotifyThread; remove sparse warning */
69#ifdef CONFIG_CIFS_EXPERIMENTAL
70static struct task_struct *dnotifyThread = NULL;
71#endif
72static const struct super_operations cifs_super_ops; 70static const struct super_operations cifs_super_ops;
73unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE; 71unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
74module_param(CIFSMaxBufSize, int, 0); 72module_param(CIFSMaxBufSize, int, 0);
@@ -316,6 +314,7 @@ cifs_alloc_inode(struct super_block *sb)
316 cifs_inode->clientCanCacheAll = false; 314 cifs_inode->clientCanCacheAll = false;
317 cifs_inode->delete_pending = false; 315 cifs_inode->delete_pending = false;
318 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ 316 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
317 cifs_inode->server_eof = 0;
319 318
320 /* Can not set i_flags here - they get immediately overwritten 319 /* Can not set i_flags here - they get immediately overwritten
321 to zero by the VFS */ 320 to zero by the VFS */
@@ -532,6 +531,7 @@ static void cifs_umount_begin(struct super_block *sb)
532 if (tcon == NULL) 531 if (tcon == NULL)
533 return; 532 return;
534 533
534 lock_kernel();
535 read_lock(&cifs_tcp_ses_lock); 535 read_lock(&cifs_tcp_ses_lock);
536 if (tcon->tc_count == 1) 536 if (tcon->tc_count == 1)
537 tcon->tidStatus = CifsExiting; 537 tcon->tidStatus = CifsExiting;
@@ -550,6 +550,7 @@ static void cifs_umount_begin(struct super_block *sb)
550 } 550 }
551/* BB FIXME - finish add checks for tidStatus BB */ 551/* BB FIXME - finish add checks for tidStatus BB */
552 552
553 unlock_kernel();
553 return; 554 return;
554} 555}
555 556
@@ -601,8 +602,7 @@ cifs_get_sb(struct file_system_type *fs_type,
601 602
602 rc = cifs_read_super(sb, data, dev_name, flags & MS_SILENT ? 1 : 0); 603 rc = cifs_read_super(sb, data, dev_name, flags & MS_SILENT ? 1 : 0);
603 if (rc) { 604 if (rc) {
604 up_write(&sb->s_umount); 605 deactivate_locked_super(sb);
605 deactivate_super(sb);
606 return rc; 606 return rc;
607 } 607 }
608 sb->s_flags |= MS_ACTIVE; 608 sb->s_flags |= MS_ACTIVE;
@@ -1040,34 +1040,6 @@ static int cifs_oplock_thread(void *dummyarg)
1040 return 0; 1040 return 0;
1041} 1041}
1042 1042
1043#ifdef CONFIG_CIFS_EXPERIMENTAL
1044static int cifs_dnotify_thread(void *dummyarg)
1045{
1046 struct list_head *tmp;
1047 struct TCP_Server_Info *server;
1048
1049 do {
1050 if (try_to_freeze())
1051 continue;
1052 set_current_state(TASK_INTERRUPTIBLE);
1053 schedule_timeout(15*HZ);
1054 /* check if any stuck requests that need
1055 to be woken up and wakeq so the
1056 thread can wake up and error out */
1057 read_lock(&cifs_tcp_ses_lock);
1058 list_for_each(tmp, &cifs_tcp_ses_list) {
1059 server = list_entry(tmp, struct TCP_Server_Info,
1060 tcp_ses_list);
1061 if (atomic_read(&server->inFlight))
1062 wake_up_all(&server->response_q);
1063 }
1064 read_unlock(&cifs_tcp_ses_lock);
1065 } while (!kthread_should_stop());
1066
1067 return 0;
1068}
1069#endif
1070
1071static int __init 1043static int __init
1072init_cifs(void) 1044init_cifs(void)
1073{ 1045{
@@ -1144,21 +1116,8 @@ init_cifs(void)
1144 goto out_unregister_dfs_key_type; 1116 goto out_unregister_dfs_key_type;
1145 } 1117 }
1146 1118
1147#ifdef CONFIG_CIFS_EXPERIMENTAL
1148 dnotifyThread = kthread_run(cifs_dnotify_thread, NULL, "cifsdnotifyd");
1149 if (IS_ERR(dnotifyThread)) {
1150 rc = PTR_ERR(dnotifyThread);
1151 cERROR(1, ("error %d create dnotify thread", rc));
1152 goto out_stop_oplock_thread;
1153 }
1154#endif
1155
1156 return 0; 1119 return 0;
1157 1120
1158#ifdef CONFIG_CIFS_EXPERIMENTAL
1159 out_stop_oplock_thread:
1160#endif
1161 kthread_stop(oplockThread);
1162 out_unregister_dfs_key_type: 1121 out_unregister_dfs_key_type:
1163#ifdef CONFIG_CIFS_DFS_UPCALL 1122#ifdef CONFIG_CIFS_DFS_UPCALL
1164 unregister_key_type(&key_type_dns_resolver); 1123 unregister_key_type(&key_type_dns_resolver);
@@ -1196,9 +1155,6 @@ exit_cifs(void)
1196 cifs_destroy_inodecache(); 1155 cifs_destroy_inodecache();
1197 cifs_destroy_mids(); 1156 cifs_destroy_mids();
1198 cifs_destroy_request_bufs(); 1157 cifs_destroy_request_bufs();
1199#ifdef CONFIG_CIFS_EXPERIMENTAL
1200 kthread_stop(dnotifyThread);
1201#endif
1202 kthread_stop(oplockThread); 1158 kthread_stop(oplockThread);
1203} 1159}
1204 1160
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 77e190dc2883..051b71cfdea9 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -100,5 +100,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
100extern const struct export_operations cifs_export_ops; 100extern const struct export_operations cifs_export_ops;
101#endif /* EXPERIMENTAL */ 101#endif /* EXPERIMENTAL */
102 102
103#define CIFS_VERSION "1.57" 103#define CIFS_VERSION "1.58"
104#endif /* _CIFSFS_H */ 104#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 9fbf4dff5da6..a61ab772c6f6 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -82,8 +82,8 @@ enum securityEnum {
82 LANMAN, /* Legacy LANMAN auth */ 82 LANMAN, /* Legacy LANMAN auth */
83 NTLM, /* Legacy NTLM012 auth with NTLM hash */ 83 NTLM, /* Legacy NTLM012 auth with NTLM hash */
84 NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */ 84 NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */
85 RawNTLMSSP, /* NTLMSSP without SPNEGO */ 85 RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */
86 NTLMSSP, /* NTLMSSP via SPNEGO */ 86 NTLMSSP, /* NTLMSSP via SPNEGO, NTLMv2 hash */
87 Kerberos, /* Kerberos via SPNEGO */ 87 Kerberos, /* Kerberos via SPNEGO */
88 MSKerberos, /* MS Kerberos via SPNEGO */ 88 MSKerberos, /* MS Kerberos via SPNEGO */
89}; 89};
@@ -350,7 +350,7 @@ struct cifsFileInfo {
350 bool invalidHandle:1; /* file closed via session abend */ 350 bool invalidHandle:1; /* file closed via session abend */
351 bool messageMode:1; /* for pipes: message vs byte mode */ 351 bool messageMode:1; /* for pipes: message vs byte mode */
352 atomic_t wrtPending; /* handle in use - defer close */ 352 atomic_t wrtPending; /* handle in use - defer close */
353 struct semaphore fh_sem; /* prevents reopen race after dead ses*/ 353 struct mutex fh_mutex; /* prevents reopen race after dead ses*/
354 struct cifs_search_info srch_inf; 354 struct cifs_search_info srch_inf;
355}; 355};
356 356
@@ -370,6 +370,7 @@ struct cifsInodeInfo {
370 bool clientCanCacheAll:1; /* read and writebehind oplock */ 370 bool clientCanCacheAll:1; /* read and writebehind oplock */
371 bool oplockPending:1; 371 bool oplockPending:1;
372 bool delete_pending:1; /* DELETE_ON_CLOSE is set */ 372 bool delete_pending:1; /* DELETE_ON_CLOSE is set */
373 u64 server_eof; /* current file size on server */
373 struct inode vfs_inode; 374 struct inode vfs_inode;
374}; 375};
375 376
@@ -530,6 +531,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
530#define CIFSSEC_MAY_PLNTXT 0 531#define CIFSSEC_MAY_PLNTXT 0
531#endif /* weak passwords */ 532#endif /* weak passwords */
532#define CIFSSEC_MAY_SEAL 0x00040 /* not supported yet */ 533#define CIFSSEC_MAY_SEAL 0x00040 /* not supported yet */
534#define CIFSSEC_MAY_NTLMSSP 0x00080 /* raw ntlmssp with ntlmv2 */
533 535
534#define CIFSSEC_MUST_SIGN 0x01001 536#define CIFSSEC_MUST_SIGN 0x01001
535/* note that only one of the following can be set so the 537/* note that only one of the following can be set so the
@@ -542,22 +544,23 @@ require use of the stronger protocol */
542#define CIFSSEC_MUST_LANMAN 0x10010 544#define CIFSSEC_MUST_LANMAN 0x10010
543#define CIFSSEC_MUST_PLNTXT 0x20020 545#define CIFSSEC_MUST_PLNTXT 0x20020
544#ifdef CONFIG_CIFS_UPCALL 546#ifdef CONFIG_CIFS_UPCALL
545#define CIFSSEC_MASK 0x3F03F /* allows weak security but also krb5 */ 547#define CIFSSEC_MASK 0xAF0AF /* allows weak security but also krb5 */
546#else 548#else
547#define CIFSSEC_MASK 0x37037 /* current flags supported if weak */ 549#define CIFSSEC_MASK 0xA70A7 /* current flags supported if weak */
548#endif /* UPCALL */ 550#endif /* UPCALL */
549#else /* do not allow weak pw hash */ 551#else /* do not allow weak pw hash */
550#ifdef CONFIG_CIFS_UPCALL 552#ifdef CONFIG_CIFS_UPCALL
551#define CIFSSEC_MASK 0x0F00F /* flags supported if no weak allowed */ 553#define CIFSSEC_MASK 0x8F08F /* flags supported if no weak allowed */
552#else 554#else
553#define CIFSSEC_MASK 0x07007 /* flags supported if no weak allowed */ 555#define CIFSSEC_MASK 0x87087 /* flags supported if no weak allowed */
554#endif /* UPCALL */ 556#endif /* UPCALL */
555#endif /* WEAK_PW_HASH */ 557#endif /* WEAK_PW_HASH */
556#define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */ 558#define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */
559#define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */
557 560
558#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2) 561#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2)
559#define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2) 562#define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2)
560#define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5) 563#define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)
561/* 564/*
562 ***************************************************************** 565 *****************************************************************
563 * All constants go here 566 * All constants go here
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index b370489c8da5..a785f69dbc9f 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -2163,7 +2163,7 @@ typedef struct {
2163 __le32 Type; 2163 __le32 Type;
2164 __le64 DevMajor; 2164 __le64 DevMajor;
2165 __le64 DevMinor; 2165 __le64 DevMinor;
2166 __u64 UniqueId; 2166 __le64 UniqueId;
2167 __le64 Permissions; 2167 __le64 Permissions;
2168 __le64 Nlinks; 2168 __le64 Nlinks;
2169} __attribute__((packed)) FILE_UNIX_BASIC_INFO; /* level 0x200 QPathInfo */ 2169} __attribute__((packed)) FILE_UNIX_BASIC_INFO; /* level 0x200 QPathInfo */
@@ -2308,7 +2308,7 @@ struct unlink_psx_rq { /* level 0x20a SetPathInfo */
2308} __attribute__((packed)); 2308} __attribute__((packed));
2309 2309
2310struct file_internal_info { 2310struct file_internal_info {
2311 __u64 UniqueId; /* inode number */ 2311 __le64 UniqueId; /* inode number */
2312} __attribute__((packed)); /* level 0x3ee */ 2312} __attribute__((packed)); /* level 0x3ee */
2313 2313
2314struct file_mode_info { 2314struct file_mode_info {
@@ -2338,7 +2338,7 @@ typedef struct {
2338 __le32 Type; 2338 __le32 Type;
2339 __le64 DevMajor; 2339 __le64 DevMajor;
2340 __le64 DevMinor; 2340 __le64 DevMinor;
2341 __u64 UniqueId; 2341 __le64 UniqueId;
2342 __le64 Permissions; 2342 __le64 Permissions;
2343 __le64 Nlinks; 2343 __le64 Nlinks;
2344 char FileName[1]; 2344 char FileName[1];
@@ -2386,7 +2386,7 @@ typedef struct {
2386 __le32 FileNameLength; 2386 __le32 FileNameLength;
2387 __le32 EaSize; /* EA size */ 2387 __le32 EaSize; /* EA size */
2388 __le32 Reserved; 2388 __le32 Reserved;
2389 __u64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/ 2389 __le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/
2390 char FileName[1]; 2390 char FileName[1];
2391} __attribute__((packed)) SEARCH_ID_FULL_DIR_INFO; /* level 0x105 FF rsp data */ 2391} __attribute__((packed)) SEARCH_ID_FULL_DIR_INFO; /* level 0x105 FF rsp data */
2392 2392
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 4167716d32f2..fae083930eee 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -260,8 +260,7 @@ extern int CIFSUnixCreateSymLink(const int xid,
260 const struct nls_table *nls_codepage); 260 const struct nls_table *nls_codepage);
261extern int CIFSSMBUnixQuerySymLink(const int xid, 261extern int CIFSSMBUnixQuerySymLink(const int xid,
262 struct cifsTconInfo *tcon, 262 struct cifsTconInfo *tcon,
263 const unsigned char *searchName, 263 const unsigned char *searchName, char **syminfo,
264 char *syminfo, const int buflen,
265 const struct nls_table *nls_codepage); 264 const struct nls_table *nls_codepage);
266extern int CIFSSMBQueryReparseLinkInfo(const int xid, 265extern int CIFSSMBQueryReparseLinkInfo(const int xid,
267 struct cifsTconInfo *tcon, 266 struct cifsTconInfo *tcon,
@@ -307,8 +306,6 @@ extern int CIFSGetSrvInodeNumber(const int xid, struct cifsTconInfo *tcon,
307 const unsigned char *searchName, __u64 *inode_number, 306 const unsigned char *searchName, __u64 *inode_number,
308 const struct nls_table *nls_codepage, 307 const struct nls_table *nls_codepage,
309 int remap_special_chars); 308 int remap_special_chars);
310extern int cifs_convertUCSpath(char *target, const __le16 *source, int maxlen,
311 const struct nls_table *codepage);
312extern int cifsConvertToUCS(__le16 *target, const char *source, int maxlen, 309extern int cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
313 const struct nls_table *cp, int mapChars); 310 const struct nls_table *cp, int mapChars);
314 311
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index bc09c998631f..d06260251c30 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/cifssmb.c 2 * fs/cifs/cifssmb.c
3 * 3 *
4 * Copyright (C) International Business Machines Corp., 2002,2008 4 * Copyright (C) International Business Machines Corp., 2002,2009
5 * Author(s): Steve French (sfrench@us.ibm.com) 5 * Author(s): Steve French (sfrench@us.ibm.com)
6 * 6 *
7 * Contains the routines for constructing the SMB PDUs themselves 7 * Contains the routines for constructing the SMB PDUs themselves
@@ -81,41 +81,6 @@ static struct {
81#endif /* CONFIG_CIFS_WEAK_PW_HASH */ 81#endif /* CONFIG_CIFS_WEAK_PW_HASH */
82#endif /* CIFS_POSIX */ 82#endif /* CIFS_POSIX */
83 83
84/* Allocates buffer into dst and copies smb string from src to it.
85 * caller is responsible for freeing dst if function returned 0.
86 * returns:
87 * on success - 0
88 * on failure - errno
89 */
90static int
91cifs_strncpy_to_host(char **dst, const char *src, const int maxlen,
92 const bool is_unicode, const struct nls_table *nls_codepage)
93{
94 int plen;
95
96 if (is_unicode) {
97 plen = UniStrnlen((wchar_t *)src, maxlen);
98 *dst = kmalloc(plen + 2, GFP_KERNEL);
99 if (!*dst)
100 goto cifs_strncpy_to_host_ErrExit;
101 cifs_strfromUCS_le(*dst, (__le16 *)src, plen, nls_codepage);
102 } else {
103 plen = strnlen(src, maxlen);
104 *dst = kmalloc(plen + 2, GFP_KERNEL);
105 if (!*dst)
106 goto cifs_strncpy_to_host_ErrExit;
107 strncpy(*dst, src, plen);
108 }
109 (*dst)[plen] = 0;
110 (*dst)[plen+1] = 0; /* harmless for ASCII case, needed for Unicode */
111 return 0;
112
113cifs_strncpy_to_host_ErrExit:
114 cERROR(1, ("Failed to allocate buffer for string\n"));
115 return -ENOMEM;
116}
117
118
119/* Mark as invalid, all open files on tree connections since they 84/* Mark as invalid, all open files on tree connections since they
120 were closed when session to server was lost */ 85 were closed when session to server was lost */
121static void mark_open_files_invalid(struct cifsTconInfo *pTcon) 86static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
@@ -484,6 +449,14 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
484 cFYI(1, ("Kerberos only mechanism, enable extended security")); 449 cFYI(1, ("Kerberos only mechanism, enable extended security"));
485 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; 450 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
486 } 451 }
452#ifdef CONFIG_CIFS_EXPERIMENTAL
453 else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
454 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
455 else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) {
456 cFYI(1, ("NTLMSSP only mechanism, enable extended security"));
457 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
458 }
459#endif
487 460
488 count = 0; 461 count = 0;
489 for (i = 0; i < CIFS_NUM_PROT; i++) { 462 for (i = 0; i < CIFS_NUM_PROT; i++) {
@@ -620,6 +593,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
620 server->secType = NTLMv2; 593 server->secType = NTLMv2;
621 else if (secFlags & CIFSSEC_MAY_KRB5) 594 else if (secFlags & CIFSSEC_MAY_KRB5)
622 server->secType = Kerberos; 595 server->secType = Kerberos;
596 else if (secFlags & CIFSSEC_MAY_NTLMSSP)
597 server->secType = NTLMSSP;
623 else if (secFlags & CIFSSEC_MAY_LANMAN) 598 else if (secFlags & CIFSSEC_MAY_LANMAN)
624 server->secType = LANMAN; 599 server->secType = LANMAN;
625/* #ifdef CONFIG_CIFS_EXPERIMENTAL 600/* #ifdef CONFIG_CIFS_EXPERIMENTAL
@@ -1626,6 +1601,8 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
1626 int smb_hdr_len; 1601 int smb_hdr_len;
1627 int resp_buf_type = 0; 1602 int resp_buf_type = 0;
1628 1603
1604 *nbytes = 0;
1605
1629 cFYI(1, ("write2 at %lld %d bytes", (long long)offset, count)); 1606 cFYI(1, ("write2 at %lld %d bytes", (long long)offset, count));
1630 1607
1631 if (tcon->ses->capabilities & CAP_LARGE_FILES) { 1608 if (tcon->ses->capabilities & CAP_LARGE_FILES) {
@@ -1682,11 +1659,9 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
1682 cifs_stats_inc(&tcon->num_writes); 1659 cifs_stats_inc(&tcon->num_writes);
1683 if (rc) { 1660 if (rc) {
1684 cFYI(1, ("Send error Write2 = %d", rc)); 1661 cFYI(1, ("Send error Write2 = %d", rc));
1685 *nbytes = 0;
1686 } else if (resp_buf_type == 0) { 1662 } else if (resp_buf_type == 0) {
1687 /* presumably this can not happen, but best to be safe */ 1663 /* presumably this can not happen, but best to be safe */
1688 rc = -EIO; 1664 rc = -EIO;
1689 *nbytes = 0;
1690 } else { 1665 } else {
1691 WRITE_RSP *pSMBr = (WRITE_RSP *)iov[0].iov_base; 1666 WRITE_RSP *pSMBr = (WRITE_RSP *)iov[0].iov_base;
1692 *nbytes = le16_to_cpu(pSMBr->CountHigh); 1667 *nbytes = le16_to_cpu(pSMBr->CountHigh);
@@ -2417,8 +2392,7 @@ winCreateHardLinkRetry:
2417 2392
2418int 2393int
2419CIFSSMBUnixQuerySymLink(const int xid, struct cifsTconInfo *tcon, 2394CIFSSMBUnixQuerySymLink(const int xid, struct cifsTconInfo *tcon,
2420 const unsigned char *searchName, 2395 const unsigned char *searchName, char **symlinkinfo,
2421 char *symlinkinfo, const int buflen,
2422 const struct nls_table *nls_codepage) 2396 const struct nls_table *nls_codepage)
2423{ 2397{
2424/* SMB_QUERY_FILE_UNIX_LINK */ 2398/* SMB_QUERY_FILE_UNIX_LINK */
@@ -2428,6 +2402,7 @@ CIFSSMBUnixQuerySymLink(const int xid, struct cifsTconInfo *tcon,
2428 int bytes_returned; 2402 int bytes_returned;
2429 int name_len; 2403 int name_len;
2430 __u16 params, byte_count; 2404 __u16 params, byte_count;
2405 char *data_start;
2431 2406
2432 cFYI(1, ("In QPathSymLinkInfo (Unix) for path %s", searchName)); 2407 cFYI(1, ("In QPathSymLinkInfo (Unix) for path %s", searchName));
2433 2408
@@ -2482,30 +2457,26 @@ querySymLinkRetry:
2482 /* decode response */ 2457 /* decode response */
2483 2458
2484 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 2459 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
2485 if (rc || (pSMBr->ByteCount < 2))
2486 /* BB also check enough total bytes returned */ 2460 /* BB also check enough total bytes returned */
2487 rc = -EIO; /* bad smb */ 2461 if (rc || (pSMBr->ByteCount < 2))
2462 rc = -EIO;
2488 else { 2463 else {
2489 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); 2464 bool is_unicode;
2490 __u16 count = le16_to_cpu(pSMBr->t2.DataCount); 2465 u16 count = le16_to_cpu(pSMBr->t2.DataCount);
2466
2467 data_start = ((char *) &pSMBr->hdr.Protocol) +
2468 le16_to_cpu(pSMBr->t2.DataOffset);
2469
2470 if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
2471 is_unicode = true;
2472 else
2473 is_unicode = false;
2491 2474
2492 if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) {
2493 name_len = UniStrnlen((wchar_t *) ((char *)
2494 &pSMBr->hdr.Protocol + data_offset),
2495 min_t(const int, buflen, count) / 2);
2496 /* BB FIXME investigate remapping reserved chars here */ 2475 /* BB FIXME investigate remapping reserved chars here */
2497 cifs_strfromUCS_le(symlinkinfo, 2476 *symlinkinfo = cifs_strndup_from_ucs(data_start, count,
2498 (__le16 *) ((char *)&pSMBr->hdr.Protocol 2477 is_unicode, nls_codepage);
2499 + data_offset), 2478 if (!*symlinkinfo)
2500 name_len, nls_codepage); 2479 rc = -ENOMEM;
2501 } else {
2502 strncpy(symlinkinfo,
2503 (char *) &pSMBr->hdr.Protocol +
2504 data_offset,
2505 min_t(const int, buflen, count));
2506 }
2507 symlinkinfo[buflen] = 0;
2508 /* just in case so calling code does not go off the end of buffer */
2509 } 2480 }
2510 } 2481 }
2511 cifs_buf_release(pSMB); 2482 cifs_buf_release(pSMB);
@@ -2603,7 +2574,6 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
2603 *pparmlen = parm_count; 2574 *pparmlen = parm_count;
2604 return 0; 2575 return 0;
2605} 2576}
2606#endif /* CIFS_EXPERIMENTAL */
2607 2577
2608int 2578int
2609CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon, 2579CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
@@ -2613,7 +2583,6 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
2613{ 2583{
2614 int rc = 0; 2584 int rc = 0;
2615 int bytes_returned; 2585 int bytes_returned;
2616 int name_len;
2617 struct smb_com_transaction_ioctl_req *pSMB; 2586 struct smb_com_transaction_ioctl_req *pSMB;
2618 struct smb_com_transaction_ioctl_rsp *pSMBr; 2587 struct smb_com_transaction_ioctl_rsp *pSMBr;
2619 2588
@@ -2650,59 +2619,55 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
2650 } else { /* decode response */ 2619 } else { /* decode response */
2651 __u32 data_offset = le32_to_cpu(pSMBr->DataOffset); 2620 __u32 data_offset = le32_to_cpu(pSMBr->DataOffset);
2652 __u32 data_count = le32_to_cpu(pSMBr->DataCount); 2621 __u32 data_count = le32_to_cpu(pSMBr->DataCount);
2653 if ((pSMBr->ByteCount < 2) || (data_offset > 512)) 2622 if ((pSMBr->ByteCount < 2) || (data_offset > 512)) {
2654 /* BB also check enough total bytes returned */ 2623 /* BB also check enough total bytes returned */
2655 rc = -EIO; /* bad smb */ 2624 rc = -EIO; /* bad smb */
2656 else { 2625 goto qreparse_out;
2657 if (data_count && (data_count < 2048)) { 2626 }
2658 char *end_of_smb = 2 /* sizeof byte count */ + 2627 if (data_count && (data_count < 2048)) {
2659 pSMBr->ByteCount + 2628 char *end_of_smb = 2 /* sizeof byte count */ +
2660 (char *)&pSMBr->ByteCount; 2629 pSMBr->ByteCount + (char *)&pSMBr->ByteCount;
2661 2630
2662 struct reparse_data *reparse_buf = 2631 struct reparse_data *reparse_buf =
2663 (struct reparse_data *) 2632 (struct reparse_data *)
2664 ((char *)&pSMBr->hdr.Protocol 2633 ((char *)&pSMBr->hdr.Protocol
2665 + data_offset); 2634 + data_offset);
2666 if ((char *)reparse_buf >= end_of_smb) { 2635 if ((char *)reparse_buf >= end_of_smb) {
2667 rc = -EIO; 2636 rc = -EIO;
2668 goto qreparse_out; 2637 goto qreparse_out;
2669 } 2638 }
2670 if ((reparse_buf->LinkNamesBuf + 2639 if ((reparse_buf->LinkNamesBuf +
2671 reparse_buf->TargetNameOffset + 2640 reparse_buf->TargetNameOffset +
2672 reparse_buf->TargetNameLen) > 2641 reparse_buf->TargetNameLen) > end_of_smb) {
2673 end_of_smb) { 2642 cFYI(1, ("reparse buf beyond SMB"));
2674 cFYI(1, ("reparse buf beyond SMB")); 2643 rc = -EIO;
2675 rc = -EIO; 2644 goto qreparse_out;
2676 goto qreparse_out; 2645 }
2677 }
2678 2646
2679 if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) { 2647 if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) {
2680 name_len = UniStrnlen((wchar_t *) 2648 cifs_from_ucs2(symlinkinfo, (__le16 *)
2681 (reparse_buf->LinkNamesBuf + 2649 (reparse_buf->LinkNamesBuf +
2682 reparse_buf->TargetNameOffset), 2650 reparse_buf->TargetNameOffset),
2683 min(buflen/2, 2651 buflen,
2684 reparse_buf->TargetNameLen / 2)); 2652 reparse_buf->TargetNameLen,
2685 cifs_strfromUCS_le(symlinkinfo, 2653 nls_codepage, 0);
2686 (__le16 *) (reparse_buf->LinkNamesBuf + 2654 } else { /* ASCII names */
2687 reparse_buf->TargetNameOffset), 2655 strncpy(symlinkinfo,
2688 name_len, nls_codepage); 2656 reparse_buf->LinkNamesBuf +
2689 } else { /* ASCII names */ 2657 reparse_buf->TargetNameOffset,
2690 strncpy(symlinkinfo, 2658 min_t(const int, buflen,
2691 reparse_buf->LinkNamesBuf + 2659 reparse_buf->TargetNameLen));
2692 reparse_buf->TargetNameOffset,
2693 min_t(const int, buflen,
2694 reparse_buf->TargetNameLen));
2695 }
2696 } else {
2697 rc = -EIO;
2698 cFYI(1, ("Invalid return data count on "
2699 "get reparse info ioctl"));
2700 } 2660 }
2701 symlinkinfo[buflen] = 0; /* just in case so the caller 2661 } else {
2702 does not go off the end of the buffer */ 2662 rc = -EIO;
2703 cFYI(1, ("readlink result - %s", symlinkinfo)); 2663 cFYI(1, ("Invalid return data count on "
2664 "get reparse info ioctl"));
2704 } 2665 }
2666 symlinkinfo[buflen] = 0; /* just in case so the caller
2667 does not go off the end of the buffer */
2668 cFYI(1, ("readlink result - %s", symlinkinfo));
2705 } 2669 }
2670
2706qreparse_out: 2671qreparse_out:
2707 cifs_buf_release(pSMB); 2672 cifs_buf_release(pSMB);
2708 2673
@@ -2711,6 +2676,7 @@ qreparse_out:
2711 2676
2712 return rc; 2677 return rc;
2713} 2678}
2679#endif /* CIFS_EXPERIMENTAL */
2714 2680
2715#ifdef CONFIG_CIFS_POSIX 2681#ifdef CONFIG_CIFS_POSIX
2716 2682
@@ -3918,7 +3884,7 @@ GetInodeNumberRetry:
3918 } 3884 }
3919 pfinfo = (struct file_internal_info *) 3885 pfinfo = (struct file_internal_info *)
3920 (data_offset + (char *) &pSMBr->hdr.Protocol); 3886 (data_offset + (char *) &pSMBr->hdr.Protocol);
3921 *inode_number = pfinfo->UniqueId; 3887 *inode_number = le64_to_cpu(pfinfo->UniqueId);
3922 } 3888 }
3923 } 3889 }
3924GetInodeNumOut: 3890GetInodeNumOut:
@@ -3928,27 +3894,6 @@ GetInodeNumOut:
3928 return rc; 3894 return rc;
3929} 3895}
3930 3896
3931/* computes length of UCS string converted to host codepage
3932 * @src: UCS string
3933 * @maxlen: length of the input string in UCS characters
3934 * (not in bytes)
3935 *
3936 * return: size of input string in host codepage
3937 */
3938static int hostlen_fromUCS(const __le16 *src, const int maxlen,
3939 const struct nls_table *nls_codepage) {
3940 int i;
3941 int hostlen = 0;
3942 char to[4];
3943 int charlen;
3944 for (i = 0; (i < maxlen) && src[i]; ++i) {
3945 charlen = nls_codepage->uni2char(le16_to_cpu(src[i]),
3946 to, NLS_MAX_CHARSET_SIZE);
3947 hostlen += charlen > 0 ? charlen : 1;
3948 }
3949 return hostlen;
3950}
3951
3952/* parses DFS refferal V3 structure 3897/* parses DFS refferal V3 structure
3953 * caller is responsible for freeing target_nodes 3898 * caller is responsible for freeing target_nodes
3954 * returns: 3899 * returns:
@@ -3994,7 +3939,7 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
3994 3939
3995 cFYI(1, ("num_referrals: %d dfs flags: 0x%x ... \n", 3940 cFYI(1, ("num_referrals: %d dfs flags: 0x%x ... \n",
3996 *num_of_nodes, 3941 *num_of_nodes,
3997 le16_to_cpu(pSMBr->DFSFlags))); 3942 le32_to_cpu(pSMBr->DFSFlags)));
3998 3943
3999 *target_nodes = kzalloc(sizeof(struct dfs_info3_param) * 3944 *target_nodes = kzalloc(sizeof(struct dfs_info3_param) *
4000 *num_of_nodes, GFP_KERNEL); 3945 *num_of_nodes, GFP_KERNEL);
@@ -4010,14 +3955,14 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
4010 int max_len; 3955 int max_len;
4011 struct dfs_info3_param *node = (*target_nodes)+i; 3956 struct dfs_info3_param *node = (*target_nodes)+i;
4012 3957
4013 node->flags = le16_to_cpu(pSMBr->DFSFlags); 3958 node->flags = le32_to_cpu(pSMBr->DFSFlags);
4014 if (is_unicode) { 3959 if (is_unicode) {
4015 __le16 *tmp = kmalloc(strlen(searchName)*2 + 2, 3960 __le16 *tmp = kmalloc(strlen(searchName)*2 + 2,
4016 GFP_KERNEL); 3961 GFP_KERNEL);
4017 cifsConvertToUCS((__le16 *) tmp, searchName, 3962 cifsConvertToUCS((__le16 *) tmp, searchName,
4018 PATH_MAX, nls_codepage, remap); 3963 PATH_MAX, nls_codepage, remap);
4019 node->path_consumed = hostlen_fromUCS(tmp, 3964 node->path_consumed = cifs_ucs2_bytes(tmp,
4020 le16_to_cpu(pSMBr->PathConsumed)/2, 3965 le16_to_cpu(pSMBr->PathConsumed),
4021 nls_codepage); 3966 nls_codepage);
4022 kfree(tmp); 3967 kfree(tmp);
4023 } else 3968 } else
@@ -4029,20 +3974,20 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
4029 /* copy DfsPath */ 3974 /* copy DfsPath */
4030 temp = (char *)ref + le16_to_cpu(ref->DfsPathOffset); 3975 temp = (char *)ref + le16_to_cpu(ref->DfsPathOffset);
4031 max_len = data_end - temp; 3976 max_len = data_end - temp;
4032 rc = cifs_strncpy_to_host(&(node->path_name), temp, 3977 node->path_name = cifs_strndup_from_ucs(temp, max_len,
4033 max_len, is_unicode, nls_codepage); 3978 is_unicode, nls_codepage);
4034 if (rc) 3979 if (!node->path_name) {
3980 rc = -ENOMEM;
4035 goto parse_DFS_referrals_exit; 3981 goto parse_DFS_referrals_exit;
3982 }
4036 3983
4037 /* copy link target UNC */ 3984 /* copy link target UNC */
4038 temp = (char *)ref + le16_to_cpu(ref->NetworkAddressOffset); 3985 temp = (char *)ref + le16_to_cpu(ref->NetworkAddressOffset);
4039 max_len = data_end - temp; 3986 max_len = data_end - temp;
4040 rc = cifs_strncpy_to_host(&(node->node_name), temp, 3987 node->node_name = cifs_strndup_from_ucs(temp, max_len,
4041 max_len, is_unicode, nls_codepage); 3988 is_unicode, nls_codepage);
4042 if (rc) 3989 if (!node->node_name)
4043 goto parse_DFS_referrals_exit; 3990 rc = -ENOMEM;
4044
4045 ref += le16_to_cpu(ref->Size);
4046 } 3991 }
4047 3992
4048parse_DFS_referrals_exit: 3993parse_DFS_referrals_exit:
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 0de3b5615a22..4aa81a507b74 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/connect.c 2 * fs/cifs/connect.c
3 * 3 *
4 * Copyright (C) International Business Machines Corp., 2002,2008 4 * Copyright (C) International Business Machines Corp., 2002,2009
5 * Author(s): Steve French (sfrench@us.ibm.com) 5 * Author(s): Steve French (sfrench@us.ibm.com)
6 * 6 *
7 * This library is free software; you can redistribute it and/or modify 7 * This library is free software; you can redistribute it and/or modify
@@ -32,6 +32,7 @@
32#include <linux/kthread.h> 32#include <linux/kthread.h>
33#include <linux/pagevec.h> 33#include <linux/pagevec.h>
34#include <linux/freezer.h> 34#include <linux/freezer.h>
35#include <linux/namei.h>
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36#include <asm/processor.h> 37#include <asm/processor.h>
37#include <net/ipv6.h> 38#include <net/ipv6.h>
@@ -978,6 +979,13 @@ cifs_parse_mount_options(char *options, const char *devname,
978 return 1; 979 return 1;
979 } else if (strnicmp(value, "krb5", 4) == 0) { 980 } else if (strnicmp(value, "krb5", 4) == 0) {
980 vol->secFlg |= CIFSSEC_MAY_KRB5; 981 vol->secFlg |= CIFSSEC_MAY_KRB5;
982#ifdef CONFIG_CIFS_EXPERIMENTAL
983 } else if (strnicmp(value, "ntlmsspi", 8) == 0) {
984 vol->secFlg |= CIFSSEC_MAY_NTLMSSP |
985 CIFSSEC_MUST_SIGN;
986 } else if (strnicmp(value, "ntlmssp", 7) == 0) {
987 vol->secFlg |= CIFSSEC_MAY_NTLMSSP;
988#endif
981 } else if (strnicmp(value, "ntlmv2i", 7) == 0) { 989 } else if (strnicmp(value, "ntlmv2i", 7) == 0) {
982 vol->secFlg |= CIFSSEC_MAY_NTLMV2 | 990 vol->secFlg |= CIFSSEC_MAY_NTLMV2 |
983 CIFSSEC_MUST_SIGN; 991 CIFSSEC_MUST_SIGN;
@@ -2214,9 +2222,58 @@ is_path_accessible(int xid, struct cifsTconInfo *tcon,
2214 return rc; 2222 return rc;
2215} 2223}
2216 2224
2225static void
2226cleanup_volume_info(struct smb_vol **pvolume_info)
2227{
2228 struct smb_vol *volume_info;
2229
2230 if (!pvolume_info && !*pvolume_info)
2231 return;
2232
2233 volume_info = *pvolume_info;
2234 kzfree(volume_info->password);
2235 kfree(volume_info->UNC);
2236 kfree(volume_info->prepath);
2237 kfree(volume_info);
2238 *pvolume_info = NULL;
2239 return;
2240}
2241
2242#ifdef CONFIG_CIFS_DFS_UPCALL
2243/* build_path_to_root returns full path to root when
2244 * we do not have an exiting connection (tcon) */
2245static char *
2246build_unc_path_to_root(const struct smb_vol *volume_info,
2247 const struct cifs_sb_info *cifs_sb)
2248{
2249 char *full_path;
2250
2251 int unc_len = strnlen(volume_info->UNC, MAX_TREE_SIZE + 1);
2252 full_path = kmalloc(unc_len + cifs_sb->prepathlen + 1, GFP_KERNEL);
2253 if (full_path == NULL)
2254 return ERR_PTR(-ENOMEM);
2255
2256 strncpy(full_path, volume_info->UNC, unc_len);
2257 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
2258 int i;
2259 for (i = 0; i < unc_len; i++) {
2260 if (full_path[i] == '\\')
2261 full_path[i] = '/';
2262 }
2263 }
2264
2265 if (cifs_sb->prepathlen)
2266 strncpy(full_path + unc_len, cifs_sb->prepath,
2267 cifs_sb->prepathlen);
2268
2269 full_path[unc_len + cifs_sb->prepathlen] = 0; /* add trailing null */
2270 return full_path;
2271}
2272#endif
2273
2217int 2274int
2218cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, 2275cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
2219 char *mount_data, const char *devname) 2276 char *mount_data_global, const char *devname)
2220{ 2277{
2221 int rc = 0; 2278 int rc = 0;
2222 int xid; 2279 int xid;
@@ -2225,6 +2282,14 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
2225 struct cifsTconInfo *tcon = NULL; 2282 struct cifsTconInfo *tcon = NULL;
2226 struct TCP_Server_Info *srvTcp = NULL; 2283 struct TCP_Server_Info *srvTcp = NULL;
2227 char *full_path; 2284 char *full_path;
2285 char *mount_data = mount_data_global;
2286#ifdef CONFIG_CIFS_DFS_UPCALL
2287 struct dfs_info3_param *referrals = NULL;
2288 unsigned int num_referrals = 0;
2289 int referral_walks_count = 0;
2290try_mount_again:
2291#endif
2292 full_path = NULL;
2228 2293
2229 xid = GetXid(); 2294 xid = GetXid();
2230 2295
@@ -2371,11 +2436,9 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
2371 } 2436 }
2372 } 2437 }
2373 2438
2374 /* check for null share name ie connect to dfs root */
2375 if ((strchr(volume_info->UNC + 3, '\\') == NULL) 2439 if ((strchr(volume_info->UNC + 3, '\\') == NULL)
2376 && (strchr(volume_info->UNC + 3, '/') == NULL)) { 2440 && (strchr(volume_info->UNC + 3, '/') == NULL)) {
2377 /* rc = connect_to_dfs_path(...) */ 2441 cERROR(1, ("Missing share name"));
2378 cFYI(1, ("DFS root not supported"));
2379 rc = -ENODEV; 2442 rc = -ENODEV;
2380 goto mount_fail_check; 2443 goto mount_fail_check;
2381 } else { 2444 } else {
@@ -2392,7 +2455,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
2392 } 2455 }
2393 } 2456 }
2394 if (rc) 2457 if (rc)
2395 goto mount_fail_check; 2458 goto remote_path_check;
2396 tcon->seal = volume_info->seal; 2459 tcon->seal = volume_info->seal;
2397 write_lock(&cifs_tcp_ses_lock); 2460 write_lock(&cifs_tcp_ses_lock);
2398 list_add(&tcon->tcon_list, &pSesInfo->tcon_list); 2461 list_add(&tcon->tcon_list, &pSesInfo->tcon_list);
@@ -2417,19 +2480,9 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
2417 /* BB FIXME fix time_gran to be larger for LANMAN sessions */ 2480 /* BB FIXME fix time_gran to be larger for LANMAN sessions */
2418 sb->s_time_gran = 100; 2481 sb->s_time_gran = 100;
2419 2482
2420mount_fail_check: 2483 if (rc)
2421 /* on error free sesinfo and tcon struct if needed */ 2484 goto remote_path_check;
2422 if (rc) { 2485
2423 /* If find_unc succeeded then rc == 0 so we can not end */
2424 /* up accidently freeing someone elses tcon struct */
2425 if (tcon)
2426 cifs_put_tcon(tcon);
2427 else if (pSesInfo)
2428 cifs_put_smb_ses(pSesInfo);
2429 else
2430 cifs_put_tcp_session(srvTcp);
2431 goto out;
2432 }
2433 cifs_sb->tcon = tcon; 2486 cifs_sb->tcon = tcon;
2434 2487
2435 /* do not care if following two calls succeed - informational */ 2488 /* do not care if following two calls succeed - informational */
@@ -2461,7 +2514,9 @@ mount_fail_check:
2461 cifs_sb->rsize = min(cifs_sb->rsize, 2514 cifs_sb->rsize = min(cifs_sb->rsize,
2462 (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE)); 2515 (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
2463 2516
2464 if (!rc && cifs_sb->prepathlen) { 2517remote_path_check:
2518 /* check if a whole path (including prepath) is not remote */
2519 if (!rc && cifs_sb->prepathlen && tcon) {
2465 /* build_path_to_root works only when we have a valid tcon */ 2520 /* build_path_to_root works only when we have a valid tcon */
2466 full_path = cifs_build_path_to_root(cifs_sb); 2521 full_path = cifs_build_path_to_root(cifs_sb);
2467 if (full_path == NULL) { 2522 if (full_path == NULL) {
@@ -2469,1079 +2524,91 @@ mount_fail_check:
2469 goto mount_fail_check; 2524 goto mount_fail_check;
2470 } 2525 }
2471 rc = is_path_accessible(xid, tcon, cifs_sb, full_path); 2526 rc = is_path_accessible(xid, tcon, cifs_sb, full_path);
2472 if (rc) { 2527 if (rc != -EREMOTE) {
2473 cERROR(1, ("Path %s in not accessible: %d",
2474 full_path, rc));
2475 kfree(full_path); 2528 kfree(full_path);
2476 goto mount_fail_check; 2529 goto mount_fail_check;
2477 } 2530 }
2478 kfree(full_path); 2531 kfree(full_path);
2479 } 2532 }
2480 2533
2481 /* volume_info->password is freed above when existing session found 2534 /* get referral if needed */
2482 (in which case it is not needed anymore) but when new sesion is created 2535 if (rc == -EREMOTE) {
2483 the password ptr is put in the new session structure (in which case the 2536#ifdef CONFIG_CIFS_DFS_UPCALL
2484 password will be freed at unmount time) */ 2537 if (referral_walks_count > MAX_NESTED_LINKS) {
2485out: 2538 /*
2486 /* zero out password before freeing */ 2539 * BB: when we implement proper loop detection,
2487 if (volume_info) { 2540 * we will remove this check. But now we need it
2488 if (volume_info->password != NULL) { 2541 * to prevent an indefinite loop if 'DFS tree' is
2489 memset(volume_info->password, 0, 2542 * misconfigured (i.e. has loops).
2490 strlen(volume_info->password)); 2543 */
2491 kfree(volume_info->password); 2544 rc = -ELOOP;
2492 } 2545 goto mount_fail_check;
2493 kfree(volume_info->UNC);
2494 kfree(volume_info->prepath);
2495 kfree(volume_info);
2496 }
2497 FreeXid(xid);
2498 return rc;
2499}
2500
2501static int
2502CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
2503 char session_key[CIFS_SESS_KEY_SIZE],
2504 const struct nls_table *nls_codepage)
2505{
2506 struct smb_hdr *smb_buffer;
2507 struct smb_hdr *smb_buffer_response;
2508 SESSION_SETUP_ANDX *pSMB;
2509 SESSION_SETUP_ANDX *pSMBr;
2510 char *bcc_ptr;
2511 char *user;
2512 char *domain;
2513 int rc = 0;
2514 int remaining_words = 0;
2515 int bytes_returned = 0;
2516 int len;
2517 __u32 capabilities;
2518 __u16 count;
2519
2520 cFYI(1, ("In sesssetup"));
2521 if (ses == NULL)
2522 return -EINVAL;
2523 user = ses->userName;
2524 domain = ses->domainName;
2525 smb_buffer = cifs_buf_get();
2526
2527 if (smb_buffer == NULL)
2528 return -ENOMEM;
2529
2530 smb_buffer_response = smb_buffer;
2531 pSMBr = pSMB = (SESSION_SETUP_ANDX *) smb_buffer;
2532
2533 /* send SMBsessionSetup here */
2534 header_assemble(smb_buffer, SMB_COM_SESSION_SETUP_ANDX,
2535 NULL /* no tCon exists yet */ , 13 /* wct */ );
2536
2537 smb_buffer->Mid = GetNextMid(ses->server);
2538 pSMB->req_no_secext.AndXCommand = 0xFF;
2539 pSMB->req_no_secext.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
2540 pSMB->req_no_secext.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
2541
2542 if (ses->server->secMode &
2543 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
2544 smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
2545
2546 capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
2547 CAP_LARGE_WRITE_X | CAP_LARGE_READ_X;
2548 if (ses->capabilities & CAP_UNICODE) {
2549 smb_buffer->Flags2 |= SMBFLG2_UNICODE;
2550 capabilities |= CAP_UNICODE;
2551 }
2552 if (ses->capabilities & CAP_STATUS32) {
2553 smb_buffer->Flags2 |= SMBFLG2_ERR_STATUS;
2554 capabilities |= CAP_STATUS32;
2555 }
2556 if (ses->capabilities & CAP_DFS) {
2557 smb_buffer->Flags2 |= SMBFLG2_DFS;
2558 capabilities |= CAP_DFS;
2559 }
2560 pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
2561
2562 pSMB->req_no_secext.CaseInsensitivePasswordLength =
2563 cpu_to_le16(CIFS_SESS_KEY_SIZE);
2564
2565 pSMB->req_no_secext.CaseSensitivePasswordLength =
2566 cpu_to_le16(CIFS_SESS_KEY_SIZE);
2567 bcc_ptr = pByteArea(smb_buffer);
2568 memcpy(bcc_ptr, (char *) session_key, CIFS_SESS_KEY_SIZE);
2569 bcc_ptr += CIFS_SESS_KEY_SIZE;
2570 memcpy(bcc_ptr, (char *) session_key, CIFS_SESS_KEY_SIZE);
2571 bcc_ptr += CIFS_SESS_KEY_SIZE;
2572
2573 if (ses->capabilities & CAP_UNICODE) {
2574 if ((long) bcc_ptr % 2) { /* must be word aligned for Unicode */
2575 *bcc_ptr = 0;
2576 bcc_ptr++;
2577 }
2578 if (user == NULL)
2579 bytes_returned = 0; /* skip null user */
2580 else
2581 bytes_returned =
2582 cifs_strtoUCS((__le16 *) bcc_ptr, user, 100,
2583 nls_codepage);
2584 /* convert number of 16 bit words to bytes */
2585 bcc_ptr += 2 * bytes_returned;
2586 bcc_ptr += 2; /* trailing null */
2587 if (domain == NULL)
2588 bytes_returned =
2589 cifs_strtoUCS((__le16 *) bcc_ptr,
2590 "CIFS_LINUX_DOM", 32, nls_codepage);
2591 else
2592 bytes_returned =
2593 cifs_strtoUCS((__le16 *) bcc_ptr, domain, 64,
2594 nls_codepage);
2595 bcc_ptr += 2 * bytes_returned;
2596 bcc_ptr += 2;
2597 bytes_returned =
2598 cifs_strtoUCS((__le16 *) bcc_ptr, "Linux version ",
2599 32, nls_codepage);
2600 bcc_ptr += 2 * bytes_returned;
2601 bytes_returned =
2602 cifs_strtoUCS((__le16 *) bcc_ptr, utsname()->release,
2603 32, nls_codepage);
2604 bcc_ptr += 2 * bytes_returned;
2605 bcc_ptr += 2;
2606 bytes_returned =
2607 cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
2608 64, nls_codepage);
2609 bcc_ptr += 2 * bytes_returned;
2610 bcc_ptr += 2;
2611 } else {
2612 if (user != NULL) {
2613 strncpy(bcc_ptr, user, 200);
2614 bcc_ptr += strnlen(user, 200);
2615 }
2616 *bcc_ptr = 0;
2617 bcc_ptr++;
2618 if (domain == NULL) {
2619 strcpy(bcc_ptr, "CIFS_LINUX_DOM");
2620 bcc_ptr += strlen("CIFS_LINUX_DOM") + 1;
2621 } else {
2622 strncpy(bcc_ptr, domain, 64);
2623 bcc_ptr += strnlen(domain, 64);
2624 *bcc_ptr = 0;
2625 bcc_ptr++;
2626 }
2627 strcpy(bcc_ptr, "Linux version ");
2628 bcc_ptr += strlen("Linux version ");
2629 strcpy(bcc_ptr, utsname()->release);
2630 bcc_ptr += strlen(utsname()->release) + 1;
2631 strcpy(bcc_ptr, CIFS_NETWORK_OPSYS);
2632 bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
2633 }
2634 count = (long) bcc_ptr - (long) pByteArea(smb_buffer);
2635 smb_buffer->smb_buf_length += count;
2636 pSMB->req_no_secext.ByteCount = cpu_to_le16(count);
2637
2638 rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response,
2639 &bytes_returned, CIFS_LONG_OP);
2640 if (rc) {
2641/* rc = map_smb_to_linux_error(smb_buffer_response); now done in SendReceive */
2642 } else if ((smb_buffer_response->WordCount == 3)
2643 || (smb_buffer_response->WordCount == 4)) {
2644 __u16 action = le16_to_cpu(pSMBr->resp.Action);
2645 __u16 blob_len = le16_to_cpu(pSMBr->resp.SecurityBlobLength);
2646 if (action & GUEST_LOGIN)
2647 cFYI(1, ("Guest login")); /* BB mark SesInfo struct? */
2648 ses->Suid = smb_buffer_response->Uid; /* UID left in wire format
2649 (little endian) */
2650 cFYI(1, ("UID = %d ", ses->Suid));
2651 /* response can have either 3 or 4 word count - Samba sends 3 */
2652 bcc_ptr = pByteArea(smb_buffer_response);
2653 if ((pSMBr->resp.hdr.WordCount == 3)
2654 || ((pSMBr->resp.hdr.WordCount == 4)
2655 && (blob_len < pSMBr->resp.ByteCount))) {
2656 if (pSMBr->resp.hdr.WordCount == 4)
2657 bcc_ptr += blob_len;
2658
2659 if (smb_buffer->Flags2 & SMBFLG2_UNICODE) {
2660 if ((long) (bcc_ptr) % 2) {
2661 remaining_words =
2662 (BCC(smb_buffer_response) - 1) / 2;
2663 /* Unicode strings must be word
2664 aligned */
2665 bcc_ptr++;
2666 } else {
2667 remaining_words =
2668 BCC(smb_buffer_response) / 2;
2669 }
2670 len =
2671 UniStrnlen((wchar_t *) bcc_ptr,
2672 remaining_words - 1);
2673/* We look for obvious messed up bcc or strings in response so we do not go off
2674 the end since (at least) WIN2K and Windows XP have a major bug in not null
2675 terminating last Unicode string in response */
2676 if (ses->serverOS)
2677 kfree(ses->serverOS);
2678 ses->serverOS = kzalloc(2 * (len + 1),
2679 GFP_KERNEL);
2680 if (ses->serverOS == NULL)
2681 goto sesssetup_nomem;
2682 cifs_strfromUCS_le(ses->serverOS,
2683 (__le16 *)bcc_ptr,
2684 len, nls_codepage);
2685 bcc_ptr += 2 * (len + 1);
2686 remaining_words -= len + 1;
2687 ses->serverOS[2 * len] = 0;
2688 ses->serverOS[1 + (2 * len)] = 0;
2689 if (remaining_words > 0) {
2690 len = UniStrnlen((wchar_t *)bcc_ptr,
2691 remaining_words-1);
2692 kfree(ses->serverNOS);
2693 ses->serverNOS = kzalloc(2 * (len + 1),
2694 GFP_KERNEL);
2695 if (ses->serverNOS == NULL)
2696 goto sesssetup_nomem;
2697 cifs_strfromUCS_le(ses->serverNOS,
2698 (__le16 *)bcc_ptr,
2699 len, nls_codepage);
2700 bcc_ptr += 2 * (len + 1);
2701 ses->serverNOS[2 * len] = 0;
2702 ses->serverNOS[1 + (2 * len)] = 0;
2703 if (strncmp(ses->serverNOS,
2704 "NT LAN Manager 4", 16) == 0) {
2705 cFYI(1, ("NT4 server"));
2706 ses->flags |= CIFS_SES_NT4;
2707 }
2708 remaining_words -= len + 1;
2709 if (remaining_words > 0) {
2710 len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words);
2711 /* last string is not always null terminated
2712 (for e.g. for Windows XP & 2000) */
2713 if (ses->serverDomain)
2714 kfree(ses->serverDomain);
2715 ses->serverDomain =
2716 kzalloc(2*(len+1),
2717 GFP_KERNEL);
2718 if (ses->serverDomain == NULL)
2719 goto sesssetup_nomem;
2720 cifs_strfromUCS_le(ses->serverDomain,
2721 (__le16 *)bcc_ptr,
2722 len, nls_codepage);
2723 bcc_ptr += 2 * (len + 1);
2724 ses->serverDomain[2*len] = 0;
2725 ses->serverDomain[1+(2*len)] = 0;
2726 } else { /* else no more room so create
2727 dummy domain string */
2728 if (ses->serverDomain)
2729 kfree(ses->serverDomain);
2730 ses->serverDomain =
2731 kzalloc(2, GFP_KERNEL);
2732 }
2733 } else { /* no room so create dummy domain
2734 and NOS string */
2735
2736 /* if these kcallocs fail not much we
2737 can do, but better to not fail the
2738 sesssetup itself */
2739 kfree(ses->serverDomain);
2740 ses->serverDomain =
2741 kzalloc(2, GFP_KERNEL);
2742 kfree(ses->serverNOS);
2743 ses->serverNOS =
2744 kzalloc(2, GFP_KERNEL);
2745 }
2746 } else { /* ASCII */
2747 len = strnlen(bcc_ptr, 1024);
2748 if (((long) bcc_ptr + len) - (long)
2749 pByteArea(smb_buffer_response)
2750 <= BCC(smb_buffer_response)) {
2751 kfree(ses->serverOS);
2752 ses->serverOS = kzalloc(len + 1,
2753 GFP_KERNEL);
2754 if (ses->serverOS == NULL)
2755 goto sesssetup_nomem;
2756 strncpy(ses->serverOS, bcc_ptr, len);
2757
2758 bcc_ptr += len;
2759 /* null terminate the string */
2760 bcc_ptr[0] = 0;
2761 bcc_ptr++;
2762
2763 len = strnlen(bcc_ptr, 1024);
2764 kfree(ses->serverNOS);
2765 ses->serverNOS = kzalloc(len + 1,
2766 GFP_KERNEL);
2767 if (ses->serverNOS == NULL)
2768 goto sesssetup_nomem;
2769 strncpy(ses->serverNOS, bcc_ptr, len);
2770 bcc_ptr += len;
2771 bcc_ptr[0] = 0;
2772 bcc_ptr++;
2773
2774 len = strnlen(bcc_ptr, 1024);
2775 if (ses->serverDomain)
2776 kfree(ses->serverDomain);
2777 ses->serverDomain = kzalloc(len + 1,
2778 GFP_KERNEL);
2779 if (ses->serverDomain == NULL)
2780 goto sesssetup_nomem;
2781 strncpy(ses->serverDomain, bcc_ptr,
2782 len);
2783 bcc_ptr += len;
2784 bcc_ptr[0] = 0;
2785 bcc_ptr++;
2786 } else
2787 cFYI(1,
2788 ("Variable field of length %d "
2789 "extends beyond end of smb ",
2790 len));
2791 }
2792 } else {
2793 cERROR(1, ("Security Blob Length extends beyond "
2794 "end of SMB"));
2795 } 2546 }
2796 } else { 2547 /* convert forward to back slashes in prepath here if needed */
2797 cERROR(1, ("Invalid Word count %d: ", 2548 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) == 0)
2798 smb_buffer_response->WordCount)); 2549 convert_delimiter(cifs_sb->prepath,
2799 rc = -EIO; 2550 CIFS_DIR_SEP(cifs_sb));
2800 } 2551 full_path = build_unc_path_to_root(volume_info, cifs_sb);
2801sesssetup_nomem: /* do not return an error on nomem for the info strings, 2552 if (IS_ERR(full_path)) {
2802 since that could make reconnection harder, and 2553 rc = PTR_ERR(full_path);
2803 reconnection might be needed to free memory */ 2554 goto mount_fail_check;
2804 cifs_buf_release(smb_buffer);
2805
2806 return rc;
2807}
2808
2809static int
2810CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
2811 struct cifsSesInfo *ses, bool *pNTLMv2_flag,
2812 const struct nls_table *nls_codepage)
2813{
2814 struct smb_hdr *smb_buffer;
2815 struct smb_hdr *smb_buffer_response;
2816 SESSION_SETUP_ANDX *pSMB;
2817 SESSION_SETUP_ANDX *pSMBr;
2818 char *bcc_ptr;
2819 char *domain;
2820 int rc = 0;
2821 int remaining_words = 0;
2822 int bytes_returned = 0;
2823 int len;
2824 int SecurityBlobLength = sizeof(NEGOTIATE_MESSAGE);
2825 PNEGOTIATE_MESSAGE SecurityBlob;
2826 PCHALLENGE_MESSAGE SecurityBlob2;
2827 __u32 negotiate_flags, capabilities;
2828 __u16 count;
2829
2830 cFYI(1, ("In NTLMSSP sesssetup (negotiate)"));
2831 if (ses == NULL)
2832 return -EINVAL;
2833 domain = ses->domainName;
2834 *pNTLMv2_flag = false;
2835 smb_buffer = cifs_buf_get();
2836 if (smb_buffer == NULL) {
2837 return -ENOMEM;
2838 }
2839 smb_buffer_response = smb_buffer;
2840 pSMB = (SESSION_SETUP_ANDX *) smb_buffer;
2841 pSMBr = (SESSION_SETUP_ANDX *) smb_buffer_response;
2842
2843 /* send SMBsessionSetup here */
2844 header_assemble(smb_buffer, SMB_COM_SESSION_SETUP_ANDX,
2845 NULL /* no tCon exists yet */ , 12 /* wct */ );
2846
2847 smb_buffer->Mid = GetNextMid(ses->server);
2848 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
2849 pSMB->req.hdr.Flags |= (SMBFLG_CASELESS | SMBFLG_CANONICAL_PATH_FORMAT);
2850
2851 pSMB->req.AndXCommand = 0xFF;
2852 pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
2853 pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
2854
2855 if (ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
2856 smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
2857
2858 capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
2859 CAP_EXTENDED_SECURITY;
2860 if (ses->capabilities & CAP_UNICODE) {
2861 smb_buffer->Flags2 |= SMBFLG2_UNICODE;
2862 capabilities |= CAP_UNICODE;
2863 }
2864 if (ses->capabilities & CAP_STATUS32) {
2865 smb_buffer->Flags2 |= SMBFLG2_ERR_STATUS;
2866 capabilities |= CAP_STATUS32;
2867 }
2868 if (ses->capabilities & CAP_DFS) {
2869 smb_buffer->Flags2 |= SMBFLG2_DFS;
2870 capabilities |= CAP_DFS;
2871 }
2872 pSMB->req.Capabilities = cpu_to_le32(capabilities);
2873
2874 bcc_ptr = (char *) &pSMB->req.SecurityBlob;
2875 SecurityBlob = (PNEGOTIATE_MESSAGE) bcc_ptr;
2876 strncpy(SecurityBlob->Signature, NTLMSSP_SIGNATURE, 8);
2877 SecurityBlob->MessageType = NtLmNegotiate;
2878 negotiate_flags =
2879 NTLMSSP_NEGOTIATE_UNICODE | NTLMSSP_NEGOTIATE_OEM |
2880 NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_NTLM |
2881 NTLMSSP_NEGOTIATE_56 |
2882 /* NTLMSSP_NEGOTIATE_ALWAYS_SIGN | */ NTLMSSP_NEGOTIATE_128;
2883 if (sign_CIFS_PDUs)
2884 negotiate_flags |= NTLMSSP_NEGOTIATE_SIGN;
2885/* if (ntlmv2_support)
2886 negotiate_flags |= NTLMSSP_NEGOTIATE_NTLMV2;*/
2887 /* setup pointers to domain name and workstation name */
2888 bcc_ptr += SecurityBlobLength;
2889
2890 SecurityBlob->WorkstationName.Buffer = 0;
2891 SecurityBlob->WorkstationName.Length = 0;
2892 SecurityBlob->WorkstationName.MaximumLength = 0;
2893
2894 /* Domain not sent on first Sesssetup in NTLMSSP, instead it is sent
2895 along with username on auth request (ie the response to challenge) */
2896 SecurityBlob->DomainName.Buffer = 0;
2897 SecurityBlob->DomainName.Length = 0;
2898 SecurityBlob->DomainName.MaximumLength = 0;
2899 if (ses->capabilities & CAP_UNICODE) {
2900 if ((long) bcc_ptr % 2) {
2901 *bcc_ptr = 0;
2902 bcc_ptr++;
2903 } 2555 }
2904 2556
2905 bytes_returned = 2557 cFYI(1, ("Getting referral for: %s", full_path));
2906 cifs_strtoUCS((__le16 *) bcc_ptr, "Linux version ", 2558 rc = get_dfs_path(xid, pSesInfo , full_path + 1,
2907 32, nls_codepage); 2559 cifs_sb->local_nls, &num_referrals, &referrals,
2908 bcc_ptr += 2 * bytes_returned; 2560 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
2909 bytes_returned = 2561 if (!rc && num_referrals > 0) {
2910 cifs_strtoUCS((__le16 *) bcc_ptr, utsname()->release, 32, 2562 char *fake_devname = NULL;
2911 nls_codepage); 2563
2912 bcc_ptr += 2 * bytes_returned; 2564 if (mount_data != mount_data_global)
2913 bcc_ptr += 2; /* null terminate Linux version */ 2565 kfree(mount_data);
2914 bytes_returned = 2566 mount_data = cifs_compose_mount_options(
2915 cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS, 2567 cifs_sb->mountdata, full_path + 1,
2916 64, nls_codepage); 2568 referrals, &fake_devname);
2917 bcc_ptr += 2 * bytes_returned; 2569 kfree(fake_devname);
2918 *(bcc_ptr + 1) = 0; 2570 free_dfs_info_array(referrals, num_referrals);
2919 *(bcc_ptr + 2) = 0; 2571
2920 bcc_ptr += 2; /* null terminate network opsys string */ 2572 if (tcon)
2921 *(bcc_ptr + 1) = 0; 2573 cifs_put_tcon(tcon);
2922 *(bcc_ptr + 2) = 0; 2574 else if (pSesInfo)
2923 bcc_ptr += 2; /* null domain */ 2575 cifs_put_smb_ses(pSesInfo);
2924 } else { /* ASCII */ 2576
2925 strcpy(bcc_ptr, "Linux version "); 2577 cleanup_volume_info(&volume_info);
2926 bcc_ptr += strlen("Linux version "); 2578 FreeXid(xid);
2927 strcpy(bcc_ptr, utsname()->release); 2579 kfree(full_path);
2928 bcc_ptr += strlen(utsname()->release) + 1; 2580 referral_walks_count++;
2929 strcpy(bcc_ptr, CIFS_NETWORK_OPSYS); 2581 goto try_mount_again;
2930 bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
2931 bcc_ptr++; /* empty domain field */
2932 *bcc_ptr = 0;
2933 }
2934 SecurityBlob->NegotiateFlags = cpu_to_le32(negotiate_flags);
2935 pSMB->req.SecurityBlobLength = cpu_to_le16(SecurityBlobLength);
2936 count = (long) bcc_ptr - (long) pByteArea(smb_buffer);
2937 smb_buffer->smb_buf_length += count;
2938 pSMB->req.ByteCount = cpu_to_le16(count);
2939
2940 rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response,
2941 &bytes_returned, CIFS_LONG_OP);
2942
2943 if (smb_buffer_response->Status.CifsError ==
2944 cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))
2945 rc = 0;
2946
2947 if (rc) {
2948/* rc = map_smb_to_linux_error(smb_buffer_response); *//* done in SendReceive now */
2949 } else if ((smb_buffer_response->WordCount == 3)
2950 || (smb_buffer_response->WordCount == 4)) {
2951 __u16 action = le16_to_cpu(pSMBr->resp.Action);
2952 __u16 blob_len = le16_to_cpu(pSMBr->resp.SecurityBlobLength);
2953
2954 if (action & GUEST_LOGIN)
2955 cFYI(1, ("Guest login"));
2956 /* Do we want to set anything in SesInfo struct when guest login? */
2957
2958 bcc_ptr = pByteArea(smb_buffer_response);
2959 /* response can have either 3 or 4 word count - Samba sends 3 */
2960
2961 SecurityBlob2 = (PCHALLENGE_MESSAGE) bcc_ptr;
2962 if (SecurityBlob2->MessageType != NtLmChallenge) {
2963 cFYI(1, ("Unexpected NTLMSSP message type received %d",
2964 SecurityBlob2->MessageType));
2965 } else if (ses) {
2966 ses->Suid = smb_buffer_response->Uid; /* UID left in le format */
2967 cFYI(1, ("UID = %d", ses->Suid));
2968 if ((pSMBr->resp.hdr.WordCount == 3)
2969 || ((pSMBr->resp.hdr.WordCount == 4)
2970 && (blob_len <
2971 pSMBr->resp.ByteCount))) {
2972
2973 if (pSMBr->resp.hdr.WordCount == 4) {
2974 bcc_ptr += blob_len;
2975 cFYI(1, ("Security Blob Length %d",
2976 blob_len));
2977 }
2978
2979 cFYI(1, ("NTLMSSP Challenge rcvd"));
2980
2981 memcpy(ses->server->cryptKey,
2982 SecurityBlob2->Challenge,
2983 CIFS_CRYPTO_KEY_SIZE);
2984 if (SecurityBlob2->NegotiateFlags &
2985 cpu_to_le32(NTLMSSP_NEGOTIATE_NTLMV2))
2986 *pNTLMv2_flag = true;
2987
2988 if ((SecurityBlob2->NegotiateFlags &
2989 cpu_to_le32(NTLMSSP_NEGOTIATE_ALWAYS_SIGN))
2990 || (sign_CIFS_PDUs > 1))
2991 ses->server->secMode |=
2992 SECMODE_SIGN_REQUIRED;
2993 if ((SecurityBlob2->NegotiateFlags &
2994 cpu_to_le32(NTLMSSP_NEGOTIATE_SIGN)) && (sign_CIFS_PDUs))
2995 ses->server->secMode |=
2996 SECMODE_SIGN_ENABLED;
2997
2998 if (smb_buffer->Flags2 & SMBFLG2_UNICODE) {
2999 if ((long) (bcc_ptr) % 2) {
3000 remaining_words =
3001 (BCC(smb_buffer_response)
3002 - 1) / 2;
3003 /* Must word align unicode strings */
3004 bcc_ptr++;
3005 } else {
3006 remaining_words =
3007 BCC
3008 (smb_buffer_response) / 2;
3009 }
3010 len =
3011 UniStrnlen((wchar_t *) bcc_ptr,
3012 remaining_words - 1);
3013/* We look for obvious messed up bcc or strings in response so we do not go off
3014 the end since (at least) WIN2K and Windows XP have a major bug in not null
3015 terminating last Unicode string in response */
3016 if (ses->serverOS)
3017 kfree(ses->serverOS);
3018 ses->serverOS =
3019 kzalloc(2 * (len + 1), GFP_KERNEL);
3020 cifs_strfromUCS_le(ses->serverOS,
3021 (__le16 *)
3022 bcc_ptr, len,
3023 nls_codepage);
3024 bcc_ptr += 2 * (len + 1);
3025 remaining_words -= len + 1;
3026 ses->serverOS[2 * len] = 0;
3027 ses->serverOS[1 + (2 * len)] = 0;
3028 if (remaining_words > 0) {
3029 len = UniStrnlen((wchar_t *)
3030 bcc_ptr,
3031 remaining_words
3032 - 1);
3033 kfree(ses->serverNOS);
3034 ses->serverNOS =
3035 kzalloc(2 * (len + 1),
3036 GFP_KERNEL);
3037 cifs_strfromUCS_le(ses->
3038 serverNOS,
3039 (__le16 *)
3040 bcc_ptr,
3041 len,
3042 nls_codepage);
3043 bcc_ptr += 2 * (len + 1);
3044 ses->serverNOS[2 * len] = 0;
3045 ses->serverNOS[1 +
3046 (2 * len)] = 0;
3047 remaining_words -= len + 1;
3048 if (remaining_words > 0) {
3049 len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words);
3050 /* last string not always null terminated
3051 (for e.g. for Windows XP & 2000) */
3052 kfree(ses->serverDomain);
3053 ses->serverDomain =
3054 kzalloc(2 *
3055 (len +
3056 1),
3057 GFP_KERNEL);
3058 cifs_strfromUCS_le
3059 (ses->serverDomain,
3060 (__le16 *)bcc_ptr,
3061 len, nls_codepage);
3062 bcc_ptr +=
3063 2 * (len + 1);
3064 ses->serverDomain[2*len]
3065 = 0;
3066 ses->serverDomain
3067 [1 + (2 * len)]
3068 = 0;
3069 } /* else no more room so create dummy domain string */
3070 else {
3071 kfree(ses->serverDomain);
3072 ses->serverDomain =
3073 kzalloc(2,
3074 GFP_KERNEL);
3075 }
3076 } else { /* no room so create dummy domain and NOS string */
3077 kfree(ses->serverDomain);
3078 ses->serverDomain =
3079 kzalloc(2, GFP_KERNEL);
3080 kfree(ses->serverNOS);
3081 ses->serverNOS =
3082 kzalloc(2, GFP_KERNEL);
3083 }
3084 } else { /* ASCII */
3085 len = strnlen(bcc_ptr, 1024);
3086 if (((long) bcc_ptr + len) - (long)
3087 pByteArea(smb_buffer_response)
3088 <= BCC(smb_buffer_response)) {
3089 if (ses->serverOS)
3090 kfree(ses->serverOS);
3091 ses->serverOS =
3092 kzalloc(len + 1,
3093 GFP_KERNEL);
3094 strncpy(ses->serverOS,
3095 bcc_ptr, len);
3096
3097 bcc_ptr += len;
3098 bcc_ptr[0] = 0; /* null terminate string */
3099 bcc_ptr++;
3100
3101 len = strnlen(bcc_ptr, 1024);
3102 kfree(ses->serverNOS);
3103 ses->serverNOS =
3104 kzalloc(len + 1,
3105 GFP_KERNEL);
3106 strncpy(ses->serverNOS, bcc_ptr, len);
3107 bcc_ptr += len;
3108 bcc_ptr[0] = 0;
3109 bcc_ptr++;
3110
3111 len = strnlen(bcc_ptr, 1024);
3112 kfree(ses->serverDomain);
3113 ses->serverDomain =
3114 kzalloc(len + 1,
3115 GFP_KERNEL);
3116 strncpy(ses->serverDomain,
3117 bcc_ptr, len);
3118 bcc_ptr += len;
3119 bcc_ptr[0] = 0;
3120 bcc_ptr++;
3121 } else
3122 cFYI(1,
3123 ("field of length %d "
3124 "extends beyond end of smb",
3125 len));
3126 }
3127 } else {
3128 cERROR(1, ("Security Blob Length extends beyond"
3129 " end of SMB"));
3130 }
3131 } else {
3132 cERROR(1, ("No session structure passed in."));
3133 } 2582 }
3134 } else { 2583#else /* No DFS support, return error on mount */
3135 cERROR(1, ("Invalid Word count %d:", 2584 rc = -EOPNOTSUPP;
3136 smb_buffer_response->WordCount)); 2585#endif
3137 rc = -EIO;
3138 }
3139
3140 cifs_buf_release(smb_buffer);
3141
3142 return rc;
3143}
3144static int
3145CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
3146 char *ntlm_session_key, bool ntlmv2_flag,
3147 const struct nls_table *nls_codepage)
3148{
3149 struct smb_hdr *smb_buffer;
3150 struct smb_hdr *smb_buffer_response;
3151 SESSION_SETUP_ANDX *pSMB;
3152 SESSION_SETUP_ANDX *pSMBr;
3153 char *bcc_ptr;
3154 char *user;
3155 char *domain;
3156 int rc = 0;
3157 int remaining_words = 0;
3158 int bytes_returned = 0;
3159 int len;
3160 int SecurityBlobLength = sizeof(AUTHENTICATE_MESSAGE);
3161 PAUTHENTICATE_MESSAGE SecurityBlob;
3162 __u32 negotiate_flags, capabilities;
3163 __u16 count;
3164
3165 cFYI(1, ("In NTLMSSPSessSetup (Authenticate)"));
3166 if (ses == NULL)
3167 return -EINVAL;
3168 user = ses->userName;
3169 domain = ses->domainName;
3170 smb_buffer = cifs_buf_get();
3171 if (smb_buffer == NULL) {
3172 return -ENOMEM;
3173 }
3174 smb_buffer_response = smb_buffer;
3175 pSMB = (SESSION_SETUP_ANDX *)smb_buffer;
3176 pSMBr = (SESSION_SETUP_ANDX *)smb_buffer_response;
3177
3178 /* send SMBsessionSetup here */
3179 header_assemble(smb_buffer, SMB_COM_SESSION_SETUP_ANDX,
3180 NULL /* no tCon exists yet */ , 12 /* wct */ );
3181
3182 smb_buffer->Mid = GetNextMid(ses->server);
3183 pSMB->req.hdr.Flags |= (SMBFLG_CASELESS | SMBFLG_CANONICAL_PATH_FORMAT);
3184 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
3185 pSMB->req.AndXCommand = 0xFF;
3186 pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
3187 pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
3188
3189 pSMB->req.hdr.Uid = ses->Suid;
3190
3191 if (ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
3192 smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
3193
3194 capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
3195 CAP_EXTENDED_SECURITY;
3196 if (ses->capabilities & CAP_UNICODE) {
3197 smb_buffer->Flags2 |= SMBFLG2_UNICODE;
3198 capabilities |= CAP_UNICODE;
3199 }
3200 if (ses->capabilities & CAP_STATUS32) {
3201 smb_buffer->Flags2 |= SMBFLG2_ERR_STATUS;
3202 capabilities |= CAP_STATUS32;
3203 } 2586 }
3204 if (ses->capabilities & CAP_DFS) {
3205 smb_buffer->Flags2 |= SMBFLG2_DFS;
3206 capabilities |= CAP_DFS;
3207 }
3208 pSMB->req.Capabilities = cpu_to_le32(capabilities);
3209
3210 bcc_ptr = (char *)&pSMB->req.SecurityBlob;
3211 SecurityBlob = (PAUTHENTICATE_MESSAGE)bcc_ptr;
3212 strncpy(SecurityBlob->Signature, NTLMSSP_SIGNATURE, 8);
3213 SecurityBlob->MessageType = NtLmAuthenticate;
3214 bcc_ptr += SecurityBlobLength;
3215 negotiate_flags = NTLMSSP_NEGOTIATE_UNICODE | NTLMSSP_REQUEST_TARGET |
3216 NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_TARGET_INFO |
3217 0x80000000 | NTLMSSP_NEGOTIATE_128;
3218 if (sign_CIFS_PDUs)
3219 negotiate_flags |= /* NTLMSSP_NEGOTIATE_ALWAYS_SIGN |*/ NTLMSSP_NEGOTIATE_SIGN;
3220 if (ntlmv2_flag)
3221 negotiate_flags |= NTLMSSP_NEGOTIATE_NTLMV2;
3222
3223/* setup pointers to domain name and workstation name */
3224
3225 SecurityBlob->WorkstationName.Buffer = 0;
3226 SecurityBlob->WorkstationName.Length = 0;
3227 SecurityBlob->WorkstationName.MaximumLength = 0;
3228 SecurityBlob->SessionKey.Length = 0;
3229 SecurityBlob->SessionKey.MaximumLength = 0;
3230 SecurityBlob->SessionKey.Buffer = 0;
3231
3232 SecurityBlob->LmChallengeResponse.Length = 0;
3233 SecurityBlob->LmChallengeResponse.MaximumLength = 0;
3234 SecurityBlob->LmChallengeResponse.Buffer = 0;
3235
3236 SecurityBlob->NtChallengeResponse.Length =
3237 cpu_to_le16(CIFS_SESS_KEY_SIZE);
3238 SecurityBlob->NtChallengeResponse.MaximumLength =
3239 cpu_to_le16(CIFS_SESS_KEY_SIZE);
3240 memcpy(bcc_ptr, ntlm_session_key, CIFS_SESS_KEY_SIZE);
3241 SecurityBlob->NtChallengeResponse.Buffer =
3242 cpu_to_le32(SecurityBlobLength);
3243 SecurityBlobLength += CIFS_SESS_KEY_SIZE;
3244 bcc_ptr += CIFS_SESS_KEY_SIZE;
3245 2587
3246 if (ses->capabilities & CAP_UNICODE) { 2588mount_fail_check:
3247 if (domain == NULL) { 2589 /* on error free sesinfo and tcon struct if needed */
3248 SecurityBlob->DomainName.Buffer = 0;
3249 SecurityBlob->DomainName.Length = 0;
3250 SecurityBlob->DomainName.MaximumLength = 0;
3251 } else {
3252 __u16 ln = cifs_strtoUCS((__le16 *) bcc_ptr, domain, 64,
3253 nls_codepage);
3254 ln *= 2;
3255 SecurityBlob->DomainName.MaximumLength =
3256 cpu_to_le16(ln);
3257 SecurityBlob->DomainName.Buffer =
3258 cpu_to_le32(SecurityBlobLength);
3259 bcc_ptr += ln;
3260 SecurityBlobLength += ln;
3261 SecurityBlob->DomainName.Length = cpu_to_le16(ln);
3262 }
3263 if (user == NULL) {
3264 SecurityBlob->UserName.Buffer = 0;
3265 SecurityBlob->UserName.Length = 0;
3266 SecurityBlob->UserName.MaximumLength = 0;
3267 } else {
3268 __u16 ln = cifs_strtoUCS((__le16 *) bcc_ptr, user, 64,
3269 nls_codepage);
3270 ln *= 2;
3271 SecurityBlob->UserName.MaximumLength =
3272 cpu_to_le16(ln);
3273 SecurityBlob->UserName.Buffer =
3274 cpu_to_le32(SecurityBlobLength);
3275 bcc_ptr += ln;
3276 SecurityBlobLength += ln;
3277 SecurityBlob->UserName.Length = cpu_to_le16(ln);
3278 }
3279
3280 /* SecurityBlob->WorkstationName.Length =
3281 cifs_strtoUCS((__le16 *) bcc_ptr, "AMACHINE",64, nls_codepage);
3282 SecurityBlob->WorkstationName.Length *= 2;
3283 SecurityBlob->WorkstationName.MaximumLength =
3284 cpu_to_le16(SecurityBlob->WorkstationName.Length);
3285 SecurityBlob->WorkstationName.Buffer =
3286 cpu_to_le32(SecurityBlobLength);
3287 bcc_ptr += SecurityBlob->WorkstationName.Length;
3288 SecurityBlobLength += SecurityBlob->WorkstationName.Length;
3289 SecurityBlob->WorkstationName.Length =
3290 cpu_to_le16(SecurityBlob->WorkstationName.Length); */
3291
3292 if ((long) bcc_ptr % 2) {
3293 *bcc_ptr = 0;
3294 bcc_ptr++;
3295 }
3296 bytes_returned =
3297 cifs_strtoUCS((__le16 *) bcc_ptr, "Linux version ",
3298 32, nls_codepage);
3299 bcc_ptr += 2 * bytes_returned;
3300 bytes_returned =
3301 cifs_strtoUCS((__le16 *) bcc_ptr, utsname()->release, 32,
3302 nls_codepage);
3303 bcc_ptr += 2 * bytes_returned;
3304 bcc_ptr += 2; /* null term version string */
3305 bytes_returned =
3306 cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
3307 64, nls_codepage);
3308 bcc_ptr += 2 * bytes_returned;
3309 *(bcc_ptr + 1) = 0;
3310 *(bcc_ptr + 2) = 0;
3311 bcc_ptr += 2; /* null terminate network opsys string */
3312 *(bcc_ptr + 1) = 0;
3313 *(bcc_ptr + 2) = 0;
3314 bcc_ptr += 2; /* null domain */
3315 } else { /* ASCII */
3316 if (domain == NULL) {
3317 SecurityBlob->DomainName.Buffer = 0;
3318 SecurityBlob->DomainName.Length = 0;
3319 SecurityBlob->DomainName.MaximumLength = 0;
3320 } else {
3321 __u16 ln;
3322 negotiate_flags |= NTLMSSP_NEGOTIATE_DOMAIN_SUPPLIED;
3323 strncpy(bcc_ptr, domain, 63);
3324 ln = strnlen(domain, 64);
3325 SecurityBlob->DomainName.MaximumLength =
3326 cpu_to_le16(ln);
3327 SecurityBlob->DomainName.Buffer =
3328 cpu_to_le32(SecurityBlobLength);
3329 bcc_ptr += ln;
3330 SecurityBlobLength += ln;
3331 SecurityBlob->DomainName.Length = cpu_to_le16(ln);
3332 }
3333 if (user == NULL) {
3334 SecurityBlob->UserName.Buffer = 0;
3335 SecurityBlob->UserName.Length = 0;
3336 SecurityBlob->UserName.MaximumLength = 0;
3337 } else {
3338 __u16 ln;
3339 strncpy(bcc_ptr, user, 63);
3340 ln = strnlen(user, 64);
3341 SecurityBlob->UserName.MaximumLength = cpu_to_le16(ln);
3342 SecurityBlob->UserName.Buffer =
3343 cpu_to_le32(SecurityBlobLength);
3344 bcc_ptr += ln;
3345 SecurityBlobLength += ln;
3346 SecurityBlob->UserName.Length = cpu_to_le16(ln);
3347 }
3348 /* BB fill in our workstation name if known BB */
3349
3350 strcpy(bcc_ptr, "Linux version ");
3351 bcc_ptr += strlen("Linux version ");
3352 strcpy(bcc_ptr, utsname()->release);
3353 bcc_ptr += strlen(utsname()->release) + 1;
3354 strcpy(bcc_ptr, CIFS_NETWORK_OPSYS);
3355 bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
3356 bcc_ptr++; /* null domain */
3357 *bcc_ptr = 0;
3358 }
3359 SecurityBlob->NegotiateFlags = cpu_to_le32(negotiate_flags);
3360 pSMB->req.SecurityBlobLength = cpu_to_le16(SecurityBlobLength);
3361 count = (long) bcc_ptr - (long) pByteArea(smb_buffer);
3362 smb_buffer->smb_buf_length += count;
3363 pSMB->req.ByteCount = cpu_to_le16(count);
3364
3365 rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response,
3366 &bytes_returned, CIFS_LONG_OP);
3367 if (rc) { 2590 if (rc) {
3368/* rc = map_smb_to_linux_error(smb_buffer_response) done in SendReceive now */ 2591 if (mount_data != mount_data_global)
3369 } else if ((smb_buffer_response->WordCount == 3) || 2592 kfree(mount_data);
3370 (smb_buffer_response->WordCount == 4)) { 2593 /* If find_unc succeeded then rc == 0 so we can not end */
3371 __u16 action = le16_to_cpu(pSMBr->resp.Action); 2594 /* up accidently freeing someone elses tcon struct */
3372 __u16 blob_len = le16_to_cpu(pSMBr->resp.SecurityBlobLength); 2595 if (tcon)
3373 if (action & GUEST_LOGIN) 2596 cifs_put_tcon(tcon);
3374 cFYI(1, ("Guest login")); /* BB Should we set anything 2597 else if (pSesInfo)
3375 in SesInfo struct ? */ 2598 cifs_put_smb_ses(pSesInfo);
3376/* if (SecurityBlob2->MessageType != NtLm??) { 2599 else
3377 cFYI("Unexpected message type on auth response is %d")); 2600 cifs_put_tcp_session(srvTcp);
3378 } */ 2601 goto out;
3379
3380 if (ses) {
3381 cFYI(1,
3382 ("Check challenge UID %d vs auth response UID %d",
3383 ses->Suid, smb_buffer_response->Uid));
3384 /* UID left in wire format */
3385 ses->Suid = smb_buffer_response->Uid;
3386 bcc_ptr = pByteArea(smb_buffer_response);
3387 /* response can have either 3 or 4 word count - Samba sends 3 */
3388 if ((pSMBr->resp.hdr.WordCount == 3)
3389 || ((pSMBr->resp.hdr.WordCount == 4)
3390 && (blob_len <
3391 pSMBr->resp.ByteCount))) {
3392 if (pSMBr->resp.hdr.WordCount == 4) {
3393 bcc_ptr +=
3394 blob_len;
3395 cFYI(1,
3396 ("Security Blob Length %d ",
3397 blob_len));
3398 }
3399
3400 cFYI(1,
3401 ("NTLMSSP response to Authenticate "));
3402
3403 if (smb_buffer->Flags2 & SMBFLG2_UNICODE) {
3404 if ((long) (bcc_ptr) % 2) {
3405 remaining_words =
3406 (BCC(smb_buffer_response)
3407 - 1) / 2;
3408 bcc_ptr++; /* Unicode strings must be word aligned */
3409 } else {
3410 remaining_words = BCC(smb_buffer_response) / 2;
3411 }
3412 len = UniStrnlen((wchar_t *) bcc_ptr,
3413 remaining_words - 1);
3414/* We look for obvious messed up bcc or strings in response so we do not go off
3415 the end since (at least) WIN2K and Windows XP have a major bug in not null
3416 terminating last Unicode string in response */
3417 if (ses->serverOS)
3418 kfree(ses->serverOS);
3419 ses->serverOS =
3420 kzalloc(2 * (len + 1), GFP_KERNEL);
3421 cifs_strfromUCS_le(ses->serverOS,
3422 (__le16 *)
3423 bcc_ptr, len,
3424 nls_codepage);
3425 bcc_ptr += 2 * (len + 1);
3426 remaining_words -= len + 1;
3427 ses->serverOS[2 * len] = 0;
3428 ses->serverOS[1 + (2 * len)] = 0;
3429 if (remaining_words > 0) {
3430 len = UniStrnlen((wchar_t *)
3431 bcc_ptr,
3432 remaining_words
3433 - 1);
3434 kfree(ses->serverNOS);
3435 ses->serverNOS =
3436 kzalloc(2 * (len + 1),
3437 GFP_KERNEL);
3438 cifs_strfromUCS_le(ses->
3439 serverNOS,
3440 (__le16 *)
3441 bcc_ptr,
3442 len,
3443 nls_codepage);
3444 bcc_ptr += 2 * (len + 1);
3445 ses->serverNOS[2 * len] = 0;
3446 ses->serverNOS[1+(2*len)] = 0;
3447 remaining_words -= len + 1;
3448 if (remaining_words > 0) {
3449 len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words);
3450 /* last string not always null terminated (e.g. for Windows XP & 2000) */
3451 if (ses->serverDomain)
3452 kfree(ses->serverDomain);
3453 ses->serverDomain =
3454 kzalloc(2 *
3455 (len +
3456 1),
3457 GFP_KERNEL);
3458 cifs_strfromUCS_le
3459 (ses->
3460 serverDomain,
3461 (__le16 *)
3462 bcc_ptr, len,
3463 nls_codepage);
3464 bcc_ptr +=
3465 2 * (len + 1);
3466 ses->
3467 serverDomain[2
3468 * len]
3469 = 0;
3470 ses->
3471 serverDomain[1
3472 +
3473 (2
3474 *
3475 len)]
3476 = 0;
3477 } /* else no more room so create dummy domain string */
3478 else {
3479 if (ses->serverDomain)
3480 kfree(ses->serverDomain);
3481 ses->serverDomain = kzalloc(2,GFP_KERNEL);
3482 }
3483 } else { /* no room so create dummy domain and NOS string */
3484 if (ses->serverDomain)
3485 kfree(ses->serverDomain);
3486 ses->serverDomain = kzalloc(2, GFP_KERNEL);
3487 kfree(ses->serverNOS);
3488 ses->serverNOS = kzalloc(2, GFP_KERNEL);
3489 }
3490 } else { /* ASCII */
3491 len = strnlen(bcc_ptr, 1024);
3492 if (((long) bcc_ptr + len) -
3493 (long) pByteArea(smb_buffer_response)
3494 <= BCC(smb_buffer_response)) {
3495 if (ses->serverOS)
3496 kfree(ses->serverOS);
3497 ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
3498 strncpy(ses->serverOS,bcc_ptr, len);
3499
3500 bcc_ptr += len;
3501 bcc_ptr[0] = 0; /* null terminate the string */
3502 bcc_ptr++;
3503
3504 len = strnlen(bcc_ptr, 1024);
3505 kfree(ses->serverNOS);
3506 ses->serverNOS = kzalloc(len+1,
3507 GFP_KERNEL);
3508 strncpy(ses->serverNOS,
3509 bcc_ptr, len);
3510 bcc_ptr += len;
3511 bcc_ptr[0] = 0;
3512 bcc_ptr++;
3513
3514 len = strnlen(bcc_ptr, 1024);
3515 if (ses->serverDomain)
3516 kfree(ses->serverDomain);
3517 ses->serverDomain =
3518 kzalloc(len+1,
3519 GFP_KERNEL);
3520 strncpy(ses->serverDomain,
3521 bcc_ptr, len);
3522 bcc_ptr += len;
3523 bcc_ptr[0] = 0;
3524 bcc_ptr++;
3525 } else
3526 cFYI(1, ("field of length %d "
3527 "extends beyond end of smb ",
3528 len));
3529 }
3530 } else {
3531 cERROR(1, ("Security Blob extends beyond end "
3532 "of SMB"));
3533 }
3534 } else {
3535 cERROR(1, ("No session structure passed in."));
3536 }
3537 } else {
3538 cERROR(1, ("Invalid Word count %d: ",
3539 smb_buffer_response->WordCount));
3540 rc = -EIO;
3541 } 2602 }
3542 2603
3543 cifs_buf_release(smb_buffer); 2604 /* volume_info->password is freed above when existing session found
3544 2605 (in which case it is not needed anymore) but when new sesion is created
2606 the password ptr is put in the new session structure (in which case the
2607 password will be freed at unmount time) */
2608out:
2609 /* zero out password before freeing */
2610 cleanup_volume_info(&volume_info);
2611 FreeXid(xid);
3545 return rc; 2612 return rc;
3546} 2613}
3547 2614
@@ -3556,7 +2623,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
3556 TCONX_RSP *pSMBr; 2623 TCONX_RSP *pSMBr;
3557 unsigned char *bcc_ptr; 2624 unsigned char *bcc_ptr;
3558 int rc = 0; 2625 int rc = 0;
3559 int length; 2626 int length, bytes_left;
3560 __u16 count; 2627 __u16 count;
3561 2628
3562 if (ses == NULL) 2629 if (ses == NULL)
@@ -3644,14 +2711,22 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
3644 rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length, 2711 rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length,
3645 CIFS_STD_OP); 2712 CIFS_STD_OP);
3646 2713
3647 /* if (rc) rc = map_smb_to_linux_error(smb_buffer_response); */
3648 /* above now done in SendReceive */ 2714 /* above now done in SendReceive */
3649 if ((rc == 0) && (tcon != NULL)) { 2715 if ((rc == 0) && (tcon != NULL)) {
2716 bool is_unicode;
2717
3650 tcon->tidStatus = CifsGood; 2718 tcon->tidStatus = CifsGood;
3651 tcon->need_reconnect = false; 2719 tcon->need_reconnect = false;
3652 tcon->tid = smb_buffer_response->Tid; 2720 tcon->tid = smb_buffer_response->Tid;
3653 bcc_ptr = pByteArea(smb_buffer_response); 2721 bcc_ptr = pByteArea(smb_buffer_response);
3654 length = strnlen(bcc_ptr, BCC(smb_buffer_response) - 2); 2722 bytes_left = BCC(smb_buffer_response);
2723 length = strnlen(bcc_ptr, bytes_left - 2);
2724 if (smb_buffer->Flags2 & SMBFLG2_UNICODE)
2725 is_unicode = true;
2726 else
2727 is_unicode = false;
2728
2729
3655 /* skip service field (NB: this field is always ASCII) */ 2730 /* skip service field (NB: this field is always ASCII) */
3656 if (length == 3) { 2731 if (length == 3) {
3657 if ((bcc_ptr[0] == 'I') && (bcc_ptr[1] == 'P') && 2732 if ((bcc_ptr[0] == 'I') && (bcc_ptr[1] == 'P') &&
@@ -3666,40 +2741,16 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
3666 } 2741 }
3667 } 2742 }
3668 bcc_ptr += length + 1; 2743 bcc_ptr += length + 1;
2744 bytes_left -= (length + 1);
3669 strncpy(tcon->treeName, tree, MAX_TREE_SIZE); 2745 strncpy(tcon->treeName, tree, MAX_TREE_SIZE);
3670 if (smb_buffer->Flags2 & SMBFLG2_UNICODE) { 2746
3671 length = UniStrnlen((wchar_t *) bcc_ptr, 512); 2747 /* mostly informational -- no need to fail on error here */
3672 if ((bcc_ptr + (2 * length)) - 2748 tcon->nativeFileSystem = cifs_strndup_from_ucs(bcc_ptr,
3673 pByteArea(smb_buffer_response) <= 2749 bytes_left, is_unicode,
3674 BCC(smb_buffer_response)) { 2750 nls_codepage);
3675 kfree(tcon->nativeFileSystem); 2751
3676 tcon->nativeFileSystem = 2752 cFYI(1, ("nativeFileSystem=%s", tcon->nativeFileSystem));
3677 kzalloc(2*(length + 1), GFP_KERNEL); 2753
3678 if (tcon->nativeFileSystem)
3679 cifs_strfromUCS_le(
3680 tcon->nativeFileSystem,
3681 (__le16 *) bcc_ptr,
3682 length, nls_codepage);
3683 bcc_ptr += 2 * length;
3684 bcc_ptr[0] = 0; /* null terminate the string */
3685 bcc_ptr[1] = 0;
3686 bcc_ptr += 2;
3687 }
3688 /* else do not bother copying these information fields*/
3689 } else {
3690 length = strnlen(bcc_ptr, 1024);
3691 if ((bcc_ptr + length) -
3692 pByteArea(smb_buffer_response) <=
3693 BCC(smb_buffer_response)) {
3694 kfree(tcon->nativeFileSystem);
3695 tcon->nativeFileSystem =
3696 kzalloc(length + 1, GFP_KERNEL);
3697 if (tcon->nativeFileSystem)
3698 strncpy(tcon->nativeFileSystem, bcc_ptr,
3699 length);
3700 }
3701 /* else do not bother copying these information fields*/
3702 }
3703 if ((smb_buffer_response->WordCount == 3) || 2754 if ((smb_buffer_response->WordCount == 3) ||
3704 (smb_buffer_response->WordCount == 7)) 2755 (smb_buffer_response->WordCount == 7))
3705 /* field is in same location */ 2756 /* field is in same location */
@@ -3738,8 +2789,6 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
3738 struct nls_table *nls_info) 2789 struct nls_table *nls_info)
3739{ 2790{
3740 int rc = 0; 2791 int rc = 0;
3741 char ntlm_session_key[CIFS_SESS_KEY_SIZE];
3742 bool ntlmv2_flag = false;
3743 int first_time = 0; 2792 int first_time = 0;
3744 struct TCP_Server_Info *server = pSesInfo->server; 2793 struct TCP_Server_Info *server = pSesInfo->server;
3745 2794
@@ -3771,83 +2820,19 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
3771 pSesInfo->capabilities = server->capabilities; 2820 pSesInfo->capabilities = server->capabilities;
3772 if (linuxExtEnabled == 0) 2821 if (linuxExtEnabled == 0)
3773 pSesInfo->capabilities &= (~CAP_UNIX); 2822 pSesInfo->capabilities &= (~CAP_UNIX);
3774 /* pSesInfo->sequence_number = 0;*/ 2823
3775 cFYI(1, ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d", 2824 cFYI(1, ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
3776 server->secMode, server->capabilities, server->timeAdj)); 2825 server->secMode, server->capabilities, server->timeAdj));
3777 2826
3778 if (experimEnabled < 2) 2827 rc = CIFS_SessSetup(xid, pSesInfo, first_time, nls_info);
3779 rc = CIFS_SessSetup(xid, pSesInfo, first_time, nls_info);
3780 else if (extended_security
3781 && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
3782 && (server->secType == NTLMSSP)) {
3783 rc = -EOPNOTSUPP;
3784 } else if (extended_security
3785 && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
3786 && (server->secType == RawNTLMSSP)) {
3787 cFYI(1, ("NTLMSSP sesssetup"));
3788 rc = CIFSNTLMSSPNegotiateSessSetup(xid, pSesInfo, &ntlmv2_flag,
3789 nls_info);
3790 if (!rc) {
3791 if (ntlmv2_flag) {
3792 char *v2_response;
3793 cFYI(1, ("more secure NTLM ver2 hash"));
3794 if (CalcNTLMv2_partial_mac_key(pSesInfo,
3795 nls_info)) {
3796 rc = -ENOMEM;
3797 goto ss_err_exit;
3798 } else
3799 v2_response = kmalloc(16 + 64 /* blob*/,
3800 GFP_KERNEL);
3801 if (v2_response) {
3802 CalcNTLMv2_response(pSesInfo,
3803 v2_response);
3804 /* if (first_time)
3805 cifs_calculate_ntlmv2_mac_key */
3806 kfree(v2_response);
3807 /* BB Put dummy sig in SessSetup PDU? */
3808 } else {
3809 rc = -ENOMEM;
3810 goto ss_err_exit;
3811 }
3812
3813 } else {
3814 SMBNTencrypt(pSesInfo->password,
3815 server->cryptKey,
3816 ntlm_session_key);
3817
3818 if (first_time)
3819 cifs_calculate_mac_key(
3820 &server->mac_signing_key,
3821 ntlm_session_key,
3822 pSesInfo->password);
3823 }
3824 /* for better security the weaker lanman hash not sent
3825 in AuthSessSetup so we no longer calculate it */
3826
3827 rc = CIFSNTLMSSPAuthSessSetup(xid, pSesInfo,
3828 ntlm_session_key,
3829 ntlmv2_flag,
3830 nls_info);
3831 }
3832 } else { /* old style NTLM 0.12 session setup */
3833 SMBNTencrypt(pSesInfo->password, server->cryptKey,
3834 ntlm_session_key);
3835
3836 if (first_time)
3837 cifs_calculate_mac_key(&server->mac_signing_key,
3838 ntlm_session_key,
3839 pSesInfo->password);
3840
3841 rc = CIFSSessSetup(xid, pSesInfo, ntlm_session_key, nls_info);
3842 }
3843 if (rc) { 2828 if (rc) {
3844 cERROR(1, ("Send error in SessSetup = %d", rc)); 2829 cERROR(1, ("Send error in SessSetup = %d", rc));
3845 } else { 2830 } else {
3846 cFYI(1, ("CIFS Session Established successfully")); 2831 cFYI(1, ("CIFS Session Established successfully"));
3847 spin_lock(&GlobalMid_Lock); 2832 spin_lock(&GlobalMid_Lock);
3848 pSesInfo->status = CifsGood; 2833 pSesInfo->status = CifsGood;
3849 pSesInfo->need_reconnect = false; 2834 pSesInfo->need_reconnect = false;
3850 spin_unlock(&GlobalMid_Lock); 2835 spin_unlock(&GlobalMid_Lock);
3851 } 2836 }
3852 2837
3853ss_err_exit: 2838ss_err_exit:
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 2f35cccfcd8d..3758965d73d5 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -129,12 +129,62 @@ cifs_bp_rename_retry:
129 return full_path; 129 return full_path;
130} 130}
131 131
132static void
133cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle,
134 struct cifsTconInfo *tcon, bool write_only)
135{
136 int oplock = 0;
137 struct cifsFileInfo *pCifsFile;
138 struct cifsInodeInfo *pCifsInode;
139
140 pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
141
142 if (pCifsFile == NULL)
143 return;
144
145 if (oplockEnabled)
146 oplock = REQ_OPLOCK;
147
148 pCifsFile->netfid = fileHandle;
149 pCifsFile->pid = current->tgid;
150 pCifsFile->pInode = newinode;
151 pCifsFile->invalidHandle = false;
152 pCifsFile->closePend = false;
153 mutex_init(&pCifsFile->fh_mutex);
154 mutex_init(&pCifsFile->lock_mutex);
155 INIT_LIST_HEAD(&pCifsFile->llist);
156 atomic_set(&pCifsFile->wrtPending, 0);
157
158 /* set the following in open now
159 pCifsFile->pfile = file; */
160 write_lock(&GlobalSMBSeslock);
161 list_add(&pCifsFile->tlist, &tcon->openFileList);
162 pCifsInode = CIFS_I(newinode);
163 if (pCifsInode) {
164 /* if readable file instance put first in list*/
165 if (write_only)
166 list_add_tail(&pCifsFile->flist,
167 &pCifsInode->openFileList);
168 else
169 list_add(&pCifsFile->flist, &pCifsInode->openFileList);
170
171 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
172 pCifsInode->clientCanCacheAll = true;
173 pCifsInode->clientCanCacheRead = true;
174 cFYI(1, ("Exclusive Oplock inode %p", newinode));
175 } else if ((oplock & 0xF) == OPLOCK_READ)
176 pCifsInode->clientCanCacheRead = true;
177 }
178 write_unlock(&GlobalSMBSeslock);
179}
180
132int cifs_posix_open(char *full_path, struct inode **pinode, 181int cifs_posix_open(char *full_path, struct inode **pinode,
133 struct super_block *sb, int mode, int oflags, 182 struct super_block *sb, int mode, int oflags,
134 int *poplock, __u16 *pnetfid, int xid) 183 int *poplock, __u16 *pnetfid, int xid)
135{ 184{
136 int rc; 185 int rc;
137 __u32 oplock; 186 __u32 oplock;
187 bool write_only = false;
138 FILE_UNIX_BASIC_INFO *presp_data; 188 FILE_UNIX_BASIC_INFO *presp_data;
139 __u32 posix_flags = 0; 189 __u32 posix_flags = 0;
140 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 190 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -172,7 +222,10 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
172 if (oflags & O_DIRECT) 222 if (oflags & O_DIRECT)
173 posix_flags |= SMB_O_DIRECT; 223 posix_flags |= SMB_O_DIRECT;
174 224
225 if (!(oflags & FMODE_READ))
226 write_only = true;
175 227
228 mode &= ~current_umask();
176 rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode, 229 rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode,
177 pnetfid, presp_data, &oplock, full_path, 230 pnetfid, presp_data, &oplock, full_path,
178 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 231 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
@@ -187,8 +240,10 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
187 if (!pinode) 240 if (!pinode)
188 goto posix_open_ret; /* caller does not need info */ 241 goto posix_open_ret; /* caller does not need info */
189 242
190 if (*pinode == NULL) 243 if (*pinode == NULL) {
191 *pinode = cifs_new_inode(sb, &presp_data->UniqueId); 244 __u64 unique_id = le64_to_cpu(presp_data->UniqueId);
245 *pinode = cifs_new_inode(sb, &unique_id);
246 }
192 /* else an inode was passed in. Update its info, don't create one */ 247 /* else an inode was passed in. Update its info, don't create one */
193 248
194 /* We do not need to close the file if new_inode fails since 249 /* We do not need to close the file if new_inode fails since
@@ -198,6 +253,8 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
198 253
199 posix_fill_in_inode(*pinode, presp_data, 1); 254 posix_fill_in_inode(*pinode, presp_data, 1);
200 255
256 cifs_fill_fileinfo(*pinode, *pnetfid, cifs_sb->tcon, write_only);
257
201posix_open_ret: 258posix_open_ret:
202 kfree(presp_data); 259 kfree(presp_data);
203 return rc; 260 return rc;
@@ -225,6 +282,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
225 int create_options = CREATE_NOT_DIR; 282 int create_options = CREATE_NOT_DIR;
226 int oplock = 0; 283 int oplock = 0;
227 int oflags; 284 int oflags;
285 bool posix_create = false;
228 /* 286 /*
229 * BB below access is probably too much for mknod to request 287 * BB below access is probably too much for mknod to request
230 * but we have to do query and setpathinfo so requesting 288 * but we have to do query and setpathinfo so requesting
@@ -239,7 +297,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
239 char *full_path = NULL; 297 char *full_path = NULL;
240 FILE_ALL_INFO *buf = NULL; 298 FILE_ALL_INFO *buf = NULL;
241 struct inode *newinode = NULL; 299 struct inode *newinode = NULL;
242 struct cifsInodeInfo *pCifsInode;
243 int disposition = FILE_OVERWRITE_IF; 300 int disposition = FILE_OVERWRITE_IF;
244 bool write_only = false; 301 bool write_only = false;
245 302
@@ -254,7 +311,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
254 return -ENOMEM; 311 return -ENOMEM;
255 } 312 }
256 313
257 mode &= ~current->fs->umask;
258 if (oplockEnabled) 314 if (oplockEnabled)
259 oplock = REQ_OPLOCK; 315 oplock = REQ_OPLOCK;
260 316
@@ -273,12 +329,14 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
273 negotation. EREMOTE indicates DFS junction, which is not 329 negotation. EREMOTE indicates DFS junction, which is not
274 handled in posix open */ 330 handled in posix open */
275 331
276 if ((rc == 0) && (newinode == NULL)) 332 if (rc == 0) {
277 goto cifs_create_get_file_info; /* query inode info */ 333 posix_create = true;
278 else if (rc == 0) /* success, no need to query */ 334 if (newinode == NULL) /* query inode info */
279 goto cifs_create_set_dentry; 335 goto cifs_create_get_file_info;
280 else if ((rc != -EIO) && (rc != -EREMOTE) && 336 else /* success, no need to query */
281 (rc != -EOPNOTSUPP)) /* path not found or net err */ 337 goto cifs_create_set_dentry;
338 } else if ((rc != -EIO) && (rc != -EREMOTE) &&
339 (rc != -EOPNOTSUPP) && (rc != -EINVAL))
282 goto cifs_create_out; 340 goto cifs_create_out;
283 /* else fallthrough to retry, using older open call, this is 341 /* else fallthrough to retry, using older open call, this is
284 case where server does not support this SMB level, and 342 case where server does not support this SMB level, and
@@ -409,45 +467,9 @@ cifs_create_set_dentry:
409 if ((nd == NULL) || (!(nd->flags & LOOKUP_OPEN))) { 467 if ((nd == NULL) || (!(nd->flags & LOOKUP_OPEN))) {
410 /* mknod case - do not leave file open */ 468 /* mknod case - do not leave file open */
411 CIFSSMBClose(xid, tcon, fileHandle); 469 CIFSSMBClose(xid, tcon, fileHandle);
412 } else if (newinode) { 470 } else if (!(posix_create) && (newinode)) {
413 struct cifsFileInfo *pCifsFile = 471 cifs_fill_fileinfo(newinode, fileHandle,
414 kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); 472 cifs_sb->tcon, write_only);
415
416 if (pCifsFile == NULL)
417 goto cifs_create_out;
418 pCifsFile->netfid = fileHandle;
419 pCifsFile->pid = current->tgid;
420 pCifsFile->pInode = newinode;
421 pCifsFile->invalidHandle = false;
422 pCifsFile->closePend = false;
423 init_MUTEX(&pCifsFile->fh_sem);
424 mutex_init(&pCifsFile->lock_mutex);
425 INIT_LIST_HEAD(&pCifsFile->llist);
426 atomic_set(&pCifsFile->wrtPending, 0);
427
428 /* set the following in open now
429 pCifsFile->pfile = file; */
430 write_lock(&GlobalSMBSeslock);
431 list_add(&pCifsFile->tlist, &tcon->openFileList);
432 pCifsInode = CIFS_I(newinode);
433 if (pCifsInode) {
434 /* if readable file instance put first in list*/
435 if (write_only) {
436 list_add_tail(&pCifsFile->flist,
437 &pCifsInode->openFileList);
438 } else {
439 list_add(&pCifsFile->flist,
440 &pCifsInode->openFileList);
441 }
442 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
443 pCifsInode->clientCanCacheAll = true;
444 pCifsInode->clientCanCacheRead = true;
445 cFYI(1, ("Exclusive Oplock inode %p",
446 newinode));
447 } else if ((oplock & 0xF) == OPLOCK_READ)
448 pCifsInode->clientCanCacheRead = true;
449 }
450 write_unlock(&GlobalSMBSeslock);
451 } 473 }
452cifs_create_out: 474cifs_create_out:
453 kfree(buf); 475 kfree(buf);
@@ -479,7 +501,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
479 rc = -ENOMEM; 501 rc = -ENOMEM;
480 else if (pTcon->unix_ext) { 502 else if (pTcon->unix_ext) {
481 struct cifs_unix_set_info_args args = { 503 struct cifs_unix_set_info_args args = {
482 .mode = mode & ~current->fs->umask, 504 .mode = mode & ~current_umask(),
483 .ctime = NO_CHANGE_64, 505 .ctime = NO_CHANGE_64,
484 .atime = NO_CHANGE_64, 506 .atime = NO_CHANGE_64,
485 .mtime = NO_CHANGE_64, 507 .mtime = NO_CHANGE_64,
@@ -580,17 +602,20 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
580 return rc; 602 return rc;
581} 603}
582 604
583
584struct dentry * 605struct dentry *
585cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, 606cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
586 struct nameidata *nd) 607 struct nameidata *nd)
587{ 608{
588 int xid; 609 int xid;
589 int rc = 0; /* to get around spurious gcc warning, set to zero here */ 610 int rc = 0; /* to get around spurious gcc warning, set to zero here */
611 int oplock = 0;
612 __u16 fileHandle = 0;
613 bool posix_open = false;
590 struct cifs_sb_info *cifs_sb; 614 struct cifs_sb_info *cifs_sb;
591 struct cifsTconInfo *pTcon; 615 struct cifsTconInfo *pTcon;
592 struct inode *newInode = NULL; 616 struct inode *newInode = NULL;
593 char *full_path = NULL; 617 char *full_path = NULL;
618 struct file *filp;
594 619
595 xid = GetXid(); 620 xid = GetXid();
596 621
@@ -632,12 +657,43 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
632 } 657 }
633 cFYI(1, ("Full path: %s inode = 0x%p", full_path, direntry->d_inode)); 658 cFYI(1, ("Full path: %s inode = 0x%p", full_path, direntry->d_inode));
634 659
635 if (pTcon->unix_ext) 660 /* Posix open is only called (at lookup time) for file create now.
636 rc = cifs_get_inode_info_unix(&newInode, full_path, 661 * For opens (rather than creates), because we do not know if it
637 parent_dir_inode->i_sb, xid); 662 * is a file or directory yet, and current Samba no longer allows
638 else 663 * us to do posix open on dirs, we could end up wasting an open call
664 * on what turns out to be a dir. For file opens, we wait to call posix
665 * open till cifs_open. It could be added here (lookup) in the future
666 * but the performance tradeoff of the extra network request when EISDIR
667 * or EACCES is returned would have to be weighed against the 50%
668 * reduction in network traffic in the other paths.
669 */
670 if (pTcon->unix_ext) {
671 if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
672 (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
673 (nd->intent.open.flags & O_CREAT)) {
674 rc = cifs_posix_open(full_path, &newInode,
675 parent_dir_inode->i_sb,
676 nd->intent.open.create_mode,
677 nd->intent.open.flags, &oplock,
678 &fileHandle, xid);
679 /*
680 * The check below works around a bug in POSIX
681 * open in samba versions 3.3.1 and earlier where
682 * open could incorrectly fail with invalid parameter.
683 * If either that or op not supported returned, follow
684 * the normal lookup.
685 */
686 if ((rc == 0) || (rc == -ENOENT))
687 posix_open = true;
688 else if ((rc == -EINVAL) || (rc != -EOPNOTSUPP))
689 pTcon->broken_posix_open = true;
690 }
691 if (!posix_open)
692 rc = cifs_get_inode_info_unix(&newInode, full_path,
693 parent_dir_inode->i_sb, xid);
694 } else
639 rc = cifs_get_inode_info(&newInode, full_path, NULL, 695 rc = cifs_get_inode_info(&newInode, full_path, NULL,
640 parent_dir_inode->i_sb, xid, NULL); 696 parent_dir_inode->i_sb, xid, NULL);
641 697
642 if ((rc == 0) && (newInode != NULL)) { 698 if ((rc == 0) && (newInode != NULL)) {
643 if (pTcon->nocase) 699 if (pTcon->nocase)
@@ -645,7 +701,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
645 else 701 else
646 direntry->d_op = &cifs_dentry_ops; 702 direntry->d_op = &cifs_dentry_ops;
647 d_add(direntry, newInode); 703 d_add(direntry, newInode);
648 704 if (posix_open)
705 filp = lookup_instantiate_filp(nd, direntry, NULL);
649 /* since paths are not looked up by component - the parent 706 /* since paths are not looked up by component - the parent
650 directories are presumed to be good here */ 707 directories are presumed to be good here */
651 renew_parental_timestamps(direntry); 708 renew_parental_timestamps(direntry);
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 1e0c1bd8f2e4..df4a306f697e 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -78,7 +78,7 @@ dns_resolver_instantiate(struct key *key, const void *data,
78 } 78 }
79 79
80 key->type_data.x[0] = datalen; 80 key->type_data.x[0] = datalen;
81 rcu_assign_pointer(key->payload.data, ip); 81 key->payload.data = ip;
82 82
83 return rc; 83 return rc;
84} 84}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 81747acca4c4..302ea15f02e6 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -46,7 +46,7 @@ static inline struct cifsFileInfo *cifs_init_private(
46 memset(private_data, 0, sizeof(struct cifsFileInfo)); 46 memset(private_data, 0, sizeof(struct cifsFileInfo));
47 private_data->netfid = netfid; 47 private_data->netfid = netfid;
48 private_data->pid = current->tgid; 48 private_data->pid = current->tgid;
49 init_MUTEX(&private_data->fh_sem); 49 mutex_init(&private_data->fh_mutex);
50 mutex_init(&private_data->lock_mutex); 50 mutex_init(&private_data->lock_mutex);
51 INIT_LIST_HEAD(&private_data->llist); 51 INIT_LIST_HEAD(&private_data->llist);
52 private_data->pfile = file; /* needed for writepage */ 52 private_data->pfile = file; /* needed for writepage */
@@ -129,15 +129,8 @@ static inline int cifs_posix_open_inode_helper(struct inode *inode,
129 struct file *file, struct cifsInodeInfo *pCifsInode, 129 struct file *file, struct cifsInodeInfo *pCifsInode,
130 struct cifsFileInfo *pCifsFile, int oplock, u16 netfid) 130 struct cifsFileInfo *pCifsFile, int oplock, u16 netfid)
131{ 131{
132 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
133/* struct timespec temp; */ /* BB REMOVEME BB */
134 132
135 file->private_data = kmalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
136 if (file->private_data == NULL)
137 return -ENOMEM;
138 pCifsFile = cifs_init_private(file->private_data, inode, file, netfid);
139 write_lock(&GlobalSMBSeslock); 133 write_lock(&GlobalSMBSeslock);
140 list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList);
141 134
142 pCifsInode = CIFS_I(file->f_path.dentry->d_inode); 135 pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
143 if (pCifsInode == NULL) { 136 if (pCifsInode == NULL) {
@@ -145,17 +138,6 @@ static inline int cifs_posix_open_inode_helper(struct inode *inode,
145 return -EINVAL; 138 return -EINVAL;
146 } 139 }
147 140
148 /* want handles we can use to read with first
149 in the list so we do not have to walk the
150 list to search for one in write_begin */
151 if ((file->f_flags & O_ACCMODE) == O_WRONLY) {
152 list_add_tail(&pCifsFile->flist,
153 &pCifsInode->openFileList);
154 } else {
155 list_add(&pCifsFile->flist,
156 &pCifsInode->openFileList);
157 }
158
159 if (pCifsInode->clientCanCacheRead) { 141 if (pCifsInode->clientCanCacheRead) {
160 /* we have the inode open somewhere else 142 /* we have the inode open somewhere else
161 no need to discard cache data */ 143 no need to discard cache data */
@@ -198,6 +180,38 @@ psx_client_can_cache:
198 return 0; 180 return 0;
199} 181}
200 182
183static struct cifsFileInfo *
184cifs_fill_filedata(struct file *file)
185{
186 struct list_head *tmp;
187 struct cifsFileInfo *pCifsFile = NULL;
188 struct cifsInodeInfo *pCifsInode = NULL;
189
190 /* search inode for this file and fill in file->private_data */
191 pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
192 read_lock(&GlobalSMBSeslock);
193 list_for_each(tmp, &pCifsInode->openFileList) {
194 pCifsFile = list_entry(tmp, struct cifsFileInfo, flist);
195 if ((pCifsFile->pfile == NULL) &&
196 (pCifsFile->pid == current->tgid)) {
197 /* mode set in cifs_create */
198
199 /* needed for writepage */
200 pCifsFile->pfile = file;
201 file->private_data = pCifsFile;
202 break;
203 }
204 }
205 read_unlock(&GlobalSMBSeslock);
206
207 if (file->private_data != NULL) {
208 return pCifsFile;
209 } else if ((file->f_flags & O_CREAT) && (file->f_flags & O_EXCL))
210 cERROR(1, ("could not find file instance for "
211 "new file %p", file));
212 return NULL;
213}
214
201/* all arguments to this function must be checked for validity in caller */ 215/* all arguments to this function must be checked for validity in caller */
202static inline int cifs_open_inode_helper(struct inode *inode, struct file *file, 216static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
203 struct cifsInodeInfo *pCifsInode, struct cifsFileInfo *pCifsFile, 217 struct cifsInodeInfo *pCifsInode, struct cifsFileInfo *pCifsFile,
@@ -272,7 +286,6 @@ int cifs_open(struct inode *inode, struct file *file)
272 struct cifsTconInfo *tcon; 286 struct cifsTconInfo *tcon;
273 struct cifsFileInfo *pCifsFile; 287 struct cifsFileInfo *pCifsFile;
274 struct cifsInodeInfo *pCifsInode; 288 struct cifsInodeInfo *pCifsInode;
275 struct list_head *tmp;
276 char *full_path = NULL; 289 char *full_path = NULL;
277 int desiredAccess; 290 int desiredAccess;
278 int disposition; 291 int disposition;
@@ -284,34 +297,11 @@ int cifs_open(struct inode *inode, struct file *file)
284 cifs_sb = CIFS_SB(inode->i_sb); 297 cifs_sb = CIFS_SB(inode->i_sb);
285 tcon = cifs_sb->tcon; 298 tcon = cifs_sb->tcon;
286 299
287 if (file->f_flags & O_CREAT) { 300 pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
288 /* search inode for this file and fill in file->private_data */ 301 pCifsFile = cifs_fill_filedata(file);
289 pCifsInode = CIFS_I(file->f_path.dentry->d_inode); 302 if (pCifsFile) {
290 read_lock(&GlobalSMBSeslock); 303 FreeXid(xid);
291 list_for_each(tmp, &pCifsInode->openFileList) { 304 return 0;
292 pCifsFile = list_entry(tmp, struct cifsFileInfo,
293 flist);
294 if ((pCifsFile->pfile == NULL) &&
295 (pCifsFile->pid == current->tgid)) {
296 /* mode set in cifs_create */
297
298 /* needed for writepage */
299 pCifsFile->pfile = file;
300
301 file->private_data = pCifsFile;
302 break;
303 }
304 }
305 read_unlock(&GlobalSMBSeslock);
306 if (file->private_data != NULL) {
307 rc = 0;
308 FreeXid(xid);
309 return rc;
310 } else {
311 if (file->f_flags & O_EXCL)
312 cERROR(1, ("could not find file instance for "
313 "new file %p", file));
314 }
315 } 305 }
316 306
317 full_path = build_path_from_dentry(file->f_path.dentry); 307 full_path = build_path_from_dentry(file->f_path.dentry);
@@ -342,6 +332,7 @@ int cifs_open(struct inode *inode, struct file *file)
342 /* no need for special case handling of setting mode 332 /* no need for special case handling of setting mode
343 on read only files needed here */ 333 on read only files needed here */
344 334
335 pCifsFile = cifs_fill_filedata(file);
345 cifs_posix_open_inode_helper(inode, file, pCifsInode, 336 cifs_posix_open_inode_helper(inode, file, pCifsInode,
346 pCifsFile, oplock, netfid); 337 pCifsFile, oplock, netfid);
347 goto out; 338 goto out;
@@ -500,9 +491,9 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
500 return -EBADF; 491 return -EBADF;
501 492
502 xid = GetXid(); 493 xid = GetXid();
503 down(&pCifsFile->fh_sem); 494 mutex_unlock(&pCifsFile->fh_mutex);
504 if (!pCifsFile->invalidHandle) { 495 if (!pCifsFile->invalidHandle) {
505 up(&pCifsFile->fh_sem); 496 mutex_lock(&pCifsFile->fh_mutex);
506 FreeXid(xid); 497 FreeXid(xid);
507 return 0; 498 return 0;
508 } 499 }
@@ -533,7 +524,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
533 if (full_path == NULL) { 524 if (full_path == NULL) {
534 rc = -ENOMEM; 525 rc = -ENOMEM;
535reopen_error_exit: 526reopen_error_exit:
536 up(&pCifsFile->fh_sem); 527 mutex_lock(&pCifsFile->fh_mutex);
537 FreeXid(xid); 528 FreeXid(xid);
538 return rc; 529 return rc;
539 } 530 }
@@ -575,14 +566,14 @@ reopen_error_exit:
575 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 566 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
576 CIFS_MOUNT_MAP_SPECIAL_CHR); 567 CIFS_MOUNT_MAP_SPECIAL_CHR);
577 if (rc) { 568 if (rc) {
578 up(&pCifsFile->fh_sem); 569 mutex_lock(&pCifsFile->fh_mutex);
579 cFYI(1, ("cifs_open returned 0x%x", rc)); 570 cFYI(1, ("cifs_open returned 0x%x", rc));
580 cFYI(1, ("oplock: %d", oplock)); 571 cFYI(1, ("oplock: %d", oplock));
581 } else { 572 } else {
582reopen_success: 573reopen_success:
583 pCifsFile->netfid = netfid; 574 pCifsFile->netfid = netfid;
584 pCifsFile->invalidHandle = false; 575 pCifsFile->invalidHandle = false;
585 up(&pCifsFile->fh_sem); 576 mutex_lock(&pCifsFile->fh_mutex);
586 pCifsInode = CIFS_I(inode); 577 pCifsInode = CIFS_I(inode);
587 if (pCifsInode) { 578 if (pCifsInode) {
588 if (can_flush) { 579 if (can_flush) {
@@ -971,6 +962,40 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
971 return rc; 962 return rc;
972} 963}
973 964
965/*
966 * Set the timeout on write requests past EOF. For some servers (Windows)
967 * these calls can be very long.
968 *
969 * If we're writing >10M past the EOF we give a 180s timeout. Anything less
970 * than that gets a 45s timeout. Writes not past EOF get 15s timeouts.
971 * The 10M cutoff is totally arbitrary. A better scheme for this would be
972 * welcome if someone wants to suggest one.
973 *
974 * We may be able to do a better job with this if there were some way to
975 * declare that a file should be sparse.
976 */
977static int
978cifs_write_timeout(struct cifsInodeInfo *cifsi, loff_t offset)
979{
980 if (offset <= cifsi->server_eof)
981 return CIFS_STD_OP;
982 else if (offset > (cifsi->server_eof + (10 * 1024 * 1024)))
983 return CIFS_VLONG_OP;
984 else
985 return CIFS_LONG_OP;
986}
987
988/* update the file size (if needed) after a write */
989static void
990cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
991 unsigned int bytes_written)
992{
993 loff_t end_of_write = offset + bytes_written;
994
995 if (end_of_write > cifsi->server_eof)
996 cifsi->server_eof = end_of_write;
997}
998
974ssize_t cifs_user_write(struct file *file, const char __user *write_data, 999ssize_t cifs_user_write(struct file *file, const char __user *write_data,
975 size_t write_size, loff_t *poffset) 1000 size_t write_size, loff_t *poffset)
976{ 1001{
@@ -981,6 +1006,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
981 struct cifsTconInfo *pTcon; 1006 struct cifsTconInfo *pTcon;
982 int xid, long_op; 1007 int xid, long_op;
983 struct cifsFileInfo *open_file; 1008 struct cifsFileInfo *open_file;
1009 struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode);
984 1010
985 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 1011 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
986 1012
@@ -1000,11 +1026,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
1000 1026
1001 xid = GetXid(); 1027 xid = GetXid();
1002 1028
1003 if (*poffset > file->f_path.dentry->d_inode->i_size) 1029 long_op = cifs_write_timeout(cifsi, *poffset);
1004 long_op = CIFS_VLONG_OP; /* writes past EOF take long time */
1005 else
1006 long_op = CIFS_LONG_OP;
1007
1008 for (total_written = 0; write_size > total_written; 1030 for (total_written = 0; write_size > total_written;
1009 total_written += bytes_written) { 1031 total_written += bytes_written) {
1010 rc = -EAGAIN; 1032 rc = -EAGAIN;
@@ -1048,8 +1070,10 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
1048 FreeXid(xid); 1070 FreeXid(xid);
1049 return rc; 1071 return rc;
1050 } 1072 }
1051 } else 1073 } else {
1074 cifs_update_eof(cifsi, *poffset, bytes_written);
1052 *poffset += bytes_written; 1075 *poffset += bytes_written;
1076 }
1053 long_op = CIFS_STD_OP; /* subsequent writes fast - 1077 long_op = CIFS_STD_OP; /* subsequent writes fast -
1054 15 seconds is plenty */ 1078 15 seconds is plenty */
1055 } 1079 }
@@ -1085,6 +1109,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
1085 struct cifsTconInfo *pTcon; 1109 struct cifsTconInfo *pTcon;
1086 int xid, long_op; 1110 int xid, long_op;
1087 struct cifsFileInfo *open_file; 1111 struct cifsFileInfo *open_file;
1112 struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode);
1088 1113
1089 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 1114 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1090 1115
@@ -1099,11 +1124,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
1099 1124
1100 xid = GetXid(); 1125 xid = GetXid();
1101 1126
1102 if (*poffset > file->f_path.dentry->d_inode->i_size) 1127 long_op = cifs_write_timeout(cifsi, *poffset);
1103 long_op = CIFS_VLONG_OP; /* writes past EOF can be slow */
1104 else
1105 long_op = CIFS_LONG_OP;
1106
1107 for (total_written = 0; write_size > total_written; 1128 for (total_written = 0; write_size > total_written;
1108 total_written += bytes_written) { 1129 total_written += bytes_written) {
1109 rc = -EAGAIN; 1130 rc = -EAGAIN;
@@ -1166,8 +1187,10 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
1166 FreeXid(xid); 1187 FreeXid(xid);
1167 return rc; 1188 return rc;
1168 } 1189 }
1169 } else 1190 } else {
1191 cifs_update_eof(cifsi, *poffset, bytes_written);
1170 *poffset += bytes_written; 1192 *poffset += bytes_written;
1193 }
1171 long_op = CIFS_STD_OP; /* subsequent writes fast - 1194 long_op = CIFS_STD_OP; /* subsequent writes fast -
1172 15 seconds is plenty */ 1195 15 seconds is plenty */
1173 } 1196 }
@@ -1380,11 +1403,12 @@ static int cifs_writepages(struct address_space *mapping,
1380 int nr_pages; 1403 int nr_pages;
1381 __u64 offset = 0; 1404 __u64 offset = 0;
1382 struct cifsFileInfo *open_file; 1405 struct cifsFileInfo *open_file;
1406 struct cifsInodeInfo *cifsi = CIFS_I(mapping->host);
1383 struct page *page; 1407 struct page *page;
1384 struct pagevec pvec; 1408 struct pagevec pvec;
1385 int rc = 0; 1409 int rc = 0;
1386 int scanned = 0; 1410 int scanned = 0;
1387 int xid; 1411 int xid, long_op;
1388 1412
1389 cifs_sb = CIFS_SB(mapping->host->i_sb); 1413 cifs_sb = CIFS_SB(mapping->host->i_sb);
1390 1414
@@ -1528,12 +1552,15 @@ retry:
1528 cERROR(1, ("No writable handles for inode")); 1552 cERROR(1, ("No writable handles for inode"));
1529 rc = -EBADF; 1553 rc = -EBADF;
1530 } else { 1554 } else {
1555 long_op = cifs_write_timeout(cifsi, offset);
1531 rc = CIFSSMBWrite2(xid, cifs_sb->tcon, 1556 rc = CIFSSMBWrite2(xid, cifs_sb->tcon,
1532 open_file->netfid, 1557 open_file->netfid,
1533 bytes_to_write, offset, 1558 bytes_to_write, offset,
1534 &bytes_written, iov, n_iov, 1559 &bytes_written, iov, n_iov,
1535 CIFS_LONG_OP); 1560 long_op);
1536 atomic_dec(&open_file->wrtPending); 1561 atomic_dec(&open_file->wrtPending);
1562 cifs_update_eof(cifsi, offset, bytes_written);
1563
1537 if (rc || bytes_written < bytes_to_write) { 1564 if (rc || bytes_written < bytes_to_write) {
1538 cERROR(1, ("Write2 ret %d, wrote %d", 1565 cERROR(1, ("Write2 ret %d, wrote %d",
1539 rc, bytes_written)); 1566 rc, bytes_written));
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a8797cc60805..9c869a6dcba1 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -143,6 +143,7 @@ static void cifs_unix_info_to_inode(struct inode *inode,
143 143
144 inode->i_nlink = le64_to_cpu(info->Nlinks); 144 inode->i_nlink = le64_to_cpu(info->Nlinks);
145 145
146 cifsInfo->server_eof = end_of_file;
146 spin_lock(&inode->i_lock); 147 spin_lock(&inode->i_lock);
147 if (is_size_safe_to_change(cifsInfo, end_of_file)) { 148 if (is_size_safe_to_change(cifsInfo, end_of_file)) {
148 /* 149 /*
@@ -276,7 +277,8 @@ int cifs_get_inode_info_unix(struct inode **pinode,
276 277
277 /* get new inode */ 278 /* get new inode */
278 if (*pinode == NULL) { 279 if (*pinode == NULL) {
279 *pinode = cifs_new_inode(sb, &find_data.UniqueId); 280 __u64 unique_id = le64_to_cpu(find_data.UniqueId);
281 *pinode = cifs_new_inode(sb, &unique_id);
280 if (*pinode == NULL) { 282 if (*pinode == NULL) {
281 rc = -ENOMEM; 283 rc = -ENOMEM;
282 goto cgiiu_exit; 284 goto cgiiu_exit;
@@ -605,12 +607,12 @@ int cifs_get_inode_info(struct inode **pinode,
605 inode->i_mode |= S_IFREG; 607 inode->i_mode |= S_IFREG;
606 } 608 }
607 609
610 cifsInfo->server_eof = le64_to_cpu(pfindData->EndOfFile);
608 spin_lock(&inode->i_lock); 611 spin_lock(&inode->i_lock);
609 if (is_size_safe_to_change(cifsInfo, 612 if (is_size_safe_to_change(cifsInfo, cifsInfo->server_eof)) {
610 le64_to_cpu(pfindData->EndOfFile))) {
611 /* can not safely shrink the file size here if the 613 /* can not safely shrink the file size here if the
612 client is writing to it due to potential races */ 614 client is writing to it due to potential races */
613 i_size_write(inode, le64_to_cpu(pfindData->EndOfFile)); 615 i_size_write(inode, cifsInfo->server_eof);
614 616
615 /* 512 bytes (2**9) is the fake blocksize that must be 617 /* 512 bytes (2**9) is the fake blocksize that must be
616 used for this calculation */ 618 used for this calculation */
@@ -960,13 +962,21 @@ undo_setattr:
960 goto out_close; 962 goto out_close;
961} 963}
962 964
965
966/*
967 * If dentry->d_inode is null (usually meaning the cached dentry
968 * is a negative dentry) then we would attempt a standard SMB delete, but
969 * if that fails we can not attempt the fall back mechanisms on EACESS
970 * but will return the EACESS to the caller. Note that the VFS does not call
971 * unlink on negative dentries currently.
972 */
963int cifs_unlink(struct inode *dir, struct dentry *dentry) 973int cifs_unlink(struct inode *dir, struct dentry *dentry)
964{ 974{
965 int rc = 0; 975 int rc = 0;
966 int xid; 976 int xid;
967 char *full_path = NULL; 977 char *full_path = NULL;
968 struct inode *inode = dentry->d_inode; 978 struct inode *inode = dentry->d_inode;
969 struct cifsInodeInfo *cifsInode = CIFS_I(inode); 979 struct cifsInodeInfo *cifs_inode;
970 struct super_block *sb = dir->i_sb; 980 struct super_block *sb = dir->i_sb;
971 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 981 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
972 struct cifsTconInfo *tcon = cifs_sb->tcon; 982 struct cifsTconInfo *tcon = cifs_sb->tcon;
@@ -1010,7 +1020,7 @@ psx_del_no_retry:
1010 rc = cifs_rename_pending_delete(full_path, dentry, xid); 1020 rc = cifs_rename_pending_delete(full_path, dentry, xid);
1011 if (rc == 0) 1021 if (rc == 0)
1012 drop_nlink(inode); 1022 drop_nlink(inode);
1013 } else if (rc == -EACCES && dosattr == 0) { 1023 } else if ((rc == -EACCES) && (dosattr == 0) && inode) {
1014 attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); 1024 attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
1015 if (attrs == NULL) { 1025 if (attrs == NULL) {
1016 rc = -ENOMEM; 1026 rc = -ENOMEM;
@@ -1018,7 +1028,8 @@ psx_del_no_retry:
1018 } 1028 }
1019 1029
1020 /* try to reset dos attributes */ 1030 /* try to reset dos attributes */
1021 origattr = cifsInode->cifsAttrs; 1031 cifs_inode = CIFS_I(inode);
1032 origattr = cifs_inode->cifsAttrs;
1022 if (origattr == 0) 1033 if (origattr == 0)
1023 origattr |= ATTR_NORMAL; 1034 origattr |= ATTR_NORMAL;
1024 dosattr = origattr & ~ATTR_READONLY; 1035 dosattr = origattr & ~ATTR_READONLY;
@@ -1039,13 +1050,13 @@ psx_del_no_retry:
1039 1050
1040out_reval: 1051out_reval:
1041 if (inode) { 1052 if (inode) {
1042 cifsInode = CIFS_I(inode); 1053 cifs_inode = CIFS_I(inode);
1043 cifsInode->time = 0; /* will force revalidate to get info 1054 cifs_inode->time = 0; /* will force revalidate to get info
1044 when needed */ 1055 when needed */
1045 inode->i_ctime = current_fs_time(sb); 1056 inode->i_ctime = current_fs_time(sb);
1046 } 1057 }
1047 dir->i_ctime = dir->i_mtime = current_fs_time(sb); 1058 dir->i_ctime = dir->i_mtime = current_fs_time(sb);
1048 cifsInode = CIFS_I(dir); 1059 cifs_inode = CIFS_I(dir);
1049 CIFS_I(dir)->time = 0; /* force revalidate of dir as well */ 1060 CIFS_I(dir)->time = 0; /* force revalidate of dir as well */
1050 1061
1051 kfree(full_path); 1062 kfree(full_path);
@@ -1125,7 +1136,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1125 goto mkdir_out; 1136 goto mkdir_out;
1126 } 1137 }
1127 1138
1128 mode &= ~current->fs->umask; 1139 mode &= ~current_umask();
1129 rc = CIFSPOSIXCreate(xid, pTcon, SMB_O_DIRECTORY | SMB_O_CREAT, 1140 rc = CIFSPOSIXCreate(xid, pTcon, SMB_O_DIRECTORY | SMB_O_CREAT,
1130 mode, NULL /* netfid */, pInfo, &oplock, 1141 mode, NULL /* netfid */, pInfo, &oplock,
1131 full_path, cifs_sb->local_nls, 1142 full_path, cifs_sb->local_nls,
@@ -1138,6 +1149,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1138 cFYI(1, ("posix mkdir returned 0x%x", rc)); 1149 cFYI(1, ("posix mkdir returned 0x%x", rc));
1139 d_drop(direntry); 1150 d_drop(direntry);
1140 } else { 1151 } else {
1152 __u64 unique_id;
1141 if (pInfo->Type == cpu_to_le32(-1)) { 1153 if (pInfo->Type == cpu_to_le32(-1)) {
1142 /* no return info, go query for it */ 1154 /* no return info, go query for it */
1143 kfree(pInfo); 1155 kfree(pInfo);
@@ -1151,8 +1163,8 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1151 else 1163 else
1152 direntry->d_op = &cifs_dentry_ops; 1164 direntry->d_op = &cifs_dentry_ops;
1153 1165
1154 newinode = cifs_new_inode(inode->i_sb, 1166 unique_id = le64_to_cpu(pInfo->UniqueId);
1155 &pInfo->UniqueId); 1167 newinode = cifs_new_inode(inode->i_sb, &unique_id);
1156 if (newinode == NULL) { 1168 if (newinode == NULL) {
1157 kfree(pInfo); 1169 kfree(pInfo);
1158 goto mkdir_get_info; 1170 goto mkdir_get_info;
@@ -1204,7 +1216,7 @@ mkdir_get_info:
1204 if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2)) 1216 if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2))
1205 direntry->d_inode->i_nlink = 2; 1217 direntry->d_inode->i_nlink = 2;
1206 1218
1207 mode &= ~current->fs->umask; 1219 mode &= ~current_umask();
1208 /* must turn on setgid bit if parent dir has it */ 1220 /* must turn on setgid bit if parent dir has it */
1209 if (inode->i_mode & S_ISGID) 1221 if (inode->i_mode & S_ISGID)
1210 mode |= S_ISGID; 1222 mode |= S_ISGID;
@@ -1450,7 +1462,8 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
1450 checking the UniqueId via FILE_INTERNAL_INFO */ 1462 checking the UniqueId via FILE_INTERNAL_INFO */
1451 1463
1452unlink_target: 1464unlink_target:
1453 if ((rc == -EACCES) || (rc == -EEXIST)) { 1465 /* Try unlinking the target dentry if it's not negative */
1466 if (target_dentry->d_inode && (rc == -EACCES || rc == -EEXIST)) {
1454 tmprc = cifs_unlink(target_dir, target_dentry); 1467 tmprc = cifs_unlink(target_dir, target_dentry);
1455 if (tmprc) 1468 if (tmprc)
1456 goto cifs_rename_exit; 1469 goto cifs_rename_exit;
@@ -1753,6 +1766,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1753 } 1766 }
1754 1767
1755 if (rc == 0) { 1768 if (rc == 0) {
1769 cifsInode->server_eof = attrs->ia_size;
1756 rc = cifs_vmtruncate(inode, attrs->ia_size); 1770 rc = cifs_vmtruncate(inode, attrs->ia_size);
1757 cifs_truncate_page(inode->i_mapping, inode->i_size); 1771 cifs_truncate_page(inode->i_mapping, inode->i_size);
1758 } 1772 }
@@ -1792,20 +1806,21 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
1792 goto out; 1806 goto out;
1793 } 1807 }
1794 1808
1795 if ((attrs->ia_valid & ATTR_MTIME) || (attrs->ia_valid & ATTR_SIZE)) { 1809 /*
1796 /* 1810 * Attempt to flush data before changing attributes. We need to do
1797 Flush data before changing file size or changing the last 1811 * this for ATTR_SIZE and ATTR_MTIME for sure, and if we change the
1798 write time of the file on the server. If the 1812 * ownership or mode then we may also need to do this. Here, we take
1799 flush returns error, store it to report later and continue. 1813 * the safe way out and just do the flush on all setattr requests. If
1800 BB: This should be smarter. Why bother flushing pages that 1814 * the flush returns error, store it to report later and continue.
1801 will be truncated anyway? Also, should we error out here if 1815 *
1802 the flush returns error? 1816 * BB: This should be smarter. Why bother flushing pages that
1803 */ 1817 * will be truncated anyway? Also, should we error out here if
1804 rc = filemap_write_and_wait(inode->i_mapping); 1818 * the flush returns error?
1805 if (rc != 0) { 1819 */
1806 cifsInode->write_behind_rc = rc; 1820 rc = filemap_write_and_wait(inode->i_mapping);
1807 rc = 0; 1821 if (rc != 0) {
1808 } 1822 cifsInode->write_behind_rc = rc;
1823 rc = 0;
1809 } 1824 }
1810 1825
1811 if (attrs->ia_valid & ATTR_SIZE) { 1826 if (attrs->ia_valid & ATTR_SIZE) {
@@ -1903,20 +1918,21 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
1903 return -ENOMEM; 1918 return -ENOMEM;
1904 } 1919 }
1905 1920
1906 if ((attrs->ia_valid & ATTR_MTIME) || (attrs->ia_valid & ATTR_SIZE)) { 1921 /*
1907 /* 1922 * Attempt to flush data before changing attributes. We need to do
1908 Flush data before changing file size or changing the last 1923 * this for ATTR_SIZE and ATTR_MTIME for sure, and if we change the
1909 write time of the file on the server. If the 1924 * ownership or mode then we may also need to do this. Here, we take
1910 flush returns error, store it to report later and continue. 1925 * the safe way out and just do the flush on all setattr requests. If
1911 BB: This should be smarter. Why bother flushing pages that 1926 * the flush returns error, store it to report later and continue.
1912 will be truncated anyway? Also, should we error out here if 1927 *
1913 the flush returns error? 1928 * BB: This should be smarter. Why bother flushing pages that
1914 */ 1929 * will be truncated anyway? Also, should we error out here if
1915 rc = filemap_write_and_wait(inode->i_mapping); 1930 * the flush returns error?
1916 if (rc != 0) { 1931 */
1917 cifsInode->write_behind_rc = rc; 1932 rc = filemap_write_and_wait(inode->i_mapping);
1918 rc = 0; 1933 if (rc != 0) {
1919 } 1934 cifsInode->write_behind_rc = rc;
1935 rc = 0;
1920 } 1936 }
1921 1937
1922 if (attrs->ia_valid & ATTR_SIZE) { 1938 if (attrs->ia_valid & ATTR_SIZE) {
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 63f644000ce5..cd83c53fcbb5 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -107,63 +107,51 @@ void *
107cifs_follow_link(struct dentry *direntry, struct nameidata *nd) 107cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
108{ 108{
109 struct inode *inode = direntry->d_inode; 109 struct inode *inode = direntry->d_inode;
110 int rc = -EACCES; 110 int rc = -ENOMEM;
111 int xid; 111 int xid;
112 char *full_path = NULL; 112 char *full_path = NULL;
113 char *target_path = ERR_PTR(-ENOMEM); 113 char *target_path = NULL;
114 struct cifs_sb_info *cifs_sb; 114 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
115 struct cifsTconInfo *pTcon; 115 struct cifsTconInfo *tcon = cifs_sb->tcon;
116 116
117 xid = GetXid(); 117 xid = GetXid();
118 118
119 full_path = build_path_from_dentry(direntry); 119 /*
120 120 * For now, we just handle symlinks with unix extensions enabled.
121 if (!full_path) 121 * Eventually we should handle NTFS reparse points, and MacOS
122 goto out_no_free; 122 * symlink support. For instance...
123 123 *
124 cFYI(1, ("Full path: %s inode = 0x%p", full_path, inode)); 124 * rc = CIFSSMBQueryReparseLinkInfo(...)
125 cifs_sb = CIFS_SB(inode->i_sb); 125 *
126 pTcon = cifs_sb->tcon; 126 * For now, just return -EACCES when the server doesn't support posix
127 target_path = kmalloc(PATH_MAX, GFP_KERNEL); 127 * extensions. Note that we still allow querying symlinks when posix
128 if (!target_path) { 128 * extensions are manually disabled. We could disable these as well
129 target_path = ERR_PTR(-ENOMEM); 129 * but there doesn't seem to be any harm in allowing the client to
130 * read them.
131 */
132 if (!(tcon->ses->capabilities & CAP_UNIX)) {
133 rc = -EACCES;
130 goto out; 134 goto out;
131 } 135 }
132 136
133 /* We could change this to: 137 full_path = build_path_from_dentry(direntry);
134 if (pTcon->unix_ext) 138 if (!full_path)
135 but there does not seem any point in refusing to 139 goto out;
136 get symlink info if we can, even if unix extensions
137 turned off for this mount */
138
139 if (pTcon->ses->capabilities & CAP_UNIX)
140 rc = CIFSSMBUnixQuerySymLink(xid, pTcon, full_path,
141 target_path,
142 PATH_MAX-1,
143 cifs_sb->local_nls);
144 else {
145 /* BB add read reparse point symlink code here */
146 /* rc = CIFSSMBQueryReparseLinkInfo */
147 /* BB Add code to Query ReparsePoint info */
148 /* BB Add MAC style xsymlink check here if enabled */
149 }
150
151 if (rc == 0) {
152 140
153/* BB Add special case check for Samba DFS symlinks */ 141 cFYI(1, ("Full path: %s inode = 0x%p", full_path, inode));
154 142
155 target_path[PATH_MAX-1] = 0; 143 rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
156 } else { 144 cifs_sb->local_nls);
145 kfree(full_path);
146out:
147 if (rc != 0) {
157 kfree(target_path); 148 kfree(target_path);
158 target_path = ERR_PTR(rc); 149 target_path = ERR_PTR(rc);
159 } 150 }
160 151
161out:
162 kfree(full_path);
163out_no_free:
164 FreeXid(xid); 152 FreeXid(xid);
165 nd_set_link(nd, target_path); 153 nd_set_link(nd, target_path);
166 return NULL; /* No cookie */ 154 return NULL;
167} 155}
168 156
169int 157int
@@ -224,98 +212,6 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
224 return rc; 212 return rc;
225} 213}
226 214
227int
228cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen)
229{
230 struct inode *inode = direntry->d_inode;
231 int rc = -EACCES;
232 int xid;
233 int oplock = 0;
234 struct cifs_sb_info *cifs_sb;
235 struct cifsTconInfo *pTcon;
236 char *full_path = NULL;
237 char *tmpbuffer;
238 int len;
239 __u16 fid;
240
241 xid = GetXid();
242 cifs_sb = CIFS_SB(inode->i_sb);
243 pTcon = cifs_sb->tcon;
244
245/* BB would it be safe against deadlock to grab this sem
246 even though rename itself grabs the sem and calls lookup? */
247/* mutex_lock(&inode->i_sb->s_vfs_rename_mutex);*/
248 full_path = build_path_from_dentry(direntry);
249/* mutex_unlock(&inode->i_sb->s_vfs_rename_mutex);*/
250
251 if (full_path == NULL) {
252 FreeXid(xid);
253 return -ENOMEM;
254 }
255
256 cFYI(1,
257 ("Full path: %s inode = 0x%p pBuffer = 0x%p buflen = %d",
258 full_path, inode, pBuffer, buflen));
259 if (buflen > PATH_MAX)
260 len = PATH_MAX;
261 else
262 len = buflen;
263 tmpbuffer = kmalloc(len, GFP_KERNEL);
264 if (tmpbuffer == NULL) {
265 kfree(full_path);
266 FreeXid(xid);
267 return -ENOMEM;
268 }
269
270/* BB add read reparse point symlink code and
271 Unix extensions symlink code here BB */
272/* We could disable this based on pTcon->unix_ext flag instead ... but why? */
273 if (cifs_sb->tcon->ses->capabilities & CAP_UNIX)
274 rc = CIFSSMBUnixQuerySymLink(xid, pTcon, full_path,
275 tmpbuffer,
276 len - 1,
277 cifs_sb->local_nls);
278 else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
279 cERROR(1, ("SFU style symlinks not implemented yet"));
280 /* add open and read as in fs/cifs/inode.c */
281 } else {
282 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, GENERIC_READ,
283 OPEN_REPARSE_POINT, &fid, &oplock, NULL,
284 cifs_sb->local_nls,
285 cifs_sb->mnt_cifs_flags &
286 CIFS_MOUNT_MAP_SPECIAL_CHR);
287 if (!rc) {
288 rc = CIFSSMBQueryReparseLinkInfo(xid, pTcon, full_path,
289 tmpbuffer,
290 len - 1,
291 fid,
292 cifs_sb->local_nls);
293 if (CIFSSMBClose(xid, pTcon, fid)) {
294 cFYI(1, ("Error closing junction point "
295 "(open for ioctl)"));
296 }
297 /* If it is a DFS junction earlier we would have gotten
298 PATH_NOT_COVERED returned from server so we do
299 not need to request the DFS info here */
300 }
301 }
302 /* BB Anything else to do to handle recursive links? */
303 /* BB Should we be using page ops here? */
304
305 /* BB null terminate returned string in pBuffer? BB */
306 if (rc == 0) {
307 rc = vfs_readlink(direntry, pBuffer, len, tmpbuffer);
308 cFYI(1,
309 ("vfs_readlink called from cifs_readlink returned %d",
310 rc));
311 }
312
313 kfree(tmpbuffer);
314 kfree(full_path);
315 FreeXid(xid);
316 return rc;
317}
318
319void cifs_put_link(struct dentry *direntry, struct nameidata *nd, void *cookie) 215void cifs_put_link(struct dentry *direntry, struct nameidata *nd, void *cookie)
320{ 216{
321 char *p = nd_get_link(nd); 217 char *p = nd_get_link(nd);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 4c89c572891a..e079a9190ec4 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -635,77 +635,6 @@ dump_smb(struct smb_hdr *smb_buf, int smb_buf_length)
635 return; 635 return;
636} 636}
637 637
638/* Windows maps these to the user defined 16 bit Unicode range since they are
639 reserved symbols (along with \ and /), otherwise illegal to store
640 in filenames in NTFS */
641#define UNI_ASTERIK (__u16) ('*' + 0xF000)
642#define UNI_QUESTION (__u16) ('?' + 0xF000)
643#define UNI_COLON (__u16) (':' + 0xF000)
644#define UNI_GRTRTHAN (__u16) ('>' + 0xF000)
645#define UNI_LESSTHAN (__u16) ('<' + 0xF000)
646#define UNI_PIPE (__u16) ('|' + 0xF000)
647#define UNI_SLASH (__u16) ('\\' + 0xF000)
648
649/* Convert 16 bit Unicode pathname from wire format to string in current code
650 page. Conversion may involve remapping up the seven characters that are
651 only legal in POSIX-like OS (if they are present in the string). Path
652 names are little endian 16 bit Unicode on the wire */
653int
654cifs_convertUCSpath(char *target, const __le16 *source, int maxlen,
655 const struct nls_table *cp)
656{
657 int i, j, len;
658 __u16 src_char;
659
660 for (i = 0, j = 0; i < maxlen; i++) {
661 src_char = le16_to_cpu(source[i]);
662 switch (src_char) {
663 case 0:
664 goto cUCS_out; /* BB check this BB */
665 case UNI_COLON:
666 target[j] = ':';
667 break;
668 case UNI_ASTERIK:
669 target[j] = '*';
670 break;
671 case UNI_QUESTION:
672 target[j] = '?';
673 break;
674 /* BB We can not handle remapping slash until
675 all the calls to build_path_from_dentry
676 are modified, as they use slash as separator BB */
677 /* case UNI_SLASH:
678 target[j] = '\\';
679 break;*/
680 case UNI_PIPE:
681 target[j] = '|';
682 break;
683 case UNI_GRTRTHAN:
684 target[j] = '>';
685 break;
686 case UNI_LESSTHAN:
687 target[j] = '<';
688 break;
689 default:
690 len = cp->uni2char(src_char, &target[j],
691 NLS_MAX_CHARSET_SIZE);
692 if (len > 0) {
693 j += len;
694 continue;
695 } else {
696 target[j] = '?';
697 }
698 }
699 j++;
700 /* make sure we do not overrun callers allocated temp buffer */
701 if (j >= (2 * NAME_MAX))
702 break;
703 }
704cUCS_out:
705 target[j] = 0;
706 return j;
707}
708
709/* Convert 16 bit Unicode pathname to wire format from string in current code 638/* Convert 16 bit Unicode pathname to wire format from string in current code
710 page. Conversion may involve remapping up the seven characters that are 639 page. Conversion may involve remapping up the seven characters that are
711 only legal in POSIX-like OS (if they are present in the string). Path 640 only legal in POSIX-like OS (if they are present in the string). Path
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 8703d68f5b20..e2fe998989a3 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -79,6 +79,7 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = {
79 {ErrQuota, -EDQUOT}, 79 {ErrQuota, -EDQUOT},
80 {ErrNotALink, -ENOLINK}, 80 {ErrNotALink, -ENOLINK},
81 {ERRnetlogonNotStarted, -ENOPROTOOPT}, 81 {ERRnetlogonNotStarted, -ENOPROTOOPT},
82 {ERRsymlink, -EOPNOTSUPP},
82 {ErrTooManyLinks, -EMLINK}, 83 {ErrTooManyLinks, -EMLINK},
83 {0, 0} 84 {0, 0}
84}; 85};
@@ -714,6 +715,7 @@ static const struct {
714 ERRDOS, ERRnoaccess, 0xc000028f}, { 715 ERRDOS, ERRnoaccess, 0xc000028f}, {
715 ERRDOS, ERRnoaccess, 0xc0000290}, { 716 ERRDOS, ERRnoaccess, 0xc0000290}, {
716 ERRDOS, ERRbadfunc, 0xc000029c}, { 717 ERRDOS, ERRbadfunc, 0xc000029c}, {
718 ERRDOS, ERRsymlink, NT_STATUS_STOPPED_ON_SYMLINK}, {
717 ERRDOS, ERRinvlevel, 0x007c0001}, }; 719 ERRDOS, ERRinvlevel, 0x007c0001}, };
718 720
719/***************************************************************************** 721/*****************************************************************************
diff --git a/fs/cifs/nterr.h b/fs/cifs/nterr.h
index 588abbb9d08c..257267367d41 100644
--- a/fs/cifs/nterr.h
+++ b/fs/cifs/nterr.h
@@ -35,8 +35,6 @@ struct nt_err_code_struct {
35extern const struct nt_err_code_struct nt_errs[]; 35extern const struct nt_err_code_struct nt_errs[];
36 36
37/* Win32 Status codes. */ 37/* Win32 Status codes. */
38
39#define STATUS_BUFFER_OVERFLOW 0x80000005
40#define STATUS_MORE_ENTRIES 0x0105 38#define STATUS_MORE_ENTRIES 0x0105
41#define ERROR_INVALID_PARAMETER 0x0057 39#define ERROR_INVALID_PARAMETER 0x0057
42#define ERROR_INSUFFICIENT_BUFFER 0x007a 40#define ERROR_INSUFFICIENT_BUFFER 0x007a
@@ -50,6 +48,13 @@ extern const struct nt_err_code_struct nt_errs[];
50#define STATUS_SOME_UNMAPPED 0x0107 48#define STATUS_SOME_UNMAPPED 0x0107
51#define STATUS_BUFFER_OVERFLOW 0x80000005 49#define STATUS_BUFFER_OVERFLOW 0x80000005
52#define NT_STATUS_NO_MORE_ENTRIES 0x8000001a 50#define NT_STATUS_NO_MORE_ENTRIES 0x8000001a
51#define NT_STATUS_MEDIA_CHANGED 0x8000001c
52#define NT_STATUS_END_OF_MEDIA 0x8000001e
53#define NT_STATUS_MEDIA_CHECK 0x80000020
54#define NT_STATUS_NO_DATA_DETECTED 0x8000001c
55#define NT_STATUS_STOPPED_ON_SYMLINK 0x8000002d
56#define NT_STATUS_DEVICE_REQUIRES_CLEANING 0x80000288
57#define NT_STATUS_DEVICE_DOOR_OPEN 0x80000288
53#define NT_STATUS_UNSUCCESSFUL 0xC0000000 | 0x0001 58#define NT_STATUS_UNSUCCESSFUL 0xC0000000 | 0x0001
54#define NT_STATUS_NOT_IMPLEMENTED 0xC0000000 | 0x0002 59#define NT_STATUS_NOT_IMPLEMENTED 0xC0000000 | 0x0002
55#define NT_STATUS_INVALID_INFO_CLASS 0xC0000000 | 0x0003 60#define NT_STATUS_INVALID_INFO_CLASS 0xC0000000 | 0x0003
diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h
index c377d8065d99..49c9a4e75319 100644
--- a/fs/cifs/ntlmssp.h
+++ b/fs/cifs/ntlmssp.h
@@ -27,29 +27,39 @@
27#define UnknownMessage cpu_to_le32(8) 27#define UnknownMessage cpu_to_le32(8)
28 28
29/* Negotiate Flags */ 29/* Negotiate Flags */
30#define NTLMSSP_NEGOTIATE_UNICODE 0x01 /* Text strings are in unicode */ 30#define NTLMSSP_NEGOTIATE_UNICODE 0x01 /* Text strings are unicode */
31#define NTLMSSP_NEGOTIATE_OEM 0x02 /* Text strings are in OEM */ 31#define NTLMSSP_NEGOTIATE_OEM 0x02 /* Text strings are in OEM */
32#define NTLMSSP_REQUEST_TARGET 0x04 /* Server return its auth realm */ 32#define NTLMSSP_REQUEST_TARGET 0x04 /* Srv returns its auth realm */
33#define NTLMSSP_NEGOTIATE_SIGN 0x0010 /* Request signature capability */ 33/* define reserved9 0x08 */
34#define NTLMSSP_NEGOTIATE_SEAL 0x0020 /* Request confidentiality */ 34#define NTLMSSP_NEGOTIATE_SIGN 0x0010 /* Request signing capability */
35#define NTLMSSP_NEGOTIATE_DGRAM 0x0040 35#define NTLMSSP_NEGOTIATE_SEAL 0x0020 /* Request confidentiality */
36#define NTLMSSP_NEGOTIATE_LM_KEY 0x0080 /* Sign/seal use LM session key */ 36#define NTLMSSP_NEGOTIATE_DGRAM 0x0040
37#define NTLMSSP_NEGOTIATE_NTLM 0x0200 /* NTLM authentication */ 37#define NTLMSSP_NEGOTIATE_LM_KEY 0x0080 /* Use LM session key */
38#define NTLMSSP_NEGOTIATE_DOMAIN_SUPPLIED 0x1000 38/* defined reserved 8 0x0100 */
39#define NTLMSSP_NEGOTIATE_NTLM 0x0200 /* NTLM authentication */
40#define NTLMSSP_NEGOTIATE_NT_ONLY 0x0400 /* Lanman not allowed */
41#define NTLMSSP_ANONYMOUS 0x0800
42#define NTLMSSP_NEGOTIATE_DOMAIN_SUPPLIED 0x1000 /* reserved6 */
39#define NTLMSSP_NEGOTIATE_WORKSTATION_SUPPLIED 0x2000 43#define NTLMSSP_NEGOTIATE_WORKSTATION_SUPPLIED 0x2000
40#define NTLMSSP_NEGOTIATE_LOCAL_CALL 0x4000 /* client/server on same machine */ 44#define NTLMSSP_NEGOTIATE_LOCAL_CALL 0x4000 /* client/server same machine */
41#define NTLMSSP_NEGOTIATE_ALWAYS_SIGN 0x8000 /* Sign for all security levels */ 45#define NTLMSSP_NEGOTIATE_ALWAYS_SIGN 0x8000 /* Sign. All security levels */
42#define NTLMSSP_TARGET_TYPE_DOMAIN 0x10000 46#define NTLMSSP_TARGET_TYPE_DOMAIN 0x10000
43#define NTLMSSP_TARGET_TYPE_SERVER 0x20000 47#define NTLMSSP_TARGET_TYPE_SERVER 0x20000
44#define NTLMSSP_TARGET_TYPE_SHARE 0x40000 48#define NTLMSSP_TARGET_TYPE_SHARE 0x40000
45#define NTLMSSP_NEGOTIATE_NTLMV2 0x80000 49#define NTLMSSP_NEGOTIATE_EXTENDED_SEC 0x80000 /* NB:not related to NTLMv2 pwd*/
46#define NTLMSSP_REQUEST_INIT_RESP 0x100000 50/* #define NTLMSSP_REQUEST_INIT_RESP 0x100000 */
47#define NTLMSSP_REQUEST_ACCEPT_RESP 0x200000 51#define NTLMSSP_NEGOTIATE_IDENTIFY 0x100000
48#define NTLMSSP_REQUEST_NOT_NT_KEY 0x400000 52#define NTLMSSP_REQUEST_ACCEPT_RESP 0x200000 /* reserved5 */
53#define NTLMSSP_REQUEST_NON_NT_KEY 0x400000
49#define NTLMSSP_NEGOTIATE_TARGET_INFO 0x800000 54#define NTLMSSP_NEGOTIATE_TARGET_INFO 0x800000
50#define NTLMSSP_NEGOTIATE_128 0x20000000 55/* #define reserved4 0x1000000 */
51#define NTLMSSP_NEGOTIATE_KEY_XCH 0x40000000 56#define NTLMSSP_NEGOTIATE_VERSION 0x2000000 /* we do not set */
52#define NTLMSSP_NEGOTIATE_56 0x80000000 57/* #define reserved3 0x4000000 */
58/* #define reserved2 0x8000000 */
59/* #define reserved1 0x10000000 */
60#define NTLMSSP_NEGOTIATE_128 0x20000000
61#define NTLMSSP_NEGOTIATE_KEY_XCH 0x40000000
62#define NTLMSSP_NEGOTIATE_56 0x80000000
53 63
54/* Although typedefs are not commonly used for structure definitions */ 64/* Although typedefs are not commonly used for structure definitions */
55/* in the Linux kernel, in this particular case they are useful */ 65/* in the Linux kernel, in this particular case they are useful */
@@ -60,32 +70,36 @@
60typedef struct _SECURITY_BUFFER { 70typedef struct _SECURITY_BUFFER {
61 __le16 Length; 71 __le16 Length;
62 __le16 MaximumLength; 72 __le16 MaximumLength;
63 __le32 Buffer; /* offset to buffer */ 73 __le32 BufferOffset; /* offset to buffer */
64} __attribute__((packed)) SECURITY_BUFFER; 74} __attribute__((packed)) SECURITY_BUFFER;
65 75
66typedef struct _NEGOTIATE_MESSAGE { 76typedef struct _NEGOTIATE_MESSAGE {
67 __u8 Signature[sizeof(NTLMSSP_SIGNATURE)]; 77 __u8 Signature[sizeof(NTLMSSP_SIGNATURE)];
68 __le32 MessageType; /* 1 */ 78 __le32 MessageType; /* NtLmNegotiate = 1 */
69 __le32 NegotiateFlags; 79 __le32 NegotiateFlags;
70 SECURITY_BUFFER DomainName; /* RFC 1001 style and ASCII */ 80 SECURITY_BUFFER DomainName; /* RFC 1001 style and ASCII */
71 SECURITY_BUFFER WorkstationName; /* RFC 1001 and ASCII */ 81 SECURITY_BUFFER WorkstationName; /* RFC 1001 and ASCII */
82 /* SECURITY_BUFFER for version info not present since we
83 do not set the version is present flag */
72 char DomainString[0]; 84 char DomainString[0];
73 /* followed by WorkstationString */ 85 /* followed by WorkstationString */
74} __attribute__((packed)) NEGOTIATE_MESSAGE, *PNEGOTIATE_MESSAGE; 86} __attribute__((packed)) NEGOTIATE_MESSAGE, *PNEGOTIATE_MESSAGE;
75 87
76typedef struct _CHALLENGE_MESSAGE { 88typedef struct _CHALLENGE_MESSAGE {
77 __u8 Signature[sizeof(NTLMSSP_SIGNATURE)]; 89 __u8 Signature[sizeof(NTLMSSP_SIGNATURE)];
78 __le32 MessageType; /* 2 */ 90 __le32 MessageType; /* NtLmChallenge = 2 */
79 SECURITY_BUFFER TargetName; 91 SECURITY_BUFFER TargetName;
80 __le32 NegotiateFlags; 92 __le32 NegotiateFlags;
81 __u8 Challenge[CIFS_CRYPTO_KEY_SIZE]; 93 __u8 Challenge[CIFS_CRYPTO_KEY_SIZE];
82 __u8 Reserved[8]; 94 __u8 Reserved[8];
83 SECURITY_BUFFER TargetInfoArray; 95 SECURITY_BUFFER TargetInfoArray;
96 /* SECURITY_BUFFER for version info not present since we
97 do not set the version is present flag */
84} __attribute__((packed)) CHALLENGE_MESSAGE, *PCHALLENGE_MESSAGE; 98} __attribute__((packed)) CHALLENGE_MESSAGE, *PCHALLENGE_MESSAGE;
85 99
86typedef struct _AUTHENTICATE_MESSAGE { 100typedef struct _AUTHENTICATE_MESSAGE {
87 __u8 Signature[sizeof (NTLMSSP_SIGNATURE)]; 101 __u8 Signature[sizeof(NTLMSSP_SIGNATURE)];
88 __le32 MessageType; /* 3 */ 102 __le32 MessageType; /* NtLmsAuthenticate = 3 */
89 SECURITY_BUFFER LmChallengeResponse; 103 SECURITY_BUFFER LmChallengeResponse;
90 SECURITY_BUFFER NtChallengeResponse; 104 SECURITY_BUFFER NtChallengeResponse;
91 SECURITY_BUFFER DomainName; 105 SECURITY_BUFFER DomainName;
@@ -93,5 +107,7 @@ typedef struct _AUTHENTICATE_MESSAGE {
93 SECURITY_BUFFER WorkstationName; 107 SECURITY_BUFFER WorkstationName;
94 SECURITY_BUFFER SessionKey; 108 SECURITY_BUFFER SessionKey;
95 __le32 NegotiateFlags; 109 __le32 NegotiateFlags;
110 /* SECURITY_BUFFER for version info not present since we
111 do not set the version is present flag */
96 char UserString[0]; 112 char UserString[0];
97} __attribute__((packed)) AUTHENTICATE_MESSAGE, *PAUTHENTICATE_MESSAGE; 113} __attribute__((packed)) AUTHENTICATE_MESSAGE, *PAUTHENTICATE_MESSAGE;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index c2c01ff4c32c..964e097c8203 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -31,6 +31,13 @@
31#include "cifs_fs_sb.h" 31#include "cifs_fs_sb.h"
32#include "cifsfs.h" 32#include "cifsfs.h"
33 33
34/*
35 * To be safe - for UCS to UTF-8 with strings loaded with the rare long
36 * characters alloc more to account for such multibyte target UTF-8
37 * characters.
38 */
39#define UNICODE_NAME_MAX ((4 * NAME_MAX) + 2)
40
34#ifdef CONFIG_CIFS_DEBUG2 41#ifdef CONFIG_CIFS_DEBUG2
35static void dump_cifs_file_struct(struct file *file, char *label) 42static void dump_cifs_file_struct(struct file *file, char *label)
36{ 43{
@@ -239,6 +246,7 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
239 if (atomic_read(&cifsInfo->inUse) == 0) 246 if (atomic_read(&cifsInfo->inUse) == 0)
240 atomic_set(&cifsInfo->inUse, 1); 247 atomic_set(&cifsInfo->inUse, 1);
241 248
249 cifsInfo->server_eof = end_of_file;
242 spin_lock(&tmp_inode->i_lock); 250 spin_lock(&tmp_inode->i_lock);
243 if (is_size_safe_to_change(cifsInfo, end_of_file)) { 251 if (is_size_safe_to_change(cifsInfo, end_of_file)) {
244 /* can not safely change the file size here if the 252 /* can not safely change the file size here if the
@@ -375,6 +383,7 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
375 tmp_inode->i_gid = le64_to_cpu(pfindData->Gid); 383 tmp_inode->i_gid = le64_to_cpu(pfindData->Gid);
376 tmp_inode->i_nlink = le64_to_cpu(pfindData->Nlinks); 384 tmp_inode->i_nlink = le64_to_cpu(pfindData->Nlinks);
377 385
386 cifsInfo->server_eof = end_of_file;
378 spin_lock(&tmp_inode->i_lock); 387 spin_lock(&tmp_inode->i_lock);
379 if (is_size_safe_to_change(cifsInfo, end_of_file)) { 388 if (is_size_safe_to_change(cifsInfo, end_of_file)) {
380 /* can not safely change the file size here if the 389 /* can not safely change the file size here if the
@@ -436,6 +445,38 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
436 } 445 }
437} 446}
438 447
448/* BB eventually need to add the following helper function to
449 resolve NT_STATUS_STOPPED_ON_SYMLINK return code when
450 we try to do FindFirst on (NTFS) directory symlinks */
451/*
452int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
453 int xid)
454{
455 __u16 fid;
456 int len;
457 int oplock = 0;
458 int rc;
459 struct cifsTconInfo *ptcon = cifs_sb->tcon;
460 char *tmpbuffer;
461
462 rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ,
463 OPEN_REPARSE_POINT, &fid, &oplock, NULL,
464 cifs_sb->local_nls,
465 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
466 if (!rc) {
467 tmpbuffer = kmalloc(maxpath);
468 rc = CIFSSMBQueryReparseLinkInfo(xid, ptcon, full_path,
469 tmpbuffer,
470 maxpath -1,
471 fid,
472 cifs_sb->local_nls);
473 if (CIFSSMBClose(xid, ptcon, fid)) {
474 cFYI(1, ("Error closing temporary reparsepoint open)"));
475 }
476 }
477}
478 */
479
439static int initiate_cifs_search(const int xid, struct file *file) 480static int initiate_cifs_search(const int xid, struct file *file)
440{ 481{
441 int rc = 0; 482 int rc = 0;
@@ -491,7 +532,10 @@ ffirst_retry:
491 CIFS_MOUNT_MAP_SPECIAL_CHR, CIFS_DIR_SEP(cifs_sb)); 532 CIFS_MOUNT_MAP_SPECIAL_CHR, CIFS_DIR_SEP(cifs_sb));
492 if (rc == 0) 533 if (rc == 0)
493 cifsFile->invalidHandle = false; 534 cifsFile->invalidHandle = false;
494 if ((rc == -EOPNOTSUPP) && 535 /* BB add following call to handle readdir on new NTFS symlink errors
536 else if STATUS_STOPPED_ON_SYMLINK
537 call get_symlink_reparse_path and retry with new path */
538 else if ((rc == -EOPNOTSUPP) &&
495 (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) { 539 (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) {
496 cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; 540 cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
497 goto ffirst_retry; 541 goto ffirst_retry;
@@ -820,7 +864,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
820/* inode num, inode type and filename returned */ 864/* inode num, inode type and filename returned */
821static int cifs_get_name_from_search_buf(struct qstr *pqst, 865static int cifs_get_name_from_search_buf(struct qstr *pqst,
822 char *current_entry, __u16 level, unsigned int unicode, 866 char *current_entry, __u16 level, unsigned int unicode,
823 struct cifs_sb_info *cifs_sb, int max_len, __u64 *pinum) 867 struct cifs_sb_info *cifs_sb, unsigned int max_len, __u64 *pinum)
824{ 868{
825 int rc = 0; 869 int rc = 0;
826 unsigned int len = 0; 870 unsigned int len = 0;
@@ -840,7 +884,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
840 len = strnlen(filename, PATH_MAX); 884 len = strnlen(filename, PATH_MAX);
841 } 885 }
842 886
843 *pinum = pFindData->UniqueId; 887 *pinum = le64_to_cpu(pFindData->UniqueId);
844 } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) { 888 } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
845 FILE_DIRECTORY_INFO *pFindData = 889 FILE_DIRECTORY_INFO *pFindData =
846 (FILE_DIRECTORY_INFO *)current_entry; 890 (FILE_DIRECTORY_INFO *)current_entry;
@@ -856,7 +900,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
856 (SEARCH_ID_FULL_DIR_INFO *)current_entry; 900 (SEARCH_ID_FULL_DIR_INFO *)current_entry;
857 filename = &pFindData->FileName[0]; 901 filename = &pFindData->FileName[0];
858 len = le32_to_cpu(pFindData->FileNameLength); 902 len = le32_to_cpu(pFindData->FileNameLength);
859 *pinum = pFindData->UniqueId; 903 *pinum = le64_to_cpu(pFindData->UniqueId);
860 } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) { 904 } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
861 FILE_BOTH_DIRECTORY_INFO *pFindData = 905 FILE_BOTH_DIRECTORY_INFO *pFindData =
862 (FILE_BOTH_DIRECTORY_INFO *)current_entry; 906 (FILE_BOTH_DIRECTORY_INFO *)current_entry;
@@ -879,14 +923,12 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
879 } 923 }
880 924
881 if (unicode) { 925 if (unicode) {
882 /* BB fixme - test with long names */ 926 pqst->len = cifs_from_ucs2((char *) pqst->name,
883 /* Note converted filename can be longer than in unicode */ 927 (__le16 *) filename,
884 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR) 928 UNICODE_NAME_MAX,
885 pqst->len = cifs_convertUCSpath((char *)pqst->name, 929 min(len, max_len), nlt,
886 (__le16 *)filename, len/2, nlt); 930 cifs_sb->mnt_cifs_flags &
887 else 931 CIFS_MOUNT_MAP_SPECIAL_CHR);
888 pqst->len = cifs_strfromUCS_le((char *)pqst->name,
889 (__le16 *)filename, len/2, nlt);
890 } else { 932 } else {
891 pqst->name = filename; 933 pqst->name = filename;
892 pqst->len = len; 934 pqst->len = len;
@@ -896,8 +938,8 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
896 return rc; 938 return rc;
897} 939}
898 940
899static int cifs_filldir(char *pfindEntry, struct file *file, 941static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
900 filldir_t filldir, void *direntry, char *scratch_buf, int max_len) 942 void *direntry, char *scratch_buf, unsigned int max_len)
901{ 943{
902 int rc = 0; 944 int rc = 0;
903 struct qstr qstring; 945 struct qstr qstring;
@@ -994,7 +1036,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
994 int num_to_fill = 0; 1036 int num_to_fill = 0;
995 char *tmp_buf = NULL; 1037 char *tmp_buf = NULL;
996 char *end_of_smb; 1038 char *end_of_smb;
997 int max_len; 1039 unsigned int max_len;
998 1040
999 xid = GetXid(); 1041 xid = GetXid();
1000 1042
@@ -1068,11 +1110,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
1068 cifsFile->srch_inf.ntwrk_buf_start); 1110 cifsFile->srch_inf.ntwrk_buf_start);
1069 end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len; 1111 end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
1070 1112
1071 /* To be safe - for UCS to UTF-8 with strings loaded 1113 tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
1072 with the rare long characters alloc more to account for
1073 such multibyte target UTF-8 characters. cifs_unicode.c,
1074 which actually does the conversion, has the same limit */
1075 tmp_buf = kmalloc((2 * NAME_MAX) + 4, GFP_KERNEL);
1076 for (i = 0; (i < num_to_fill) && (rc == 0); i++) { 1114 for (i = 0; (i < num_to_fill) && (rc == 0); i++) {
1077 if (current_entry == NULL) { 1115 if (current_entry == NULL) {
1078 /* evaluate whether this case is an error */ 1116 /* evaluate whether this case is an error */
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 5c68b4282be9..897a052270f9 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * SMB/CIFS session setup handling routines 4 * SMB/CIFS session setup handling routines
5 * 5 *
6 * Copyright (c) International Business Machines Corp., 2006, 2007 6 * Copyright (c) International Business Machines Corp., 2006, 2009
7 * Author(s): Steve French (sfrench@us.ibm.com) 7 * Author(s): Steve French (sfrench@us.ibm.com)
8 * 8 *
9 * This library is free software; you can redistribute it and/or modify 9 * This library is free software; you can redistribute it and/or modify
@@ -111,7 +111,7 @@ static __le16 get_next_vcnum(struct cifsSesInfo *ses)
111get_vc_num_exit: 111get_vc_num_exit:
112 write_unlock(&cifs_tcp_ses_lock); 112 write_unlock(&cifs_tcp_ses_lock);
113 113
114 return le16_to_cpu(vcnum); 114 return cpu_to_le16(vcnum);
115} 115}
116 116
117static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB) 117static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
@@ -277,85 +277,51 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
277 *pbcc_area = bcc_ptr; 277 *pbcc_area = bcc_ptr;
278} 278}
279 279
280static int decode_unicode_ssetup(char **pbcc_area, int bleft, 280static void
281 struct cifsSesInfo *ses, 281decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
282 const struct nls_table *nls_cp) 282 const struct nls_table *nls_cp)
283{ 283{
284 int rc = 0; 284 int len;
285 int words_left, len;
286 char *data = *pbcc_area; 285 char *data = *pbcc_area;
287 286
288
289
290 cFYI(1, ("bleft %d", bleft)); 287 cFYI(1, ("bleft %d", bleft));
291 288
292 289 /*
293 /* SMB header is unaligned, so cifs servers word align start of 290 * Windows servers do not always double null terminate their final
294 Unicode strings */ 291 * Unicode string. Check to see if there are an uneven number of bytes
295 data++; 292 * left. If so, then add an extra NULL pad byte to the end of the
296 bleft--; /* Windows servers do not always double null terminate 293 * response.
297 their final Unicode string - in which case we 294 *
298 now will not attempt to decode the byte of junk 295 * See section 2.7.2 in "Implementing CIFS" for details
299 which follows it */ 296 */
300 297 if (bleft % 2) {
301 words_left = bleft / 2; 298 data[bleft] = 0;
302 299 ++bleft;
303 /* save off server operating system */ 300 }
304 len = UniStrnlen((wchar_t *) data, words_left);
305
306/* We look for obvious messed up bcc or strings in response so we do not go off
307 the end since (at least) WIN2K and Windows XP have a major bug in not null
308 terminating last Unicode string in response */
309 if (len >= words_left)
310 return rc;
311 301
312 kfree(ses->serverOS); 302 kfree(ses->serverOS);
313 /* UTF-8 string will not grow more than four times as big as UCS-16 */ 303 ses->serverOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
314 ses->serverOS = kzalloc((4 * len) + 2 /* trailing null */, GFP_KERNEL); 304 cFYI(1, ("serverOS=%s", ses->serverOS));
315 if (ses->serverOS != NULL) 305 len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
316 cifs_strfromUCS_le(ses->serverOS, (__le16 *)data, len, nls_cp); 306 data += len;
317 data += 2 * (len + 1); 307 bleft -= len;
318 words_left -= len + 1; 308 if (bleft <= 0)
319 309 return;
320 /* save off server network operating system */
321 len = UniStrnlen((wchar_t *) data, words_left);
322
323 if (len >= words_left)
324 return rc;
325 310
326 kfree(ses->serverNOS); 311 kfree(ses->serverNOS);
327 ses->serverNOS = kzalloc((4 * len) + 2 /* trailing null */, GFP_KERNEL); 312 ses->serverNOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
328 if (ses->serverNOS != NULL) { 313 cFYI(1, ("serverNOS=%s", ses->serverNOS));
329 cifs_strfromUCS_le(ses->serverNOS, (__le16 *)data, len, 314 len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
330 nls_cp); 315 data += len;
331 if (strncmp(ses->serverNOS, "NT LAN Manager 4", 16) == 0) { 316 bleft -= len;
332 cFYI(1, ("NT4 server")); 317 if (bleft <= 0)
333 ses->flags |= CIFS_SES_NT4; 318 return;
334 }
335 }
336 data += 2 * (len + 1);
337 words_left -= len + 1;
338
339 /* save off server domain */
340 len = UniStrnlen((wchar_t *) data, words_left);
341
342 if (len > words_left)
343 return rc;
344 319
345 kfree(ses->serverDomain); 320 kfree(ses->serverDomain);
346 ses->serverDomain = kzalloc(2 * (len + 1), GFP_KERNEL); /* BB FIXME wrong length */ 321 ses->serverDomain = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
347 if (ses->serverDomain != NULL) { 322 cFYI(1, ("serverDomain=%s", ses->serverDomain));
348 cifs_strfromUCS_le(ses->serverDomain, (__le16 *)data, len,
349 nls_cp);
350 ses->serverDomain[2*len] = 0;
351 ses->serverDomain[(2*len) + 1] = 0;
352 }
353 data += 2 * (len + 1);
354 words_left -= len + 1;
355 323
356 cFYI(1, ("words left: %d", words_left)); 324 return;
357
358 return rc;
359} 325}
360 326
361static int decode_ascii_ssetup(char **pbcc_area, int bleft, 327static int decode_ascii_ssetup(char **pbcc_area, int bleft,
@@ -412,6 +378,186 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
412 return rc; 378 return rc;
413} 379}
414 380
381static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
382 struct cifsSesInfo *ses)
383{
384 CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
385
386 if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
387 cERROR(1, ("challenge blob len %d too small", blob_len));
388 return -EINVAL;
389 }
390
391 if (memcmp(pblob->Signature, "NTLMSSP", 8)) {
392 cERROR(1, ("blob signature incorrect %s", pblob->Signature));
393 return -EINVAL;
394 }
395 if (pblob->MessageType != NtLmChallenge) {
396 cERROR(1, ("Incorrect message type %d", pblob->MessageType));
397 return -EINVAL;
398 }
399
400 memcpy(ses->server->cryptKey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE);
401 /* BB we could decode pblob->NegotiateFlags; some may be useful */
402 /* In particular we can examine sign flags */
403 /* BB spec says that if AvId field of MsvAvTimestamp is populated then
404 we must set the MIC field of the AUTHENTICATE_MESSAGE */
405
406 return 0;
407}
408
409#ifdef CONFIG_CIFS_EXPERIMENTAL
410/* BB Move to ntlmssp.c eventually */
411
412/* We do not malloc the blob, it is passed in pbuffer, because
413 it is fixed size, and small, making this approach cleaner */
414static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
415 struct cifsSesInfo *ses)
416{
417 NEGOTIATE_MESSAGE *sec_blob = (NEGOTIATE_MESSAGE *)pbuffer;
418 __u32 flags;
419
420 memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
421 sec_blob->MessageType = NtLmNegotiate;
422
423 /* BB is NTLMV2 session security format easier to use here? */
424 flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET |
425 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
426 NTLMSSP_NEGOTIATE_NT_ONLY | NTLMSSP_NEGOTIATE_NTLM;
427 if (ses->server->secMode &
428 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
429 flags |= NTLMSSP_NEGOTIATE_SIGN;
430 if (ses->server->secMode & SECMODE_SIGN_REQUIRED)
431 flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
432
433 sec_blob->NegotiateFlags |= cpu_to_le32(flags);
434
435 sec_blob->WorkstationName.BufferOffset = 0;
436 sec_blob->WorkstationName.Length = 0;
437 sec_blob->WorkstationName.MaximumLength = 0;
438
439 /* Domain name is sent on the Challenge not Negotiate NTLMSSP request */
440 sec_blob->DomainName.BufferOffset = 0;
441 sec_blob->DomainName.Length = 0;
442 sec_blob->DomainName.MaximumLength = 0;
443}
444
445/* We do not malloc the blob, it is passed in pbuffer, because its
446 maximum possible size is fixed and small, making this approach cleaner.
447 This function returns the length of the data in the blob */
448static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
449 struct cifsSesInfo *ses,
450 const struct nls_table *nls_cp, int first)
451{
452 AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer;
453 __u32 flags;
454 unsigned char *tmp;
455 char ntlm_session_key[CIFS_SESS_KEY_SIZE];
456
457 memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
458 sec_blob->MessageType = NtLmAuthenticate;
459
460 flags = NTLMSSP_NEGOTIATE_56 |
461 NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
462 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
463 NTLMSSP_NEGOTIATE_NT_ONLY | NTLMSSP_NEGOTIATE_NTLM;
464 if (ses->server->secMode &
465 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
466 flags |= NTLMSSP_NEGOTIATE_SIGN;
467 if (ses->server->secMode & SECMODE_SIGN_REQUIRED)
468 flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
469
470 tmp = pbuffer + sizeof(AUTHENTICATE_MESSAGE);
471 sec_blob->NegotiateFlags |= cpu_to_le32(flags);
472
473 sec_blob->LmChallengeResponse.BufferOffset =
474 cpu_to_le32(sizeof(AUTHENTICATE_MESSAGE));
475 sec_blob->LmChallengeResponse.Length = 0;
476 sec_blob->LmChallengeResponse.MaximumLength = 0;
477
478 /* calculate session key, BB what about adding similar ntlmv2 path? */
479 SMBNTencrypt(ses->password, ses->server->cryptKey, ntlm_session_key);
480 if (first)
481 cifs_calculate_mac_key(&ses->server->mac_signing_key,
482 ntlm_session_key, ses->password);
483
484 memcpy(tmp, ntlm_session_key, CIFS_SESS_KEY_SIZE);
485 sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer);
486 sec_blob->NtChallengeResponse.Length = cpu_to_le16(CIFS_SESS_KEY_SIZE);
487 sec_blob->NtChallengeResponse.MaximumLength =
488 cpu_to_le16(CIFS_SESS_KEY_SIZE);
489
490 tmp += CIFS_SESS_KEY_SIZE;
491
492 if (ses->domainName == NULL) {
493 sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
494 sec_blob->DomainName.Length = 0;
495 sec_blob->DomainName.MaximumLength = 0;
496 tmp += 2;
497 } else {
498 int len;
499 len = cifs_strtoUCS((__le16 *)tmp, ses->domainName,
500 MAX_USERNAME_SIZE, nls_cp);
501 len *= 2; /* unicode is 2 bytes each */
502 len += 2; /* trailing null */
503 sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
504 sec_blob->DomainName.Length = cpu_to_le16(len);
505 sec_blob->DomainName.MaximumLength = cpu_to_le16(len);
506 tmp += len;
507 }
508
509 if (ses->userName == NULL) {
510 sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
511 sec_blob->UserName.Length = 0;
512 sec_blob->UserName.MaximumLength = 0;
513 tmp += 2;
514 } else {
515 int len;
516 len = cifs_strtoUCS((__le16 *)tmp, ses->userName,
517 MAX_USERNAME_SIZE, nls_cp);
518 len *= 2; /* unicode is 2 bytes each */
519 len += 2; /* trailing null */
520 sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
521 sec_blob->UserName.Length = cpu_to_le16(len);
522 sec_blob->UserName.MaximumLength = cpu_to_le16(len);
523 tmp += len;
524 }
525
526 sec_blob->WorkstationName.BufferOffset = cpu_to_le32(tmp - pbuffer);
527 sec_blob->WorkstationName.Length = 0;
528 sec_blob->WorkstationName.MaximumLength = 0;
529 tmp += 2;
530
531 sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
532 sec_blob->SessionKey.Length = 0;
533 sec_blob->SessionKey.MaximumLength = 0;
534 return tmp - pbuffer;
535}
536
537
538static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB,
539 struct cifsSesInfo *ses)
540{
541 build_ntlmssp_negotiate_blob(&pSMB->req.SecurityBlob[0], ses);
542 pSMB->req.SecurityBlobLength = cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
543
544 return;
545}
546
547static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
548 struct cifsSesInfo *ses,
549 const struct nls_table *nls, int first_time)
550{
551 int bloblen;
552
553 bloblen = build_ntlmssp_auth_blob(&pSMB->req.SecurityBlob[0], ses, nls,
554 first_time);
555 pSMB->req.SecurityBlobLength = cpu_to_le16(bloblen);
556
557 return bloblen;
558}
559#endif
560
415int 561int
416CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time, 562CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
417 const struct nls_table *nls_cp) 563 const struct nls_table *nls_cp)
@@ -430,6 +576,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
430 __u16 action; 576 __u16 action;
431 int bytes_remaining; 577 int bytes_remaining;
432 struct key *spnego_key = NULL; 578 struct key *spnego_key = NULL;
579 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
433 580
434 if (ses == NULL) 581 if (ses == NULL)
435 return -EINVAL; 582 return -EINVAL;
@@ -437,6 +584,10 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
437 type = ses->server->secType; 584 type = ses->server->secType;
438 585
439 cFYI(1, ("sess setup type %d", type)); 586 cFYI(1, ("sess setup type %d", type));
587ssetup_ntlmssp_authenticate:
588 if (phase == NtLmChallenge)
589 phase = NtLmAuthenticate; /* if ntlmssp, now final phase */
590
440 if (type == LANMAN) { 591 if (type == LANMAN) {
441#ifndef CONFIG_CIFS_WEAK_PW_HASH 592#ifndef CONFIG_CIFS_WEAK_PW_HASH
442 /* LANMAN and plaintext are less secure and off by default. 593 /* LANMAN and plaintext are less secure and off by default.
@@ -650,9 +801,53 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
650 goto ssetup_exit; 801 goto ssetup_exit;
651#endif /* CONFIG_CIFS_UPCALL */ 802#endif /* CONFIG_CIFS_UPCALL */
652 } else { 803 } else {
804#ifdef CONFIG_CIFS_EXPERIMENTAL
805 if ((experimEnabled > 1) && (type == RawNTLMSSP)) {
806 if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
807 cERROR(1, ("NTLMSSP requires Unicode support"));
808 rc = -ENOSYS;
809 goto ssetup_exit;
810 }
811
812 cFYI(1, ("ntlmssp session setup phase %d", phase));
813 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
814 capabilities |= CAP_EXTENDED_SECURITY;
815 pSMB->req.Capabilities |= cpu_to_le32(capabilities);
816 if (phase == NtLmNegotiate) {
817 setup_ntlmssp_neg_req(pSMB, ses);
818 iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
819 } else if (phase == NtLmAuthenticate) {
820 int blob_len;
821 blob_len = setup_ntlmssp_auth_req(pSMB, ses,
822 nls_cp,
823 first_time);
824 iov[1].iov_len = blob_len;
825 /* Make sure that we tell the server that we
826 are using the uid that it just gave us back
827 on the response (challenge) */
828 smb_buf->Uid = ses->Suid;
829 } else {
830 cERROR(1, ("invalid phase %d", phase));
831 rc = -ENOSYS;
832 goto ssetup_exit;
833 }
834 iov[1].iov_base = &pSMB->req.SecurityBlob[0];
835 /* unicode strings must be word aligned */
836 if ((iov[0].iov_len + iov[1].iov_len) % 2) {
837 *bcc_ptr = 0;
838 bcc_ptr++;
839 }
840 unicode_oslm_strings(&bcc_ptr, nls_cp);
841 } else {
842 cERROR(1, ("secType %d not supported!", type));
843 rc = -ENOSYS;
844 goto ssetup_exit;
845 }
846#else
653 cERROR(1, ("secType %d not supported!", type)); 847 cERROR(1, ("secType %d not supported!", type));
654 rc = -ENOSYS; 848 rc = -ENOSYS;
655 goto ssetup_exit; 849 goto ssetup_exit;
850#endif
656 } 851 }
657 852
658 iov[2].iov_base = str_area; 853 iov[2].iov_base = str_area;
@@ -668,12 +863,23 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
668 /* SMB request buf freed in SendReceive2 */ 863 /* SMB request buf freed in SendReceive2 */
669 864
670 cFYI(1, ("ssetup rc from sendrecv2 is %d", rc)); 865 cFYI(1, ("ssetup rc from sendrecv2 is %d", rc));
671 if (rc)
672 goto ssetup_exit;
673 866
674 pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base; 867 pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
675 smb_buf = (struct smb_hdr *)iov[0].iov_base; 868 smb_buf = (struct smb_hdr *)iov[0].iov_base;
676 869
870 if ((type == RawNTLMSSP) && (smb_buf->Status.CifsError ==
871 cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) {
872 if (phase != NtLmNegotiate) {
873 cERROR(1, ("Unexpected more processing error"));
874 goto ssetup_exit;
875 }
876 /* NTLMSSP Negotiate sent now processing challenge (response) */
877 phase = NtLmChallenge; /* process ntlmssp challenge */
878 rc = 0; /* MORE_PROC rc is not an error here, but expected */
879 }
880 if (rc)
881 goto ssetup_exit;
882
677 if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) { 883 if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) {
678 rc = -EIO; 884 rc = -EIO;
679 cERROR(1, ("bad word count %d", smb_buf->WordCount)); 885 cERROR(1, ("bad word count %d", smb_buf->WordCount));
@@ -692,22 +898,33 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
692 if (smb_buf->WordCount == 4) { 898 if (smb_buf->WordCount == 4) {
693 __u16 blob_len; 899 __u16 blob_len;
694 blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); 900 blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
695 bcc_ptr += blob_len;
696 if (blob_len > bytes_remaining) { 901 if (blob_len > bytes_remaining) {
697 cERROR(1, ("bad security blob length %d", blob_len)); 902 cERROR(1, ("bad security blob length %d", blob_len));
698 rc = -EINVAL; 903 rc = -EINVAL;
699 goto ssetup_exit; 904 goto ssetup_exit;
700 } 905 }
906 if (phase == NtLmChallenge) {
907 rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses);
908 /* now goto beginning for ntlmssp authenticate phase */
909 if (rc)
910 goto ssetup_exit;
911 }
912 bcc_ptr += blob_len;
701 bytes_remaining -= blob_len; 913 bytes_remaining -= blob_len;
702 } 914 }
703 915
704 /* BB check if Unicode and decode strings */ 916 /* BB check if Unicode and decode strings */
705 if (smb_buf->Flags2 & SMBFLG2_UNICODE) 917 if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
706 rc = decode_unicode_ssetup(&bcc_ptr, bytes_remaining, 918 /* unicode string area must be word-aligned */
707 ses, nls_cp); 919 if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
708 else 920 ++bcc_ptr;
921 --bytes_remaining;
922 }
923 decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp);
924 } else {
709 rc = decode_ascii_ssetup(&bcc_ptr, bytes_remaining, 925 rc = decode_ascii_ssetup(&bcc_ptr, bytes_remaining,
710 ses, nls_cp); 926 ses, nls_cp);
927 }
711 928
712ssetup_exit: 929ssetup_exit:
713 if (spnego_key) { 930 if (spnego_key) {
@@ -721,5 +938,9 @@ ssetup_exit:
721 } else if (resp_buf_type == CIFS_LARGE_BUFFER) 938 } else if (resp_buf_type == CIFS_LARGE_BUFFER)
722 cifs_buf_release(iov[0].iov_base); 939 cifs_buf_release(iov[0].iov_base);
723 940
941 /* if ntlmssp, and negotiate succeeded, proceed to authenticate phase */
942 if ((phase == NtLmChallenge) && (rc == 0))
943 goto ssetup_ntlmssp_authenticate;
944
724 return rc; 945 return rc;
725} 946}
diff --git a/fs/cifs/smberr.h b/fs/cifs/smberr.h
index 7f50e8577c1c..c5084d27db7c 100644
--- a/fs/cifs/smberr.h
+++ b/fs/cifs/smberr.h
@@ -110,6 +110,7 @@
110 110
111/* Below errors are used internally (do not come over the wire) for passthrough 111/* Below errors are used internally (do not come over the wire) for passthrough
112 from STATUS codes to POSIX only */ 112 from STATUS codes to POSIX only */
113#define ERRsymlink 0xFFFD
113#define ErrTooManyLinks 0xFFFE 114#define ErrTooManyLinks 0xFFFE
114 115
115/* Following error codes may be generated with the ERRSRV error class.*/ 116/* Following error codes may be generated with the ERRSRV error class.*/
diff --git a/fs/compat.c b/fs/compat.c
index 55efdfebdf5a..681ed81e6be0 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -51,6 +51,7 @@
51#include <linux/poll.h> 51#include <linux/poll.h>
52#include <linux/mm.h> 52#include <linux/mm.h>
53#include <linux/eventpoll.h> 53#include <linux/eventpoll.h>
54#include <linux/fs_struct.h>
54 55
55#include <asm/uaccess.h> 56#include <asm/uaccess.h>
56#include <asm/mmu_context.h> 57#include <asm/mmu_context.h>
@@ -180,22 +181,24 @@ asmlinkage long compat_sys_newstat(char __user * filename,
180 struct compat_stat __user *statbuf) 181 struct compat_stat __user *statbuf)
181{ 182{
182 struct kstat stat; 183 struct kstat stat;
183 int error = vfs_stat_fd(AT_FDCWD, filename, &stat); 184 int error;
184 185
185 if (!error) 186 error = vfs_stat(filename, &stat);
186 error = cp_compat_stat(&stat, statbuf); 187 if (error)
187 return error; 188 return error;
189 return cp_compat_stat(&stat, statbuf);
188} 190}
189 191
190asmlinkage long compat_sys_newlstat(char __user * filename, 192asmlinkage long compat_sys_newlstat(char __user * filename,
191 struct compat_stat __user *statbuf) 193 struct compat_stat __user *statbuf)
192{ 194{
193 struct kstat stat; 195 struct kstat stat;
194 int error = vfs_lstat_fd(AT_FDCWD, filename, &stat); 196 int error;
195 197
196 if (!error) 198 error = vfs_lstat(filename, &stat);
197 error = cp_compat_stat(&stat, statbuf); 199 if (error)
198 return error; 200 return error;
201 return cp_compat_stat(&stat, statbuf);
199} 202}
200 203
201#ifndef __ARCH_WANT_STAT64 204#ifndef __ARCH_WANT_STAT64
@@ -203,21 +206,12 @@ asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user *filename,
203 struct compat_stat __user *statbuf, int flag) 206 struct compat_stat __user *statbuf, int flag)
204{ 207{
205 struct kstat stat; 208 struct kstat stat;
206 int error = -EINVAL; 209 int error;
207
208 if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
209 goto out;
210
211 if (flag & AT_SYMLINK_NOFOLLOW)
212 error = vfs_lstat_fd(dfd, filename, &stat);
213 else
214 error = vfs_stat_fd(dfd, filename, &stat);
215
216 if (!error)
217 error = cp_compat_stat(&stat, statbuf);
218 210
219out: 211 error = vfs_fstatat(dfd, filename, &stat, flag);
220 return error; 212 if (error)
213 return error;
214 return cp_compat_stat(&stat, statbuf);
221} 215}
222#endif 216#endif
223 217
@@ -1195,16 +1189,12 @@ out:
1195 return ret; 1189 return ret;
1196} 1190}
1197 1191
1198asmlinkage ssize_t 1192static size_t compat_readv(struct file *file,
1199compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsigned long vlen) 1193 const struct compat_iovec __user *vec,
1194 unsigned long vlen, loff_t *pos)
1200{ 1195{
1201 struct file *file;
1202 ssize_t ret = -EBADF; 1196 ssize_t ret = -EBADF;
1203 1197
1204 file = fget(fd);
1205 if (!file)
1206 return -EBADF;
1207
1208 if (!(file->f_mode & FMODE_READ)) 1198 if (!(file->f_mode & FMODE_READ))
1209 goto out; 1199 goto out;
1210 1200
@@ -1212,25 +1202,56 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsign
1212 if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read)) 1202 if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
1213 goto out; 1203 goto out;
1214 1204
1215 ret = compat_do_readv_writev(READ, file, vec, vlen, &file->f_pos); 1205 ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
1216 1206
1217out: 1207out:
1218 if (ret > 0) 1208 if (ret > 0)
1219 add_rchar(current, ret); 1209 add_rchar(current, ret);
1220 inc_syscr(current); 1210 inc_syscr(current);
1221 fput(file);
1222 return ret; 1211 return ret;
1223} 1212}
1224 1213
1225asmlinkage ssize_t 1214asmlinkage ssize_t
1226compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsigned long vlen) 1215compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec,
1216 unsigned long vlen)
1227{ 1217{
1228 struct file *file; 1218 struct file *file;
1229 ssize_t ret = -EBADF; 1219 int fput_needed;
1220 ssize_t ret;
1230 1221
1231 file = fget(fd); 1222 file = fget_light(fd, &fput_needed);
1223 if (!file)
1224 return -EBADF;
1225 ret = compat_readv(file, vec, vlen, &file->f_pos);
1226 fput_light(file, fput_needed);
1227 return ret;
1228}
1229
1230asmlinkage ssize_t
1231compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
1232 unsigned long vlen, u32 pos_low, u32 pos_high)
1233{
1234 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1235 struct file *file;
1236 int fput_needed;
1237 ssize_t ret;
1238
1239 if (pos < 0)
1240 return -EINVAL;
1241 file = fget_light(fd, &fput_needed);
1232 if (!file) 1242 if (!file)
1233 return -EBADF; 1243 return -EBADF;
1244 ret = compat_readv(file, vec, vlen, &pos);
1245 fput_light(file, fput_needed);
1246 return ret;
1247}
1248
1249static size_t compat_writev(struct file *file,
1250 const struct compat_iovec __user *vec,
1251 unsigned long vlen, loff_t *pos)
1252{
1253 ssize_t ret = -EBADF;
1254
1234 if (!(file->f_mode & FMODE_WRITE)) 1255 if (!(file->f_mode & FMODE_WRITE))
1235 goto out; 1256 goto out;
1236 1257
@@ -1238,13 +1259,47 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsig
1238 if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write)) 1259 if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
1239 goto out; 1260 goto out;
1240 1261
1241 ret = compat_do_readv_writev(WRITE, file, vec, vlen, &file->f_pos); 1262 ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
1242 1263
1243out: 1264out:
1244 if (ret > 0) 1265 if (ret > 0)
1245 add_wchar(current, ret); 1266 add_wchar(current, ret);
1246 inc_syscw(current); 1267 inc_syscw(current);
1247 fput(file); 1268 return ret;
1269}
1270
1271asmlinkage ssize_t
1272compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec,
1273 unsigned long vlen)
1274{
1275 struct file *file;
1276 int fput_needed;
1277 ssize_t ret;
1278
1279 file = fget_light(fd, &fput_needed);
1280 if (!file)
1281 return -EBADF;
1282 ret = compat_writev(file, vec, vlen, &file->f_pos);
1283 fput_light(file, fput_needed);
1284 return ret;
1285}
1286
1287asmlinkage ssize_t
1288compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
1289 unsigned long vlen, u32 pos_low, u32 pos_high)
1290{
1291 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1292 struct file *file;
1293 int fput_needed;
1294 ssize_t ret;
1295
1296 if (pos < 0)
1297 return -EINVAL;
1298 file = fget_light(fd, &fput_needed);
1299 if (!file)
1300 return -EBADF;
1301 ret = compat_writev(file, vec, vlen, &pos);
1302 fput_light(file, fput_needed);
1248 return ret; 1303 return ret;
1249} 1304}
1250 1305
@@ -1421,6 +1476,7 @@ int compat_do_execve(char * filename,
1421 struct linux_binprm *bprm; 1476 struct linux_binprm *bprm;
1422 struct file *file; 1477 struct file *file;
1423 struct files_struct *displaced; 1478 struct files_struct *displaced;
1479 bool clear_in_exec;
1424 int retval; 1480 int retval;
1425 1481
1426 retval = unshare_files(&displaced); 1482 retval = unshare_files(&displaced);
@@ -1441,12 +1497,16 @@ int compat_do_execve(char * filename,
1441 bprm->cred = prepare_exec_creds(); 1497 bprm->cred = prepare_exec_creds();
1442 if (!bprm->cred) 1498 if (!bprm->cred)
1443 goto out_unlock; 1499 goto out_unlock;
1444 check_unsafe_exec(bprm); 1500
1501 retval = check_unsafe_exec(bprm);
1502 if (retval < 0)
1503 goto out_unlock;
1504 clear_in_exec = retval;
1445 1505
1446 file = open_exec(filename); 1506 file = open_exec(filename);
1447 retval = PTR_ERR(file); 1507 retval = PTR_ERR(file);
1448 if (IS_ERR(file)) 1508 if (IS_ERR(file))
1449 goto out_unlock; 1509 goto out_unmark;
1450 1510
1451 sched_exec(); 1511 sched_exec();
1452 1512
@@ -1488,6 +1548,7 @@ int compat_do_execve(char * filename,
1488 goto out; 1548 goto out;
1489 1549
1490 /* execve succeeded */ 1550 /* execve succeeded */
1551 current->fs->in_exec = 0;
1491 current->in_execve = 0; 1552 current->in_execve = 0;
1492 mutex_unlock(&current->cred_exec_mutex); 1553 mutex_unlock(&current->cred_exec_mutex);
1493 acct_update_integrals(current); 1554 acct_update_integrals(current);
@@ -1506,6 +1567,10 @@ out_file:
1506 fput(bprm->file); 1567 fput(bprm->file);
1507 } 1568 }
1508 1569
1570out_unmark:
1571 if (clear_in_exec)
1572 current->fs->in_exec = 0;
1573
1509out_unlock: 1574out_unlock:
1510 current->in_execve = 0; 1575 current->in_execve = 0;
1511 mutex_unlock(&current->cred_exec_mutex); 1576 mutex_unlock(&current->cred_exec_mutex);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index ff786687e93b..b83f6bcfa51a 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -23,7 +23,7 @@
23#include <linux/if.h> 23#include <linux/if.h>
24#include <linux/if_bridge.h> 24#include <linux/if_bridge.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/raid/md.h> 26#include <linux/raid/md_u.h>
27#include <linux/kd.h> 27#include <linux/kd.h>
28#include <linux/route.h> 28#include <linux/route.h>
29#include <linux/in6.h> 29#include <linux/in6.h>
@@ -58,7 +58,6 @@
58#include <linux/i2c.h> 58#include <linux/i2c.h>
59#include <linux/i2c-dev.h> 59#include <linux/i2c-dev.h>
60#include <linux/atalk.h> 60#include <linux/atalk.h>
61#include <linux/loop.h>
62 61
63#include <net/bluetooth/bluetooth.h> 62#include <net/bluetooth/bluetooth.h>
64#include <net/bluetooth/hci.h> 63#include <net/bluetooth/hci.h>
@@ -68,6 +67,7 @@
68#include <linux/gigaset_dev.h> 67#include <linux/gigaset_dev.h>
69 68
70#ifdef CONFIG_BLOCK 69#ifdef CONFIG_BLOCK
70#include <linux/loop.h>
71#include <scsi/scsi.h> 71#include <scsi/scsi.h>
72#include <scsi/scsi_ioctl.h> 72#include <scsi/scsi_ioctl.h>
73#include <scsi/sg.h> 73#include <scsi/sg.h>
@@ -2660,6 +2660,8 @@ HANDLE_IOCTL(SONET_GETFRAMING, do_atm_ioctl)
2660HANDLE_IOCTL(SONET_GETFRSENSE, do_atm_ioctl) 2660HANDLE_IOCTL(SONET_GETFRSENSE, do_atm_ioctl)
2661/* block stuff */ 2661/* block stuff */
2662#ifdef CONFIG_BLOCK 2662#ifdef CONFIG_BLOCK
2663/* loop */
2664IGNORE_IOCTL(LOOP_CLR_FD)
2663/* Raw devices */ 2665/* Raw devices */
2664HANDLE_IOCTL(RAW_SETBIND, raw_ioctl) 2666HANDLE_IOCTL(RAW_SETBIND, raw_ioctl)
2665HANDLE_IOCTL(RAW_GETBIND, raw_ioctl) 2667HANDLE_IOCTL(RAW_GETBIND, raw_ioctl)
@@ -2728,9 +2730,6 @@ HANDLE_IOCTL(LPSETTIMEOUT, lp_timeout_trans)
2728IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32) 2730IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32)
2729IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32) 2731IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32)
2730 2732
2731/* loop */
2732IGNORE_IOCTL(LOOP_CLR_FD)
2733
2734#ifdef CONFIG_SPARC 2733#ifdef CONFIG_SPARC
2735/* Sparc framebuffers, handled in sbusfb_compat_ioctl() */ 2734/* Sparc framebuffers, handled in sbusfb_compat_ioctl() */
2736IGNORE_IOCTL(FBIOGTYPE) 2735IGNORE_IOCTL(FBIOGTYPE)
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 932a92b31483..c8afa6b1d91d 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -135,7 +135,7 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
135 struct path path; 135 struct path path;
136 struct configfs_dirent *sd; 136 struct configfs_dirent *sd;
137 struct config_item *parent_item; 137 struct config_item *parent_item;
138 struct config_item *target_item; 138 struct config_item *target_item = NULL;
139 struct config_item_type *type; 139 struct config_item_type *type;
140 140
141 ret = -EPERM; /* What lack-of-symlink returns */ 141 ret = -EPERM; /* What lack-of-symlink returns */
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index a07338d2d140..dd3634e4c967 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -318,6 +318,7 @@ out:
318static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf) 318static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
319{ 319{
320 struct super_block *sb = dentry->d_sb; 320 struct super_block *sb = dentry->d_sb;
321 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
321 322
322 buf->f_type = CRAMFS_MAGIC; 323 buf->f_type = CRAMFS_MAGIC;
323 buf->f_bsize = PAGE_CACHE_SIZE; 324 buf->f_bsize = PAGE_CACHE_SIZE;
@@ -326,6 +327,8 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
326 buf->f_bavail = 0; 327 buf->f_bavail = 0;
327 buf->f_files = CRAMFS_SB(sb)->files; 328 buf->f_files = CRAMFS_SB(sb)->files;
328 buf->f_ffree = 0; 329 buf->f_ffree = 0;
330 buf->f_fsid.val[0] = (u32)id;
331 buf->f_fsid.val[1] = (u32)(id >> 32);
329 buf->f_namelen = CRAMFS_MAXPATHLEN; 332 buf->f_namelen = CRAMFS_MAXPATHLEN;
330 return 0; 333 return 0;
331} 334}
@@ -459,11 +462,14 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s
459static int cramfs_readpage(struct file *file, struct page * page) 462static int cramfs_readpage(struct file *file, struct page * page)
460{ 463{
461 struct inode *inode = page->mapping->host; 464 struct inode *inode = page->mapping->host;
462 u32 maxblock, bytes_filled; 465 u32 maxblock;
466 int bytes_filled;
463 void *pgdata; 467 void *pgdata;
464 468
465 maxblock = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 469 maxblock = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
466 bytes_filled = 0; 470 bytes_filled = 0;
471 pgdata = kmap(page);
472
467 if (page->index < maxblock) { 473 if (page->index < maxblock) {
468 struct super_block *sb = inode->i_sb; 474 struct super_block *sb = inode->i_sb;
469 u32 blkptr_offset = OFFSET(inode) + page->index*4; 475 u32 blkptr_offset = OFFSET(inode) + page->index*4;
@@ -472,30 +478,43 @@ static int cramfs_readpage(struct file *file, struct page * page)
472 start_offset = OFFSET(inode) + maxblock*4; 478 start_offset = OFFSET(inode) + maxblock*4;
473 mutex_lock(&read_mutex); 479 mutex_lock(&read_mutex);
474 if (page->index) 480 if (page->index)
475 start_offset = *(u32 *) cramfs_read(sb, blkptr_offset-4, 4); 481 start_offset = *(u32 *) cramfs_read(sb, blkptr_offset-4,
476 compr_len = (*(u32 *) cramfs_read(sb, blkptr_offset, 4) - start_offset); 482 4);
483 compr_len = (*(u32 *) cramfs_read(sb, blkptr_offset, 4) -
484 start_offset);
477 mutex_unlock(&read_mutex); 485 mutex_unlock(&read_mutex);
478 pgdata = kmap(page); 486
479 if (compr_len == 0) 487 if (compr_len == 0)
480 ; /* hole */ 488 ; /* hole */
481 else if (compr_len > (PAGE_CACHE_SIZE << 1)) 489 else if (unlikely(compr_len > (PAGE_CACHE_SIZE << 1))) {
482 printk(KERN_ERR "cramfs: bad compressed blocksize %u\n", compr_len); 490 pr_err("cramfs: bad compressed blocksize %u\n",
483 else { 491 compr_len);
492 goto err;
493 } else {
484 mutex_lock(&read_mutex); 494 mutex_lock(&read_mutex);
485 bytes_filled = cramfs_uncompress_block(pgdata, 495 bytes_filled = cramfs_uncompress_block(pgdata,
486 PAGE_CACHE_SIZE, 496 PAGE_CACHE_SIZE,
487 cramfs_read(sb, start_offset, compr_len), 497 cramfs_read(sb, start_offset, compr_len),
488 compr_len); 498 compr_len);
489 mutex_unlock(&read_mutex); 499 mutex_unlock(&read_mutex);
500 if (unlikely(bytes_filled < 0))
501 goto err;
490 } 502 }
491 } else 503 }
492 pgdata = kmap(page); 504
493 memset(pgdata + bytes_filled, 0, PAGE_CACHE_SIZE - bytes_filled); 505 memset(pgdata + bytes_filled, 0, PAGE_CACHE_SIZE - bytes_filled);
494 kunmap(page);
495 flush_dcache_page(page); 506 flush_dcache_page(page);
507 kunmap(page);
496 SetPageUptodate(page); 508 SetPageUptodate(page);
497 unlock_page(page); 509 unlock_page(page);
498 return 0; 510 return 0;
511
512err:
513 kunmap(page);
514 ClearPageUptodate(page);
515 SetPageError(page);
516 unlock_page(page);
517 return 0;
499} 518}
500 519
501static const struct address_space_operations cramfs_aops = { 520static const struct address_space_operations cramfs_aops = {
diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c
index fc3ccb74626f..023329800d2e 100644
--- a/fs/cramfs/uncompress.c
+++ b/fs/cramfs/uncompress.c
@@ -50,7 +50,7 @@ int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen)
50err: 50err:
51 printk("Error %d while decompressing!\n", err); 51 printk("Error %d while decompressing!\n", err);
52 printk("%p(%d)->%p(%d)\n", src, srclen, dst, dstlen); 52 printk("%p(%d)->%p(%d)\n", src, srclen, dst, dstlen);
53 return 0; 53 return -EIO;
54} 54}
55 55
56int cramfs_uncompress_init(void) 56int cramfs_uncompress_init(void)
diff --git a/fs/dcache.c b/fs/dcache.c
index 90bbd7e1b116..75659a6fd1f8 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -17,7 +17,6 @@
17#include <linux/syscalls.h> 17#include <linux/syscalls.h>
18#include <linux/string.h> 18#include <linux/string.h>
19#include <linux/mm.h> 19#include <linux/mm.h>
20#include <linux/fdtable.h>
21#include <linux/fs.h> 20#include <linux/fs.h>
22#include <linux/fsnotify.h> 21#include <linux/fsnotify.h>
23#include <linux/slab.h> 22#include <linux/slab.h>
@@ -32,6 +31,7 @@
32#include <linux/seqlock.h> 31#include <linux/seqlock.h>
33#include <linux/swap.h> 32#include <linux/swap.h>
34#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/fs_struct.h>
35#include "internal.h" 35#include "internal.h"
36 36
37int sysctl_vfs_cache_pressure __read_mostly = 100; 37int sysctl_vfs_cache_pressure __read_mostly = 100;
@@ -481,7 +481,7 @@ restart:
481 if ((flags & DCACHE_REFERENCED) 481 if ((flags & DCACHE_REFERENCED)
482 && (dentry->d_flags & DCACHE_REFERENCED)) { 482 && (dentry->d_flags & DCACHE_REFERENCED)) {
483 dentry->d_flags &= ~DCACHE_REFERENCED; 483 dentry->d_flags &= ~DCACHE_REFERENCED;
484 list_move_tail(&dentry->d_lru, &referenced); 484 list_move(&dentry->d_lru, &referenced);
485 spin_unlock(&dentry->d_lock); 485 spin_unlock(&dentry->d_lock);
486 } else { 486 } else {
487 list_move_tail(&dentry->d_lru, &tmp); 487 list_move_tail(&dentry->d_lru, &tmp);
@@ -2149,7 +2149,6 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
2149 int result; 2149 int result;
2150 unsigned long seq; 2150 unsigned long seq;
2151 2151
2152 /* FIXME: This is old behavior, needed? Please check callers. */
2153 if (new_dentry == old_dentry) 2152 if (new_dentry == old_dentry)
2154 return 1; 2153 return 1;
2155 2154
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 81ae9ea3c6e1..0662ba6de85a 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -30,6 +30,7 @@
30 30
31static struct vfsmount *debugfs_mount; 31static struct vfsmount *debugfs_mount;
32static int debugfs_mount_count; 32static int debugfs_mount_count;
33static bool debugfs_registered;
33 34
34static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev) 35static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev)
35{ 36{
@@ -496,6 +497,16 @@ exit:
496} 497}
497EXPORT_SYMBOL_GPL(debugfs_rename); 498EXPORT_SYMBOL_GPL(debugfs_rename);
498 499
500/**
501 * debugfs_initialized - Tells whether debugfs has been registered
502 */
503bool debugfs_initialized(void)
504{
505 return debugfs_registered;
506}
507EXPORT_SYMBOL_GPL(debugfs_initialized);
508
509
499static struct kobject *debug_kobj; 510static struct kobject *debug_kobj;
500 511
501static int __init debugfs_init(void) 512static int __init debugfs_init(void)
@@ -509,11 +520,16 @@ static int __init debugfs_init(void)
509 retval = register_filesystem(&debug_fs_type); 520 retval = register_filesystem(&debug_fs_type);
510 if (retval) 521 if (retval)
511 kobject_put(debug_kobj); 522 kobject_put(debug_kobj);
523 else
524 debugfs_registered = true;
525
512 return retval; 526 return retval;
513} 527}
514 528
515static void __exit debugfs_exit(void) 529static void __exit debugfs_exit(void)
516{ 530{
531 debugfs_registered = false;
532
517 simple_release_fs(&debugfs_mount, &debugfs_mount_count); 533 simple_release_fs(&debugfs_mount, &debugfs_mount_count);
518 unregister_filesystem(&debug_fs_type); 534 unregister_filesystem(&debug_fs_type);
519 kobject_put(debug_kobj); 535 kobject_put(debug_kobj);
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 63a4a59e4148..c68edb969441 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -90,6 +90,15 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode)
90#define PARSE_MOUNT 0 90#define PARSE_MOUNT 0
91#define PARSE_REMOUNT 1 91#define PARSE_REMOUNT 1
92 92
93/*
94 * parse_mount_options():
95 * Set @opts to mount options specified in @data. If an option is not
96 * specified in @data, set it to its default value. The exception is
97 * 'newinstance' option which can only be set/cleared on a mount (i.e.
98 * cannot be changed during remount).
99 *
100 * Note: @data may be NULL (in which case all options are set to default).
101 */
93static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) 102static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
94{ 103{
95 char *p; 104 char *p;
@@ -355,12 +364,9 @@ static int devpts_get_sb(struct file_system_type *fs_type,
355 struct pts_mount_opts opts; 364 struct pts_mount_opts opts;
356 struct super_block *s; 365 struct super_block *s;
357 366
358 memset(&opts, 0, sizeof(opts)); 367 error = parse_mount_options(data, PARSE_MOUNT, &opts);
359 if (data) { 368 if (error)
360 error = parse_mount_options(data, PARSE_MOUNT, &opts); 369 return error;
361 if (error)
362 return error;
363 }
364 370
365 if (opts.newinstance) 371 if (opts.newinstance)
366 s = sget(fs_type, NULL, set_anon_super, NULL); 372 s = sget(fs_type, NULL, set_anon_super, NULL);
@@ -389,11 +395,10 @@ static int devpts_get_sb(struct file_system_type *fs_type,
389 return 0; 395 return 0;
390 396
391out_dput: 397out_dput:
392 dput(s->s_root); 398 dput(s->s_root); /* undo dget() in simple_set_mnt() */
393 399
394out_undo_sget: 400out_undo_sget:
395 up_write(&s->s_umount); 401 deactivate_locked_super(s);
396 deactivate_super(s);
397 return error; 402 return error;
398} 403}
399 404
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b6d43908ff7a..05763bbc2050 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -307,8 +307,6 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
307 struct bio *bio; 307 struct bio *bio;
308 308
309 bio = bio_alloc(GFP_KERNEL, nr_vecs); 309 bio = bio_alloc(GFP_KERNEL, nr_vecs);
310 if (bio == NULL)
311 return -ENOMEM;
312 310
313 bio->bi_bdev = bdev; 311 bio->bi_bdev = bdev;
314 bio->bi_sector = first_sector; 312 bio->bi_sector = first_sector;
@@ -1126,7 +1124,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1126 int acquire_i_mutex = 0; 1124 int acquire_i_mutex = 0;
1127 1125
1128 if (rw & WRITE) 1126 if (rw & WRITE)
1129 rw = WRITE_SYNC; 1127 rw = WRITE_ODIRECT;
1130 1128
1131 if (bdev) 1129 if (bdev)
1132 bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev)); 1130 bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 44d725f612cf..b6a719a909f8 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -18,7 +18,7 @@ static void drop_pagecache_sb(struct super_block *sb)
18 18
19 spin_lock(&inode_lock); 19 spin_lock(&inode_lock);
20 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 20 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
21 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 21 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
22 continue; 22 continue;
23 if (inode->i_mapping->nrpages == 0) 23 if (inode->i_mapping->nrpages == 0)
24 continue; 24 continue;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 8b65f289ee00..b91851f1cda3 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -483,15 +483,7 @@ int ecryptfs_encrypt_page(struct page *page)
483 ecryptfs_inode = page->mapping->host; 483 ecryptfs_inode = page->mapping->host;
484 crypt_stat = 484 crypt_stat =
485 &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); 485 &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
486 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { 486 BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
487 rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page,
488 0, PAGE_CACHE_SIZE);
489 if (rc)
490 printk(KERN_ERR "%s: Error attempting to copy "
491 "page at index [%ld]\n", __func__,
492 page->index);
493 goto out;
494 }
495 enc_extent_page = alloc_page(GFP_USER); 487 enc_extent_page = alloc_page(GFP_USER);
496 if (!enc_extent_page) { 488 if (!enc_extent_page) {
497 rc = -ENOMEM; 489 rc = -ENOMEM;
@@ -620,16 +612,7 @@ int ecryptfs_decrypt_page(struct page *page)
620 ecryptfs_inode = page->mapping->host; 612 ecryptfs_inode = page->mapping->host;
621 crypt_stat = 613 crypt_stat =
622 &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); 614 &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
623 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { 615 BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
624 rc = ecryptfs_read_lower_page_segment(page, page->index, 0,
625 PAGE_CACHE_SIZE,
626 ecryptfs_inode);
627 if (rc)
628 printk(KERN_ERR "%s: Error attempting to copy "
629 "page at index [%ld]\n", __func__,
630 page->index);
631 goto out;
632 }
633 enc_extent_page = alloc_page(GFP_USER); 616 enc_extent_page = alloc_page(GFP_USER);
634 if (!enc_extent_page) { 617 if (!enc_extent_page) {
635 rc = -ENOMEM; 618 rc = -ENOMEM;
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 064c5820e4e5..00b30a2d5466 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -269,6 +269,7 @@ struct ecryptfs_crypt_stat {
269#define ECRYPTFS_ENCRYPT_FILENAMES 0x00000800 269#define ECRYPTFS_ENCRYPT_FILENAMES 0x00000800
270#define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00001000 270#define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00001000
271#define ECRYPTFS_ENCFN_USE_FEK 0x00002000 271#define ECRYPTFS_ENCFN_USE_FEK 0x00002000
272#define ECRYPTFS_UNLINK_SIGS 0x00004000
272 u32 flags; 273 u32 flags;
273 unsigned int file_version; 274 unsigned int file_version;
274 size_t iv_bytes; 275 size_t iv_bytes;
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 55b3145b8072..2f0945d63297 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -379,9 +379,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
379 goto out_d_drop; 379 goto out_d_drop;
380 } 380 }
381 lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); 381 lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
382 mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
382 lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name, 383 lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
383 lower_dir_dentry, 384 lower_dir_dentry,
384 ecryptfs_dentry->d_name.len); 385 ecryptfs_dentry->d_name.len);
386 mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
385 if (IS_ERR(lower_dentry)) { 387 if (IS_ERR(lower_dentry)) {
386 rc = PTR_ERR(lower_dentry); 388 rc = PTR_ERR(lower_dentry);
387 printk(KERN_ERR "%s: lookup_one_len() returned [%d] on " 389 printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
@@ -406,9 +408,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
406 "filename; rc = [%d]\n", __func__, rc); 408 "filename; rc = [%d]\n", __func__, rc);
407 goto out_d_drop; 409 goto out_d_drop;
408 } 410 }
411 mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
409 lower_dentry = lookup_one_len(encrypted_and_encoded_name, 412 lower_dentry = lookup_one_len(encrypted_and_encoded_name,
410 lower_dir_dentry, 413 lower_dir_dentry,
411 encrypted_and_encoded_name_size - 1); 414 encrypted_and_encoded_name_size - 1);
415 mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
412 if (IS_ERR(lower_dentry)) { 416 if (IS_ERR(lower_dentry)) {
413 rc = PTR_ERR(lower_dentry); 417 rc = PTR_ERR(lower_dentry);
414 printk(KERN_ERR "%s: lookup_one_len() returned [%d] on " 418 printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
@@ -636,8 +640,9 @@ static int
636ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) 640ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
637{ 641{
638 char *lower_buf; 642 char *lower_buf;
643 size_t lower_bufsiz;
639 struct dentry *lower_dentry; 644 struct dentry *lower_dentry;
640 struct ecryptfs_crypt_stat *crypt_stat; 645 struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
641 char *plaintext_name; 646 char *plaintext_name;
642 size_t plaintext_name_size; 647 size_t plaintext_name_size;
643 mm_segment_t old_fs; 648 mm_segment_t old_fs;
@@ -648,12 +653,21 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
648 rc = -EINVAL; 653 rc = -EINVAL;
649 goto out; 654 goto out;
650 } 655 }
651 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; 656 mount_crypt_stat = &ecryptfs_superblock_to_private(
657 dentry->d_sb)->mount_crypt_stat;
658 /*
659 * If the lower filename is encrypted, it will result in a significantly
660 * longer name. If needed, truncate the name after decode and decrypt.
661 */
662 if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
663 lower_bufsiz = PATH_MAX;
664 else
665 lower_bufsiz = bufsiz;
652 /* Released in this function */ 666 /* Released in this function */
653 lower_buf = kmalloc(bufsiz, GFP_KERNEL); 667 lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL);
654 if (lower_buf == NULL) { 668 if (lower_buf == NULL) {
655 printk(KERN_ERR "%s: Out of memory whilst attempting to " 669 printk(KERN_ERR "%s: Out of memory whilst attempting to "
656 "kmalloc [%d] bytes\n", __func__, bufsiz); 670 "kmalloc [%zd] bytes\n", __func__, lower_bufsiz);
657 rc = -ENOMEM; 671 rc = -ENOMEM;
658 goto out; 672 goto out;
659 } 673 }
@@ -661,7 +675,7 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
661 set_fs(get_ds()); 675 set_fs(get_ds());
662 rc = lower_dentry->d_inode->i_op->readlink(lower_dentry, 676 rc = lower_dentry->d_inode->i_op->readlink(lower_dentry,
663 (char __user *)lower_buf, 677 (char __user *)lower_buf,
664 bufsiz); 678 lower_bufsiz);
665 set_fs(old_fs); 679 set_fs(old_fs);
666 if (rc >= 0) { 680 if (rc >= 0) {
667 rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name, 681 rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name,
@@ -674,7 +688,9 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
674 rc); 688 rc);
675 goto out_free_lower_buf; 689 goto out_free_lower_buf;
676 } 690 }
677 rc = copy_to_user(buf, plaintext_name, plaintext_name_size); 691 /* Check for bufsiz <= 0 done in sys_readlinkat() */
692 rc = copy_to_user(buf, plaintext_name,
693 min((size_t) bufsiz, plaintext_name_size));
678 if (rc) 694 if (rc)
679 rc = -EFAULT; 695 rc = -EFAULT;
680 else 696 else
@@ -814,6 +830,13 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
814 size_t num_zeros = (PAGE_CACHE_SIZE 830 size_t num_zeros = (PAGE_CACHE_SIZE
815 - (new_length & ~PAGE_CACHE_MASK)); 831 - (new_length & ~PAGE_CACHE_MASK));
816 832
833 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
834 rc = vmtruncate(inode, new_length);
835 if (rc)
836 goto out_free;
837 rc = vmtruncate(lower_dentry->d_inode, new_length);
838 goto out_free;
839 }
817 if (num_zeros) { 840 if (num_zeros) {
818 char *zeros_virt; 841 char *zeros_virt;
819 842
@@ -915,8 +938,6 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
915 } 938 }
916 rc = 0; 939 rc = 0;
917 crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); 940 crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
918 mutex_unlock(&crypt_stat->cs_mutex);
919 goto out;
920 } 941 }
921 } 942 }
922 mutex_unlock(&crypt_stat->cs_mutex); 943 mutex_unlock(&crypt_stat->cs_mutex);
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index e4a6223c3145..af737bb56cb7 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -740,8 +740,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
740out_release_free_unlock: 740out_release_free_unlock:
741 crypto_free_hash(s->hash_desc.tfm); 741 crypto_free_hash(s->hash_desc.tfm);
742out_free_unlock: 742out_free_unlock:
743 memset(s->block_aligned_filename, 0, s->block_aligned_filename_size); 743 kzfree(s->block_aligned_filename);
744 kfree(s->block_aligned_filename);
745out_unlock: 744out_unlock:
746 mutex_unlock(s->tfm_mutex); 745 mutex_unlock(s->tfm_mutex);
747out: 746out:
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index aed56c25539b..9f0aa9883c28 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -190,14 +190,14 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
190 init_special_inode(inode, lower_inode->i_mode, 190 init_special_inode(inode, lower_inode->i_mode,
191 lower_inode->i_rdev); 191 lower_inode->i_rdev);
192 dentry->d_op = &ecryptfs_dops; 192 dentry->d_op = &ecryptfs_dops;
193 if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
194 d_add(dentry, inode);
195 else
196 d_instantiate(dentry, inode);
197 fsstack_copy_attr_all(inode, lower_inode, NULL); 193 fsstack_copy_attr_all(inode, lower_inode, NULL);
198 /* This size will be overwritten for real files w/ headers and 194 /* This size will be overwritten for real files w/ headers and
199 * other metadata */ 195 * other metadata */
200 fsstack_copy_inode_size(inode, lower_inode); 196 fsstack_copy_inode_size(inode, lower_inode);
197 if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
198 d_add(dentry, inode);
199 else
200 d_instantiate(dentry, inode);
201out: 201out:
202 return rc; 202 return rc;
203} 203}
@@ -208,7 +208,7 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
208 ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata, 208 ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata,
209 ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig, 209 ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig,
210 ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes, 210 ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes,
211 ecryptfs_opt_err }; 211 ecryptfs_opt_unlink_sigs, ecryptfs_opt_err };
212 212
213static const match_table_t tokens = { 213static const match_table_t tokens = {
214 {ecryptfs_opt_sig, "sig=%s"}, 214 {ecryptfs_opt_sig, "sig=%s"},
@@ -222,6 +222,7 @@ static const match_table_t tokens = {
222 {ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"}, 222 {ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"},
223 {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"}, 223 {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"},
224 {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"}, 224 {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"},
225 {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"},
225 {ecryptfs_opt_err, NULL} 226 {ecryptfs_opt_err, NULL}
226}; 227};
227 228
@@ -402,6 +403,9 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
402 fn_cipher_key_bytes; 403 fn_cipher_key_bytes;
403 fn_cipher_key_bytes_set = 1; 404 fn_cipher_key_bytes_set = 1;
404 break; 405 break;
406 case ecryptfs_opt_unlink_sigs:
407 mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS;
408 break;
405 case ecryptfs_opt_err: 409 case ecryptfs_opt_err:
406 default: 410 default:
407 printk(KERN_WARNING 411 printk(KERN_WARNING
@@ -610,9 +614,8 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
610 } 614 }
611 goto out; 615 goto out;
612out_abort: 616out_abort:
613 dput(sb->s_root); 617 dput(sb->s_root); /* aka mnt->mnt_root, as set by get_sb_nodev() */
614 up_write(&sb->s_umount); 618 deactivate_locked_super(sb);
615 deactivate_super(sb);
616out: 619out:
617 return rc; 620 return rc;
618} 621}
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 96ef51489e01..f1c17e87c5fb 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -133,45 +133,6 @@ out:
133 return rc; 133 return rc;
134} 134}
135 135
136static int
137ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type,
138 struct ecryptfs_msg_ctx **msg_ctx);
139
140/**
141 * ecryptfs_send_raw_message
142 * @msg_type: Message type
143 * @daemon: Daemon struct for recipient of message
144 *
145 * A raw message is one that does not include an ecryptfs_message
146 * struct. It simply has a type.
147 *
148 * Must be called with ecryptfs_daemon_hash_mux held.
149 *
150 * Returns zero on success; non-zero otherwise
151 */
152static int ecryptfs_send_raw_message(u8 msg_type,
153 struct ecryptfs_daemon *daemon)
154{
155 struct ecryptfs_msg_ctx *msg_ctx;
156 int rc;
157
158 rc = ecryptfs_send_message_locked(NULL, 0, msg_type, &msg_ctx);
159 if (rc) {
160 printk(KERN_ERR "%s: Error whilst attempting to send "
161 "message to ecryptfsd; rc = [%d]\n", __func__, rc);
162 goto out;
163 }
164 /* Raw messages are logically context-free (e.g., no
165 * reply is expected), so we set the state of the
166 * ecryptfs_msg_ctx object to indicate that it should
167 * be freed as soon as the message is sent. */
168 mutex_lock(&msg_ctx->mux);
169 msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_NO_REPLY;
170 mutex_unlock(&msg_ctx->mux);
171out:
172 return rc;
173}
174
175/** 136/**
176 * ecryptfs_spawn_daemon - Create and initialize a new daemon struct 137 * ecryptfs_spawn_daemon - Create and initialize a new daemon struct
177 * @daemon: Pointer to set to newly allocated daemon struct 138 * @daemon: Pointer to set to newly allocated daemon struct
@@ -212,49 +173,6 @@ out:
212} 173}
213 174
214/** 175/**
215 * ecryptfs_process_helo
216 * @euid: The user ID owner of the message
217 * @user_ns: The namespace in which @euid applies
218 * @pid: The process ID for the userspace program that sent the
219 * message
220 *
221 * Adds the euid and pid values to the daemon euid hash. If an euid
222 * already has a daemon pid registered, the daemon will be
223 * unregistered before the new daemon is put into the hash list.
224 * Returns zero after adding a new daemon to the hash list;
225 * non-zero otherwise.
226 */
227int ecryptfs_process_helo(uid_t euid, struct user_namespace *user_ns,
228 struct pid *pid)
229{
230 struct ecryptfs_daemon *new_daemon;
231 struct ecryptfs_daemon *old_daemon;
232 int rc;
233
234 mutex_lock(&ecryptfs_daemon_hash_mux);
235 rc = ecryptfs_find_daemon_by_euid(&old_daemon, euid, user_ns);
236 if (rc != 0) {
237 printk(KERN_WARNING "Received request from user [%d] "
238 "to register daemon [0x%p]; unregistering daemon "
239 "[0x%p]\n", euid, pid, old_daemon->pid);
240 rc = ecryptfs_send_raw_message(ECRYPTFS_MSG_QUIT, old_daemon);
241 if (rc)
242 printk(KERN_WARNING "Failed to send QUIT "
243 "message to daemon [0x%p]; rc = [%d]\n",
244 old_daemon->pid, rc);
245 hlist_del(&old_daemon->euid_chain);
246 kfree(old_daemon);
247 }
248 rc = ecryptfs_spawn_daemon(&new_daemon, euid, user_ns, pid);
249 if (rc)
250 printk(KERN_ERR "%s: The gods are displeased with this attempt "
251 "to create a new daemon object for euid [%d]; pid "
252 "[0x%p]; rc = [%d]\n", __func__, euid, pid, rc);
253 mutex_unlock(&ecryptfs_daemon_hash_mux);
254 return rc;
255}
256
257/**
258 * ecryptfs_exorcise_daemon - Destroy the daemon struct 176 * ecryptfs_exorcise_daemon - Destroy the daemon struct
259 * 177 *
260 * Must be called ceremoniously while in possession of 178 * Must be called ceremoniously while in possession of
@@ -291,8 +209,7 @@ int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon)
291 if (daemon->user_ns) 209 if (daemon->user_ns)
292 put_user_ns(daemon->user_ns); 210 put_user_ns(daemon->user_ns);
293 mutex_unlock(&daemon->mux); 211 mutex_unlock(&daemon->mux);
294 memset(daemon, 0, sizeof(*daemon)); 212 kzfree(daemon);
295 kfree(daemon);
296out: 213out:
297 return rc; 214 return rc;
298} 215}
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index a67fea655f49..4ec8f61ccf5a 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -193,26 +193,20 @@ int ecryptfs_send_miscdev(char *data, size_t data_size,
193 int rc = 0; 193 int rc = 0;
194 194
195 mutex_lock(&msg_ctx->mux); 195 mutex_lock(&msg_ctx->mux);
196 if (data) { 196 msg_ctx->msg = kmalloc((sizeof(*msg_ctx->msg) + data_size),
197 msg_ctx->msg = kmalloc((sizeof(*msg_ctx->msg) + data_size), 197 GFP_KERNEL);
198 GFP_KERNEL); 198 if (!msg_ctx->msg) {
199 if (!msg_ctx->msg) { 199 rc = -ENOMEM;
200 rc = -ENOMEM; 200 printk(KERN_ERR "%s: Out of memory whilst attempting "
201 printk(KERN_ERR "%s: Out of memory whilst attempting " 201 "to kmalloc(%zd, GFP_KERNEL)\n", __func__,
202 "to kmalloc(%zd, GFP_KERNEL)\n", __func__, 202 (sizeof(*msg_ctx->msg) + data_size));
203 (sizeof(*msg_ctx->msg) + data_size)); 203 goto out_unlock;
204 goto out_unlock; 204 }
205 }
206 } else
207 msg_ctx->msg = NULL;
208 msg_ctx->msg->index = msg_ctx->index; 205 msg_ctx->msg->index = msg_ctx->index;
209 msg_ctx->msg->data_len = data_size; 206 msg_ctx->msg->data_len = data_size;
210 msg_ctx->type = msg_type; 207 msg_ctx->type = msg_type;
211 if (data) { 208 memcpy(msg_ctx->msg->data, data, data_size);
212 memcpy(msg_ctx->msg->data, data, data_size); 209 msg_ctx->msg_size = (sizeof(*msg_ctx->msg) + data_size);
213 msg_ctx->msg_size = (sizeof(*msg_ctx->msg) + data_size);
214 } else
215 msg_ctx->msg_size = 0;
216 mutex_lock(&daemon->mux); 210 mutex_lock(&daemon->mux);
217 list_add_tail(&msg_ctx->daemon_out_list, &daemon->msg_ctx_out_queue); 211 list_add_tail(&msg_ctx->daemon_out_list, &daemon->msg_ctx_out_queue);
218 daemon->num_queued_msg_ctx++; 212 daemon->num_queued_msg_ctx++;
@@ -418,18 +412,13 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
418 412
419 if (count == 0) 413 if (count == 0)
420 goto out; 414 goto out;
421 data = kmalloc(count, GFP_KERNEL); 415
422 if (!data) { 416 data = memdup_user(buf, count);
423 printk(KERN_ERR "%s: Out of memory whilst attempting to " 417 if (IS_ERR(data)) {
424 "kmalloc([%zd], GFP_KERNEL)\n", __func__, count); 418 printk(KERN_ERR "%s: memdup_user returned error [%ld]\n",
419 __func__, PTR_ERR(data));
425 goto out; 420 goto out;
426 } 421 }
427 rc = copy_from_user(data, buf, count);
428 if (rc) {
429 printk(KERN_ERR "%s: copy_from_user returned error [%d]\n",
430 __func__, rc);
431 goto out_free;
432 }
433 sz = count; 422 sz = count;
434 i = 0; 423 i = 0;
435 switch (data[i++]) { 424 switch (data[i++]) {
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 46cec2b69796..5c6bab9786e3 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -449,6 +449,7 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode)
449 struct ecryptfs_crypt_stat *crypt_stat; 449 struct ecryptfs_crypt_stat *crypt_stat;
450 450
451 crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; 451 crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
452 BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
452 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) 453 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
453 return ecryptfs_write_inode_size_to_xattr(ecryptfs_inode); 454 return ecryptfs_write_inode_size_to_xattr(ecryptfs_inode);
454 else 455 else
@@ -490,6 +491,16 @@ static int ecryptfs_write_end(struct file *file,
490 ecryptfs_printk(KERN_DEBUG, "Not a new file\n"); 491 ecryptfs_printk(KERN_DEBUG, "Not a new file\n");
491 ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page" 492 ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
492 "(page w/ index = [0x%.16x], to = [%d])\n", index, to); 493 "(page w/ index = [0x%.16x], to = [%d])\n", index, to);
494 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
495 rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0,
496 to);
497 if (!rc) {
498 rc = copied;
499 fsstack_copy_inode_size(ecryptfs_inode,
500 ecryptfs_inode_to_lower(ecryptfs_inode));
501 }
502 goto out;
503 }
493 /* Fills in zeros if 'to' goes beyond inode size */ 504 /* Fills in zeros if 'to' goes beyond inode size */
494 rc = fill_zeros_to_end_of_page(page, to); 505 rc = fill_zeros_to_end_of_page(page, to);
495 if (rc) { 506 if (rc) {
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 75c2ea9fee35..a137c6ea2fee 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -117,13 +117,15 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
117 size_t size) 117 size_t size)
118{ 118{
119 struct page *ecryptfs_page; 119 struct page *ecryptfs_page;
120 struct ecryptfs_crypt_stat *crypt_stat;
121 struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
120 char *ecryptfs_page_virt; 122 char *ecryptfs_page_virt;
121 loff_t ecryptfs_file_size = 123 loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
122 i_size_read(ecryptfs_file->f_dentry->d_inode);
123 loff_t data_offset = 0; 124 loff_t data_offset = 0;
124 loff_t pos; 125 loff_t pos;
125 int rc = 0; 126 int rc = 0;
126 127
128 crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
127 /* 129 /*
128 * if we are writing beyond current size, then start pos 130 * if we are writing beyond current size, then start pos
129 * at the current size - we'll fill in zeros from there. 131 * at the current size - we'll fill in zeros from there.
@@ -184,7 +186,13 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
184 flush_dcache_page(ecryptfs_page); 186 flush_dcache_page(ecryptfs_page);
185 SetPageUptodate(ecryptfs_page); 187 SetPageUptodate(ecryptfs_page);
186 unlock_page(ecryptfs_page); 188 unlock_page(ecryptfs_page);
187 rc = ecryptfs_encrypt_page(ecryptfs_page); 189 if (crypt_stat->flags & ECRYPTFS_ENCRYPTED)
190 rc = ecryptfs_encrypt_page(ecryptfs_page);
191 else
192 rc = ecryptfs_write_lower_page_segment(ecryptfs_inode,
193 ecryptfs_page,
194 start_offset_in_page,
195 data_offset);
188 page_cache_release(ecryptfs_page); 196 page_cache_release(ecryptfs_page);
189 if (rc) { 197 if (rc) {
190 printk(KERN_ERR "%s: Error encrypting " 198 printk(KERN_ERR "%s: Error encrypting "
@@ -194,14 +202,16 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
194 pos += num_bytes; 202 pos += num_bytes;
195 } 203 }
196 if ((offset + size) > ecryptfs_file_size) { 204 if ((offset + size) > ecryptfs_file_size) {
197 i_size_write(ecryptfs_file->f_dentry->d_inode, (offset + size)); 205 i_size_write(ecryptfs_inode, (offset + size));
198 rc = ecryptfs_write_inode_size_to_metadata( 206 if (crypt_stat->flags & ECRYPTFS_ENCRYPTED) {
199 ecryptfs_file->f_dentry->d_inode); 207 rc = ecryptfs_write_inode_size_to_metadata(
200 if (rc) { 208 ecryptfs_inode);
201 printk(KERN_ERR "Problem with " 209 if (rc) {
202 "ecryptfs_write_inode_size_to_metadata; " 210 printk(KERN_ERR "Problem with "
203 "rc = [%d]\n", rc); 211 "ecryptfs_write_inode_size_to_metadata; "
204 goto out; 212 "rc = [%d]\n", rc);
213 goto out;
214 }
205 } 215 }
206 } 216 }
207out: 217out:
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index c27ac2b358a1..fa4c7e7d15d9 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -170,7 +170,10 @@ static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt)
170 list_for_each_entry(walker, 170 list_for_each_entry(walker,
171 &mount_crypt_stat->global_auth_tok_list, 171 &mount_crypt_stat->global_auth_tok_list,
172 mount_crypt_stat_list) { 172 mount_crypt_stat_list) {
173 seq_printf(m, ",ecryptfs_sig=%s", walker->sig); 173 if (walker->flags & ECRYPTFS_AUTH_TOK_FNEK)
174 seq_printf(m, ",ecryptfs_fnek_sig=%s", walker->sig);
175 else
176 seq_printf(m, ",ecryptfs_sig=%s", walker->sig);
174 } 177 }
175 mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); 178 mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
176 179
@@ -186,6 +189,8 @@ static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt)
186 seq_printf(m, ",ecryptfs_xattr_metadata"); 189 seq_printf(m, ",ecryptfs_xattr_metadata");
187 if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) 190 if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
188 seq_printf(m, ",ecryptfs_encrypted_view"); 191 seq_printf(m, ",ecryptfs_encrypted_view");
192 if (mount_crypt_stat->flags & ECRYPTFS_UNLINK_SIGS)
193 seq_printf(m, ",ecryptfs_unlink_sigs");
189 194
190 return 0; 195 return 0;
191} 196}
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 73b19cfc91fc..f04942810818 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -329,18 +329,22 @@ out_no_fs:
329} 329}
330 330
331static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) { 331static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
332 struct efs_sb_info *sb = SUPER_INFO(dentry->d_sb); 332 struct super_block *sb = dentry->d_sb;
333 struct efs_sb_info *sbi = SUPER_INFO(sb);
334 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
333 335
334 buf->f_type = EFS_SUPER_MAGIC; /* efs magic number */ 336 buf->f_type = EFS_SUPER_MAGIC; /* efs magic number */
335 buf->f_bsize = EFS_BLOCKSIZE; /* blocksize */ 337 buf->f_bsize = EFS_BLOCKSIZE; /* blocksize */
336 buf->f_blocks = sb->total_groups * /* total data blocks */ 338 buf->f_blocks = sbi->total_groups * /* total data blocks */
337 (sb->group_size - sb->inode_blocks); 339 (sbi->group_size - sbi->inode_blocks);
338 buf->f_bfree = sb->data_free; /* free data blocks */ 340 buf->f_bfree = sbi->data_free; /* free data blocks */
339 buf->f_bavail = sb->data_free; /* free blocks for non-root */ 341 buf->f_bavail = sbi->data_free; /* free blocks for non-root */
340 buf->f_files = sb->total_groups * /* total inodes */ 342 buf->f_files = sbi->total_groups * /* total inodes */
341 sb->inode_blocks * 343 sbi->inode_blocks *
342 (EFS_BLOCKSIZE / sizeof(struct efs_dinode)); 344 (EFS_BLOCKSIZE / sizeof(struct efs_dinode));
343 buf->f_ffree = sb->inode_free; /* free inodes */ 345 buf->f_ffree = sbi->inode_free; /* free inodes */
346 buf->f_fsid.val[0] = (u32)id;
347 buf->f_fsid.val[1] = (u32)(id >> 32);
344 buf->f_namelen = EFS_MAXNAMELEN; /* max filename length */ 348 buf->f_namelen = EFS_MAXNAMELEN; /* max filename length */
345 349
346 return 0; 350 return 0;
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 5de2c2db3aa2..2a701d593d35 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -28,6 +28,7 @@ struct eventfd_ctx {
28 * issue a wakeup. 28 * issue a wakeup.
29 */ 29 */
30 __u64 count; 30 __u64 count;
31 unsigned int flags;
31}; 32};
32 33
33/* 34/*
@@ -50,7 +51,7 @@ int eventfd_signal(struct file *file, int n)
50 n = (int) (ULLONG_MAX - ctx->count); 51 n = (int) (ULLONG_MAX - ctx->count);
51 ctx->count += n; 52 ctx->count += n;
52 if (waitqueue_active(&ctx->wqh)) 53 if (waitqueue_active(&ctx->wqh))
53 wake_up_locked(&ctx->wqh); 54 wake_up_locked_poll(&ctx->wqh, POLLIN);
54 spin_unlock_irqrestore(&ctx->wqh.lock, flags); 55 spin_unlock_irqrestore(&ctx->wqh.lock, flags);
55 56
56 return n; 57 return n;
@@ -87,22 +88,20 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
87{ 88{
88 struct eventfd_ctx *ctx = file->private_data; 89 struct eventfd_ctx *ctx = file->private_data;
89 ssize_t res; 90 ssize_t res;
90 __u64 ucnt; 91 __u64 ucnt = 0;
91 DECLARE_WAITQUEUE(wait, current); 92 DECLARE_WAITQUEUE(wait, current);
92 93
93 if (count < sizeof(ucnt)) 94 if (count < sizeof(ucnt))
94 return -EINVAL; 95 return -EINVAL;
95 spin_lock_irq(&ctx->wqh.lock); 96 spin_lock_irq(&ctx->wqh.lock);
96 res = -EAGAIN; 97 res = -EAGAIN;
97 ucnt = ctx->count; 98 if (ctx->count > 0)
98 if (ucnt > 0)
99 res = sizeof(ucnt); 99 res = sizeof(ucnt);
100 else if (!(file->f_flags & O_NONBLOCK)) { 100 else if (!(file->f_flags & O_NONBLOCK)) {
101 __add_wait_queue(&ctx->wqh, &wait); 101 __add_wait_queue(&ctx->wqh, &wait);
102 for (res = 0;;) { 102 for (res = 0;;) {
103 set_current_state(TASK_INTERRUPTIBLE); 103 set_current_state(TASK_INTERRUPTIBLE);
104 if (ctx->count > 0) { 104 if (ctx->count > 0) {
105 ucnt = ctx->count;
106 res = sizeof(ucnt); 105 res = sizeof(ucnt);
107 break; 106 break;
108 } 107 }
@@ -117,10 +116,11 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
117 __remove_wait_queue(&ctx->wqh, &wait); 116 __remove_wait_queue(&ctx->wqh, &wait);
118 __set_current_state(TASK_RUNNING); 117 __set_current_state(TASK_RUNNING);
119 } 118 }
120 if (res > 0) { 119 if (likely(res > 0)) {
121 ctx->count = 0; 120 ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
121 ctx->count -= ucnt;
122 if (waitqueue_active(&ctx->wqh)) 122 if (waitqueue_active(&ctx->wqh))
123 wake_up_locked(&ctx->wqh); 123 wake_up_locked_poll(&ctx->wqh, POLLOUT);
124 } 124 }
125 spin_unlock_irq(&ctx->wqh.lock); 125 spin_unlock_irq(&ctx->wqh.lock);
126 if (res > 0 && put_user(ucnt, (__u64 __user *) buf)) 126 if (res > 0 && put_user(ucnt, (__u64 __user *) buf))
@@ -166,10 +166,10 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
166 __remove_wait_queue(&ctx->wqh, &wait); 166 __remove_wait_queue(&ctx->wqh, &wait);
167 __set_current_state(TASK_RUNNING); 167 __set_current_state(TASK_RUNNING);
168 } 168 }
169 if (res > 0) { 169 if (likely(res > 0)) {
170 ctx->count += ucnt; 170 ctx->count += ucnt;
171 if (waitqueue_active(&ctx->wqh)) 171 if (waitqueue_active(&ctx->wqh))
172 wake_up_locked(&ctx->wqh); 172 wake_up_locked_poll(&ctx->wqh, POLLIN);
173 } 173 }
174 spin_unlock_irq(&ctx->wqh.lock); 174 spin_unlock_irq(&ctx->wqh.lock);
175 175
@@ -207,7 +207,7 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
207 BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC); 207 BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
208 BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK); 208 BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
209 209
210 if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK)) 210 if (flags & ~EFD_FLAGS_SET)
211 return -EINVAL; 211 return -EINVAL;
212 212
213 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); 213 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
@@ -216,13 +216,14 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
216 216
217 init_waitqueue_head(&ctx->wqh); 217 init_waitqueue_head(&ctx->wqh);
218 ctx->count = count; 218 ctx->count = count;
219 ctx->flags = flags;
219 220
220 /* 221 /*
221 * When we call this, the initialization must be complete, since 222 * When we call this, the initialization must be complete, since
222 * anon_inode_getfd() will install the fd. 223 * anon_inode_getfd() will install the fd.
223 */ 224 */
224 fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx, 225 fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx,
225 flags & (O_CLOEXEC | O_NONBLOCK)); 226 flags & EFD_SHARED_FCNTL_FLAGS);
226 if (fd < 0) 227 if (fd < 0)
227 kfree(ctx); 228 kfree(ctx);
228 return fd; 229 return fd;
@@ -232,3 +233,4 @@ SYSCALL_DEFINE1(eventfd, unsigned int, count)
232{ 233{
233 return sys_eventfd2(count, 0); 234 return sys_eventfd2(count, 0);
234} 235}
236
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index c5c424f23fd5..5458e80fc558 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * fs/eventpoll.c (Efficent event polling implementation) 2 * fs/eventpoll.c (Efficient event retrieval implementation)
3 * Copyright (C) 2001,...,2007 Davide Libenzi 3 * Copyright (C) 2001,...,2009 Davide Libenzi
4 * 4 *
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
@@ -71,29 +71,11 @@
71 * a better scalability. 71 * a better scalability.
72 */ 72 */
73 73
74#define DEBUG_EPOLL 0
75
76#if DEBUG_EPOLL > 0
77#define DPRINTK(x) printk x
78#define DNPRINTK(n, x) do { if ((n) <= DEBUG_EPOLL) printk x; } while (0)
79#else /* #if DEBUG_EPOLL > 0 */
80#define DPRINTK(x) (void) 0
81#define DNPRINTK(n, x) (void) 0
82#endif /* #if DEBUG_EPOLL > 0 */
83
84#define DEBUG_EPI 0
85
86#if DEBUG_EPI != 0
87#define EPI_SLAB_DEBUG (SLAB_DEBUG_FREE | SLAB_RED_ZONE /* | SLAB_POISON */)
88#else /* #if DEBUG_EPI != 0 */
89#define EPI_SLAB_DEBUG 0
90#endif /* #if DEBUG_EPI != 0 */
91
92/* Epoll private bits inside the event mask */ 74/* Epoll private bits inside the event mask */
93#define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET) 75#define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET)
94 76
95/* Maximum number of poll wake up nests we are allowing */ 77/* Maximum number of nesting allowed inside epoll sets */
96#define EP_MAX_POLLWAKE_NESTS 4 78#define EP_MAX_NESTS 4
97 79
98/* Maximum msec timeout value storeable in a long int */ 80/* Maximum msec timeout value storeable in a long int */
99#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ) 81#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
@@ -110,24 +92,21 @@ struct epoll_filefd {
110}; 92};
111 93
112/* 94/*
113 * Node that is linked into the "wake_task_list" member of the "struct poll_safewake". 95 * Structure used to track possible nested calls, for too deep recursions
114 * It is used to keep track on all tasks that are currently inside the wake_up() code 96 * and loop cycles.
115 * to 1) short-circuit the one coming from the same task and same wait queue head
116 * (loop) 2) allow a maximum number of epoll descriptors inclusion nesting
117 * 3) let go the ones coming from other tasks.
118 */ 97 */
119struct wake_task_node { 98struct nested_call_node {
120 struct list_head llink; 99 struct list_head llink;
121 struct task_struct *task; 100 void *cookie;
122 wait_queue_head_t *wq; 101 int cpu;
123}; 102};
124 103
125/* 104/*
126 * This is used to implement the safe poll wake up avoiding to reenter 105 * This structure is used as collector for nested calls, to check for
127 * the poll callback from inside wake_up(). 106 * maximum recursion dept and loop cycles.
128 */ 107 */
129struct poll_safewake { 108struct nested_calls {
130 struct list_head wake_task_list; 109 struct list_head tasks_call_list;
131 spinlock_t lock; 110 spinlock_t lock;
132}; 111};
133 112
@@ -213,7 +192,7 @@ struct eppoll_entry {
213 struct list_head llink; 192 struct list_head llink;
214 193
215 /* The "base" pointer is set to the container "struct epitem" */ 194 /* The "base" pointer is set to the container "struct epitem" */
216 void *base; 195 struct epitem *base;
217 196
218 /* 197 /*
219 * Wait queue item that will be linked to the target file wait 198 * Wait queue item that will be linked to the target file wait
@@ -231,6 +210,12 @@ struct ep_pqueue {
231 struct epitem *epi; 210 struct epitem *epi;
232}; 211};
233 212
213/* Used by the ep_send_events() function as callback private data */
214struct ep_send_events_data {
215 int maxevents;
216 struct epoll_event __user *events;
217};
218
234/* 219/*
235 * Configuration options available inside /proc/sys/fs/epoll/ 220 * Configuration options available inside /proc/sys/fs/epoll/
236 */ 221 */
@@ -242,8 +227,11 @@ static int max_user_watches __read_mostly;
242 */ 227 */
243static DEFINE_MUTEX(epmutex); 228static DEFINE_MUTEX(epmutex);
244 229
245/* Safe wake up implementation */ 230/* Used for safe wake up implementation */
246static struct poll_safewake psw; 231static struct nested_calls poll_safewake_ncalls;
232
233/* Used to call file's f_op->poll() under the nested calls boundaries */
234static struct nested_calls poll_readywalk_ncalls;
247 235
248/* Slab cache used to allocate "struct epitem" */ 236/* Slab cache used to allocate "struct epitem" */
249static struct kmem_cache *epi_cache __read_mostly; 237static struct kmem_cache *epi_cache __read_mostly;
@@ -312,89 +300,230 @@ static inline int ep_op_has_event(int op)
312} 300}
313 301
314/* Initialize the poll safe wake up structure */ 302/* Initialize the poll safe wake up structure */
315static void ep_poll_safewake_init(struct poll_safewake *psw) 303static void ep_nested_calls_init(struct nested_calls *ncalls)
316{ 304{
317 305 INIT_LIST_HEAD(&ncalls->tasks_call_list);
318 INIT_LIST_HEAD(&psw->wake_task_list); 306 spin_lock_init(&ncalls->lock);
319 spin_lock_init(&psw->lock);
320} 307}
321 308
322/* 309/**
323 * Perform a safe wake up of the poll wait list. The problem is that 310 * ep_call_nested - Perform a bound (possibly) nested call, by checking
324 * with the new callback'd wake up system, it is possible that the 311 * that the recursion limit is not exceeded, and that
325 * poll callback is reentered from inside the call to wake_up() done 312 * the same nested call (by the meaning of same cookie) is
326 * on the poll wait queue head. The rule is that we cannot reenter the 313 * no re-entered.
327 * wake up code from the same task more than EP_MAX_POLLWAKE_NESTS times, 314 *
328 * and we cannot reenter the same wait queue head at all. This will 315 * @ncalls: Pointer to the nested_calls structure to be used for this call.
329 * enable to have a hierarchy of epoll file descriptor of no more than 316 * @max_nests: Maximum number of allowed nesting calls.
330 * EP_MAX_POLLWAKE_NESTS deep. We need the irq version of the spin lock 317 * @nproc: Nested call core function pointer.
331 * because this one gets called by the poll callback, that in turn is called 318 * @priv: Opaque data to be passed to the @nproc callback.
332 * from inside a wake_up(), that might be called from irq context. 319 * @cookie: Cookie to be used to identify this nested call.
320 *
321 * Returns: Returns the code returned by the @nproc callback, or -1 if
322 * the maximum recursion limit has been exceeded.
333 */ 323 */
334static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq) 324static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
325 int (*nproc)(void *, void *, int), void *priv,
326 void *cookie)
335{ 327{
336 int wake_nests = 0; 328 int error, call_nests = 0;
337 unsigned long flags; 329 unsigned long flags;
338 struct task_struct *this_task = current; 330 int this_cpu = get_cpu();
339 struct list_head *lsthead = &psw->wake_task_list; 331 struct list_head *lsthead = &ncalls->tasks_call_list;
340 struct wake_task_node *tncur; 332 struct nested_call_node *tncur;
341 struct wake_task_node tnode; 333 struct nested_call_node tnode;
342 334
343 spin_lock_irqsave(&psw->lock, flags); 335 spin_lock_irqsave(&ncalls->lock, flags);
344 336
345 /* Try to see if the current task is already inside this wakeup call */ 337 /*
338 * Try to see if the current task is already inside this wakeup call.
339 * We use a list here, since the population inside this set is always
340 * very much limited.
341 */
346 list_for_each_entry(tncur, lsthead, llink) { 342 list_for_each_entry(tncur, lsthead, llink) {
347 343 if (tncur->cpu == this_cpu &&
348 if (tncur->wq == wq || 344 (tncur->cookie == cookie || ++call_nests > max_nests)) {
349 (tncur->task == this_task && ++wake_nests > EP_MAX_POLLWAKE_NESTS)) {
350 /* 345 /*
351 * Ops ... loop detected or maximum nest level reached. 346 * Ops ... loop detected or maximum nest level reached.
352 * We abort this wake by breaking the cycle itself. 347 * We abort this wake by breaking the cycle itself.
353 */ 348 */
354 spin_unlock_irqrestore(&psw->lock, flags); 349 error = -1;
355 return; 350 goto out_unlock;
356 } 351 }
357 } 352 }
358 353
359 /* Add the current task to the list */ 354 /* Add the current task and cookie to the list */
360 tnode.task = this_task; 355 tnode.cpu = this_cpu;
361 tnode.wq = wq; 356 tnode.cookie = cookie;
362 list_add(&tnode.llink, lsthead); 357 list_add(&tnode.llink, lsthead);
363 358
364 spin_unlock_irqrestore(&psw->lock, flags); 359 spin_unlock_irqrestore(&ncalls->lock, flags);
365 360
366 /* Do really wake up now */ 361 /* Call the nested function */
367 wake_up_nested(wq, 1 + wake_nests); 362 error = (*nproc)(priv, cookie, call_nests);
368 363
369 /* Remove the current task from the list */ 364 /* Remove the current task from the list */
370 spin_lock_irqsave(&psw->lock, flags); 365 spin_lock_irqsave(&ncalls->lock, flags);
371 list_del(&tnode.llink); 366 list_del(&tnode.llink);
372 spin_unlock_irqrestore(&psw->lock, flags); 367 out_unlock:
368 spin_unlock_irqrestore(&ncalls->lock, flags);
369
370 put_cpu();
371 return error;
372}
373
374#ifdef CONFIG_DEBUG_LOCK_ALLOC
375static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
376 unsigned long events, int subclass)
377{
378 unsigned long flags;
379
380 spin_lock_irqsave_nested(&wqueue->lock, flags, subclass);
381 wake_up_locked_poll(wqueue, events);
382 spin_unlock_irqrestore(&wqueue->lock, flags);
383}
384#else
385static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
386 unsigned long events, int subclass)
387{
388 wake_up_poll(wqueue, events);
389}
390#endif
391
392static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
393{
394 ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN,
395 1 + call_nests);
396 return 0;
397}
398
399/*
400 * Perform a safe wake up of the poll wait list. The problem is that
401 * with the new callback'd wake up system, it is possible that the
402 * poll callback is reentered from inside the call to wake_up() done
403 * on the poll wait queue head. The rule is that we cannot reenter the
404 * wake up code from the same task more than EP_MAX_NESTS times,
405 * and we cannot reenter the same wait queue head at all. This will
406 * enable to have a hierarchy of epoll file descriptor of no more than
407 * EP_MAX_NESTS deep.
408 */
409static void ep_poll_safewake(wait_queue_head_t *wq)
410{
411 ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
412 ep_poll_wakeup_proc, NULL, wq);
373} 413}
374 414
375/* 415/*
376 * This function unregister poll callbacks from the associated file descriptor. 416 * This function unregisters poll callbacks from the associated file
377 * Since this must be called without holding "ep->lock" the atomic exchange trick 417 * descriptor. Must be called with "mtx" held (or "epmutex" if called from
378 * will protect us from multiple unregister. 418 * ep_free).
379 */ 419 */
380static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) 420static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
381{ 421{
382 int nwait;
383 struct list_head *lsthead = &epi->pwqlist; 422 struct list_head *lsthead = &epi->pwqlist;
384 struct eppoll_entry *pwq; 423 struct eppoll_entry *pwq;
385 424
386 /* This is called without locks, so we need the atomic exchange */ 425 while (!list_empty(lsthead)) {
387 nwait = xchg(&epi->nwait, 0); 426 pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
388 427
389 if (nwait) { 428 list_del(&pwq->llink);
390 while (!list_empty(lsthead)) { 429 remove_wait_queue(pwq->whead, &pwq->wait);
391 pwq = list_first_entry(lsthead, struct eppoll_entry, llink); 430 kmem_cache_free(pwq_cache, pwq);
431 }
432}
392 433
393 list_del_init(&pwq->llink); 434/**
394 remove_wait_queue(pwq->whead, &pwq->wait); 435 * ep_scan_ready_list - Scans the ready list in a way that makes possible for
395 kmem_cache_free(pwq_cache, pwq); 436 * the scan code, to call f_op->poll(). Also allows for
396 } 437 * O(NumReady) performance.
438 *
439 * @ep: Pointer to the epoll private data structure.
440 * @sproc: Pointer to the scan callback.
441 * @priv: Private opaque data passed to the @sproc callback.
442 *
443 * Returns: The same integer error code returned by the @sproc callback.
444 */
445static int ep_scan_ready_list(struct eventpoll *ep,
446 int (*sproc)(struct eventpoll *,
447 struct list_head *, void *),
448 void *priv)
449{
450 int error, pwake = 0;
451 unsigned long flags;
452 struct epitem *epi, *nepi;
453 LIST_HEAD(txlist);
454
455 /*
456 * We need to lock this because we could be hit by
457 * eventpoll_release_file() and epoll_ctl().
458 */
459 mutex_lock(&ep->mtx);
460
461 /*
462 * Steal the ready list, and re-init the original one to the
463 * empty list. Also, set ep->ovflist to NULL so that events
464 * happening while looping w/out locks, are not lost. We cannot
465 * have the poll callback to queue directly on ep->rdllist,
466 * because we want the "sproc" callback to be able to do it
467 * in a lockless way.
468 */
469 spin_lock_irqsave(&ep->lock, flags);
470 list_splice_init(&ep->rdllist, &txlist);
471 ep->ovflist = NULL;
472 spin_unlock_irqrestore(&ep->lock, flags);
473
474 /*
475 * Now call the callback function.
476 */
477 error = (*sproc)(ep, &txlist, priv);
478
479 spin_lock_irqsave(&ep->lock, flags);
480 /*
481 * During the time we spent inside the "sproc" callback, some
482 * other events might have been queued by the poll callback.
483 * We re-insert them inside the main ready-list here.
484 */
485 for (nepi = ep->ovflist; (epi = nepi) != NULL;
486 nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
487 /*
488 * We need to check if the item is already in the list.
489 * During the "sproc" callback execution time, items are
490 * queued into ->ovflist but the "txlist" might already
491 * contain them, and the list_splice() below takes care of them.
492 */
493 if (!ep_is_linked(&epi->rdllink))
494 list_add_tail(&epi->rdllink, &ep->rdllist);
495 }
496 /*
497 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
498 * releasing the lock, events will be queued in the normal way inside
499 * ep->rdllist.
500 */
501 ep->ovflist = EP_UNACTIVE_PTR;
502
503 /*
504 * Quickly re-inject items left on "txlist".
505 */
506 list_splice(&txlist, &ep->rdllist);
507
508 if (!list_empty(&ep->rdllist)) {
509 /*
510 * Wake up (if active) both the eventpoll wait list and
511 * the ->poll() wait list (delayed after we release the lock).
512 */
513 if (waitqueue_active(&ep->wq))
514 wake_up_locked(&ep->wq);
515 if (waitqueue_active(&ep->poll_wait))
516 pwake++;
397 } 517 }
518 spin_unlock_irqrestore(&ep->lock, flags);
519
520 mutex_unlock(&ep->mtx);
521
522 /* We have to call this outside the lock */
523 if (pwake)
524 ep_poll_safewake(&ep->poll_wait);
525
526 return error;
398} 527}
399 528
400/* 529/*
@@ -434,9 +563,6 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
434 563
435 atomic_dec(&ep->user->epoll_watches); 564 atomic_dec(&ep->user->epoll_watches);
436 565
437 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n",
438 current, ep, file));
439
440 return 0; 566 return 0;
441} 567}
442 568
@@ -447,7 +573,7 @@ static void ep_free(struct eventpoll *ep)
447 573
448 /* We need to release all tasks waiting for these file */ 574 /* We need to release all tasks waiting for these file */
449 if (waitqueue_active(&ep->poll_wait)) 575 if (waitqueue_active(&ep->poll_wait))
450 ep_poll_safewake(&psw, &ep->poll_wait); 576 ep_poll_safewake(&ep->poll_wait);
451 577
452 /* 578 /*
453 * We need to lock this because we could be hit by 579 * We need to lock this because we could be hit by
@@ -492,26 +618,54 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
492 if (ep) 618 if (ep)
493 ep_free(ep); 619 ep_free(ep);
494 620
495 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep));
496 return 0; 621 return 0;
497} 622}
498 623
624static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
625 void *priv)
626{
627 struct epitem *epi, *tmp;
628
629 list_for_each_entry_safe(epi, tmp, head, rdllink) {
630 if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
631 epi->event.events)
632 return POLLIN | POLLRDNORM;
633 else {
634 /*
635 * Item has been dropped into the ready list by the poll
636 * callback, but it's not actually ready, as far as
637 * caller requested events goes. We can remove it here.
638 */
639 list_del_init(&epi->rdllink);
640 }
641 }
642
643 return 0;
644}
645
646static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
647{
648 return ep_scan_ready_list(priv, ep_read_events_proc, NULL);
649}
650
499static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) 651static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
500{ 652{
501 unsigned int pollflags = 0; 653 int pollflags;
502 unsigned long flags;
503 struct eventpoll *ep = file->private_data; 654 struct eventpoll *ep = file->private_data;
504 655
505 /* Insert inside our poll wait queue */ 656 /* Insert inside our poll wait queue */
506 poll_wait(file, &ep->poll_wait, wait); 657 poll_wait(file, &ep->poll_wait, wait);
507 658
508 /* Check our condition */ 659 /*
509 spin_lock_irqsave(&ep->lock, flags); 660 * Proceed to find out if wanted events are really available inside
510 if (!list_empty(&ep->rdllist)) 661 * the ready list. This need to be done under ep_call_nested()
511 pollflags = POLLIN | POLLRDNORM; 662 * supervision, since the call to f_op->poll() done on listed files
512 spin_unlock_irqrestore(&ep->lock, flags); 663 * could re-enter here.
664 */
665 pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
666 ep_poll_readyevents_proc, ep, ep);
513 667
514 return pollflags; 668 return pollflags != -1 ? pollflags : 0;
515} 669}
516 670
517/* File callbacks that implement the eventpoll file behaviour */ 671/* File callbacks that implement the eventpoll file behaviour */
@@ -541,7 +695,7 @@ void eventpoll_release_file(struct file *file)
541 * We don't want to get "file->f_lock" because it is not 695 * We don't want to get "file->f_lock" because it is not
542 * necessary. It is not necessary because we're in the "struct file" 696 * necessary. It is not necessary because we're in the "struct file"
543 * cleanup path, and this means that noone is using this file anymore. 697 * cleanup path, and this means that noone is using this file anymore.
544 * So, for example, epoll_ctl() cannot hit here sicne if we reach this 698 * So, for example, epoll_ctl() cannot hit here since if we reach this
545 * point, the file counter already went to zero and fget() would fail. 699 * point, the file counter already went to zero and fget() would fail.
546 * The only hit might come from ep_free() but by holding the mutex 700 * The only hit might come from ep_free() but by holding the mutex
547 * will correctly serialize the operation. We do need to acquire 701 * will correctly serialize the operation. We do need to acquire
@@ -588,8 +742,6 @@ static int ep_alloc(struct eventpoll **pep)
588 742
589 *pep = ep; 743 *pep = ep;
590 744
591 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n",
592 current, ep));
593 return 0; 745 return 0;
594 746
595free_uid: 747free_uid:
@@ -623,9 +775,6 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
623 } 775 }
624 } 776 }
625 777
626 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%p) -> %p\n",
627 current, file, epir));
628
629 return epir; 778 return epir;
630} 779}
631 780
@@ -641,9 +790,6 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
641 struct epitem *epi = ep_item_from_wait(wait); 790 struct epitem *epi = ep_item_from_wait(wait);
642 struct eventpoll *ep = epi->ep; 791 struct eventpoll *ep = epi->ep;
643 792
644 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
645 current, epi->ffd.file, epi, ep));
646
647 spin_lock_irqsave(&ep->lock, flags); 793 spin_lock_irqsave(&ep->lock, flags);
648 794
649 /* 795 /*
@@ -656,6 +802,15 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
656 goto out_unlock; 802 goto out_unlock;
657 803
658 /* 804 /*
805 * Check the events coming with the callback. At this stage, not
806 * every device reports the events in the "key" parameter of the
807 * callback. We need to be able to handle both cases here, hence the
808 * test for "key" != NULL before the event match test.
809 */
810 if (key && !((unsigned long) key & epi->event.events))
811 goto out_unlock;
812
813 /*
659 * If we are trasfering events to userspace, we can hold no locks 814 * If we are trasfering events to userspace, we can hold no locks
660 * (because we're accessing user memory, and because of linux f_op->poll() 815 * (because we're accessing user memory, and because of linux f_op->poll()
661 * semantics). All the events that happens during that period of time are 816 * semantics). All the events that happens during that period of time are
@@ -670,12 +825,9 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
670 } 825 }
671 826
672 /* If this file is already in the ready list we exit soon */ 827 /* If this file is already in the ready list we exit soon */
673 if (ep_is_linked(&epi->rdllink)) 828 if (!ep_is_linked(&epi->rdllink))
674 goto is_linked; 829 list_add_tail(&epi->rdllink, &ep->rdllist);
675
676 list_add_tail(&epi->rdllink, &ep->rdllist);
677 830
678is_linked:
679 /* 831 /*
680 * Wake up ( if active ) both the eventpoll wait list and the ->poll() 832 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
681 * wait list. 833 * wait list.
@@ -690,7 +842,7 @@ out_unlock:
690 842
691 /* We have to call this outside the lock */ 843 /* We have to call this outside the lock */
692 if (pwake) 844 if (pwake)
693 ep_poll_safewake(&psw, &ep->poll_wait); 845 ep_poll_safewake(&ep->poll_wait);
694 846
695 return 1; 847 return 1;
696} 848}
@@ -817,10 +969,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
817 969
818 /* We have to call this outside the lock */ 970 /* We have to call this outside the lock */
819 if (pwake) 971 if (pwake)
820 ep_poll_safewake(&psw, &ep->poll_wait); 972 ep_poll_safewake(&ep->poll_wait);
821
822 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
823 current, ep, tfile, fd));
824 973
825 return 0; 974 return 0;
826 975
@@ -851,15 +1000,14 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
851{ 1000{
852 int pwake = 0; 1001 int pwake = 0;
853 unsigned int revents; 1002 unsigned int revents;
854 unsigned long flags;
855 1003
856 /* 1004 /*
857 * Set the new event interest mask before calling f_op->poll(), otherwise 1005 * Set the new event interest mask before calling f_op->poll();
858 * a potential race might occur. In fact if we do this operation inside 1006 * otherwise we might miss an event that happens between the
859 * the lock, an event might happen between the f_op->poll() call and the 1007 * f_op->poll() call and the new event set registering.
860 * new event set registering.
861 */ 1008 */
862 epi->event.events = event->events; 1009 epi->event.events = event->events;
1010 epi->event.data = event->data; /* protected by mtx */
863 1011
864 /* 1012 /*
865 * Get current event bits. We can safely use the file* here because 1013 * Get current event bits. We can safely use the file* here because
@@ -867,16 +1015,12 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
867 */ 1015 */
868 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); 1016 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
869 1017
870 spin_lock_irqsave(&ep->lock, flags);
871
872 /* Copy the data member from inside the lock */
873 epi->event.data = event->data;
874
875 /* 1018 /*
876 * If the item is "hot" and it is not registered inside the ready 1019 * If the item is "hot" and it is not registered inside the ready
877 * list, push it inside. 1020 * list, push it inside.
878 */ 1021 */
879 if (revents & event->events) { 1022 if (revents & event->events) {
1023 spin_lock_irq(&ep->lock);
880 if (!ep_is_linked(&epi->rdllink)) { 1024 if (!ep_is_linked(&epi->rdllink)) {
881 list_add_tail(&epi->rdllink, &ep->rdllist); 1025 list_add_tail(&epi->rdllink, &ep->rdllist);
882 1026
@@ -886,142 +1030,84 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
886 if (waitqueue_active(&ep->poll_wait)) 1030 if (waitqueue_active(&ep->poll_wait))
887 pwake++; 1031 pwake++;
888 } 1032 }
1033 spin_unlock_irq(&ep->lock);
889 } 1034 }
890 spin_unlock_irqrestore(&ep->lock, flags);
891 1035
892 /* We have to call this outside the lock */ 1036 /* We have to call this outside the lock */
893 if (pwake) 1037 if (pwake)
894 ep_poll_safewake(&psw, &ep->poll_wait); 1038 ep_poll_safewake(&ep->poll_wait);
895 1039
896 return 0; 1040 return 0;
897} 1041}
898 1042
899static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events, 1043static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
900 int maxevents) 1044 void *priv)
901{ 1045{
902 int eventcnt, error = -EFAULT, pwake = 0; 1046 struct ep_send_events_data *esed = priv;
1047 int eventcnt;
903 unsigned int revents; 1048 unsigned int revents;
904 unsigned long flags; 1049 struct epitem *epi;
905 struct epitem *epi, *nepi; 1050 struct epoll_event __user *uevent;
906 struct list_head txlist;
907
908 INIT_LIST_HEAD(&txlist);
909
910 /*
911 * We need to lock this because we could be hit by
912 * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL).
913 */
914 mutex_lock(&ep->mtx);
915
916 /*
917 * Steal the ready list, and re-init the original one to the
918 * empty list. Also, set ep->ovflist to NULL so that events
919 * happening while looping w/out locks, are not lost. We cannot
920 * have the poll callback to queue directly on ep->rdllist,
921 * because we are doing it in the loop below, in a lockless way.
922 */
923 spin_lock_irqsave(&ep->lock, flags);
924 list_splice(&ep->rdllist, &txlist);
925 INIT_LIST_HEAD(&ep->rdllist);
926 ep->ovflist = NULL;
927 spin_unlock_irqrestore(&ep->lock, flags);
928 1051
929 /* 1052 /*
930 * We can loop without lock because this is a task private list. 1053 * We can loop without lock because we are passed a task private list.
931 * We just splice'd out the ep->rdllist in ep_collect_ready_items(). 1054 * Items cannot vanish during the loop because ep_scan_ready_list() is
932 * Items cannot vanish during the loop because we are holding "mtx". 1055 * holding "mtx" during this call.
933 */ 1056 */
934 for (eventcnt = 0; !list_empty(&txlist) && eventcnt < maxevents;) { 1057 for (eventcnt = 0, uevent = esed->events;
935 epi = list_first_entry(&txlist, struct epitem, rdllink); 1058 !list_empty(head) && eventcnt < esed->maxevents;) {
1059 epi = list_first_entry(head, struct epitem, rdllink);
936 1060
937 list_del_init(&epi->rdllink); 1061 list_del_init(&epi->rdllink);
938 1062
939 /* 1063 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
940 * Get the ready file event set. We can safely use the file 1064 epi->event.events;
941 * because we are holding the "mtx" and this will guarantee
942 * that both the file and the item will not vanish.
943 */
944 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
945 revents &= epi->event.events;
946 1065
947 /* 1066 /*
948 * Is the event mask intersect the caller-requested one, 1067 * If the event mask intersect the caller-requested one,
949 * deliver the event to userspace. Again, we are holding 1068 * deliver the event to userspace. Again, ep_scan_ready_list()
950 * "mtx", so no operations coming from userspace can change 1069 * is holding "mtx", so no operations coming from userspace
951 * the item. 1070 * can change the item.
952 */ 1071 */
953 if (revents) { 1072 if (revents) {
954 if (__put_user(revents, 1073 if (__put_user(revents, &uevent->events) ||
955 &events[eventcnt].events) || 1074 __put_user(epi->event.data, &uevent->data)) {
956 __put_user(epi->event.data, 1075 list_add(&epi->rdllink, head);
957 &events[eventcnt].data)) 1076 return eventcnt ? eventcnt : -EFAULT;
958 goto errxit; 1077 }
1078 eventcnt++;
1079 uevent++;
959 if (epi->event.events & EPOLLONESHOT) 1080 if (epi->event.events & EPOLLONESHOT)
960 epi->event.events &= EP_PRIVATE_BITS; 1081 epi->event.events &= EP_PRIVATE_BITS;
961 eventcnt++; 1082 else if (!(epi->event.events & EPOLLET)) {
1083 /*
1084 * If this file has been added with Level
1085 * Trigger mode, we need to insert back inside
1086 * the ready list, so that the next call to
1087 * epoll_wait() will check again the events
1088 * availability. At this point, noone can insert
1089 * into ep->rdllist besides us. The epoll_ctl()
1090 * callers are locked out by
1091 * ep_scan_ready_list() holding "mtx" and the
1092 * poll callback will queue them in ep->ovflist.
1093 */
1094 list_add_tail(&epi->rdllink, &ep->rdllist);
1095 }
962 } 1096 }
963 /*
964 * At this point, noone can insert into ep->rdllist besides
965 * us. The epoll_ctl() callers are locked out by us holding
966 * "mtx" and the poll callback will queue them in ep->ovflist.
967 */
968 if (!(epi->event.events & EPOLLET) &&
969 (revents & epi->event.events))
970 list_add_tail(&epi->rdllink, &ep->rdllist);
971 }
972 error = 0;
973
974errxit:
975
976 spin_lock_irqsave(&ep->lock, flags);
977 /*
978 * During the time we spent in the loop above, some other events
979 * might have been queued by the poll callback. We re-insert them
980 * inside the main ready-list here.
981 */
982 for (nepi = ep->ovflist; (epi = nepi) != NULL;
983 nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
984 /*
985 * If the above loop quit with errors, the epoll item might still
986 * be linked to "txlist", and the list_splice() done below will
987 * take care of those cases.
988 */
989 if (!ep_is_linked(&epi->rdllink))
990 list_add_tail(&epi->rdllink, &ep->rdllist);
991 } 1097 }
992 /*
993 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
994 * releasing the lock, events will be queued in the normal way inside
995 * ep->rdllist.
996 */
997 ep->ovflist = EP_UNACTIVE_PTR;
998 1098
999 /* 1099 return eventcnt;
1000 * In case of error in the event-send loop, or in case the number of 1100}
1001 * ready events exceeds the userspace limit, we need to splice the
1002 * "txlist" back inside ep->rdllist.
1003 */
1004 list_splice(&txlist, &ep->rdllist);
1005
1006 if (!list_empty(&ep->rdllist)) {
1007 /*
1008 * Wake up (if active) both the eventpoll wait list and the ->poll()
1009 * wait list (delayed after we release the lock).
1010 */
1011 if (waitqueue_active(&ep->wq))
1012 wake_up_locked(&ep->wq);
1013 if (waitqueue_active(&ep->poll_wait))
1014 pwake++;
1015 }
1016 spin_unlock_irqrestore(&ep->lock, flags);
1017 1101
1018 mutex_unlock(&ep->mtx); 1102static int ep_send_events(struct eventpoll *ep,
1103 struct epoll_event __user *events, int maxevents)
1104{
1105 struct ep_send_events_data esed;
1019 1106
1020 /* We have to call this outside the lock */ 1107 esed.maxevents = maxevents;
1021 if (pwake) 1108 esed.events = events;
1022 ep_poll_safewake(&psw, &ep->poll_wait);
1023 1109
1024 return eventcnt == 0 ? error: eventcnt; 1110 return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
1025} 1111}
1026 1112
1027static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, 1113static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
@@ -1033,7 +1119,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1033 wait_queue_t wait; 1119 wait_queue_t wait;
1034 1120
1035 /* 1121 /*
1036 * Calculate the timeout by checking for the "infinite" value ( -1 ) 1122 * Calculate the timeout by checking for the "infinite" value (-1)
1037 * and the overflow condition. The passed timeout is in milliseconds, 1123 * and the overflow condition. The passed timeout is in milliseconds,
1038 * that why (t * HZ) / 1000. 1124 * that why (t * HZ) / 1000.
1039 */ 1125 */
@@ -1076,9 +1162,8 @@ retry:
1076 1162
1077 set_current_state(TASK_RUNNING); 1163 set_current_state(TASK_RUNNING);
1078 } 1164 }
1079
1080 /* Is it worth to try to dig for events ? */ 1165 /* Is it worth to try to dig for events ? */
1081 eavail = !list_empty(&ep->rdllist); 1166 eavail = !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
1082 1167
1083 spin_unlock_irqrestore(&ep->lock, flags); 1168 spin_unlock_irqrestore(&ep->lock, flags);
1084 1169
@@ -1099,46 +1184,35 @@ retry:
1099 */ 1184 */
1100SYSCALL_DEFINE1(epoll_create1, int, flags) 1185SYSCALL_DEFINE1(epoll_create1, int, flags)
1101{ 1186{
1102 int error, fd = -1; 1187 int error;
1103 struct eventpoll *ep; 1188 struct eventpoll *ep = NULL;
1104 1189
1105 /* Check the EPOLL_* constant for consistency. */ 1190 /* Check the EPOLL_* constant for consistency. */
1106 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); 1191 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
1107 1192
1108 if (flags & ~EPOLL_CLOEXEC) 1193 if (flags & ~EPOLL_CLOEXEC)
1109 return -EINVAL; 1194 return -EINVAL;
1110
1111 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
1112 current, flags));
1113
1114 /* 1195 /*
1115 * Create the internal data structure ( "struct eventpoll" ). 1196 * Create the internal data structure ("struct eventpoll").
1116 */ 1197 */
1117 error = ep_alloc(&ep); 1198 error = ep_alloc(&ep);
1118 if (error < 0) { 1199 if (error < 0)
1119 fd = error; 1200 return error;
1120 goto error_return;
1121 }
1122
1123 /* 1201 /*
1124 * Creates all the items needed to setup an eventpoll file. That is, 1202 * Creates all the items needed to setup an eventpoll file. That is,
1125 * a file structure and a free file descriptor. 1203 * a file structure and a free file descriptor.
1126 */ 1204 */
1127 fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, 1205 error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
1128 flags & O_CLOEXEC); 1206 flags & O_CLOEXEC);
1129 if (fd < 0) 1207 if (error < 0)
1130 ep_free(ep); 1208 ep_free(ep);
1131 1209
1132error_return: 1210 return error;
1133 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
1134 current, flags, fd));
1135
1136 return fd;
1137} 1211}
1138 1212
1139SYSCALL_DEFINE1(epoll_create, int, size) 1213SYSCALL_DEFINE1(epoll_create, int, size)
1140{ 1214{
1141 if (size < 0) 1215 if (size <= 0)
1142 return -EINVAL; 1216 return -EINVAL;
1143 1217
1144 return sys_epoll_create1(0); 1218 return sys_epoll_create1(0);
@@ -1158,9 +1232,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1158 struct epitem *epi; 1232 struct epitem *epi;
1159 struct epoll_event epds; 1233 struct epoll_event epds;
1160 1234
1161 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
1162 current, epfd, op, fd, event));
1163
1164 error = -EFAULT; 1235 error = -EFAULT;
1165 if (ep_op_has_event(op) && 1236 if (ep_op_has_event(op) &&
1166 copy_from_user(&epds, event, sizeof(struct epoll_event))) 1237 copy_from_user(&epds, event, sizeof(struct epoll_event)))
@@ -1211,7 +1282,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1211 case EPOLL_CTL_ADD: 1282 case EPOLL_CTL_ADD:
1212 if (!epi) { 1283 if (!epi) {
1213 epds.events |= POLLERR | POLLHUP; 1284 epds.events |= POLLERR | POLLHUP;
1214
1215 error = ep_insert(ep, &epds, tfile, fd); 1285 error = ep_insert(ep, &epds, tfile, fd);
1216 } else 1286 } else
1217 error = -EEXIST; 1287 error = -EEXIST;
@@ -1237,8 +1307,6 @@ error_tgt_fput:
1237error_fput: 1307error_fput:
1238 fput(file); 1308 fput(file);
1239error_return: 1309error_return:
1240 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
1241 current, epfd, op, fd, event, error));
1242 1310
1243 return error; 1311 return error;
1244} 1312}
@@ -1254,9 +1322,6 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
1254 struct file *file; 1322 struct file *file;
1255 struct eventpoll *ep; 1323 struct eventpoll *ep;
1256 1324
1257 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
1258 current, epfd, events, maxevents, timeout));
1259
1260 /* The maximum number of event must be greater than zero */ 1325 /* The maximum number of event must be greater than zero */
1261 if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) 1326 if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
1262 return -EINVAL; 1327 return -EINVAL;
@@ -1293,8 +1358,6 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
1293error_fput: 1358error_fput:
1294 fput(file); 1359 fput(file);
1295error_return: 1360error_return:
1296 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
1297 current, epfd, events, maxevents, timeout, error));
1298 1361
1299 return error; 1362 return error;
1300} 1363}
@@ -1359,17 +1422,18 @@ static int __init eventpoll_init(void)
1359 EP_ITEM_COST; 1422 EP_ITEM_COST;
1360 1423
1361 /* Initialize the structure used to perform safe poll wait head wake ups */ 1424 /* Initialize the structure used to perform safe poll wait head wake ups */
1362 ep_poll_safewake_init(&psw); 1425 ep_nested_calls_init(&poll_safewake_ncalls);
1426
1427 /* Initialize the structure used to perform file's f_op->poll() calls */
1428 ep_nested_calls_init(&poll_readywalk_ncalls);
1363 1429
1364 /* Allocates slab cache used to allocate "struct epitem" items */ 1430 /* Allocates slab cache used to allocate "struct epitem" items */
1365 epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), 1431 epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
1366 0, SLAB_HWCACHE_ALIGN|EPI_SLAB_DEBUG|SLAB_PANIC, 1432 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
1367 NULL);
1368 1433
1369 /* Allocates slab cache used to allocate "struct eppoll_entry" */ 1434 /* Allocates slab cache used to allocate "struct eppoll_entry" */
1370 pwq_cache = kmem_cache_create("eventpoll_pwq", 1435 pwq_cache = kmem_cache_create("eventpoll_pwq",
1371 sizeof(struct eppoll_entry), 0, 1436 sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);
1372 EPI_SLAB_DEBUG|SLAB_PANIC, NULL);
1373 1437
1374 return 0; 1438 return 0;
1375} 1439}
diff --git a/fs/exec.c b/fs/exec.c
index c5128fbc9165..895823d0149d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -53,6 +53,7 @@
53#include <linux/tracehook.h> 53#include <linux/tracehook.h>
54#include <linux/kmod.h> 54#include <linux/kmod.h>
55#include <linux/fsnotify.h> 55#include <linux/fsnotify.h>
56#include <linux/fs_struct.h>
56 57
57#include <asm/uaccess.h> 58#include <asm/uaccess.h>
58#include <asm/mmu_context.h> 59#include <asm/mmu_context.h>
@@ -68,17 +69,18 @@ int suid_dumpable = 0;
68static LIST_HEAD(formats); 69static LIST_HEAD(formats);
69static DEFINE_RWLOCK(binfmt_lock); 70static DEFINE_RWLOCK(binfmt_lock);
70 71
71int register_binfmt(struct linux_binfmt * fmt) 72int __register_binfmt(struct linux_binfmt * fmt, int insert)
72{ 73{
73 if (!fmt) 74 if (!fmt)
74 return -EINVAL; 75 return -EINVAL;
75 write_lock(&binfmt_lock); 76 write_lock(&binfmt_lock);
76 list_add(&fmt->lh, &formats); 77 insert ? list_add(&fmt->lh, &formats) :
78 list_add_tail(&fmt->lh, &formats);
77 write_unlock(&binfmt_lock); 79 write_unlock(&binfmt_lock);
78 return 0; 80 return 0;
79} 81}
80 82
81EXPORT_SYMBOL(register_binfmt); 83EXPORT_SYMBOL(__register_binfmt);
82 84
83void unregister_binfmt(struct linux_binfmt * fmt) 85void unregister_binfmt(struct linux_binfmt * fmt)
84{ 86{
@@ -103,40 +105,28 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
103SYSCALL_DEFINE1(uselib, const char __user *, library) 105SYSCALL_DEFINE1(uselib, const char __user *, library)
104{ 106{
105 struct file *file; 107 struct file *file;
106 struct nameidata nd;
107 char *tmp = getname(library); 108 char *tmp = getname(library);
108 int error = PTR_ERR(tmp); 109 int error = PTR_ERR(tmp);
109 110
110 if (!IS_ERR(tmp)) { 111 if (IS_ERR(tmp))
111 error = path_lookup_open(AT_FDCWD, tmp, 112 goto out;
112 LOOKUP_FOLLOW, &nd, 113
113 FMODE_READ|FMODE_EXEC); 114 file = do_filp_open(AT_FDCWD, tmp,
114 putname(tmp); 115 O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0,
115 } 116 MAY_READ | MAY_EXEC | MAY_OPEN);
116 if (error) 117 putname(tmp);
118 error = PTR_ERR(file);
119 if (IS_ERR(file))
117 goto out; 120 goto out;
118 121
119 error = -EINVAL; 122 error = -EINVAL;
120 if (!S_ISREG(nd.path.dentry->d_inode->i_mode)) 123 if (!S_ISREG(file->f_path.dentry->d_inode->i_mode))
121 goto exit; 124 goto exit;
122 125
123 error = -EACCES; 126 error = -EACCES;
124 if (nd.path.mnt->mnt_flags & MNT_NOEXEC) 127 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
125 goto exit; 128 goto exit;
126 129
127 error = inode_permission(nd.path.dentry->d_inode,
128 MAY_READ | MAY_EXEC | MAY_OPEN);
129 if (error)
130 goto exit;
131 error = ima_path_check(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN);
132 if (error)
133 goto exit;
134
135 file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE);
136 error = PTR_ERR(file);
137 if (IS_ERR(file))
138 goto out;
139
140 fsnotify_open(file->f_path.dentry); 130 fsnotify_open(file->f_path.dentry);
141 131
142 error = -ENOEXEC; 132 error = -ENOEXEC;
@@ -158,13 +148,10 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
158 } 148 }
159 read_unlock(&binfmt_lock); 149 read_unlock(&binfmt_lock);
160 } 150 }
151exit:
161 fput(file); 152 fput(file);
162out: 153out:
163 return error; 154 return error;
164exit:
165 release_open_intent(&nd);
166 path_put(&nd.path);
167 goto out;
168} 155}
169 156
170#ifdef CONFIG_MMU 157#ifdef CONFIG_MMU
@@ -659,47 +646,33 @@ EXPORT_SYMBOL(setup_arg_pages);
659 646
660struct file *open_exec(const char *name) 647struct file *open_exec(const char *name)
661{ 648{
662 struct nameidata nd;
663 struct file *file; 649 struct file *file;
664 int err; 650 int err;
665 651
666 err = path_lookup_open(AT_FDCWD, name, LOOKUP_FOLLOW, &nd, 652 file = do_filp_open(AT_FDCWD, name,
667 FMODE_READ|FMODE_EXEC); 653 O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0,
668 if (err) 654 MAY_EXEC | MAY_OPEN);
655 if (IS_ERR(file))
669 goto out; 656 goto out;
670 657
671 err = -EACCES; 658 err = -EACCES;
672 if (!S_ISREG(nd.path.dentry->d_inode->i_mode)) 659 if (!S_ISREG(file->f_path.dentry->d_inode->i_mode))
673 goto out_path_put; 660 goto exit;
674
675 if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
676 goto out_path_put;
677
678 err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN);
679 if (err)
680 goto out_path_put;
681 err = ima_path_check(&nd.path, MAY_EXEC | MAY_OPEN);
682 if (err)
683 goto out_path_put;
684 661
685 file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE); 662 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
686 if (IS_ERR(file)) 663 goto exit;
687 return file;
688 664
689 fsnotify_open(file->f_path.dentry); 665 fsnotify_open(file->f_path.dentry);
690 666
691 err = deny_write_access(file); 667 err = deny_write_access(file);
692 if (err) { 668 if (err)
693 fput(file); 669 goto exit;
694 goto out;
695 }
696 670
671out:
697 return file; 672 return file;
698 673
699 out_path_put: 674exit:
700 release_open_intent(&nd); 675 fput(file);
701 path_put(&nd.path);
702 out:
703 return ERR_PTR(err); 676 return ERR_PTR(err);
704} 677}
705EXPORT_SYMBOL(open_exec); 678EXPORT_SYMBOL(open_exec);
@@ -1056,28 +1029,35 @@ EXPORT_SYMBOL(install_exec_creds);
1056 * - the caller must hold current->cred_exec_mutex to protect against 1029 * - the caller must hold current->cred_exec_mutex to protect against
1057 * PTRACE_ATTACH 1030 * PTRACE_ATTACH
1058 */ 1031 */
1059void check_unsafe_exec(struct linux_binprm *bprm) 1032int check_unsafe_exec(struct linux_binprm *bprm)
1060{ 1033{
1061 struct task_struct *p = current, *t; 1034 struct task_struct *p = current, *t;
1062 unsigned long flags; 1035 unsigned n_fs;
1063 unsigned n_fs, n_sighand; 1036 int res = 0;
1064 1037
1065 bprm->unsafe = tracehook_unsafe_exec(p); 1038 bprm->unsafe = tracehook_unsafe_exec(p);
1066 1039
1067 n_fs = 1; 1040 n_fs = 1;
1068 n_sighand = 1; 1041 write_lock(&p->fs->lock);
1069 lock_task_sighand(p, &flags); 1042 rcu_read_lock();
1070 for (t = next_thread(p); t != p; t = next_thread(t)) { 1043 for (t = next_thread(p); t != p; t = next_thread(t)) {
1071 if (t->fs == p->fs) 1044 if (t->fs == p->fs)
1072 n_fs++; 1045 n_fs++;
1073 n_sighand++;
1074 } 1046 }
1047 rcu_read_unlock();
1075 1048
1076 if (atomic_read(&p->fs->count) > n_fs || 1049 if (p->fs->users > n_fs) {
1077 atomic_read(&p->sighand->count) > n_sighand)
1078 bprm->unsafe |= LSM_UNSAFE_SHARE; 1050 bprm->unsafe |= LSM_UNSAFE_SHARE;
1051 } else {
1052 res = -EAGAIN;
1053 if (!p->fs->in_exec) {
1054 p->fs->in_exec = 1;
1055 res = 1;
1056 }
1057 }
1058 write_unlock(&p->fs->lock);
1079 1059
1080 unlock_task_sighand(p, &flags); 1060 return res;
1081} 1061}
1082 1062
1083/* 1063/*
@@ -1276,6 +1256,7 @@ int do_execve(char * filename,
1276 struct linux_binprm *bprm; 1256 struct linux_binprm *bprm;
1277 struct file *file; 1257 struct file *file;
1278 struct files_struct *displaced; 1258 struct files_struct *displaced;
1259 bool clear_in_exec;
1279 int retval; 1260 int retval;
1280 1261
1281 retval = unshare_files(&displaced); 1262 retval = unshare_files(&displaced);
@@ -1296,12 +1277,16 @@ int do_execve(char * filename,
1296 bprm->cred = prepare_exec_creds(); 1277 bprm->cred = prepare_exec_creds();
1297 if (!bprm->cred) 1278 if (!bprm->cred)
1298 goto out_unlock; 1279 goto out_unlock;
1299 check_unsafe_exec(bprm); 1280
1281 retval = check_unsafe_exec(bprm);
1282 if (retval < 0)
1283 goto out_unlock;
1284 clear_in_exec = retval;
1300 1285
1301 file = open_exec(filename); 1286 file = open_exec(filename);
1302 retval = PTR_ERR(file); 1287 retval = PTR_ERR(file);
1303 if (IS_ERR(file)) 1288 if (IS_ERR(file))
1304 goto out_unlock; 1289 goto out_unmark;
1305 1290
1306 sched_exec(); 1291 sched_exec();
1307 1292
@@ -1344,6 +1329,7 @@ int do_execve(char * filename,
1344 goto out; 1329 goto out;
1345 1330
1346 /* execve succeeded */ 1331 /* execve succeeded */
1332 current->fs->in_exec = 0;
1347 current->in_execve = 0; 1333 current->in_execve = 0;
1348 mutex_unlock(&current->cred_exec_mutex); 1334 mutex_unlock(&current->cred_exec_mutex);
1349 acct_update_integrals(current); 1335 acct_update_integrals(current);
@@ -1362,6 +1348,10 @@ out_file:
1362 fput(bprm->file); 1348 fput(bprm->file);
1363 } 1349 }
1364 1350
1351out_unmark:
1352 if (clear_in_exec)
1353 current->fs->in_exec = 0;
1354
1365out_unlock: 1355out_unlock:
1366 current->in_execve = 0; 1356 current->in_execve = 0;
1367 mutex_unlock(&current->cred_exec_mutex); 1357 mutex_unlock(&current->cred_exec_mutex);
diff --git a/fs/exofs/BUGS b/fs/exofs/BUGS
new file mode 100644
index 000000000000..1b2d4c63a579
--- /dev/null
+++ b/fs/exofs/BUGS
@@ -0,0 +1,3 @@
1- Out-of-space may cause a severe problem if the object (and directory entry)
2 were written, but the inode attributes failed. Then if the filesystem was
3 unmounted and mounted the kernel can get into an endless loop doing a readdir.
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
new file mode 100644
index 000000000000..cc2d22db119c
--- /dev/null
+++ b/fs/exofs/Kbuild
@@ -0,0 +1,16 @@
1#
2# Kbuild for the EXOFS module
3#
4# Copyright (C) 2008 Panasas Inc. All rights reserved.
5#
6# Authors:
7# Boaz Harrosh <bharrosh@panasas.com>
8#
9# This program is free software; you can redistribute it and/or modify
10# it under the terms of the GNU General Public License version 2
11#
12# Kbuild - Gets included from the Kernels Makefile and build system
13#
14
15exofs-y := osd.o inode.o file.o symlink.o namei.o dir.o super.o
16obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
new file mode 100644
index 000000000000..86194b2f799d
--- /dev/null
+++ b/fs/exofs/Kconfig
@@ -0,0 +1,13 @@
1config EXOFS_FS
2 tristate "exofs: OSD based file system support"
3 depends on SCSI_OSD_ULD
4 help
5 EXOFS is a file system that uses an OSD storage device,
6 as its backing storage.
7
8# Debugging-related stuff
9config EXOFS_DEBUG
10 bool "Enable debugging"
11 depends on EXOFS_FS
12 help
13 This option enables EXOFS debug prints.
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
new file mode 100644
index 000000000000..b1512c4bb8c7
--- /dev/null
+++ b/fs/exofs/common.h
@@ -0,0 +1,184 @@
1/*
2 * common.h - Common definitions for both Kernel and user-mode utilities
3 *
4 * Copyright (C) 2005, 2006
5 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
6 * Copyright (C) 2005, 2006
7 * International Business Machines
8 * Copyright (C) 2008, 2009
9 * Boaz Harrosh <bharrosh@panasas.com>
10 *
11 * Copyrights for code taken from ext2:
12 * Copyright (C) 1992, 1993, 1994, 1995
13 * Remy Card (card@masi.ibp.fr)
14 * Laboratoire MASI - Institut Blaise Pascal
15 * Universite Pierre et Marie Curie (Paris VI)
16 * from
17 * linux/fs/minix/inode.c
18 * Copyright (C) 1991, 1992 Linus Torvalds
19 *
20 * This file is part of exofs.
21 *
22 * exofs is free software; you can redistribute it and/or modify
23 * it under the terms of the GNU General Public License as published by
24 * the Free Software Foundation. Since it is based on ext2, and the only
25 * valid version of GPL for the Linux kernel is version 2, the only valid
26 * version of GPL for exofs is version 2.
27 *
28 * exofs is distributed in the hope that it will be useful,
29 * but WITHOUT ANY WARRANTY; without even the implied warranty of
30 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31 * GNU General Public License for more details.
32 *
33 * You should have received a copy of the GNU General Public License
34 * along with exofs; if not, write to the Free Software
35 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
36 */
37
38#ifndef __EXOFS_COM_H__
39#define __EXOFS_COM_H__
40
41#include <linux/types.h>
42
43#include <scsi/osd_attributes.h>
44#include <scsi/osd_initiator.h>
45#include <scsi/osd_sec.h>
46
47/****************************************************************************
48 * Object ID related defines
49 * NOTE: inode# = object ID - EXOFS_OBJ_OFF
50 ****************************************************************************/
51#define EXOFS_MIN_PID 0x10000 /* Smallest partition ID */
52#define EXOFS_OBJ_OFF 0x10000 /* offset for objects */
53#define EXOFS_SUPER_ID 0x10000 /* object ID for on-disk superblock */
54#define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */
55
56/* exofs Application specific page/attribute */
57# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3)
58# define EXOFS_ATTR_INODE_DATA 1
59
60/*
61 * The maximum number of files we can have is limited by the size of the
62 * inode number. This is the largest object ID that the file system supports.
63 * Object IDs 0, 1, and 2 are always in use (see above defines).
64 */
65enum {
66 EXOFS_MAX_INO_ID = (sizeof(ino_t) * 8 == 64) ? ULLONG_MAX :
67 (1ULL << (sizeof(ino_t) * 8ULL - 1ULL)),
68 EXOFS_MAX_ID = (EXOFS_MAX_INO_ID - 1 - EXOFS_OBJ_OFF),
69};
70
71/****************************************************************************
72 * Misc.
73 ****************************************************************************/
74#define EXOFS_BLKSHIFT 12
75#define EXOFS_BLKSIZE (1UL << EXOFS_BLKSHIFT)
76
77/****************************************************************************
78 * superblock-related things
79 ****************************************************************************/
80#define EXOFS_SUPER_MAGIC 0x5DF5
81
82/*
83 * The file system control block - stored in an object's data (mainly, the one
84 * with ID EXOFS_SUPER_ID). This is where the in-memory superblock is stored
85 * on disk. Right now it just has a magic value, which is basically a sanity
86 * check on our ability to communicate with the object store.
87 */
88struct exofs_fscb {
89 __le64 s_nextid; /* Highest object ID used */
90 __le32 s_numfiles; /* Number of files on fs */
91 __le16 s_magic; /* Magic signature */
92 __le16 s_newfs; /* Non-zero if this is a new fs */
93};
94
95/****************************************************************************
96 * inode-related things
97 ****************************************************************************/
98#define EXOFS_IDATA 5
99
100/*
101 * The file control block - stored in an object's attributes. This is where
102 * the in-memory inode is stored on disk.
103 */
104struct exofs_fcb {
105 __le64 i_size; /* Size of the file */
106 __le16 i_mode; /* File mode */
107 __le16 i_links_count; /* Links count */
108 __le32 i_uid; /* Owner Uid */
109 __le32 i_gid; /* Group Id */
110 __le32 i_atime; /* Access time */
111 __le32 i_ctime; /* Creation time */
112 __le32 i_mtime; /* Modification time */
113 __le32 i_flags; /* File flags (unused for now)*/
114 __le32 i_generation; /* File version (for NFS) */
115 __le32 i_data[EXOFS_IDATA]; /* Short symlink names and device #s */
116};
117
118#define EXOFS_INO_ATTR_SIZE sizeof(struct exofs_fcb)
119
120/* This is the Attribute the fcb is stored in */
121static const struct __weak osd_attr g_attr_inode_data = ATTR_DEF(
122 EXOFS_APAGE_FS_DATA,
123 EXOFS_ATTR_INODE_DATA,
124 EXOFS_INO_ATTR_SIZE);
125
126/****************************************************************************
127 * dentry-related things
128 ****************************************************************************/
129#define EXOFS_NAME_LEN 255
130
131/*
132 * The on-disk directory entry
133 */
134struct exofs_dir_entry {
135 __le64 inode_no; /* inode number */
136 __le16 rec_len; /* directory entry length */
137 u8 name_len; /* name length */
138 u8 file_type; /* umm...file type */
139 char name[EXOFS_NAME_LEN]; /* file name */
140};
141
142enum {
143 EXOFS_FT_UNKNOWN,
144 EXOFS_FT_REG_FILE,
145 EXOFS_FT_DIR,
146 EXOFS_FT_CHRDEV,
147 EXOFS_FT_BLKDEV,
148 EXOFS_FT_FIFO,
149 EXOFS_FT_SOCK,
150 EXOFS_FT_SYMLINK,
151 EXOFS_FT_MAX
152};
153
154#define EXOFS_DIR_PAD 4
155#define EXOFS_DIR_ROUND (EXOFS_DIR_PAD - 1)
156#define EXOFS_DIR_REC_LEN(name_len) \
157 (((name_len) + offsetof(struct exofs_dir_entry, name) + \
158 EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
159
160/*************************
161 * function declarations *
162 *************************/
163/* osd.c */
164void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
165 const struct osd_obj_id *obj);
166
167int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid);
168static inline int exofs_check_ok(struct osd_request *or)
169{
170 return exofs_check_ok_resid(or, NULL, NULL);
171}
172int exofs_sync_op(struct osd_request *or, int timeout, u8 *cred);
173int exofs_async_op(struct osd_request *or,
174 osd_req_done_fn *async_done, void *caller_context, u8 *cred);
175
176int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr);
177
178int osd_req_read_kern(struct osd_request *or,
179 const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
180
181int osd_req_write_kern(struct osd_request *or,
182 const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
183
184#endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
new file mode 100644
index 000000000000..65b0c8c776a1
--- /dev/null
+++ b/fs/exofs/dir.c
@@ -0,0 +1,672 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * Copyrights for code taken from ext2:
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 * from
15 * linux/fs/minix/inode.c
16 * Copyright (C) 1991, 1992 Linus Torvalds
17 *
18 * This file is part of exofs.
19 *
20 * exofs is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation. Since it is based on ext2, and the only
23 * valid version of GPL for the Linux kernel is version 2, the only valid
24 * version of GPL for exofs is version 2.
25 *
26 * exofs is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with exofs; if not, write to the Free Software
33 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
34 */
35
36#include "exofs.h"
37
38static inline unsigned exofs_chunk_size(struct inode *inode)
39{
40 return inode->i_sb->s_blocksize;
41}
42
43static inline void exofs_put_page(struct page *page)
44{
45 kunmap(page);
46 page_cache_release(page);
47}
48
49/* Accesses dir's inode->i_size must be called under inode lock */
50static inline unsigned long dir_pages(struct inode *inode)
51{
52 return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
53}
54
55static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr)
56{
57 loff_t last_byte = inode->i_size;
58
59 last_byte -= page_nr << PAGE_CACHE_SHIFT;
60 if (last_byte > PAGE_CACHE_SIZE)
61 last_byte = PAGE_CACHE_SIZE;
62 return last_byte;
63}
64
65static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len)
66{
67 struct address_space *mapping = page->mapping;
68 struct inode *dir = mapping->host;
69 int err = 0;
70
71 dir->i_version++;
72
73 if (!PageUptodate(page))
74 SetPageUptodate(page);
75
76 if (pos+len > dir->i_size) {
77 i_size_write(dir, pos+len);
78 mark_inode_dirty(dir);
79 }
80 set_page_dirty(page);
81
82 if (IS_DIRSYNC(dir))
83 err = write_one_page(page, 1);
84 else
85 unlock_page(page);
86
87 return err;
88}
89
90static void exofs_check_page(struct page *page)
91{
92 struct inode *dir = page->mapping->host;
93 unsigned chunk_size = exofs_chunk_size(dir);
94 char *kaddr = page_address(page);
95 unsigned offs, rec_len;
96 unsigned limit = PAGE_CACHE_SIZE;
97 struct exofs_dir_entry *p;
98 char *error;
99
100 /* if the page is the last one in the directory */
101 if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
102 limit = dir->i_size & ~PAGE_CACHE_MASK;
103 if (limit & (chunk_size - 1))
104 goto Ebadsize;
105 if (!limit)
106 goto out;
107 }
108 for (offs = 0; offs <= limit - EXOFS_DIR_REC_LEN(1); offs += rec_len) {
109 p = (struct exofs_dir_entry *)(kaddr + offs);
110 rec_len = le16_to_cpu(p->rec_len);
111
112 if (rec_len < EXOFS_DIR_REC_LEN(1))
113 goto Eshort;
114 if (rec_len & 3)
115 goto Ealign;
116 if (rec_len < EXOFS_DIR_REC_LEN(p->name_len))
117 goto Enamelen;
118 if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
119 goto Espan;
120 }
121 if (offs != limit)
122 goto Eend;
123out:
124 SetPageChecked(page);
125 return;
126
127Ebadsize:
128 EXOFS_ERR("ERROR [exofs_check_page]: "
129 "size of directory #%lu is not a multiple of chunk size",
130 dir->i_ino
131 );
132 goto fail;
133Eshort:
134 error = "rec_len is smaller than minimal";
135 goto bad_entry;
136Ealign:
137 error = "unaligned directory entry";
138 goto bad_entry;
139Enamelen:
140 error = "rec_len is too small for name_len";
141 goto bad_entry;
142Espan:
143 error = "directory entry across blocks";
144 goto bad_entry;
145bad_entry:
146 EXOFS_ERR(
147 "ERROR [exofs_check_page]: bad entry in directory #%lu: %s - "
148 "offset=%lu, inode=%llu, rec_len=%d, name_len=%d",
149 dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
150 _LLU(le64_to_cpu(p->inode_no)),
151 rec_len, p->name_len);
152 goto fail;
153Eend:
154 p = (struct exofs_dir_entry *)(kaddr + offs);
155 EXOFS_ERR("ERROR [exofs_check_page]: "
156 "entry in directory #%lu spans the page boundary"
157 "offset=%lu, inode=%llu",
158 dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
159 _LLU(le64_to_cpu(p->inode_no)));
160fail:
161 SetPageChecked(page);
162 SetPageError(page);
163}
164
165static struct page *exofs_get_page(struct inode *dir, unsigned long n)
166{
167 struct address_space *mapping = dir->i_mapping;
168 struct page *page = read_mapping_page(mapping, n, NULL);
169
170 if (!IS_ERR(page)) {
171 kmap(page);
172 if (!PageChecked(page))
173 exofs_check_page(page);
174 if (PageError(page))
175 goto fail;
176 }
177 return page;
178
179fail:
180 exofs_put_page(page);
181 return ERR_PTR(-EIO);
182}
183
184static inline int exofs_match(int len, const unsigned char *name,
185 struct exofs_dir_entry *de)
186{
187 if (len != de->name_len)
188 return 0;
189 if (!de->inode_no)
190 return 0;
191 return !memcmp(name, de->name, len);
192}
193
194static inline
195struct exofs_dir_entry *exofs_next_entry(struct exofs_dir_entry *p)
196{
197 return (struct exofs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
198}
199
200static inline unsigned
201exofs_validate_entry(char *base, unsigned offset, unsigned mask)
202{
203 struct exofs_dir_entry *de = (struct exofs_dir_entry *)(base + offset);
204 struct exofs_dir_entry *p =
205 (struct exofs_dir_entry *)(base + (offset&mask));
206 while ((char *)p < (char *)de) {
207 if (p->rec_len == 0)
208 break;
209 p = exofs_next_entry(p);
210 }
211 return (char *)p - base;
212}
213
214static unsigned char exofs_filetype_table[EXOFS_FT_MAX] = {
215 [EXOFS_FT_UNKNOWN] = DT_UNKNOWN,
216 [EXOFS_FT_REG_FILE] = DT_REG,
217 [EXOFS_FT_DIR] = DT_DIR,
218 [EXOFS_FT_CHRDEV] = DT_CHR,
219 [EXOFS_FT_BLKDEV] = DT_BLK,
220 [EXOFS_FT_FIFO] = DT_FIFO,
221 [EXOFS_FT_SOCK] = DT_SOCK,
222 [EXOFS_FT_SYMLINK] = DT_LNK,
223};
224
225#define S_SHIFT 12
226static unsigned char exofs_type_by_mode[S_IFMT >> S_SHIFT] = {
227 [S_IFREG >> S_SHIFT] = EXOFS_FT_REG_FILE,
228 [S_IFDIR >> S_SHIFT] = EXOFS_FT_DIR,
229 [S_IFCHR >> S_SHIFT] = EXOFS_FT_CHRDEV,
230 [S_IFBLK >> S_SHIFT] = EXOFS_FT_BLKDEV,
231 [S_IFIFO >> S_SHIFT] = EXOFS_FT_FIFO,
232 [S_IFSOCK >> S_SHIFT] = EXOFS_FT_SOCK,
233 [S_IFLNK >> S_SHIFT] = EXOFS_FT_SYMLINK,
234};
235
236static inline
237void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode)
238{
239 mode_t mode = inode->i_mode;
240 de->file_type = exofs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
241}
242
243static int
244exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
245{
246 loff_t pos = filp->f_pos;
247 struct inode *inode = filp->f_path.dentry->d_inode;
248 unsigned int offset = pos & ~PAGE_CACHE_MASK;
249 unsigned long n = pos >> PAGE_CACHE_SHIFT;
250 unsigned long npages = dir_pages(inode);
251 unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
252 unsigned char *types = NULL;
253 int need_revalidate = (filp->f_version != inode->i_version);
254
255 if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1))
256 return 0;
257
258 types = exofs_filetype_table;
259
260 for ( ; n < npages; n++, offset = 0) {
261 char *kaddr, *limit;
262 struct exofs_dir_entry *de;
263 struct page *page = exofs_get_page(inode, n);
264
265 if (IS_ERR(page)) {
266 EXOFS_ERR("ERROR: "
267 "bad page in #%lu",
268 inode->i_ino);
269 filp->f_pos += PAGE_CACHE_SIZE - offset;
270 return PTR_ERR(page);
271 }
272 kaddr = page_address(page);
273 if (unlikely(need_revalidate)) {
274 if (offset) {
275 offset = exofs_validate_entry(kaddr, offset,
276 chunk_mask);
277 filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
278 }
279 filp->f_version = inode->i_version;
280 need_revalidate = 0;
281 }
282 de = (struct exofs_dir_entry *)(kaddr + offset);
283 limit = kaddr + exofs_last_byte(inode, n) -
284 EXOFS_DIR_REC_LEN(1);
285 for (; (char *)de <= limit; de = exofs_next_entry(de)) {
286 if (de->rec_len == 0) {
287 EXOFS_ERR("ERROR: "
288 "zero-length directory entry");
289 exofs_put_page(page);
290 return -EIO;
291 }
292 if (de->inode_no) {
293 int over;
294 unsigned char d_type = DT_UNKNOWN;
295
296 if (types && de->file_type < EXOFS_FT_MAX)
297 d_type = types[de->file_type];
298
299 offset = (char *)de - kaddr;
300 over = filldir(dirent, de->name, de->name_len,
301 (n<<PAGE_CACHE_SHIFT) | offset,
302 le64_to_cpu(de->inode_no),
303 d_type);
304 if (over) {
305 exofs_put_page(page);
306 return 0;
307 }
308 }
309 filp->f_pos += le16_to_cpu(de->rec_len);
310 }
311 exofs_put_page(page);
312 }
313
314 return 0;
315}
316
317struct exofs_dir_entry *exofs_find_entry(struct inode *dir,
318 struct dentry *dentry, struct page **res_page)
319{
320 const unsigned char *name = dentry->d_name.name;
321 int namelen = dentry->d_name.len;
322 unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
323 unsigned long start, n;
324 unsigned long npages = dir_pages(dir);
325 struct page *page = NULL;
326 struct exofs_i_info *oi = exofs_i(dir);
327 struct exofs_dir_entry *de;
328
329 if (npages == 0)
330 goto out;
331
332 *res_page = NULL;
333
334 start = oi->i_dir_start_lookup;
335 if (start >= npages)
336 start = 0;
337 n = start;
338 do {
339 char *kaddr;
340 page = exofs_get_page(dir, n);
341 if (!IS_ERR(page)) {
342 kaddr = page_address(page);
343 de = (struct exofs_dir_entry *) kaddr;
344 kaddr += exofs_last_byte(dir, n) - reclen;
345 while ((char *) de <= kaddr) {
346 if (de->rec_len == 0) {
347 EXOFS_ERR(
348 "ERROR: exofs_find_entry: "
349 "zero-length directory entry");
350 exofs_put_page(page);
351 goto out;
352 }
353 if (exofs_match(namelen, name, de))
354 goto found;
355 de = exofs_next_entry(de);
356 }
357 exofs_put_page(page);
358 }
359 if (++n >= npages)
360 n = 0;
361 } while (n != start);
362out:
363 return NULL;
364
365found:
366 *res_page = page;
367 oi->i_dir_start_lookup = n;
368 return de;
369}
370
371struct exofs_dir_entry *exofs_dotdot(struct inode *dir, struct page **p)
372{
373 struct page *page = exofs_get_page(dir, 0);
374 struct exofs_dir_entry *de = NULL;
375
376 if (!IS_ERR(page)) {
377 de = exofs_next_entry(
378 (struct exofs_dir_entry *)page_address(page));
379 *p = page;
380 }
381 return de;
382}
383
384ino_t exofs_parent_ino(struct dentry *child)
385{
386 struct page *page;
387 struct exofs_dir_entry *de;
388 ino_t ino;
389
390 de = exofs_dotdot(child->d_inode, &page);
391 if (!de)
392 return 0;
393
394 ino = le64_to_cpu(de->inode_no);
395 exofs_put_page(page);
396 return ino;
397}
398
399ino_t exofs_inode_by_name(struct inode *dir, struct dentry *dentry)
400{
401 ino_t res = 0;
402 struct exofs_dir_entry *de;
403 struct page *page;
404
405 de = exofs_find_entry(dir, dentry, &page);
406 if (de) {
407 res = le64_to_cpu(de->inode_no);
408 exofs_put_page(page);
409 }
410 return res;
411}
412
413int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
414 struct page *page, struct inode *inode)
415{
416 loff_t pos = page_offset(page) +
417 (char *) de - (char *) page_address(page);
418 unsigned len = le16_to_cpu(de->rec_len);
419 int err;
420
421 lock_page(page);
422 err = exofs_write_begin(NULL, page->mapping, pos, len,
423 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
424 if (err)
425 EXOFS_ERR("exofs_set_link: exofs_write_begin FAILD => %d\n",
426 err);
427
428 de->inode_no = cpu_to_le64(inode->i_ino);
429 exofs_set_de_type(de, inode);
430 if (likely(!err))
431 err = exofs_commit_chunk(page, pos, len);
432 exofs_put_page(page);
433 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
434 mark_inode_dirty(dir);
435 return err;
436}
437
438int exofs_add_link(struct dentry *dentry, struct inode *inode)
439{
440 struct inode *dir = dentry->d_parent->d_inode;
441 const unsigned char *name = dentry->d_name.name;
442 int namelen = dentry->d_name.len;
443 unsigned chunk_size = exofs_chunk_size(dir);
444 unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
445 unsigned short rec_len, name_len;
446 struct page *page = NULL;
447 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
448 struct exofs_dir_entry *de;
449 unsigned long npages = dir_pages(dir);
450 unsigned long n;
451 char *kaddr;
452 loff_t pos;
453 int err;
454
455 for (n = 0; n <= npages; n++) {
456 char *dir_end;
457
458 page = exofs_get_page(dir, n);
459 err = PTR_ERR(page);
460 if (IS_ERR(page))
461 goto out;
462 lock_page(page);
463 kaddr = page_address(page);
464 dir_end = kaddr + exofs_last_byte(dir, n);
465 de = (struct exofs_dir_entry *)kaddr;
466 kaddr += PAGE_CACHE_SIZE - reclen;
467 while ((char *)de <= kaddr) {
468 if ((char *)de == dir_end) {
469 name_len = 0;
470 rec_len = chunk_size;
471 de->rec_len = cpu_to_le16(chunk_size);
472 de->inode_no = 0;
473 goto got_it;
474 }
475 if (de->rec_len == 0) {
476 EXOFS_ERR("ERROR: exofs_add_link: "
477 "zero-length directory entry");
478 err = -EIO;
479 goto out_unlock;
480 }
481 err = -EEXIST;
482 if (exofs_match(namelen, name, de))
483 goto out_unlock;
484 name_len = EXOFS_DIR_REC_LEN(de->name_len);
485 rec_len = le16_to_cpu(de->rec_len);
486 if (!de->inode_no && rec_len >= reclen)
487 goto got_it;
488 if (rec_len >= name_len + reclen)
489 goto got_it;
490 de = (struct exofs_dir_entry *) ((char *) de + rec_len);
491 }
492 unlock_page(page);
493 exofs_put_page(page);
494 }
495
496 EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=%p", dentry, inode);
497 return -EINVAL;
498
499got_it:
500 pos = page_offset(page) +
501 (char *)de - (char *)page_address(page);
502 err = exofs_write_begin(NULL, page->mapping, pos, rec_len, 0,
503 &page, NULL);
504 if (err)
505 goto out_unlock;
506 if (de->inode_no) {
507 struct exofs_dir_entry *de1 =
508 (struct exofs_dir_entry *)((char *)de + name_len);
509 de1->rec_len = cpu_to_le16(rec_len - name_len);
510 de->rec_len = cpu_to_le16(name_len);
511 de = de1;
512 }
513 de->name_len = namelen;
514 memcpy(de->name, name, namelen);
515 de->inode_no = cpu_to_le64(inode->i_ino);
516 exofs_set_de_type(de, inode);
517 err = exofs_commit_chunk(page, pos, rec_len);
518 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
519 mark_inode_dirty(dir);
520 sbi->s_numfiles++;
521
522out_put:
523 exofs_put_page(page);
524out:
525 return err;
526out_unlock:
527 unlock_page(page);
528 goto out_put;
529}
530
531int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
532{
533 struct address_space *mapping = page->mapping;
534 struct inode *inode = mapping->host;
535 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
536 char *kaddr = page_address(page);
537 unsigned from = ((char *)dir - kaddr) & ~(exofs_chunk_size(inode)-1);
538 unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len);
539 loff_t pos;
540 struct exofs_dir_entry *pde = NULL;
541 struct exofs_dir_entry *de = (struct exofs_dir_entry *) (kaddr + from);
542 int err;
543
544 while (de < dir) {
545 if (de->rec_len == 0) {
546 EXOFS_ERR("ERROR: exofs_delete_entry:"
547 "zero-length directory entry");
548 err = -EIO;
549 goto out;
550 }
551 pde = de;
552 de = exofs_next_entry(de);
553 }
554 if (pde)
555 from = (char *)pde - (char *)page_address(page);
556 pos = page_offset(page) + from;
557 lock_page(page);
558 err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0,
559 &page, NULL);
560 if (err)
561 EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILD => %d\n",
562 err);
563 if (pde)
564 pde->rec_len = cpu_to_le16(to - from);
565 dir->inode_no = 0;
566 if (likely(!err))
567 err = exofs_commit_chunk(page, pos, to - from);
568 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
569 mark_inode_dirty(inode);
570 sbi->s_numfiles--;
571out:
572 exofs_put_page(page);
573 return err;
574}
575
576/* kept aligned on 4 bytes */
577#define THIS_DIR ".\0\0"
578#define PARENT_DIR "..\0"
579
580int exofs_make_empty(struct inode *inode, struct inode *parent)
581{
582 struct address_space *mapping = inode->i_mapping;
583 struct page *page = grab_cache_page(mapping, 0);
584 unsigned chunk_size = exofs_chunk_size(inode);
585 struct exofs_dir_entry *de;
586 int err;
587 void *kaddr;
588
589 if (!page)
590 return -ENOMEM;
591
592 err = exofs_write_begin(NULL, page->mapping, 0, chunk_size, 0,
593 &page, NULL);
594 if (err) {
595 unlock_page(page);
596 goto fail;
597 }
598
599 kaddr = kmap_atomic(page, KM_USER0);
600 de = (struct exofs_dir_entry *)kaddr;
601 de->name_len = 1;
602 de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1));
603 memcpy(de->name, THIS_DIR, sizeof(THIS_DIR));
604 de->inode_no = cpu_to_le64(inode->i_ino);
605 exofs_set_de_type(de, inode);
606
607 de = (struct exofs_dir_entry *)(kaddr + EXOFS_DIR_REC_LEN(1));
608 de->name_len = 2;
609 de->rec_len = cpu_to_le16(chunk_size - EXOFS_DIR_REC_LEN(1));
610 de->inode_no = cpu_to_le64(parent->i_ino);
611 memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
612 exofs_set_de_type(de, inode);
613 kunmap_atomic(page, KM_USER0);
614 err = exofs_commit_chunk(page, 0, chunk_size);
615fail:
616 page_cache_release(page);
617 return err;
618}
619
620int exofs_empty_dir(struct inode *inode)
621{
622 struct page *page = NULL;
623 unsigned long i, npages = dir_pages(inode);
624
625 for (i = 0; i < npages; i++) {
626 char *kaddr;
627 struct exofs_dir_entry *de;
628 page = exofs_get_page(inode, i);
629
630 if (IS_ERR(page))
631 continue;
632
633 kaddr = page_address(page);
634 de = (struct exofs_dir_entry *)kaddr;
635 kaddr += exofs_last_byte(inode, i) - EXOFS_DIR_REC_LEN(1);
636
637 while ((char *)de <= kaddr) {
638 if (de->rec_len == 0) {
639 EXOFS_ERR("ERROR: exofs_empty_dir: "
640 "zero-length directory entry"
641 "kaddr=%p, de=%p\n", kaddr, de);
642 goto not_empty;
643 }
644 if (de->inode_no != 0) {
645 /* check for . and .. */
646 if (de->name[0] != '.')
647 goto not_empty;
648 if (de->name_len > 2)
649 goto not_empty;
650 if (de->name_len < 2) {
651 if (le64_to_cpu(de->inode_no) !=
652 inode->i_ino)
653 goto not_empty;
654 } else if (de->name[1] != '.')
655 goto not_empty;
656 }
657 de = exofs_next_entry(de);
658 }
659 exofs_put_page(page);
660 }
661 return 1;
662
663not_empty:
664 exofs_put_page(page);
665 return 0;
666}
667
668const struct file_operations exofs_dir_operations = {
669 .llseek = generic_file_llseek,
670 .read = generic_read_dir,
671 .readdir = exofs_readdir,
672};
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
new file mode 100644
index 000000000000..0fd4c7859679
--- /dev/null
+++ b/fs/exofs/exofs.h
@@ -0,0 +1,180 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * Copyrights for code taken from ext2:
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 * from
15 * linux/fs/minix/inode.c
16 * Copyright (C) 1991, 1992 Linus Torvalds
17 *
18 * This file is part of exofs.
19 *
20 * exofs is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation. Since it is based on ext2, and the only
23 * valid version of GPL for the Linux kernel is version 2, the only valid
24 * version of GPL for exofs is version 2.
25 *
26 * exofs is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with exofs; if not, write to the Free Software
33 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
34 */
35
36#include <linux/fs.h>
37#include <linux/time.h>
38#include "common.h"
39
40#ifndef __EXOFS_H__
41#define __EXOFS_H__
42
43#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
44
45#ifdef CONFIG_EXOFS_DEBUG
46#define EXOFS_DBGMSG(fmt, a...) \
47 printk(KERN_NOTICE "exofs @%s:%d: " fmt, __func__, __LINE__, ##a)
48#else
49#define EXOFS_DBGMSG(fmt, a...) \
50 do { if (0) printk(fmt, ##a); } while (0)
51#endif
52
53/* u64 has problems with printk this will cast it to unsigned long long */
54#define _LLU(x) (unsigned long long)(x)
55
56/*
57 * our extension to the in-memory superblock
58 */
59struct exofs_sb_info {
60 struct osd_dev *s_dev; /* returned by get_osd_dev */
61 osd_id s_pid; /* partition ID of file system*/
62 int s_timeout; /* timeout for OSD operations */
63 uint64_t s_nextid; /* highest object ID used */
64 uint32_t s_numfiles; /* number of files on fs */
65 spinlock_t s_next_gen_lock; /* spinlock for gen # update */
66 u32 s_next_generation; /* next gen # to use */
67 atomic_t s_curr_pending; /* number of pending commands */
68 uint8_t s_cred[OSD_CAP_LEN]; /* all-powerful credential */
69};
70
71/*
72 * our extension to the in-memory inode
73 */
74struct exofs_i_info {
75 unsigned long i_flags; /* various atomic flags */
76 uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/
77 uint32_t i_dir_start_lookup; /* which page to start lookup */
78 wait_queue_head_t i_wq; /* wait queue for inode */
79 uint64_t i_commit_size; /* the object's written length */
80 uint8_t i_cred[OSD_CAP_LEN];/* all-powerful credential */
81 struct inode vfs_inode; /* normal in-memory inode */
82};
83
84/*
85 * our inode flags
86 */
87#define OBJ_2BCREATED 0 /* object will be created soon*/
88#define OBJ_CREATED 1 /* object has been created on the osd*/
89
90static inline int obj_2bcreated(struct exofs_i_info *oi)
91{
92 return test_bit(OBJ_2BCREATED, &oi->i_flags);
93}
94
95static inline void set_obj_2bcreated(struct exofs_i_info *oi)
96{
97 set_bit(OBJ_2BCREATED, &oi->i_flags);
98}
99
100static inline int obj_created(struct exofs_i_info *oi)
101{
102 return test_bit(OBJ_CREATED, &oi->i_flags);
103}
104
105static inline void set_obj_created(struct exofs_i_info *oi)
106{
107 set_bit(OBJ_CREATED, &oi->i_flags);
108}
109
110int __exofs_wait_obj_created(struct exofs_i_info *oi);
111static inline int wait_obj_created(struct exofs_i_info *oi)
112{
113 if (likely(obj_created(oi)))
114 return 0;
115
116 return __exofs_wait_obj_created(oi);
117}
118
119/*
120 * get to our inode from the vfs inode
121 */
122static inline struct exofs_i_info *exofs_i(struct inode *inode)
123{
124 return container_of(inode, struct exofs_i_info, vfs_inode);
125}
126
127/*
128 * Maximum count of links to a file
129 */
130#define EXOFS_LINK_MAX 32000
131
132/*************************
133 * function declarations *
134 *************************/
135/* inode.c */
136void exofs_truncate(struct inode *inode);
137int exofs_setattr(struct dentry *, struct iattr *);
138int exofs_write_begin(struct file *file, struct address_space *mapping,
139 loff_t pos, unsigned len, unsigned flags,
140 struct page **pagep, void **fsdata);
141extern struct inode *exofs_iget(struct super_block *, unsigned long);
142struct inode *exofs_new_inode(struct inode *, int);
143extern int exofs_write_inode(struct inode *, int);
144extern void exofs_delete_inode(struct inode *);
145
146/* dir.c: */
147int exofs_add_link(struct dentry *, struct inode *);
148ino_t exofs_inode_by_name(struct inode *, struct dentry *);
149int exofs_delete_entry(struct exofs_dir_entry *, struct page *);
150int exofs_make_empty(struct inode *, struct inode *);
151struct exofs_dir_entry *exofs_find_entry(struct inode *, struct dentry *,
152 struct page **);
153int exofs_empty_dir(struct inode *);
154struct exofs_dir_entry *exofs_dotdot(struct inode *, struct page **);
155ino_t exofs_parent_ino(struct dentry *child);
156int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
157 struct inode *);
158
159/*********************
160 * operation vectors *
161 *********************/
162/* dir.c: */
163extern const struct file_operations exofs_dir_operations;
164
165/* file.c */
166extern const struct inode_operations exofs_file_inode_operations;
167extern const struct file_operations exofs_file_operations;
168
169/* inode.c */
170extern const struct address_space_operations exofs_aops;
171
172/* namei.c */
173extern const struct inode_operations exofs_dir_inode_operations;
174extern const struct inode_operations exofs_special_inode_operations;
175
176/* symlink.c */
177extern const struct inode_operations exofs_symlink_inode_operations;
178extern const struct inode_operations exofs_fast_symlink_inode_operations;
179
180#endif
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
new file mode 100644
index 000000000000..6ed7fe484752
--- /dev/null
+++ b/fs/exofs/file.c
@@ -0,0 +1,87 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * Copyrights for code taken from ext2:
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 * from
15 * linux/fs/minix/inode.c
16 * Copyright (C) 1991, 1992 Linus Torvalds
17 *
18 * This file is part of exofs.
19 *
20 * exofs is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation. Since it is based on ext2, and the only
23 * valid version of GPL for the Linux kernel is version 2, the only valid
24 * version of GPL for exofs is version 2.
25 *
26 * exofs is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with exofs; if not, write to the Free Software
33 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
34 */
35
36#include <linux/buffer_head.h>
37
38#include "exofs.h"
39
40static int exofs_release_file(struct inode *inode, struct file *filp)
41{
42 return 0;
43}
44
45static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
46 int datasync)
47{
48 int ret;
49 struct address_space *mapping = filp->f_mapping;
50
51 ret = filemap_write_and_wait(mapping);
52 if (ret)
53 return ret;
54
55 /*Note: file_fsync below also calles sync_blockdev, which is a no-op
56 * for exofs, but other then that it does sync_inode and
57 * sync_superblock which is what we need here.
58 */
59 return file_fsync(filp, dentry, datasync);
60}
61
62static int exofs_flush(struct file *file, fl_owner_t id)
63{
64 exofs_file_fsync(file, file->f_path.dentry, 1);
65 /* TODO: Flush the OSD target */
66 return 0;
67}
68
69const struct file_operations exofs_file_operations = {
70 .llseek = generic_file_llseek,
71 .read = do_sync_read,
72 .write = do_sync_write,
73 .aio_read = generic_file_aio_read,
74 .aio_write = generic_file_aio_write,
75 .mmap = generic_file_mmap,
76 .open = generic_file_open,
77 .release = exofs_release_file,
78 .fsync = exofs_file_fsync,
79 .flush = exofs_flush,
80 .splice_read = generic_file_splice_read,
81 .splice_write = generic_file_splice_write,
82};
83
84const struct inode_operations exofs_file_inode_operations = {
85 .truncate = exofs_truncate,
86 .setattr = exofs_setattr,
87};
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
new file mode 100644
index 000000000000..ba8d9fab4693
--- /dev/null
+++ b/fs/exofs/inode.c
@@ -0,0 +1,1303 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * Copyrights for code taken from ext2:
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 * from
15 * linux/fs/minix/inode.c
16 * Copyright (C) 1991, 1992 Linus Torvalds
17 *
18 * This file is part of exofs.
19 *
20 * exofs is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation. Since it is based on ext2, and the only
23 * valid version of GPL for the Linux kernel is version 2, the only valid
24 * version of GPL for exofs is version 2.
25 *
26 * exofs is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with exofs; if not, write to the Free Software
33 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
34 */
35
36#include <linux/writeback.h>
37#include <linux/buffer_head.h>
38#include <scsi/scsi_device.h>
39
40#include "exofs.h"
41
42#ifdef CONFIG_EXOFS_DEBUG
43# define EXOFS_DEBUG_OBJ_ISIZE 1
44#endif
45
46struct page_collect {
47 struct exofs_sb_info *sbi;
48 struct request_queue *req_q;
49 struct inode *inode;
50 unsigned expected_pages;
51
52 struct bio *bio;
53 unsigned nr_pages;
54 unsigned long length;
55 loff_t pg_first; /* keep 64bit also in 32-arches */
56};
57
58static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
59 struct inode *inode)
60{
61 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
62 struct request_queue *req_q = sbi->s_dev->scsi_device->request_queue;
63
64 pcol->sbi = sbi;
65 pcol->req_q = req_q;
66 pcol->inode = inode;
67 pcol->expected_pages = expected_pages;
68
69 pcol->bio = NULL;
70 pcol->nr_pages = 0;
71 pcol->length = 0;
72 pcol->pg_first = -1;
73
74 EXOFS_DBGMSG("_pcol_init ino=0x%lx expected_pages=%u\n", inode->i_ino,
75 expected_pages);
76}
77
78static void _pcol_reset(struct page_collect *pcol)
79{
80 pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages);
81
82 pcol->bio = NULL;
83 pcol->nr_pages = 0;
84 pcol->length = 0;
85 pcol->pg_first = -1;
86 EXOFS_DBGMSG("_pcol_reset ino=0x%lx expected_pages=%u\n",
87 pcol->inode->i_ino, pcol->expected_pages);
88
89 /* this is probably the end of the loop but in writes
90 * it might not end here. don't be left with nothing
91 */
92 if (!pcol->expected_pages)
93 pcol->expected_pages = 128;
94}
95
96static int pcol_try_alloc(struct page_collect *pcol)
97{
98 int pages = min_t(unsigned, pcol->expected_pages, BIO_MAX_PAGES);
99
100 for (; pages; pages >>= 1) {
101 pcol->bio = bio_alloc(GFP_KERNEL, pages);
102 if (likely(pcol->bio))
103 return 0;
104 }
105
106 EXOFS_ERR("Failed to kcalloc expected_pages=%u\n",
107 pcol->expected_pages);
108 return -ENOMEM;
109}
110
111static void pcol_free(struct page_collect *pcol)
112{
113 bio_put(pcol->bio);
114 pcol->bio = NULL;
115}
116
117static int pcol_add_page(struct page_collect *pcol, struct page *page,
118 unsigned len)
119{
120 int added_len = bio_add_pc_page(pcol->req_q, pcol->bio, page, len, 0);
121 if (unlikely(len != added_len))
122 return -ENOMEM;
123
124 ++pcol->nr_pages;
125 pcol->length += len;
126 return 0;
127}
128
129static int update_read_page(struct page *page, int ret)
130{
131 if (ret == 0) {
132 /* Everything is OK */
133 SetPageUptodate(page);
134 if (PageError(page))
135 ClearPageError(page);
136 } else if (ret == -EFAULT) {
137 /* In this case we were trying to read something that wasn't on
138 * disk yet - return a page full of zeroes. This should be OK,
139 * because the object should be empty (if there was a write
140 * before this read, the read would be waiting with the page
141 * locked */
142 clear_highpage(page);
143
144 SetPageUptodate(page);
145 if (PageError(page))
146 ClearPageError(page);
147 ret = 0; /* recovered error */
148 EXOFS_DBGMSG("recovered read error\n");
149 } else /* Error */
150 SetPageError(page);
151
152 return ret;
153}
154
155static void update_write_page(struct page *page, int ret)
156{
157 if (ret) {
158 mapping_set_error(page->mapping, ret);
159 SetPageError(page);
160 }
161 end_page_writeback(page);
162}
163
164/* Called at the end of reads, to optionally unlock pages and update their
165 * status.
166 */
167static int __readpages_done(struct osd_request *or, struct page_collect *pcol,
168 bool do_unlock)
169{
170 struct bio_vec *bvec;
171 int i;
172 u64 resid;
173 u64 good_bytes;
174 u64 length = 0;
175 int ret = exofs_check_ok_resid(or, &resid, NULL);
176
177 osd_end_request(or);
178
179 if (likely(!ret))
180 good_bytes = pcol->length;
181 else if (!resid)
182 good_bytes = 0;
183 else
184 good_bytes = pcol->length - resid;
185
186 EXOFS_DBGMSG("readpages_done(0x%lx) good_bytes=0x%llx"
187 " length=0x%lx nr_pages=%u\n",
188 pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
189 pcol->nr_pages);
190
191 __bio_for_each_segment(bvec, pcol->bio, i, 0) {
192 struct page *page = bvec->bv_page;
193 struct inode *inode = page->mapping->host;
194 int page_stat;
195
196 if (inode != pcol->inode)
197 continue; /* osd might add more pages at end */
198
199 if (likely(length < good_bytes))
200 page_stat = 0;
201 else
202 page_stat = ret;
203
204 EXOFS_DBGMSG(" readpages_done(0x%lx, 0x%lx) %s\n",
205 inode->i_ino, page->index,
206 page_stat ? "bad_bytes" : "good_bytes");
207
208 ret = update_read_page(page, page_stat);
209 if (do_unlock)
210 unlock_page(page);
211 length += bvec->bv_len;
212 }
213
214 pcol_free(pcol);
215 EXOFS_DBGMSG("readpages_done END\n");
216 return ret;
217}
218
219/* callback of async reads */
220static void readpages_done(struct osd_request *or, void *p)
221{
222 struct page_collect *pcol = p;
223
224 __readpages_done(or, pcol, true);
225 atomic_dec(&pcol->sbi->s_curr_pending);
226 kfree(p);
227}
228
229static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
230{
231 struct bio_vec *bvec;
232 int i;
233
234 __bio_for_each_segment(bvec, pcol->bio, i, 0) {
235 struct page *page = bvec->bv_page;
236
237 if (rw == READ)
238 update_read_page(page, ret);
239 else
240 update_write_page(page, ret);
241
242 unlock_page(page);
243 }
244 pcol_free(pcol);
245}
246
247static int read_exec(struct page_collect *pcol, bool is_sync)
248{
249 struct exofs_i_info *oi = exofs_i(pcol->inode);
250 struct osd_obj_id obj = {pcol->sbi->s_pid,
251 pcol->inode->i_ino + EXOFS_OBJ_OFF};
252 struct osd_request *or = NULL;
253 struct page_collect *pcol_copy = NULL;
254 loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
255 int ret;
256
257 if (!pcol->bio)
258 return 0;
259
260 /* see comment in _readpage() about sync reads */
261 WARN_ON(is_sync && (pcol->nr_pages != 1));
262
263 or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
264 if (unlikely(!or)) {
265 ret = -ENOMEM;
266 goto err;
267 }
268
269 osd_req_read(or, &obj, pcol->bio, i_start);
270
271 if (is_sync) {
272 exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred);
273 return __readpages_done(or, pcol, false);
274 }
275
276 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
277 if (!pcol_copy) {
278 ret = -ENOMEM;
279 goto err;
280 }
281
282 *pcol_copy = *pcol;
283 ret = exofs_async_op(or, readpages_done, pcol_copy, oi->i_cred);
284 if (unlikely(ret))
285 goto err;
286
287 atomic_inc(&pcol->sbi->s_curr_pending);
288
289 EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
290 obj.id, _LLU(i_start), pcol->length);
291
292 /* pages ownership was passed to pcol_copy */
293 _pcol_reset(pcol);
294 return 0;
295
296err:
297 if (!is_sync)
298 _unlock_pcol_pages(pcol, ret, READ);
299 kfree(pcol_copy);
300 if (or)
301 osd_end_request(or);
302 return ret;
303}
304
305/* readpage_strip is called either directly from readpage() or by the VFS from
306 * within read_cache_pages(), to add one more page to be read. It will try to
307 * collect as many contiguous pages as posible. If a discontinuity is
308 * encountered, or it runs out of resources, it will submit the previous segment
309 * and will start a new collection. Eventually caller must submit the last
310 * segment if present.
311 */
312static int readpage_strip(void *data, struct page *page)
313{
314 struct page_collect *pcol = data;
315 struct inode *inode = pcol->inode;
316 struct exofs_i_info *oi = exofs_i(inode);
317 loff_t i_size = i_size_read(inode);
318 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
319 size_t len;
320 int ret;
321
322 /* FIXME: Just for debugging, will be removed */
323 if (PageUptodate(page))
324 EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
325 page->index);
326
327 if (page->index < end_index)
328 len = PAGE_CACHE_SIZE;
329 else if (page->index == end_index)
330 len = i_size & ~PAGE_CACHE_MASK;
331 else
332 len = 0;
333
334 if (!len || !obj_created(oi)) {
335 /* this will be out of bounds, or doesn't exist yet.
336 * Current page is cleared and the request is split
337 */
338 clear_highpage(page);
339
340 SetPageUptodate(page);
341 if (PageError(page))
342 ClearPageError(page);
343
344 unlock_page(page);
345 EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
346 " splitting\n", inode->i_ino, page->index);
347
348 return read_exec(pcol, false);
349 }
350
351try_again:
352
353 if (unlikely(pcol->pg_first == -1)) {
354 pcol->pg_first = page->index;
355 } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
356 page->index)) {
357 /* Discontinuity detected, split the request */
358 ret = read_exec(pcol, false);
359 if (unlikely(ret))
360 goto fail;
361 goto try_again;
362 }
363
364 if (!pcol->bio) {
365 ret = pcol_try_alloc(pcol);
366 if (unlikely(ret))
367 goto fail;
368 }
369
370 if (len != PAGE_CACHE_SIZE)
371 zero_user(page, len, PAGE_CACHE_SIZE - len);
372
373 EXOFS_DBGMSG(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
374 inode->i_ino, page->index, len);
375
376 ret = pcol_add_page(pcol, page, len);
377 if (ret) {
378 EXOFS_DBGMSG("Failed pcol_add_page pages[i]=%p "
379 "this_len=0x%zx nr_pages=%u length=0x%lx\n",
380 page, len, pcol->nr_pages, pcol->length);
381
382 /* split the request, and start again with current page */
383 ret = read_exec(pcol, false);
384 if (unlikely(ret))
385 goto fail;
386
387 goto try_again;
388 }
389
390 return 0;
391
392fail:
393 /* SetPageError(page); ??? */
394 unlock_page(page);
395 return ret;
396}
397
398static int exofs_readpages(struct file *file, struct address_space *mapping,
399 struct list_head *pages, unsigned nr_pages)
400{
401 struct page_collect pcol;
402 int ret;
403
404 _pcol_init(&pcol, nr_pages, mapping->host);
405
406 ret = read_cache_pages(mapping, pages, readpage_strip, &pcol);
407 if (ret) {
408 EXOFS_ERR("read_cache_pages => %d\n", ret);
409 return ret;
410 }
411
412 return read_exec(&pcol, false);
413}
414
415static int _readpage(struct page *page, bool is_sync)
416{
417 struct page_collect pcol;
418 int ret;
419
420 _pcol_init(&pcol, 1, page->mapping->host);
421
422 /* readpage_strip might call read_exec(,async) inside at several places
423 * but this is safe for is_async=0 since read_exec will not do anything
424 * when we have a single page.
425 */
426 ret = readpage_strip(&pcol, page);
427 if (ret) {
428 EXOFS_ERR("_readpage => %d\n", ret);
429 return ret;
430 }
431
432 return read_exec(&pcol, is_sync);
433}
434
435/*
436 * We don't need the file
437 */
438static int exofs_readpage(struct file *file, struct page *page)
439{
440 return _readpage(page, false);
441}
442
443/* Callback for osd_write. All writes are asynchronouse */
444static void writepages_done(struct osd_request *or, void *p)
445{
446 struct page_collect *pcol = p;
447 struct bio_vec *bvec;
448 int i;
449 u64 resid;
450 u64 good_bytes;
451 u64 length = 0;
452
453 int ret = exofs_check_ok_resid(or, NULL, &resid);
454
455 osd_end_request(or);
456 atomic_dec(&pcol->sbi->s_curr_pending);
457
458 if (likely(!ret))
459 good_bytes = pcol->length;
460 else if (!resid)
461 good_bytes = 0;
462 else
463 good_bytes = pcol->length - resid;
464
465 EXOFS_DBGMSG("writepages_done(0x%lx) good_bytes=0x%llx"
466 " length=0x%lx nr_pages=%u\n",
467 pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
468 pcol->nr_pages);
469
470 __bio_for_each_segment(bvec, pcol->bio, i, 0) {
471 struct page *page = bvec->bv_page;
472 struct inode *inode = page->mapping->host;
473 int page_stat;
474
475 if (inode != pcol->inode)
476 continue; /* osd might add more pages to a bio */
477
478 if (likely(length < good_bytes))
479 page_stat = 0;
480 else
481 page_stat = ret;
482
483 update_write_page(page, page_stat);
484 unlock_page(page);
485 EXOFS_DBGMSG(" writepages_done(0x%lx, 0x%lx) status=%d\n",
486 inode->i_ino, page->index, page_stat);
487
488 length += bvec->bv_len;
489 }
490
491 pcol_free(pcol);
492 kfree(pcol);
493 EXOFS_DBGMSG("writepages_done END\n");
494}
495
496static int write_exec(struct page_collect *pcol)
497{
498 struct exofs_i_info *oi = exofs_i(pcol->inode);
499 struct osd_obj_id obj = {pcol->sbi->s_pid,
500 pcol->inode->i_ino + EXOFS_OBJ_OFF};
501 struct osd_request *or = NULL;
502 struct page_collect *pcol_copy = NULL;
503 loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
504 int ret;
505
506 if (!pcol->bio)
507 return 0;
508
509 or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
510 if (unlikely(!or)) {
511 EXOFS_ERR("write_exec: Faild to osd_start_request()\n");
512 ret = -ENOMEM;
513 goto err;
514 }
515
516 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
517 if (!pcol_copy) {
518 EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n");
519 ret = -ENOMEM;
520 goto err;
521 }
522
523 *pcol_copy = *pcol;
524
525 osd_req_write(or, &obj, pcol_copy->bio, i_start);
526 ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred);
527 if (unlikely(ret)) {
528 EXOFS_ERR("write_exec: exofs_async_op() Faild\n");
529 goto err;
530 }
531
532 atomic_inc(&pcol->sbi->s_curr_pending);
533 EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
534 pcol->inode->i_ino, pcol->pg_first, _LLU(i_start),
535 pcol->length);
536 /* pages ownership was passed to pcol_copy */
537 _pcol_reset(pcol);
538 return 0;
539
540err:
541 _unlock_pcol_pages(pcol, ret, WRITE);
542 kfree(pcol_copy);
543 if (or)
544 osd_end_request(or);
545 return ret;
546}
547
548/* writepage_strip is called either directly from writepage() or by the VFS from
549 * within write_cache_pages(), to add one more page to be written to storage.
550 * It will try to collect as many contiguous pages as possible. If a
551 * discontinuity is encountered or it runs out of resources it will submit the
552 * previous segment and will start a new collection.
553 * Eventually caller must submit the last segment if present.
554 */
555static int writepage_strip(struct page *page,
556 struct writeback_control *wbc_unused, void *data)
557{
558 struct page_collect *pcol = data;
559 struct inode *inode = pcol->inode;
560 struct exofs_i_info *oi = exofs_i(inode);
561 loff_t i_size = i_size_read(inode);
562 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
563 size_t len;
564 int ret;
565
566 BUG_ON(!PageLocked(page));
567
568 ret = wait_obj_created(oi);
569 if (unlikely(ret))
570 goto fail;
571
572 if (page->index < end_index)
573 /* in this case, the page is within the limits of the file */
574 len = PAGE_CACHE_SIZE;
575 else {
576 len = i_size & ~PAGE_CACHE_MASK;
577
578 if (page->index > end_index || !len) {
579 /* in this case, the page is outside the limits
580 * (truncate in progress)
581 */
582 ret = write_exec(pcol);
583 if (unlikely(ret))
584 goto fail;
585 if (PageError(page))
586 ClearPageError(page);
587 unlock_page(page);
588 return 0;
589 }
590 }
591
592try_again:
593
594 if (unlikely(pcol->pg_first == -1)) {
595 pcol->pg_first = page->index;
596 } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
597 page->index)) {
598 /* Discontinuity detected, split the request */
599 ret = write_exec(pcol);
600 if (unlikely(ret))
601 goto fail;
602 goto try_again;
603 }
604
605 if (!pcol->bio) {
606 ret = pcol_try_alloc(pcol);
607 if (unlikely(ret))
608 goto fail;
609 }
610
611 EXOFS_DBGMSG(" writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
612 inode->i_ino, page->index, len);
613
614 ret = pcol_add_page(pcol, page, len);
615 if (unlikely(ret)) {
616 EXOFS_DBGMSG("Failed pcol_add_page "
617 "nr_pages=%u total_length=0x%lx\n",
618 pcol->nr_pages, pcol->length);
619
620 /* split the request, next loop will start again */
621 ret = write_exec(pcol);
622 if (unlikely(ret)) {
623 EXOFS_DBGMSG("write_exec faild => %d", ret);
624 goto fail;
625 }
626
627 goto try_again;
628 }
629
630 BUG_ON(PageWriteback(page));
631 set_page_writeback(page);
632
633 return 0;
634
635fail:
636 set_bit(AS_EIO, &page->mapping->flags);
637 unlock_page(page);
638 return ret;
639}
640
641static int exofs_writepages(struct address_space *mapping,
642 struct writeback_control *wbc)
643{
644 struct page_collect pcol;
645 long start, end, expected_pages;
646 int ret;
647
648 start = wbc->range_start >> PAGE_CACHE_SHIFT;
649 end = (wbc->range_end == LLONG_MAX) ?
650 start + mapping->nrpages :
651 wbc->range_end >> PAGE_CACHE_SHIFT;
652
653 if (start || end)
654 expected_pages = min(end - start + 1, 32L);
655 else
656 expected_pages = mapping->nrpages;
657
658 EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx"
659 " m->nrpages=%lu start=0x%lx end=0x%lx\n",
660 mapping->host->i_ino, wbc->range_start, wbc->range_end,
661 mapping->nrpages, start, end);
662
663 _pcol_init(&pcol, expected_pages, mapping->host);
664
665 ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
666 if (ret) {
667 EXOFS_ERR("write_cache_pages => %d\n", ret);
668 return ret;
669 }
670
671 return write_exec(&pcol);
672}
673
674static int exofs_writepage(struct page *page, struct writeback_control *wbc)
675{
676 struct page_collect pcol;
677 int ret;
678
679 _pcol_init(&pcol, 1, page->mapping->host);
680
681 ret = writepage_strip(page, NULL, &pcol);
682 if (ret) {
683 EXOFS_ERR("exofs_writepage => %d\n", ret);
684 return ret;
685 }
686
687 return write_exec(&pcol);
688}
689
690int exofs_write_begin(struct file *file, struct address_space *mapping,
691 loff_t pos, unsigned len, unsigned flags,
692 struct page **pagep, void **fsdata)
693{
694 int ret = 0;
695 struct page *page;
696
697 page = *pagep;
698 if (page == NULL) {
699 ret = simple_write_begin(file, mapping, pos, len, flags, pagep,
700 fsdata);
701 if (ret) {
702 EXOFS_DBGMSG("simple_write_begin faild\n");
703 return ret;
704 }
705
706 page = *pagep;
707 }
708
709 /* read modify write */
710 if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
711 ret = _readpage(page, true);
712 if (ret) {
713 /*SetPageError was done by _readpage. Is it ok?*/
714 unlock_page(page);
715 EXOFS_DBGMSG("__readpage_filler faild\n");
716 }
717 }
718
719 return ret;
720}
721
722static int exofs_write_begin_export(struct file *file,
723 struct address_space *mapping,
724 loff_t pos, unsigned len, unsigned flags,
725 struct page **pagep, void **fsdata)
726{
727 *pagep = NULL;
728
729 return exofs_write_begin(file, mapping, pos, len, flags, pagep,
730 fsdata);
731}
732
733const struct address_space_operations exofs_aops = {
734 .readpage = exofs_readpage,
735 .readpages = exofs_readpages,
736 .writepage = exofs_writepage,
737 .writepages = exofs_writepages,
738 .write_begin = exofs_write_begin_export,
739 .write_end = simple_write_end,
740};
741
742/******************************************************************************
743 * INODE OPERATIONS
744 *****************************************************************************/
745
746/*
747 * Test whether an inode is a fast symlink.
748 */
749static inline int exofs_inode_is_fast_symlink(struct inode *inode)
750{
751 struct exofs_i_info *oi = exofs_i(inode);
752
753 return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0);
754}
755
756/*
757 * get_block_t - Fill in a buffer_head
758 * An OSD takes care of block allocation so we just fake an allocation by
759 * putting in the inode's sector_t in the buffer_head.
760 * TODO: What about the case of create==0 and @iblock does not exist in the
761 * object?
762 */
763static int exofs_get_block(struct inode *inode, sector_t iblock,
764 struct buffer_head *bh_result, int create)
765{
766 map_bh(bh_result, inode->i_sb, iblock);
767 return 0;
768}
769
770const struct osd_attr g_attr_logical_length = ATTR_DEF(
771 OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
772
773/*
774 * Truncate a file to the specified size - all we have to do is set the size
775 * attribute. We make sure the object exists first.
776 */
777void exofs_truncate(struct inode *inode)
778{
779 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
780 struct exofs_i_info *oi = exofs_i(inode);
781 struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
782 struct osd_request *or;
783 struct osd_attr attr;
784 loff_t isize = i_size_read(inode);
785 __be64 newsize;
786 int ret;
787
788 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
789 || S_ISLNK(inode->i_mode)))
790 return;
791 if (exofs_inode_is_fast_symlink(inode))
792 return;
793 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
794 return;
795 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
796
797 nobh_truncate_page(inode->i_mapping, isize, exofs_get_block);
798
799 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
800 if (unlikely(!or)) {
801 EXOFS_ERR("ERROR: exofs_truncate: osd_start_request failed\n");
802 goto fail;
803 }
804
805 osd_req_set_attributes(or, &obj);
806
807 newsize = cpu_to_be64((u64)isize);
808 attr = g_attr_logical_length;
809 attr.val_ptr = &newsize;
810 osd_req_add_set_attr_list(or, &attr, 1);
811
812 /* if we are about to truncate an object, and it hasn't been
813 * created yet, wait
814 */
815 if (unlikely(wait_obj_created(oi)))
816 goto fail;
817
818 ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
819 osd_end_request(or);
820 if (ret)
821 goto fail;
822
823out:
824 mark_inode_dirty(inode);
825 return;
826fail:
827 make_bad_inode(inode);
828 goto out;
829}
830
831/*
832 * Set inode attributes - just call generic functions.
833 */
834int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
835{
836 struct inode *inode = dentry->d_inode;
837 int error;
838
839 error = inode_change_ok(inode, iattr);
840 if (error)
841 return error;
842
843 error = inode_setattr(inode, iattr);
844 return error;
845}
846
847/*
848 * Read an inode from the OSD, and return it as is. We also return the size
849 * attribute in the 'sanity' argument if we got compiled with debugging turned
850 * on.
851 */
852static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
853 struct exofs_fcb *inode, uint64_t *sanity)
854{
855 struct exofs_sb_info *sbi = sb->s_fs_info;
856 struct osd_request *or;
857 struct osd_attr attr;
858 struct osd_obj_id obj = {sbi->s_pid,
859 oi->vfs_inode.i_ino + EXOFS_OBJ_OFF};
860 int ret;
861
862 exofs_make_credential(oi->i_cred, &obj);
863
864 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
865 if (unlikely(!or)) {
866 EXOFS_ERR("exofs_get_inode: osd_start_request failed.\n");
867 return -ENOMEM;
868 }
869 osd_req_get_attributes(or, &obj);
870
871 /* we need the inode attribute */
872 osd_req_add_get_attr_list(or, &g_attr_inode_data, 1);
873
874#ifdef EXOFS_DEBUG_OBJ_ISIZE
875 /* we get the size attributes to do a sanity check */
876 osd_req_add_get_attr_list(or, &g_attr_logical_length, 1);
877#endif
878
879 ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
880 if (ret)
881 goto out;
882
883 attr = g_attr_inode_data;
884 ret = extract_attr_from_req(or, &attr);
885 if (ret) {
886 EXOFS_ERR("exofs_get_inode: extract_attr_from_req failed\n");
887 goto out;
888 }
889
890 WARN_ON(attr.len != EXOFS_INO_ATTR_SIZE);
891 memcpy(inode, attr.val_ptr, EXOFS_INO_ATTR_SIZE);
892
893#ifdef EXOFS_DEBUG_OBJ_ISIZE
894 attr = g_attr_logical_length;
895 ret = extract_attr_from_req(or, &attr);
896 if (ret) {
897 EXOFS_ERR("ERROR: extract attr from or failed\n");
898 goto out;
899 }
900 *sanity = get_unaligned_be64(attr.val_ptr);
901#endif
902
903out:
904 osd_end_request(or);
905 return ret;
906}
907
908/*
909 * Fill in an inode read from the OSD and set it up for use
910 */
911struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
912{
913 struct exofs_i_info *oi;
914 struct exofs_fcb fcb;
915 struct inode *inode;
916 uint64_t uninitialized_var(sanity);
917 int ret;
918
919 inode = iget_locked(sb, ino);
920 if (!inode)
921 return ERR_PTR(-ENOMEM);
922 if (!(inode->i_state & I_NEW))
923 return inode;
924 oi = exofs_i(inode);
925
926 /* read the inode from the osd */
927 ret = exofs_get_inode(sb, oi, &fcb, &sanity);
928 if (ret)
929 goto bad_inode;
930
931 init_waitqueue_head(&oi->i_wq);
932 set_obj_created(oi);
933
934 /* copy stuff from on-disk struct to in-memory struct */
935 inode->i_mode = le16_to_cpu(fcb.i_mode);
936 inode->i_uid = le32_to_cpu(fcb.i_uid);
937 inode->i_gid = le32_to_cpu(fcb.i_gid);
938 inode->i_nlink = le16_to_cpu(fcb.i_links_count);
939 inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime);
940 inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime);
941 inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime);
942 inode->i_ctime.tv_nsec =
943 inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0;
944 oi->i_commit_size = le64_to_cpu(fcb.i_size);
945 i_size_write(inode, oi->i_commit_size);
946 inode->i_blkbits = EXOFS_BLKSHIFT;
947 inode->i_generation = le32_to_cpu(fcb.i_generation);
948
949#ifdef EXOFS_DEBUG_OBJ_ISIZE
950 if ((inode->i_size != sanity) &&
951 (!exofs_inode_is_fast_symlink(inode))) {
952 EXOFS_ERR("WARNING: Size of object from inode and "
953 "attributes differ (%lld != %llu)\n",
954 inode->i_size, _LLU(sanity));
955 }
956#endif
957
958 oi->i_dir_start_lookup = 0;
959
960 if ((inode->i_nlink == 0) && (inode->i_mode == 0)) {
961 ret = -ESTALE;
962 goto bad_inode;
963 }
964
965 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
966 if (fcb.i_data[0])
967 inode->i_rdev =
968 old_decode_dev(le32_to_cpu(fcb.i_data[0]));
969 else
970 inode->i_rdev =
971 new_decode_dev(le32_to_cpu(fcb.i_data[1]));
972 } else {
973 memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
974 }
975
976 if (S_ISREG(inode->i_mode)) {
977 inode->i_op = &exofs_file_inode_operations;
978 inode->i_fop = &exofs_file_operations;
979 inode->i_mapping->a_ops = &exofs_aops;
980 } else if (S_ISDIR(inode->i_mode)) {
981 inode->i_op = &exofs_dir_inode_operations;
982 inode->i_fop = &exofs_dir_operations;
983 inode->i_mapping->a_ops = &exofs_aops;
984 } else if (S_ISLNK(inode->i_mode)) {
985 if (exofs_inode_is_fast_symlink(inode))
986 inode->i_op = &exofs_fast_symlink_inode_operations;
987 else {
988 inode->i_op = &exofs_symlink_inode_operations;
989 inode->i_mapping->a_ops = &exofs_aops;
990 }
991 } else {
992 inode->i_op = &exofs_special_inode_operations;
993 if (fcb.i_data[0])
994 init_special_inode(inode, inode->i_mode,
995 old_decode_dev(le32_to_cpu(fcb.i_data[0])));
996 else
997 init_special_inode(inode, inode->i_mode,
998 new_decode_dev(le32_to_cpu(fcb.i_data[1])));
999 }
1000
1001 unlock_new_inode(inode);
1002 return inode;
1003
1004bad_inode:
1005 iget_failed(inode);
1006 return ERR_PTR(ret);
1007}
1008
1009int __exofs_wait_obj_created(struct exofs_i_info *oi)
1010{
1011 if (!obj_created(oi)) {
1012 BUG_ON(!obj_2bcreated(oi));
1013 wait_event(oi->i_wq, obj_created(oi));
1014 }
1015 return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
1016}
1017/*
1018 * Callback function from exofs_new_inode(). The important thing is that we
1019 * set the obj_created flag so that other methods know that the object exists on
1020 * the OSD.
1021 */
1022static void create_done(struct osd_request *or, void *p)
1023{
1024 struct inode *inode = p;
1025 struct exofs_i_info *oi = exofs_i(inode);
1026 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
1027 int ret;
1028
1029 ret = exofs_check_ok(or);
1030 osd_end_request(or);
1031 atomic_dec(&sbi->s_curr_pending);
1032
1033 if (unlikely(ret)) {
1034 EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx",
1035 _LLU(sbi->s_pid), _LLU(inode->i_ino + EXOFS_OBJ_OFF));
1036 make_bad_inode(inode);
1037 } else
1038 set_obj_created(oi);
1039
1040 atomic_dec(&inode->i_count);
1041 wake_up(&oi->i_wq);
1042}
1043
1044/*
1045 * Set up a new inode and create an object for it on the OSD
1046 */
1047struct inode *exofs_new_inode(struct inode *dir, int mode)
1048{
1049 struct super_block *sb;
1050 struct inode *inode;
1051 struct exofs_i_info *oi;
1052 struct exofs_sb_info *sbi;
1053 struct osd_request *or;
1054 struct osd_obj_id obj;
1055 int ret;
1056
1057 sb = dir->i_sb;
1058 inode = new_inode(sb);
1059 if (!inode)
1060 return ERR_PTR(-ENOMEM);
1061
1062 oi = exofs_i(inode);
1063
1064 init_waitqueue_head(&oi->i_wq);
1065 set_obj_2bcreated(oi);
1066
1067 sbi = sb->s_fs_info;
1068
1069 sb->s_dirt = 1;
1070 inode->i_uid = current->cred->fsuid;
1071 if (dir->i_mode & S_ISGID) {
1072 inode->i_gid = dir->i_gid;
1073 if (S_ISDIR(mode))
1074 mode |= S_ISGID;
1075 } else {
1076 inode->i_gid = current->cred->fsgid;
1077 }
1078 inode->i_mode = mode;
1079
1080 inode->i_ino = sbi->s_nextid++;
1081 inode->i_blkbits = EXOFS_BLKSHIFT;
1082 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
1083 oi->i_commit_size = inode->i_size = 0;
1084 spin_lock(&sbi->s_next_gen_lock);
1085 inode->i_generation = sbi->s_next_generation++;
1086 spin_unlock(&sbi->s_next_gen_lock);
1087 insert_inode_hash(inode);
1088
1089 mark_inode_dirty(inode);
1090
1091 obj.partition = sbi->s_pid;
1092 obj.id = inode->i_ino + EXOFS_OBJ_OFF;
1093 exofs_make_credential(oi->i_cred, &obj);
1094
1095 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
1096 if (unlikely(!or)) {
1097 EXOFS_ERR("exofs_new_inode: osd_start_request failed\n");
1098 return ERR_PTR(-ENOMEM);
1099 }
1100
1101 osd_req_create_object(or, &obj);
1102
1103 /* increment the refcount so that the inode will still be around when we
1104 * reach the callback
1105 */
1106 atomic_inc(&inode->i_count);
1107
1108 ret = exofs_async_op(or, create_done, inode, oi->i_cred);
1109 if (ret) {
1110 atomic_dec(&inode->i_count);
1111 osd_end_request(or);
1112 return ERR_PTR(-EIO);
1113 }
1114 atomic_inc(&sbi->s_curr_pending);
1115
1116 return inode;
1117}
1118
1119/*
1120 * struct to pass two arguments to update_inode's callback
1121 */
1122struct updatei_args {
1123 struct exofs_sb_info *sbi;
1124 struct exofs_fcb fcb;
1125};
1126
1127/*
1128 * Callback function from exofs_update_inode().
1129 */
1130static void updatei_done(struct osd_request *or, void *p)
1131{
1132 struct updatei_args *args = p;
1133
1134 osd_end_request(or);
1135
1136 atomic_dec(&args->sbi->s_curr_pending);
1137
1138 kfree(args);
1139}
1140
1141/*
1142 * Write the inode to the OSD. Just fill up the struct, and set the attribute
1143 * synchronously or asynchronously depending on the do_sync flag.
1144 */
1145static int exofs_update_inode(struct inode *inode, int do_sync)
1146{
1147 struct exofs_i_info *oi = exofs_i(inode);
1148 struct super_block *sb = inode->i_sb;
1149 struct exofs_sb_info *sbi = sb->s_fs_info;
1150 struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
1151 struct osd_request *or;
1152 struct osd_attr attr;
1153 struct exofs_fcb *fcb;
1154 struct updatei_args *args;
1155 int ret;
1156
1157 args = kzalloc(sizeof(*args), GFP_KERNEL);
1158 if (!args)
1159 return -ENOMEM;
1160
1161 fcb = &args->fcb;
1162
1163 fcb->i_mode = cpu_to_le16(inode->i_mode);
1164 fcb->i_uid = cpu_to_le32(inode->i_uid);
1165 fcb->i_gid = cpu_to_le32(inode->i_gid);
1166 fcb->i_links_count = cpu_to_le16(inode->i_nlink);
1167 fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
1168 fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
1169 fcb->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
1170 oi->i_commit_size = i_size_read(inode);
1171 fcb->i_size = cpu_to_le64(oi->i_commit_size);
1172 fcb->i_generation = cpu_to_le32(inode->i_generation);
1173
1174 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
1175 if (old_valid_dev(inode->i_rdev)) {
1176 fcb->i_data[0] =
1177 cpu_to_le32(old_encode_dev(inode->i_rdev));
1178 fcb->i_data[1] = 0;
1179 } else {
1180 fcb->i_data[0] = 0;
1181 fcb->i_data[1] =
1182 cpu_to_le32(new_encode_dev(inode->i_rdev));
1183 fcb->i_data[2] = 0;
1184 }
1185 } else
1186 memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
1187
1188 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
1189 if (unlikely(!or)) {
1190 EXOFS_ERR("exofs_update_inode: osd_start_request failed.\n");
1191 ret = -ENOMEM;
1192 goto free_args;
1193 }
1194
1195 osd_req_set_attributes(or, &obj);
1196
1197 attr = g_attr_inode_data;
1198 attr.val_ptr = fcb;
1199 osd_req_add_set_attr_list(or, &attr, 1);
1200
1201 if (!obj_created(oi)) {
1202 EXOFS_DBGMSG("!obj_created\n");
1203 BUG_ON(!obj_2bcreated(oi));
1204 wait_event(oi->i_wq, obj_created(oi));
1205 EXOFS_DBGMSG("wait_event done\n");
1206 }
1207
1208 if (do_sync) {
1209 ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
1210 osd_end_request(or);
1211 goto free_args;
1212 } else {
1213 args->sbi = sbi;
1214
1215 ret = exofs_async_op(or, updatei_done, args, oi->i_cred);
1216 if (ret) {
1217 osd_end_request(or);
1218 goto free_args;
1219 }
1220 atomic_inc(&sbi->s_curr_pending);
1221 goto out; /* deallocation in updatei_done */
1222 }
1223
1224free_args:
1225 kfree(args);
1226out:
1227 EXOFS_DBGMSG("ret=>%d\n", ret);
1228 return ret;
1229}
1230
1231int exofs_write_inode(struct inode *inode, int wait)
1232{
1233 return exofs_update_inode(inode, wait);
1234}
1235
1236/*
1237 * Callback function from exofs_delete_inode() - don't have much cleaning up to
1238 * do.
1239 */
1240static void delete_done(struct osd_request *or, void *p)
1241{
1242 struct exofs_sb_info *sbi;
1243 osd_end_request(or);
1244 sbi = p;
1245 atomic_dec(&sbi->s_curr_pending);
1246}
1247
1248/*
1249 * Called when the refcount of an inode reaches zero. We remove the object
1250 * from the OSD here. We make sure the object was created before we try and
1251 * delete it.
1252 */
1253void exofs_delete_inode(struct inode *inode)
1254{
1255 struct exofs_i_info *oi = exofs_i(inode);
1256 struct super_block *sb = inode->i_sb;
1257 struct exofs_sb_info *sbi = sb->s_fs_info;
1258 struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
1259 struct osd_request *or;
1260 int ret;
1261
1262 truncate_inode_pages(&inode->i_data, 0);
1263
1264 if (is_bad_inode(inode))
1265 goto no_delete;
1266
1267 mark_inode_dirty(inode);
1268 exofs_update_inode(inode, inode_needs_sync(inode));
1269
1270 inode->i_size = 0;
1271 if (inode->i_blocks)
1272 exofs_truncate(inode);
1273
1274 clear_inode(inode);
1275
1276 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
1277 if (unlikely(!or)) {
1278 EXOFS_ERR("exofs_delete_inode: osd_start_request failed\n");
1279 return;
1280 }
1281
1282 osd_req_remove_object(or, &obj);
1283
1284 /* if we are deleting an obj that hasn't been created yet, wait */
1285 if (!obj_created(oi)) {
1286 BUG_ON(!obj_2bcreated(oi));
1287 wait_event(oi->i_wq, obj_created(oi));
1288 }
1289
1290 ret = exofs_async_op(or, delete_done, sbi, oi->i_cred);
1291 if (ret) {
1292 EXOFS_ERR(
1293 "ERROR: @exofs_delete_inode exofs_async_op failed\n");
1294 osd_end_request(or);
1295 return;
1296 }
1297 atomic_inc(&sbi->s_curr_pending);
1298
1299 return;
1300
1301no_delete:
1302 clear_inode(inode);
1303}
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
new file mode 100644
index 000000000000..77fdd765e76d
--- /dev/null
+++ b/fs/exofs/namei.c
@@ -0,0 +1,342 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * Copyrights for code taken from ext2:
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 * from
15 * linux/fs/minix/inode.c
16 * Copyright (C) 1991, 1992 Linus Torvalds
17 *
18 * This file is part of exofs.
19 *
20 * exofs is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation. Since it is based on ext2, and the only
23 * valid version of GPL for the Linux kernel is version 2, the only valid
24 * version of GPL for exofs is version 2.
25 *
26 * exofs is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with exofs; if not, write to the Free Software
33 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
34 */
35
36#include "exofs.h"
37
38static inline int exofs_add_nondir(struct dentry *dentry, struct inode *inode)
39{
40 int err = exofs_add_link(dentry, inode);
41 if (!err) {
42 d_instantiate(dentry, inode);
43 return 0;
44 }
45 inode_dec_link_count(inode);
46 iput(inode);
47 return err;
48}
49
50static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry,
51 struct nameidata *nd)
52{
53 struct inode *inode;
54 ino_t ino;
55
56 if (dentry->d_name.len > EXOFS_NAME_LEN)
57 return ERR_PTR(-ENAMETOOLONG);
58
59 ino = exofs_inode_by_name(dir, dentry);
60 inode = NULL;
61 if (ino) {
62 inode = exofs_iget(dir->i_sb, ino);
63 if (IS_ERR(inode))
64 return ERR_CAST(inode);
65 }
66 return d_splice_alias(inode, dentry);
67}
68
69static int exofs_create(struct inode *dir, struct dentry *dentry, int mode,
70 struct nameidata *nd)
71{
72 struct inode *inode = exofs_new_inode(dir, mode);
73 int err = PTR_ERR(inode);
74 if (!IS_ERR(inode)) {
75 inode->i_op = &exofs_file_inode_operations;
76 inode->i_fop = &exofs_file_operations;
77 inode->i_mapping->a_ops = &exofs_aops;
78 mark_inode_dirty(inode);
79 err = exofs_add_nondir(dentry, inode);
80 }
81 return err;
82}
83
84static int exofs_mknod(struct inode *dir, struct dentry *dentry, int mode,
85 dev_t rdev)
86{
87 struct inode *inode;
88 int err;
89
90 if (!new_valid_dev(rdev))
91 return -EINVAL;
92
93 inode = exofs_new_inode(dir, mode);
94 err = PTR_ERR(inode);
95 if (!IS_ERR(inode)) {
96 init_special_inode(inode, inode->i_mode, rdev);
97 mark_inode_dirty(inode);
98 err = exofs_add_nondir(dentry, inode);
99 }
100 return err;
101}
102
103static int exofs_symlink(struct inode *dir, struct dentry *dentry,
104 const char *symname)
105{
106 struct super_block *sb = dir->i_sb;
107 int err = -ENAMETOOLONG;
108 unsigned l = strlen(symname)+1;
109 struct inode *inode;
110 struct exofs_i_info *oi;
111
112 if (l > sb->s_blocksize)
113 goto out;
114
115 inode = exofs_new_inode(dir, S_IFLNK | S_IRWXUGO);
116 err = PTR_ERR(inode);
117 if (IS_ERR(inode))
118 goto out;
119
120 oi = exofs_i(inode);
121 if (l > sizeof(oi->i_data)) {
122 /* slow symlink */
123 inode->i_op = &exofs_symlink_inode_operations;
124 inode->i_mapping->a_ops = &exofs_aops;
125 memset(oi->i_data, 0, sizeof(oi->i_data));
126
127 err = page_symlink(inode, symname, l);
128 if (err)
129 goto out_fail;
130 } else {
131 /* fast symlink */
132 inode->i_op = &exofs_fast_symlink_inode_operations;
133 memcpy(oi->i_data, symname, l);
134 inode->i_size = l-1;
135 }
136 mark_inode_dirty(inode);
137
138 err = exofs_add_nondir(dentry, inode);
139out:
140 return err;
141
142out_fail:
143 inode_dec_link_count(inode);
144 iput(inode);
145 goto out;
146}
147
148static int exofs_link(struct dentry *old_dentry, struct inode *dir,
149 struct dentry *dentry)
150{
151 struct inode *inode = old_dentry->d_inode;
152
153 if (inode->i_nlink >= EXOFS_LINK_MAX)
154 return -EMLINK;
155
156 inode->i_ctime = CURRENT_TIME;
157 inode_inc_link_count(inode);
158 atomic_inc(&inode->i_count);
159
160 return exofs_add_nondir(dentry, inode);
161}
162
163static int exofs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
164{
165 struct inode *inode;
166 int err = -EMLINK;
167
168 if (dir->i_nlink >= EXOFS_LINK_MAX)
169 goto out;
170
171 inode_inc_link_count(dir);
172
173 inode = exofs_new_inode(dir, S_IFDIR | mode);
174 err = PTR_ERR(inode);
175 if (IS_ERR(inode))
176 goto out_dir;
177
178 inode->i_op = &exofs_dir_inode_operations;
179 inode->i_fop = &exofs_dir_operations;
180 inode->i_mapping->a_ops = &exofs_aops;
181
182 inode_inc_link_count(inode);
183
184 err = exofs_make_empty(inode, dir);
185 if (err)
186 goto out_fail;
187
188 err = exofs_add_link(dentry, inode);
189 if (err)
190 goto out_fail;
191
192 d_instantiate(dentry, inode);
193out:
194 return err;
195
196out_fail:
197 inode_dec_link_count(inode);
198 inode_dec_link_count(inode);
199 iput(inode);
200out_dir:
201 inode_dec_link_count(dir);
202 goto out;
203}
204
205static int exofs_unlink(struct inode *dir, struct dentry *dentry)
206{
207 struct inode *inode = dentry->d_inode;
208 struct exofs_dir_entry *de;
209 struct page *page;
210 int err = -ENOENT;
211
212 de = exofs_find_entry(dir, dentry, &page);
213 if (!de)
214 goto out;
215
216 err = exofs_delete_entry(de, page);
217 if (err)
218 goto out;
219
220 inode->i_ctime = dir->i_ctime;
221 inode_dec_link_count(inode);
222 err = 0;
223out:
224 return err;
225}
226
227static int exofs_rmdir(struct inode *dir, struct dentry *dentry)
228{
229 struct inode *inode = dentry->d_inode;
230 int err = -ENOTEMPTY;
231
232 if (exofs_empty_dir(inode)) {
233 err = exofs_unlink(dir, dentry);
234 if (!err) {
235 inode->i_size = 0;
236 inode_dec_link_count(inode);
237 inode_dec_link_count(dir);
238 }
239 }
240 return err;
241}
242
243static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
244 struct inode *new_dir, struct dentry *new_dentry)
245{
246 struct inode *old_inode = old_dentry->d_inode;
247 struct inode *new_inode = new_dentry->d_inode;
248 struct page *dir_page = NULL;
249 struct exofs_dir_entry *dir_de = NULL;
250 struct page *old_page;
251 struct exofs_dir_entry *old_de;
252 int err = -ENOENT;
253
254 old_de = exofs_find_entry(old_dir, old_dentry, &old_page);
255 if (!old_de)
256 goto out;
257
258 if (S_ISDIR(old_inode->i_mode)) {
259 err = -EIO;
260 dir_de = exofs_dotdot(old_inode, &dir_page);
261 if (!dir_de)
262 goto out_old;
263 }
264
265 if (new_inode) {
266 struct page *new_page;
267 struct exofs_dir_entry *new_de;
268
269 err = -ENOTEMPTY;
270 if (dir_de && !exofs_empty_dir(new_inode))
271 goto out_dir;
272
273 err = -ENOENT;
274 new_de = exofs_find_entry(new_dir, new_dentry, &new_page);
275 if (!new_de)
276 goto out_dir;
277 inode_inc_link_count(old_inode);
278 err = exofs_set_link(new_dir, new_de, new_page, old_inode);
279 new_inode->i_ctime = CURRENT_TIME;
280 if (dir_de)
281 drop_nlink(new_inode);
282 inode_dec_link_count(new_inode);
283 if (err)
284 goto out_dir;
285 } else {
286 if (dir_de) {
287 err = -EMLINK;
288 if (new_dir->i_nlink >= EXOFS_LINK_MAX)
289 goto out_dir;
290 }
291 inode_inc_link_count(old_inode);
292 err = exofs_add_link(new_dentry, old_inode);
293 if (err) {
294 inode_dec_link_count(old_inode);
295 goto out_dir;
296 }
297 if (dir_de)
298 inode_inc_link_count(new_dir);
299 }
300
301 old_inode->i_ctime = CURRENT_TIME;
302
303 exofs_delete_entry(old_de, old_page);
304 inode_dec_link_count(old_inode);
305
306 if (dir_de) {
307 err = exofs_set_link(old_inode, dir_de, dir_page, new_dir);
308 inode_dec_link_count(old_dir);
309 if (err)
310 goto out_dir;
311 }
312 return 0;
313
314
315out_dir:
316 if (dir_de) {
317 kunmap(dir_page);
318 page_cache_release(dir_page);
319 }
320out_old:
321 kunmap(old_page);
322 page_cache_release(old_page);
323out:
324 return err;
325}
326
327const struct inode_operations exofs_dir_inode_operations = {
328 .create = exofs_create,
329 .lookup = exofs_lookup,
330 .link = exofs_link,
331 .unlink = exofs_unlink,
332 .symlink = exofs_symlink,
333 .mkdir = exofs_mkdir,
334 .rmdir = exofs_rmdir,
335 .mknod = exofs_mknod,
336 .rename = exofs_rename,
337 .setattr = exofs_setattr,
338};
339
340const struct inode_operations exofs_special_inode_operations = {
341 .setattr = exofs_setattr,
342};
diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c
new file mode 100644
index 000000000000..b249ae97fb15
--- /dev/null
+++ b/fs/exofs/osd.c
@@ -0,0 +1,153 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * This file is part of exofs.
10 *
11 * exofs is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation. Since it is based on ext2, and the only
14 * valid version of GPL for the Linux kernel is version 2, the only valid
15 * version of GPL for exofs is version 2.
16 *
17 * exofs is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with exofs; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
25 */
26
27#include <scsi/scsi_device.h>
28#include <scsi/osd_sense.h>
29
30#include "exofs.h"
31
32int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid)
33{
34 struct osd_sense_info osi;
35 int ret = osd_req_decode_sense(or, &osi);
36
37 if (ret) { /* translate to Linux codes */
38 if (osi.additional_code == scsi_invalid_field_in_cdb) {
39 if (osi.cdb_field_offset == OSD_CFO_STARTING_BYTE)
40 ret = -EFAULT;
41 if (osi.cdb_field_offset == OSD_CFO_OBJECT_ID)
42 ret = -ENOENT;
43 else
44 ret = -EINVAL;
45 } else if (osi.additional_code == osd_quota_error)
46 ret = -ENOSPC;
47 else
48 ret = -EIO;
49 }
50
51 /* FIXME: should be include in osd_sense_info */
52 if (in_resid)
53 *in_resid = or->in.req ? or->in.req->data_len : 0;
54
55 if (out_resid)
56 *out_resid = or->out.req ? or->out.req->data_len : 0;
57
58 return ret;
59}
60
61void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
62{
63 osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
64}
65
66/*
67 * Perform a synchronous OSD operation.
68 */
69int exofs_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
70{
71 int ret;
72
73 or->timeout = timeout;
74 ret = osd_finalize_request(or, 0, credential, NULL);
75 if (ret) {
76 EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
77 return ret;
78 }
79
80 ret = osd_execute_request(or);
81
82 if (ret)
83 EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
84 /* osd_req_decode_sense(or, ret); */
85 return ret;
86}
87
88/*
89 * Perform an asynchronous OSD operation.
90 */
91int exofs_async_op(struct osd_request *or, osd_req_done_fn *async_done,
92 void *caller_context, u8 *cred)
93{
94 int ret;
95
96 ret = osd_finalize_request(or, 0, cred, NULL);
97 if (ret) {
98 EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
99 return ret;
100 }
101
102 ret = osd_execute_request_async(or, async_done, caller_context);
103
104 if (ret)
105 EXOFS_DBGMSG("osd_execute_request_async() => %d\n", ret);
106 return ret;
107}
108
109int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
110{
111 struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
112 void *iter = NULL;
113 int nelem;
114
115 do {
116 nelem = 1;
117 osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
118 if ((cur_attr.attr_page == attr->attr_page) &&
119 (cur_attr.attr_id == attr->attr_id)) {
120 attr->len = cur_attr.len;
121 attr->val_ptr = cur_attr.val_ptr;
122 return 0;
123 }
124 } while (iter);
125
126 return -EIO;
127}
128
129int osd_req_read_kern(struct osd_request *or,
130 const struct osd_obj_id *obj, u64 offset, void* buff, u64 len)
131{
132 struct request_queue *req_q = or->osd_dev->scsi_device->request_queue;
133 struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL);
134
135 if (!bio)
136 return -ENOMEM;
137
138 osd_req_read(or, obj, bio, offset);
139 return 0;
140}
141
142int osd_req_write_kern(struct osd_request *or,
143 const struct osd_obj_id *obj, u64 offset, void* buff, u64 len)
144{
145 struct request_queue *req_q = or->osd_dev->scsi_device->request_queue;
146 struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL);
147
148 if (!bio)
149 return -ENOMEM;
150
151 osd_req_write(or, obj, bio, offset);
152 return 0;
153}
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
new file mode 100644
index 000000000000..9f1985e857e2
--- /dev/null
+++ b/fs/exofs/super.c
@@ -0,0 +1,584 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * Copyrights for code taken from ext2:
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 * from
15 * linux/fs/minix/inode.c
16 * Copyright (C) 1991, 1992 Linus Torvalds
17 *
18 * This file is part of exofs.
19 *
20 * exofs is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation. Since it is based on ext2, and the only
23 * valid version of GPL for the Linux kernel is version 2, the only valid
24 * version of GPL for exofs is version 2.
25 *
26 * exofs is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with exofs; if not, write to the Free Software
33 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
34 */
35
36#include <linux/string.h>
37#include <linux/parser.h>
38#include <linux/vfs.h>
39#include <linux/random.h>
40#include <linux/exportfs.h>
41
42#include "exofs.h"
43
44/******************************************************************************
45 * MOUNT OPTIONS
46 *****************************************************************************/
47
48/*
49 * struct to hold what we get from mount options
50 */
51struct exofs_mountopt {
52 const char *dev_name;
53 uint64_t pid;
54 int timeout;
55};
56
57/*
58 * exofs-specific mount-time options.
59 */
60enum { Opt_pid, Opt_to, Opt_mkfs, Opt_format, Opt_err };
61
62/*
63 * Our mount-time options. These should ideally be 64-bit unsigned, but the
64 * kernel's parsing functions do not currently support that. 32-bit should be
65 * sufficient for most applications now.
66 */
67static match_table_t tokens = {
68 {Opt_pid, "pid=%u"},
69 {Opt_to, "to=%u"},
70 {Opt_err, NULL}
71};
72
73/*
74 * The main option parsing method. Also makes sure that all of the mandatory
75 * mount options were set.
76 */
77static int parse_options(char *options, struct exofs_mountopt *opts)
78{
79 char *p;
80 substring_t args[MAX_OPT_ARGS];
81 int option;
82 bool s_pid = false;
83
84 EXOFS_DBGMSG("parse_options %s\n", options);
85 /* defaults */
86 memset(opts, 0, sizeof(*opts));
87 opts->timeout = BLK_DEFAULT_SG_TIMEOUT;
88
89 while ((p = strsep(&options, ",")) != NULL) {
90 int token;
91 char str[32];
92
93 if (!*p)
94 continue;
95
96 token = match_token(p, tokens, args);
97 switch (token) {
98 case Opt_pid:
99 if (0 == match_strlcpy(str, &args[0], sizeof(str)))
100 return -EINVAL;
101 opts->pid = simple_strtoull(str, NULL, 0);
102 if (opts->pid < EXOFS_MIN_PID) {
103 EXOFS_ERR("Partition ID must be >= %u",
104 EXOFS_MIN_PID);
105 return -EINVAL;
106 }
107 s_pid = 1;
108 break;
109 case Opt_to:
110 if (match_int(&args[0], &option))
111 return -EINVAL;
112 if (option <= 0) {
113 EXOFS_ERR("Timout must be > 0");
114 return -EINVAL;
115 }
116 opts->timeout = option * HZ;
117 break;
118 }
119 }
120
121 if (!s_pid) {
122 EXOFS_ERR("Need to specify the following options:\n");
123 EXOFS_ERR(" -o pid=pid_no_to_use\n");
124 return -EINVAL;
125 }
126
127 return 0;
128}
129
130/******************************************************************************
131 * INODE CACHE
132 *****************************************************************************/
133
134/*
135 * Our inode cache. Isn't it pretty?
136 */
137static struct kmem_cache *exofs_inode_cachep;
138
139/*
140 * Allocate an inode in the cache
141 */
142static struct inode *exofs_alloc_inode(struct super_block *sb)
143{
144 struct exofs_i_info *oi;
145
146 oi = kmem_cache_alloc(exofs_inode_cachep, GFP_KERNEL);
147 if (!oi)
148 return NULL;
149
150 oi->vfs_inode.i_version = 1;
151 return &oi->vfs_inode;
152}
153
154/*
155 * Remove an inode from the cache
156 */
157static void exofs_destroy_inode(struct inode *inode)
158{
159 kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
160}
161
162/*
163 * Initialize the inode
164 */
165static void exofs_init_once(void *foo)
166{
167 struct exofs_i_info *oi = foo;
168
169 inode_init_once(&oi->vfs_inode);
170}
171
172/*
173 * Create and initialize the inode cache
174 */
175static int init_inodecache(void)
176{
177 exofs_inode_cachep = kmem_cache_create("exofs_inode_cache",
178 sizeof(struct exofs_i_info), 0,
179 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
180 exofs_init_once);
181 if (exofs_inode_cachep == NULL)
182 return -ENOMEM;
183 return 0;
184}
185
186/*
187 * Destroy the inode cache
188 */
189static void destroy_inodecache(void)
190{
191 kmem_cache_destroy(exofs_inode_cachep);
192}
193
194/******************************************************************************
195 * SUPERBLOCK FUNCTIONS
196 *****************************************************************************/
197static const struct super_operations exofs_sops;
198static const struct export_operations exofs_export_ops;
199
200/*
201 * Write the superblock to the OSD
202 */
203static void exofs_write_super(struct super_block *sb)
204{
205 struct exofs_sb_info *sbi;
206 struct exofs_fscb *fscb;
207 struct osd_request *or;
208 struct osd_obj_id obj;
209 int ret;
210
211 fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL);
212 if (!fscb) {
213 EXOFS_ERR("exofs_write_super: memory allocation failed.\n");
214 return;
215 }
216
217 lock_kernel();
218 sbi = sb->s_fs_info;
219 fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
220 fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
221 fscb->s_magic = cpu_to_le16(sb->s_magic);
222 fscb->s_newfs = 0;
223
224 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
225 if (unlikely(!or)) {
226 EXOFS_ERR("exofs_write_super: osd_start_request failed.\n");
227 goto out;
228 }
229
230 obj.partition = sbi->s_pid;
231 obj.id = EXOFS_SUPER_ID;
232 ret = osd_req_write_kern(or, &obj, 0, fscb, sizeof(*fscb));
233 if (unlikely(ret)) {
234 EXOFS_ERR("exofs_write_super: osd_req_write_kern failed.\n");
235 goto out;
236 }
237
238 ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
239 if (unlikely(ret)) {
240 EXOFS_ERR("exofs_write_super: exofs_sync_op failed.\n");
241 goto out;
242 }
243 sb->s_dirt = 0;
244
245out:
246 if (or)
247 osd_end_request(or);
248 unlock_kernel();
249 kfree(fscb);
250}
251
252/*
253 * This function is called when the vfs is freeing the superblock. We just
254 * need to free our own part.
255 */
256static void exofs_put_super(struct super_block *sb)
257{
258 int num_pend;
259 struct exofs_sb_info *sbi = sb->s_fs_info;
260
261 /* make sure there are no pending commands */
262 for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
263 num_pend = atomic_read(&sbi->s_curr_pending)) {
264 wait_queue_head_t wq;
265 init_waitqueue_head(&wq);
266 wait_event_timeout(wq,
267 (atomic_read(&sbi->s_curr_pending) == 0),
268 msecs_to_jiffies(100));
269 }
270
271 osduld_put_device(sbi->s_dev);
272 kfree(sb->s_fs_info);
273 sb->s_fs_info = NULL;
274}
275
276/*
277 * Read the superblock from the OSD and fill in the fields
278 */
279static int exofs_fill_super(struct super_block *sb, void *data, int silent)
280{
281 struct inode *root;
282 struct exofs_mountopt *opts = data;
283 struct exofs_sb_info *sbi; /*extended info */
284 struct exofs_fscb fscb; /*on-disk superblock info */
285 struct osd_request *or = NULL;
286 struct osd_obj_id obj;
287 int ret;
288
289 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
290 if (!sbi)
291 return -ENOMEM;
292 sb->s_fs_info = sbi;
293
294 /* use mount options to fill superblock */
295 sbi->s_dev = osduld_path_lookup(opts->dev_name);
296 if (IS_ERR(sbi->s_dev)) {
297 ret = PTR_ERR(sbi->s_dev);
298 sbi->s_dev = NULL;
299 goto free_sbi;
300 }
301
302 sbi->s_pid = opts->pid;
303 sbi->s_timeout = opts->timeout;
304
305 /* fill in some other data by hand */
306 memset(sb->s_id, 0, sizeof(sb->s_id));
307 strcpy(sb->s_id, "exofs");
308 sb->s_blocksize = EXOFS_BLKSIZE;
309 sb->s_blocksize_bits = EXOFS_BLKSHIFT;
310 sb->s_maxbytes = MAX_LFS_FILESIZE;
311 atomic_set(&sbi->s_curr_pending, 0);
312 sb->s_bdev = NULL;
313 sb->s_dev = 0;
314
315 /* read data from on-disk superblock object */
316 obj.partition = sbi->s_pid;
317 obj.id = EXOFS_SUPER_ID;
318 exofs_make_credential(sbi->s_cred, &obj);
319
320 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
321 if (unlikely(!or)) {
322 if (!silent)
323 EXOFS_ERR(
324 "exofs_fill_super: osd_start_request failed.\n");
325 ret = -ENOMEM;
326 goto free_sbi;
327 }
328 ret = osd_req_read_kern(or, &obj, 0, &fscb, sizeof(fscb));
329 if (unlikely(ret)) {
330 if (!silent)
331 EXOFS_ERR(
332 "exofs_fill_super: osd_req_read_kern failed.\n");
333 ret = -ENOMEM;
334 goto free_sbi;
335 }
336
337 ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
338 if (unlikely(ret)) {
339 if (!silent)
340 EXOFS_ERR("exofs_fill_super: exofs_sync_op failed.\n");
341 ret = -EIO;
342 goto free_sbi;
343 }
344
345 sb->s_magic = le16_to_cpu(fscb.s_magic);
346 sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
347 sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles);
348
349 /* make sure what we read from the object store is correct */
350 if (sb->s_magic != EXOFS_SUPER_MAGIC) {
351 if (!silent)
352 EXOFS_ERR("ERROR: Bad magic value\n");
353 ret = -EINVAL;
354 goto free_sbi;
355 }
356
357 /* start generation numbers from a random point */
358 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
359 spin_lock_init(&sbi->s_next_gen_lock);
360
361 /* set up operation vectors */
362 sb->s_op = &exofs_sops;
363 sb->s_export_op = &exofs_export_ops;
364 root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
365 if (IS_ERR(root)) {
366 EXOFS_ERR("ERROR: exofs_iget failed\n");
367 ret = PTR_ERR(root);
368 goto free_sbi;
369 }
370 sb->s_root = d_alloc_root(root);
371 if (!sb->s_root) {
372 iput(root);
373 EXOFS_ERR("ERROR: get root inode failed\n");
374 ret = -ENOMEM;
375 goto free_sbi;
376 }
377
378 if (!S_ISDIR(root->i_mode)) {
379 dput(sb->s_root);
380 sb->s_root = NULL;
381 EXOFS_ERR("ERROR: corrupt root inode (mode = %hd)\n",
382 root->i_mode);
383 ret = -EINVAL;
384 goto free_sbi;
385 }
386
387 ret = 0;
388out:
389 if (or)
390 osd_end_request(or);
391 return ret;
392
393free_sbi:
394 osduld_put_device(sbi->s_dev); /* NULL safe */
395 kfree(sbi);
396 goto out;
397}
398
399/*
400 * Set up the superblock (calls exofs_fill_super eventually)
401 */
402static int exofs_get_sb(struct file_system_type *type,
403 int flags, const char *dev_name,
404 void *data, struct vfsmount *mnt)
405{
406 struct exofs_mountopt opts;
407 int ret;
408
409 ret = parse_options(data, &opts);
410 if (ret)
411 return ret;
412
413 opts.dev_name = dev_name;
414 return get_sb_nodev(type, flags, &opts, exofs_fill_super, mnt);
415}
416
417/*
418 * Return information about the file system state in the buffer. This is used
419 * by the 'df' command, for example.
420 */
421static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
422{
423 struct super_block *sb = dentry->d_sb;
424 struct exofs_sb_info *sbi = sb->s_fs_info;
425 struct osd_obj_id obj = {sbi->s_pid, 0};
426 struct osd_attr attrs[] = {
427 ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS,
428 OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)),
429 ATTR_DEF(OSD_APAGE_PARTITION_INFORMATION,
430 OSD_ATTR_PI_USED_CAPACITY, sizeof(__be64)),
431 };
432 uint64_t capacity = ULLONG_MAX;
433 uint64_t used = ULLONG_MAX;
434 struct osd_request *or;
435 uint8_t cred_a[OSD_CAP_LEN];
436 int ret;
437
438 /* get used/capacity attributes */
439 exofs_make_credential(cred_a, &obj);
440
441 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
442 if (unlikely(!or)) {
443 EXOFS_DBGMSG("exofs_statfs: osd_start_request failed.\n");
444 return -ENOMEM;
445 }
446
447 osd_req_get_attributes(or, &obj);
448 osd_req_add_get_attr_list(or, attrs, ARRAY_SIZE(attrs));
449 ret = exofs_sync_op(or, sbi->s_timeout, cred_a);
450 if (unlikely(ret))
451 goto out;
452
453 ret = extract_attr_from_req(or, &attrs[0]);
454 if (likely(!ret))
455 capacity = get_unaligned_be64(attrs[0].val_ptr);
456 else
457 EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n");
458
459 ret = extract_attr_from_req(or, &attrs[1]);
460 if (likely(!ret))
461 used = get_unaligned_be64(attrs[1].val_ptr);
462 else
463 EXOFS_DBGMSG("exofs_statfs: get used-space failed.\n");
464
465 /* fill in the stats buffer */
466 buf->f_type = EXOFS_SUPER_MAGIC;
467 buf->f_bsize = EXOFS_BLKSIZE;
468 buf->f_blocks = (capacity >> EXOFS_BLKSHIFT);
469 buf->f_bfree = ((capacity - used) >> EXOFS_BLKSHIFT);
470 buf->f_bavail = buf->f_bfree;
471 buf->f_files = sbi->s_numfiles;
472 buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles;
473 buf->f_namelen = EXOFS_NAME_LEN;
474
475out:
476 osd_end_request(or);
477 return ret;
478}
479
480static const struct super_operations exofs_sops = {
481 .alloc_inode = exofs_alloc_inode,
482 .destroy_inode = exofs_destroy_inode,
483 .write_inode = exofs_write_inode,
484 .delete_inode = exofs_delete_inode,
485 .put_super = exofs_put_super,
486 .write_super = exofs_write_super,
487 .statfs = exofs_statfs,
488};
489
490/******************************************************************************
491 * EXPORT OPERATIONS
492 *****************************************************************************/
493
494struct dentry *exofs_get_parent(struct dentry *child)
495{
496 unsigned long ino = exofs_parent_ino(child);
497
498 if (!ino)
499 return NULL;
500
501 return d_obtain_alias(exofs_iget(child->d_inode->i_sb, ino));
502}
503
504static struct inode *exofs_nfs_get_inode(struct super_block *sb,
505 u64 ino, u32 generation)
506{
507 struct inode *inode;
508
509 inode = exofs_iget(sb, ino);
510 if (IS_ERR(inode))
511 return ERR_CAST(inode);
512 if (generation && inode->i_generation != generation) {
513 /* we didn't find the right inode.. */
514 iput(inode);
515 return ERR_PTR(-ESTALE);
516 }
517 return inode;
518}
519
520static struct dentry *exofs_fh_to_dentry(struct super_block *sb,
521 struct fid *fid, int fh_len, int fh_type)
522{
523 return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
524 exofs_nfs_get_inode);
525}
526
527static struct dentry *exofs_fh_to_parent(struct super_block *sb,
528 struct fid *fid, int fh_len, int fh_type)
529{
530 return generic_fh_to_parent(sb, fid, fh_len, fh_type,
531 exofs_nfs_get_inode);
532}
533
534static const struct export_operations exofs_export_ops = {
535 .fh_to_dentry = exofs_fh_to_dentry,
536 .fh_to_parent = exofs_fh_to_parent,
537 .get_parent = exofs_get_parent,
538};
539
540/******************************************************************************
541 * INSMOD/RMMOD
542 *****************************************************************************/
543
544/*
545 * struct that describes this file system
546 */
547static struct file_system_type exofs_type = {
548 .owner = THIS_MODULE,
549 .name = "exofs",
550 .get_sb = exofs_get_sb,
551 .kill_sb = generic_shutdown_super,
552};
553
554static int __init init_exofs(void)
555{
556 int err;
557
558 err = init_inodecache();
559 if (err)
560 goto out;
561
562 err = register_filesystem(&exofs_type);
563 if (err)
564 goto out_d;
565
566 return 0;
567out_d:
568 destroy_inodecache();
569out:
570 return err;
571}
572
573static void __exit exit_exofs(void)
574{
575 unregister_filesystem(&exofs_type);
576 destroy_inodecache();
577}
578
579MODULE_AUTHOR("Avishay Traeger <avishay@gmail.com>");
580MODULE_DESCRIPTION("exofs");
581MODULE_LICENSE("GPL");
582
583module_init(init_exofs)
584module_exit(exit_exofs)
diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c
new file mode 100644
index 000000000000..36e2d7bc7f7b
--- /dev/null
+++ b/fs/exofs/symlink.c
@@ -0,0 +1,57 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * Copyrights for code taken from ext2:
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 * from
15 * linux/fs/minix/inode.c
16 * Copyright (C) 1991, 1992 Linus Torvalds
17 *
18 * This file is part of exofs.
19 *
20 * exofs is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation. Since it is based on ext2, and the only
23 * valid version of GPL for the Linux kernel is version 2, the only valid
24 * version of GPL for exofs is version 2.
25 *
26 * exofs is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with exofs; if not, write to the Free Software
33 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
34 */
35
36#include <linux/namei.h>
37
38#include "exofs.h"
39
40static void *exofs_follow_link(struct dentry *dentry, struct nameidata *nd)
41{
42 struct exofs_i_info *oi = exofs_i(dentry->d_inode);
43
44 nd_set_link(nd, (char *)oi->i_data);
45 return NULL;
46}
47
48const struct inode_operations exofs_symlink_inode_operations = {
49 .readlink = generic_readlink,
50 .follow_link = page_follow_link_light,
51 .put_link = page_put_link,
52};
53
54const struct inode_operations exofs_fast_symlink_inode_operations = {
55 .readlink = generic_readlink,
56 .follow_link = exofs_follow_link,
57};
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index ae8c4f850b27..d46e38cb85c5 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -318,7 +318,7 @@ ext2_init_acl(struct inode *inode, struct inode *dir)
318 return PTR_ERR(acl); 318 return PTR_ERR(acl);
319 } 319 }
320 if (!acl) 320 if (!acl)
321 inode->i_mode &= ~current->fs->umask; 321 inode->i_mode &= ~current_umask();
322 } 322 }
323 if (test_opt(inode->i_sb, POSIX_ACL) && acl) { 323 if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
324 struct posix_acl *clone; 324 struct posix_acl *clone;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index b43b95563663..acf678831103 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -590,9 +590,8 @@ static int ext2_get_blocks(struct inode *inode,
590 590
591 if (depth == 0) 591 if (depth == 0)
592 return (err); 592 return (err);
593reread:
594 partial = ext2_get_branch(inode, depth, offsets, chain, &err);
595 593
594 partial = ext2_get_branch(inode, depth, offsets, chain, &err);
596 /* Simplest case - block found, no allocation needed */ 595 /* Simplest case - block found, no allocation needed */
597 if (!partial) { 596 if (!partial) {
598 first_block = le32_to_cpu(chain[depth - 1].key); 597 first_block = le32_to_cpu(chain[depth - 1].key);
@@ -602,15 +601,16 @@ reread:
602 while (count < maxblocks && count <= blocks_to_boundary) { 601 while (count < maxblocks && count <= blocks_to_boundary) {
603 ext2_fsblk_t blk; 602 ext2_fsblk_t blk;
604 603
605 if (!verify_chain(chain, partial)) { 604 if (!verify_chain(chain, chain + depth - 1)) {
606 /* 605 /*
607 * Indirect block might be removed by 606 * Indirect block might be removed by
608 * truncate while we were reading it. 607 * truncate while we were reading it.
609 * Handling of that case: forget what we've 608 * Handling of that case: forget what we've
610 * got now, go to reread. 609 * got now, go to reread.
611 */ 610 */
611 err = -EAGAIN;
612 count = 0; 612 count = 0;
613 goto changed; 613 break;
614 } 614 }
615 blk = le32_to_cpu(*(chain[depth-1].p + count)); 615 blk = le32_to_cpu(*(chain[depth-1].p + count));
616 if (blk == first_block + count) 616 if (blk == first_block + count)
@@ -618,7 +618,8 @@ reread:
618 else 618 else
619 break; 619 break;
620 } 620 }
621 goto got_it; 621 if (err != -EAGAIN)
622 goto got_it;
622 } 623 }
623 624
624 /* Next simple case - plain lookup or failed read of indirect block */ 625 /* Next simple case - plain lookup or failed read of indirect block */
@@ -626,6 +627,33 @@ reread:
626 goto cleanup; 627 goto cleanup;
627 628
628 mutex_lock(&ei->truncate_mutex); 629 mutex_lock(&ei->truncate_mutex);
630 /*
631 * If the indirect block is missing while we are reading
632 * the chain(ext3_get_branch() returns -EAGAIN err), or
633 * if the chain has been changed after we grab the semaphore,
634 * (either because another process truncated this branch, or
635 * another get_block allocated this branch) re-grab the chain to see if
636 * the request block has been allocated or not.
637 *
638 * Since we already block the truncate/other get_block
639 * at this point, we will have the current copy of the chain when we
640 * splice the branch into the tree.
641 */
642 if (err == -EAGAIN || !verify_chain(chain, partial)) {
643 while (partial > chain) {
644 brelse(partial->bh);
645 partial--;
646 }
647 partial = ext2_get_branch(inode, depth, offsets, chain, &err);
648 if (!partial) {
649 count++;
650 mutex_unlock(&ei->truncate_mutex);
651 if (err)
652 goto cleanup;
653 clear_buffer_new(bh_result);
654 goto got_it;
655 }
656 }
629 657
630 /* 658 /*
631 * Okay, we need to do block allocation. Lazily initialize the block 659 * Okay, we need to do block allocation. Lazily initialize the block
@@ -683,12 +711,6 @@ cleanup:
683 partial--; 711 partial--;
684 } 712 }
685 return err; 713 return err;
686changed:
687 while (partial > chain) {
688 brelse(partial->bh);
689 partial--;
690 }
691 goto reread;
692} 714}
693 715
694int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) 716int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index f983225266dc..5c4afe652245 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1395,8 +1395,10 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type,
1395 blk++; 1395 blk++;
1396 } 1396 }
1397out: 1397out:
1398 if (len == towrite) 1398 if (len == towrite) {
1399 mutex_unlock(&inode->i_mutex);
1399 return err; 1400 return err;
1401 }
1400 if (inode->i_size < off+len-towrite) 1402 if (inode->i_size < off+len-towrite)
1401 i_size_write(inode, off+len-towrite); 1403 i_size_write(inode, off+len-towrite);
1402 inode->i_version++; 1404 inode->i_version++;
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
index 8e0cfe44b0fc..fb3c1a21b135 100644
--- a/fs/ext3/Kconfig
+++ b/fs/ext3/Kconfig
@@ -28,6 +28,25 @@ config EXT3_FS
28 To compile this file system support as a module, choose M here: the 28 To compile this file system support as a module, choose M here: the
29 module will be called ext3. 29 module will be called ext3.
30 30
31config EXT3_DEFAULTS_TO_ORDERED
32 bool "Default to 'data=ordered' in ext3 (legacy option)"
33 depends on EXT3_FS
34 help
35 If a filesystem does not explicitly specify a data ordering
36 mode, and the journal capability allowed it, ext3 used to
37 historically default to 'data=ordered'.
38
39 That was a rather unfortunate choice, because it leads to all
40 kinds of latency problems, and the 'data=writeback' mode is more
41 appropriate these days.
42
43 You should probably always answer 'n' here, and if you really
44 want to use 'data=ordered' mode, set it in the filesystem itself
45 with 'tune2fs -o journal_data_ordered'.
46
47 But if you really want to enable the legacy default, you can do
48 so by answering 'y' to this question.
49
31config EXT3_FS_XATTR 50config EXT3_FS_XATTR
32 bool "Ext3 extended attributes" 51 bool "Ext3 extended attributes"
33 depends on EXT3_FS 52 depends on EXT3_FS
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index b60bb241880c..d81ef2fdb08e 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -323,7 +323,7 @@ ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
323 return PTR_ERR(acl); 323 return PTR_ERR(acl);
324 } 324 }
325 if (!acl) 325 if (!acl)
326 inode->i_mode &= ~current->fs->umask; 326 inode->i_mode &= ~current_umask();
327 } 327 }
328 if (test_opt(inode->i_sb, POSIX_ACL) && acl) { 328 if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
329 struct posix_acl *clone; 329 struct posix_acl *clone;
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 5853f4440af4..3d724a95882f 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -42,7 +42,7 @@ const struct file_operations ext3_dir_operations = {
42 .llseek = generic_file_llseek, 42 .llseek = generic_file_llseek,
43 .read = generic_read_dir, 43 .read = generic_read_dir,
44 .readdir = ext3_readdir, /* we take BKL. needed?*/ 44 .readdir = ext3_readdir, /* we take BKL. needed?*/
45 .ioctl = ext3_ioctl, /* BKL held */ 45 .unlocked_ioctl = ext3_ioctl,
46#ifdef CONFIG_COMPAT 46#ifdef CONFIG_COMPAT
47 .compat_ioctl = ext3_compat_ioctl, 47 .compat_ioctl = ext3_compat_ioctl,
48#endif 48#endif
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 3be1e0689c9a..5b49704b231b 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -33,6 +33,10 @@
33 */ 33 */
34static int ext3_release_file (struct inode * inode, struct file * filp) 34static int ext3_release_file (struct inode * inode, struct file * filp)
35{ 35{
36 if (EXT3_I(inode)->i_state & EXT3_STATE_FLUSH_ON_CLOSE) {
37 filemap_flush(inode->i_mapping);
38 EXT3_I(inode)->i_state &= ~EXT3_STATE_FLUSH_ON_CLOSE;
39 }
36 /* if we are the last writer on the inode, drop the block reservation */ 40 /* if we are the last writer on the inode, drop the block reservation */
37 if ((filp->f_mode & FMODE_WRITE) && 41 if ((filp->f_mode & FMODE_WRITE) &&
38 (atomic_read(&inode->i_writecount) == 1)) 42 (atomic_read(&inode->i_writecount) == 1))
@@ -112,7 +116,7 @@ const struct file_operations ext3_file_operations = {
112 .write = do_sync_write, 116 .write = do_sync_write,
113 .aio_read = generic_file_aio_read, 117 .aio_read = generic_file_aio_read,
114 .aio_write = ext3_file_write, 118 .aio_write = ext3_file_write,
115 .ioctl = ext3_ioctl, 119 .unlocked_ioctl = ext3_ioctl,
116#ifdef CONFIG_COMPAT 120#ifdef CONFIG_COMPAT
117 .compat_ioctl = ext3_compat_ioctl, 121 .compat_ioctl = ext3_compat_ioctl,
118#endif 122#endif
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 4a09ff169870..fcfa24361856 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1149,12 +1149,15 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping,
1149 struct page **pagep, void **fsdata) 1149 struct page **pagep, void **fsdata)
1150{ 1150{
1151 struct inode *inode = mapping->host; 1151 struct inode *inode = mapping->host;
1152 int ret, needed_blocks = ext3_writepage_trans_blocks(inode); 1152 int ret;
1153 handle_t *handle; 1153 handle_t *handle;
1154 int retries = 0; 1154 int retries = 0;
1155 struct page *page; 1155 struct page *page;
1156 pgoff_t index; 1156 pgoff_t index;
1157 unsigned from, to; 1157 unsigned from, to;
1158 /* Reserve one block more for addition to orphan list in case
1159 * we allocate blocks but write fails for some reason */
1160 int needed_blocks = ext3_writepage_trans_blocks(inode) + 1;
1158 1161
1159 index = pos >> PAGE_CACHE_SHIFT; 1162 index = pos >> PAGE_CACHE_SHIFT;
1160 from = pos & (PAGE_CACHE_SIZE - 1); 1163 from = pos & (PAGE_CACHE_SIZE - 1);
@@ -1184,15 +1187,20 @@ retry:
1184 } 1187 }
1185write_begin_failed: 1188write_begin_failed:
1186 if (ret) { 1189 if (ret) {
1187 ext3_journal_stop(handle);
1188 unlock_page(page);
1189 page_cache_release(page);
1190 /* 1190 /*
1191 * block_write_begin may have instantiated a few blocks 1191 * block_write_begin may have instantiated a few blocks
1192 * outside i_size. Trim these off again. Don't need 1192 * outside i_size. Trim these off again. Don't need
1193 * i_size_read because we hold i_mutex. 1193 * i_size_read because we hold i_mutex.
1194 *
1195 * Add inode to orphan list in case we crash before truncate
1196 * finishes.
1194 */ 1197 */
1195 if (pos + len > inode->i_size) 1198 if (pos + len > inode->i_size)
1199 ext3_orphan_add(handle, inode);
1200 ext3_journal_stop(handle);
1201 unlock_page(page);
1202 page_cache_release(page);
1203 if (pos + len > inode->i_size)
1196 vmtruncate(inode, inode->i_size); 1204 vmtruncate(inode, inode->i_size);
1197 } 1205 }
1198 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) 1206 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
@@ -1211,6 +1219,18 @@ int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1211 return err; 1219 return err;
1212} 1220}
1213 1221
1222/* For ordered writepage and write_end functions */
1223static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1224{
1225 /*
1226 * Write could have mapped the buffer but it didn't copy the data in
1227 * yet. So avoid filing such buffer into a transaction.
1228 */
1229 if (buffer_mapped(bh) && buffer_uptodate(bh))
1230 return ext3_journal_dirty_data(handle, bh);
1231 return 0;
1232}
1233
1214/* For write_end() in data=journal mode */ 1234/* For write_end() in data=journal mode */
1215static int write_end_fn(handle_t *handle, struct buffer_head *bh) 1235static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1216{ 1236{
@@ -1221,26 +1241,20 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1221} 1241}
1222 1242
1223/* 1243/*
1224 * Generic write_end handler for ordered and writeback ext3 journal modes. 1244 * This is nasty and subtle: ext3_write_begin() could have allocated blocks
1225 * We can't use generic_write_end, because that unlocks the page and we need to 1245 * for the whole page but later we failed to copy the data in. Update inode
1226 * unlock the page after ext3_journal_stop, but ext3_journal_stop must run 1246 * size according to what we managed to copy. The rest is going to be
1227 * after block_write_end. 1247 * truncated in write_end function.
1228 */ 1248 */
1229static int ext3_generic_write_end(struct file *file, 1249static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied)
1230 struct address_space *mapping,
1231 loff_t pos, unsigned len, unsigned copied,
1232 struct page *page, void *fsdata)
1233{ 1250{
1234 struct inode *inode = file->f_mapping->host; 1251 /* What matters to us is i_disksize. We don't write i_size anywhere */
1235 1252 if (pos + copied > inode->i_size)
1236 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); 1253 i_size_write(inode, pos + copied);
1237 1254 if (pos + copied > EXT3_I(inode)->i_disksize) {
1238 if (pos+copied > inode->i_size) { 1255 EXT3_I(inode)->i_disksize = pos + copied;
1239 i_size_write(inode, pos+copied);
1240 mark_inode_dirty(inode); 1256 mark_inode_dirty(inode);
1241 } 1257 }
1242
1243 return copied;
1244} 1258}
1245 1259
1246/* 1260/*
@@ -1260,35 +1274,29 @@ static int ext3_ordered_write_end(struct file *file,
1260 unsigned from, to; 1274 unsigned from, to;
1261 int ret = 0, ret2; 1275 int ret = 0, ret2;
1262 1276
1263 from = pos & (PAGE_CACHE_SIZE - 1); 1277 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1264 to = from + len;
1265 1278
1279 from = pos & (PAGE_CACHE_SIZE - 1);
1280 to = from + copied;
1266 ret = walk_page_buffers(handle, page_buffers(page), 1281 ret = walk_page_buffers(handle, page_buffers(page),
1267 from, to, NULL, ext3_journal_dirty_data); 1282 from, to, NULL, journal_dirty_data_fn);
1268 1283
1269 if (ret == 0) { 1284 if (ret == 0)
1270 /* 1285 update_file_sizes(inode, pos, copied);
1271 * generic_write_end() will run mark_inode_dirty() if i_size 1286 /*
1272 * changes. So let's piggyback the i_disksize mark_inode_dirty 1287 * There may be allocated blocks outside of i_size because
1273 * into that. 1288 * we failed to copy some data. Prepare for truncate.
1274 */ 1289 */
1275 loff_t new_i_size; 1290 if (pos + len > inode->i_size)
1276 1291 ext3_orphan_add(handle, inode);
1277 new_i_size = pos + copied;
1278 if (new_i_size > EXT3_I(inode)->i_disksize)
1279 EXT3_I(inode)->i_disksize = new_i_size;
1280 ret2 = ext3_generic_write_end(file, mapping, pos, len, copied,
1281 page, fsdata);
1282 copied = ret2;
1283 if (ret2 < 0)
1284 ret = ret2;
1285 }
1286 ret2 = ext3_journal_stop(handle); 1292 ret2 = ext3_journal_stop(handle);
1287 if (!ret) 1293 if (!ret)
1288 ret = ret2; 1294 ret = ret2;
1289 unlock_page(page); 1295 unlock_page(page);
1290 page_cache_release(page); 1296 page_cache_release(page);
1291 1297
1298 if (pos + len > inode->i_size)
1299 vmtruncate(inode, inode->i_size);
1292 return ret ? ret : copied; 1300 return ret ? ret : copied;
1293} 1301}
1294 1302
@@ -1299,25 +1307,22 @@ static int ext3_writeback_write_end(struct file *file,
1299{ 1307{
1300 handle_t *handle = ext3_journal_current_handle(); 1308 handle_t *handle = ext3_journal_current_handle();
1301 struct inode *inode = file->f_mapping->host; 1309 struct inode *inode = file->f_mapping->host;
1302 int ret = 0, ret2; 1310 int ret;
1303 loff_t new_i_size;
1304
1305 new_i_size = pos + copied;
1306 if (new_i_size > EXT3_I(inode)->i_disksize)
1307 EXT3_I(inode)->i_disksize = new_i_size;
1308
1309 ret2 = ext3_generic_write_end(file, mapping, pos, len, copied,
1310 page, fsdata);
1311 copied = ret2;
1312 if (ret2 < 0)
1313 ret = ret2;
1314 1311
1315 ret2 = ext3_journal_stop(handle); 1312 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1316 if (!ret) 1313 update_file_sizes(inode, pos, copied);
1317 ret = ret2; 1314 /*
1315 * There may be allocated blocks outside of i_size because
1316 * we failed to copy some data. Prepare for truncate.
1317 */
1318 if (pos + len > inode->i_size)
1319 ext3_orphan_add(handle, inode);
1320 ret = ext3_journal_stop(handle);
1318 unlock_page(page); 1321 unlock_page(page);
1319 page_cache_release(page); 1322 page_cache_release(page);
1320 1323
1324 if (pos + len > inode->i_size)
1325 vmtruncate(inode, inode->i_size);
1321 return ret ? ret : copied; 1326 return ret ? ret : copied;
1322} 1327}
1323 1328
@@ -1338,15 +1343,23 @@ static int ext3_journalled_write_end(struct file *file,
1338 if (copied < len) { 1343 if (copied < len) {
1339 if (!PageUptodate(page)) 1344 if (!PageUptodate(page))
1340 copied = 0; 1345 copied = 0;
1341 page_zero_new_buffers(page, from+copied, to); 1346 page_zero_new_buffers(page, from + copied, to);
1347 to = from + copied;
1342 } 1348 }
1343 1349
1344 ret = walk_page_buffers(handle, page_buffers(page), from, 1350 ret = walk_page_buffers(handle, page_buffers(page), from,
1345 to, &partial, write_end_fn); 1351 to, &partial, write_end_fn);
1346 if (!partial) 1352 if (!partial)
1347 SetPageUptodate(page); 1353 SetPageUptodate(page);
1348 if (pos+copied > inode->i_size) 1354
1349 i_size_write(inode, pos+copied); 1355 if (pos + copied > inode->i_size)
1356 i_size_write(inode, pos + copied);
1357 /*
1358 * There may be allocated blocks outside of i_size because
1359 * we failed to copy some data. Prepare for truncate.
1360 */
1361 if (pos + len > inode->i_size)
1362 ext3_orphan_add(handle, inode);
1350 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; 1363 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1351 if (inode->i_size > EXT3_I(inode)->i_disksize) { 1364 if (inode->i_size > EXT3_I(inode)->i_disksize) {
1352 EXT3_I(inode)->i_disksize = inode->i_size; 1365 EXT3_I(inode)->i_disksize = inode->i_size;
@@ -1361,6 +1374,8 @@ static int ext3_journalled_write_end(struct file *file,
1361 unlock_page(page); 1374 unlock_page(page);
1362 page_cache_release(page); 1375 page_cache_release(page);
1363 1376
1377 if (pos + len > inode->i_size)
1378 vmtruncate(inode, inode->i_size);
1364 return ret ? ret : copied; 1379 return ret ? ret : copied;
1365} 1380}
1366 1381
@@ -1428,17 +1443,11 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
1428 return 0; 1443 return 0;
1429} 1444}
1430 1445
1431static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1432{
1433 if (buffer_mapped(bh))
1434 return ext3_journal_dirty_data(handle, bh);
1435 return 0;
1436}
1437
1438static int buffer_unmapped(handle_t *handle, struct buffer_head *bh) 1446static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
1439{ 1447{
1440 return !buffer_mapped(bh); 1448 return !buffer_mapped(bh);
1441} 1449}
1450
1442/* 1451/*
1443 * Note that we always start a transaction even if we're not journalling 1452 * Note that we always start a transaction even if we're not journalling
1444 * data. This is to preserve ordering: any hole instantiation within 1453 * data. This is to preserve ordering: any hole instantiation within
@@ -1512,12 +1521,16 @@ static int ext3_ordered_writepage(struct page *page,
1512 if (!page_has_buffers(page)) { 1521 if (!page_has_buffers(page)) {
1513 create_empty_buffers(page, inode->i_sb->s_blocksize, 1522 create_empty_buffers(page, inode->i_sb->s_blocksize,
1514 (1 << BH_Dirty)|(1 << BH_Uptodate)); 1523 (1 << BH_Dirty)|(1 << BH_Uptodate));
1515 } else if (!walk_page_buffers(NULL, page_buffers(page), 0, PAGE_CACHE_SIZE, NULL, buffer_unmapped)) { 1524 page_bufs = page_buffers(page);
1516 /* Provide NULL instead of get_block so that we catch bugs if buffers weren't really mapped */ 1525 } else {
1517 return block_write_full_page(page, NULL, wbc); 1526 page_bufs = page_buffers(page);
1527 if (!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE,
1528 NULL, buffer_unmapped)) {
1529 /* Provide NULL get_block() to catch bugs if buffers
1530 * weren't really mapped */
1531 return block_write_full_page(page, NULL, wbc);
1532 }
1518 } 1533 }
1519 page_bufs = page_buffers(page);
1520
1521 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); 1534 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1522 1535
1523 if (IS_ERR(handle)) { 1536 if (IS_ERR(handle)) {
@@ -1572,6 +1585,15 @@ static int ext3_writeback_writepage(struct page *page,
1572 if (ext3_journal_current_handle()) 1585 if (ext3_journal_current_handle())
1573 goto out_fail; 1586 goto out_fail;
1574 1587
1588 if (page_has_buffers(page)) {
1589 if (!walk_page_buffers(NULL, page_buffers(page), 0,
1590 PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
1591 /* Provide NULL get_block() to catch bugs if buffers
1592 * weren't really mapped */
1593 return block_write_full_page(page, NULL, wbc);
1594 }
1595 }
1596
1575 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); 1597 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1576 if (IS_ERR(handle)) { 1598 if (IS_ERR(handle)) {
1577 ret = PTR_ERR(handle); 1599 ret = PTR_ERR(handle);
@@ -2354,6 +2376,9 @@ void ext3_truncate(struct inode *inode)
2354 if (!ext3_can_truncate(inode)) 2376 if (!ext3_can_truncate(inode))
2355 return; 2377 return;
2356 2378
2379 if (inode->i_size == 0 && ext3_should_writeback_data(inode))
2380 ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE;
2381
2357 /* 2382 /*
2358 * We have to lock the EOF page here, because lock_page() nests 2383 * We have to lock the EOF page here, because lock_page() nests
2359 * outside journal_start(). 2384 * outside journal_start().
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 5e86ce9a86e0..88974814783a 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -15,12 +15,11 @@
15#include <linux/mount.h> 15#include <linux/mount.h>
16#include <linux/time.h> 16#include <linux/time.h>
17#include <linux/compat.h> 17#include <linux/compat.h>
18#include <linux/smp_lock.h>
19#include <asm/uaccess.h> 18#include <asm/uaccess.h>
20 19
21int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, 20long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
22 unsigned long arg)
23{ 21{
22 struct inode *inode = filp->f_dentry->d_inode;
24 struct ext3_inode_info *ei = EXT3_I(inode); 23 struct ext3_inode_info *ei = EXT3_I(inode);
25 unsigned int flags; 24 unsigned int flags;
26 unsigned short rsv_window_size; 25 unsigned short rsv_window_size;
@@ -39,29 +38,25 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
39 unsigned int oldflags; 38 unsigned int oldflags;
40 unsigned int jflag; 39 unsigned int jflag;
41 40
41 if (!is_owner_or_cap(inode))
42 return -EACCES;
43
44 if (get_user(flags, (int __user *) arg))
45 return -EFAULT;
46
42 err = mnt_want_write(filp->f_path.mnt); 47 err = mnt_want_write(filp->f_path.mnt);
43 if (err) 48 if (err)
44 return err; 49 return err;
45 50
46 if (!is_owner_or_cap(inode)) {
47 err = -EACCES;
48 goto flags_out;
49 }
50
51 if (get_user(flags, (int __user *) arg)) {
52 err = -EFAULT;
53 goto flags_out;
54 }
55
56 flags = ext3_mask_flags(inode->i_mode, flags); 51 flags = ext3_mask_flags(inode->i_mode, flags);
57 52
58 mutex_lock(&inode->i_mutex); 53 mutex_lock(&inode->i_mutex);
54
59 /* Is it quota file? Do not allow user to mess with it */ 55 /* Is it quota file? Do not allow user to mess with it */
60 if (IS_NOQUOTA(inode)) { 56 err = -EPERM;
61 mutex_unlock(&inode->i_mutex); 57 if (IS_NOQUOTA(inode))
62 err = -EPERM;
63 goto flags_out; 58 goto flags_out;
64 } 59
65 oldflags = ei->i_flags; 60 oldflags = ei->i_flags;
66 61
67 /* The JOURNAL_DATA flag is modifiable only by root */ 62 /* The JOURNAL_DATA flag is modifiable only by root */
@@ -74,11 +69,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
74 * This test looks nicer. Thanks to Pauline Middelink 69 * This test looks nicer. Thanks to Pauline Middelink
75 */ 70 */
76 if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) { 71 if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
77 if (!capable(CAP_LINUX_IMMUTABLE)) { 72 if (!capable(CAP_LINUX_IMMUTABLE))
78 mutex_unlock(&inode->i_mutex);
79 err = -EPERM;
80 goto flags_out; 73 goto flags_out;
81 }
82 } 74 }
83 75
84 /* 76 /*
@@ -86,17 +78,12 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
86 * the relevant capability. 78 * the relevant capability.
87 */ 79 */
88 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { 80 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
89 if (!capable(CAP_SYS_RESOURCE)) { 81 if (!capable(CAP_SYS_RESOURCE))
90 mutex_unlock(&inode->i_mutex);
91 err = -EPERM;
92 goto flags_out; 82 goto flags_out;
93 }
94 } 83 }
95 84
96
97 handle = ext3_journal_start(inode, 1); 85 handle = ext3_journal_start(inode, 1);
98 if (IS_ERR(handle)) { 86 if (IS_ERR(handle)) {
99 mutex_unlock(&inode->i_mutex);
100 err = PTR_ERR(handle); 87 err = PTR_ERR(handle);
101 goto flags_out; 88 goto flags_out;
102 } 89 }
@@ -116,15 +103,13 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
116 err = ext3_mark_iloc_dirty(handle, inode, &iloc); 103 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
117flags_err: 104flags_err:
118 ext3_journal_stop(handle); 105 ext3_journal_stop(handle);
119 if (err) { 106 if (err)
120 mutex_unlock(&inode->i_mutex); 107 goto flags_out;
121 return err;
122 }
123 108
124 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) 109 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
125 err = ext3_change_inode_journal_flag(inode, jflag); 110 err = ext3_change_inode_journal_flag(inode, jflag);
126 mutex_unlock(&inode->i_mutex);
127flags_out: 111flags_out:
112 mutex_unlock(&inode->i_mutex);
128 mnt_drop_write(filp->f_path.mnt); 113 mnt_drop_write(filp->f_path.mnt);
129 return err; 114 return err;
130 } 115 }
@@ -140,6 +125,7 @@ flags_out:
140 125
141 if (!is_owner_or_cap(inode)) 126 if (!is_owner_or_cap(inode))
142 return -EPERM; 127 return -EPERM;
128
143 err = mnt_want_write(filp->f_path.mnt); 129 err = mnt_want_write(filp->f_path.mnt);
144 if (err) 130 if (err)
145 return err; 131 return err;
@@ -147,6 +133,7 @@ flags_out:
147 err = -EFAULT; 133 err = -EFAULT;
148 goto setversion_out; 134 goto setversion_out;
149 } 135 }
136
150 handle = ext3_journal_start(inode, 1); 137 handle = ext3_journal_start(inode, 1);
151 if (IS_ERR(handle)) { 138 if (IS_ERR(handle)) {
152 err = PTR_ERR(handle); 139 err = PTR_ERR(handle);
@@ -299,9 +286,6 @@ group_add_out:
299#ifdef CONFIG_COMPAT 286#ifdef CONFIG_COMPAT
300long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 287long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
301{ 288{
302 struct inode *inode = file->f_path.dentry->d_inode;
303 int ret;
304
305 /* These are just misnamed, they actually get/put from/to user an int */ 289 /* These are just misnamed, they actually get/put from/to user an int */
306 switch (cmd) { 290 switch (cmd) {
307 case EXT3_IOC32_GETFLAGS: 291 case EXT3_IOC32_GETFLAGS:
@@ -341,9 +325,6 @@ long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
341 default: 325 default:
342 return -ENOIOCTLCMD; 326 return -ENOIOCTLCMD;
343 } 327 }
344 lock_kernel(); 328 return ext3_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
345 ret = ext3_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
346 unlock_kernel();
347 return ret;
348} 329}
349#endif 330#endif
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index e2fc63cbba8b..6ff7b9730234 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -161,12 +161,12 @@ static struct dx_frame *dx_probe(struct qstr *entry,
161 struct dx_frame *frame, 161 struct dx_frame *frame,
162 int *err); 162 int *err);
163static void dx_release (struct dx_frame *frames); 163static void dx_release (struct dx_frame *frames);
164static int dx_make_map (struct ext3_dir_entry_2 *de, int size, 164static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
165 struct dx_hash_info *hinfo, struct dx_map_entry map[]); 165 struct dx_hash_info *hinfo, struct dx_map_entry map[]);
166static void dx_sort_map(struct dx_map_entry *map, unsigned count); 166static void dx_sort_map(struct dx_map_entry *map, unsigned count);
167static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, 167static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
168 struct dx_map_entry *offsets, int count); 168 struct dx_map_entry *offsets, int count);
169static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); 169static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize);
170static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); 170static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
171static int ext3_htree_next_block(struct inode *dir, __u32 hash, 171static int ext3_htree_next_block(struct inode *dir, __u32 hash,
172 struct dx_frame *frame, 172 struct dx_frame *frame,
@@ -708,14 +708,14 @@ errout:
708 * Create map of hash values, offsets, and sizes, stored at end of block. 708 * Create map of hash values, offsets, and sizes, stored at end of block.
709 * Returns number of entries mapped. 709 * Returns number of entries mapped.
710 */ 710 */
711static int dx_make_map (struct ext3_dir_entry_2 *de, int size, 711static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
712 struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) 712 struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
713{ 713{
714 int count = 0; 714 int count = 0;
715 char *base = (char *) de; 715 char *base = (char *) de;
716 struct dx_hash_info h = *hinfo; 716 struct dx_hash_info h = *hinfo;
717 717
718 while ((char *) de < base + size) 718 while ((char *) de < base + blocksize)
719 { 719 {
720 if (de->name_len && de->inode) { 720 if (de->name_len && de->inode) {
721 ext3fs_dirhash(de->name, de->name_len, &h); 721 ext3fs_dirhash(de->name, de->name_len, &h);
@@ -1047,8 +1047,16 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
1047 return ERR_PTR(-EIO); 1047 return ERR_PTR(-EIO);
1048 } 1048 }
1049 inode = ext3_iget(dir->i_sb, ino); 1049 inode = ext3_iget(dir->i_sb, ino);
1050 if (IS_ERR(inode)) 1050 if (unlikely(IS_ERR(inode))) {
1051 return ERR_CAST(inode); 1051 if (PTR_ERR(inode) == -ESTALE) {
1052 ext3_error(dir->i_sb, __func__,
1053 "deleted inode referenced: %lu",
1054 ino);
1055 return ERR_PTR(-EIO);
1056 } else {
1057 return ERR_CAST(inode);
1058 }
1059 }
1052 } 1060 }
1053 return d_splice_alias(inode, dentry); 1061 return d_splice_alias(inode, dentry);
1054} 1062}
@@ -1120,13 +1128,14 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
1120 * Compact each dir entry in the range to the minimal rec_len. 1128 * Compact each dir entry in the range to the minimal rec_len.
1121 * Returns pointer to last entry in range. 1129 * Returns pointer to last entry in range.
1122 */ 1130 */
1123static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size) 1131static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize)
1124{ 1132{
1125 struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base; 1133 struct ext3_dir_entry_2 *next, *to, *prev;
1134 struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *)base;
1126 unsigned rec_len = 0; 1135 unsigned rec_len = 0;
1127 1136
1128 prev = to = de; 1137 prev = to = de;
1129 while ((char*)de < base + size) { 1138 while ((char *)de < base + blocksize) {
1130 next = ext3_next_entry(de); 1139 next = ext3_next_entry(de);
1131 if (de->inode && de->name_len) { 1140 if (de->inode && de->name_len) {
1132 rec_len = EXT3_DIR_REC_LEN(de->name_len); 1141 rec_len = EXT3_DIR_REC_LEN(de->name_len);
@@ -2265,7 +2274,7 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
2265 struct inode * old_inode, * new_inode; 2274 struct inode * old_inode, * new_inode;
2266 struct buffer_head * old_bh, * new_bh, * dir_bh; 2275 struct buffer_head * old_bh, * new_bh, * dir_bh;
2267 struct ext3_dir_entry_2 * old_de, * new_de; 2276 struct ext3_dir_entry_2 * old_de, * new_de;
2268 int retval; 2277 int retval, flush_file = 0;
2269 2278
2270 old_bh = new_bh = dir_bh = NULL; 2279 old_bh = new_bh = dir_bh = NULL;
2271 2280
@@ -2401,6 +2410,8 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
2401 ext3_mark_inode_dirty(handle, new_inode); 2410 ext3_mark_inode_dirty(handle, new_inode);
2402 if (!new_inode->i_nlink) 2411 if (!new_inode->i_nlink)
2403 ext3_orphan_add(handle, new_inode); 2412 ext3_orphan_add(handle, new_inode);
2413 if (ext3_should_writeback_data(new_inode))
2414 flush_file = 1;
2404 } 2415 }
2405 retval = 0; 2416 retval = 0;
2406 2417
@@ -2409,6 +2420,8 @@ end_rename:
2409 brelse (old_bh); 2420 brelse (old_bh);
2410 brelse (new_bh); 2421 brelse (new_bh);
2411 ext3_journal_stop(handle); 2422 ext3_journal_stop(handle);
2423 if (retval == 0 && flush_file)
2424 filemap_flush(old_inode->i_mapping);
2412 return retval; 2425 return retval;
2413} 2426}
2414 2427
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 9e5b8e387e1e..599dbfe504c3 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -44,6 +44,12 @@
44#include "acl.h" 44#include "acl.h"
45#include "namei.h" 45#include "namei.h"
46 46
47#ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED
48 #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA
49#else
50 #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_WRITEBACK_DATA
51#endif
52
47static int ext3_load_journal(struct super_block *, struct ext3_super_block *, 53static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
48 unsigned long journal_devnum); 54 unsigned long journal_devnum);
49static int ext3_create_journal(struct super_block *, struct ext3_super_block *, 55static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
@@ -1919,7 +1925,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1919 cope, else JOURNAL_DATA */ 1925 cope, else JOURNAL_DATA */
1920 if (journal_check_available_features 1926 if (journal_check_available_features
1921 (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) 1927 (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
1922 set_opt(sbi->s_mount_opt, ORDERED_DATA); 1928 set_opt(sbi->s_mount_opt, DEFAULT_DATA_MODE);
1923 else 1929 else
1924 set_opt(sbi->s_mount_opt, JOURNAL_DATA); 1930 set_opt(sbi->s_mount_opt, JOURNAL_DATA);
1925 break; 1931 break;
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 7505482a08fa..418b6f3b0ae8 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -18,7 +18,7 @@ config EXT4_FS
18 filesystem; while there will be some performance gains from 18 filesystem; while there will be some performance gains from
19 the delayed allocation and inode table readahead, the best 19 the delayed allocation and inode table readahead, the best
20 performance gains will require enabling ext4 features in the 20 performance gains will require enabling ext4 features in the
21 filesystem, or formating a new filesystem as an ext4 21 filesystem, or formatting a new filesystem as an ext4
22 filesystem initially. 22 filesystem initially.
23 23
24 To compile this file system support as a module, choose M here. The 24 To compile this file system support as a module, choose M here. The
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 694ed6fadcc8..647e0d65a284 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -323,7 +323,7 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
323 return PTR_ERR(acl); 323 return PTR_ERR(acl);
324 } 324 }
325 if (!acl) 325 if (!acl)
326 inode->i_mode &= ~current->fs->umask; 326 inode->i_mode &= ~current_umask();
327 } 327 }
328 if (test_opt(inode->i_sb, POSIX_ACL) && acl) { 328 if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
329 struct posix_acl *clone; 329 struct posix_acl *clone;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 38f40d55899c..53c72ad85877 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -55,7 +55,8 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
55} 55}
56 56
57static int ext4_group_used_meta_blocks(struct super_block *sb, 57static int ext4_group_used_meta_blocks(struct super_block *sb,
58 ext4_group_t block_group) 58 ext4_group_t block_group,
59 struct ext4_group_desc *gdp)
59{ 60{
60 ext4_fsblk_t tmp; 61 ext4_fsblk_t tmp;
61 struct ext4_sb_info *sbi = EXT4_SB(sb); 62 struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -63,10 +64,6 @@ static int ext4_group_used_meta_blocks(struct super_block *sb,
63 int used_blocks = sbi->s_itb_per_group + 2; 64 int used_blocks = sbi->s_itb_per_group + 2;
64 65
65 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { 66 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
66 struct ext4_group_desc *gdp;
67 struct buffer_head *bh;
68
69 gdp = ext4_get_group_desc(sb, block_group, &bh);
70 if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), 67 if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp),
71 block_group)) 68 block_group))
72 used_blocks--; 69 used_blocks--;
@@ -177,7 +174,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
177 */ 174 */
178 mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data); 175 mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
179 } 176 }
180 return free_blocks - ext4_group_used_meta_blocks(sb, block_group); 177 return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
181} 178}
182 179
183 180
@@ -473,9 +470,8 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
473 470
474 if (sbi->s_log_groups_per_flex) { 471 if (sbi->s_log_groups_per_flex) {
475 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 472 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
476 spin_lock(sb_bgl_lock(sbi, flex_group)); 473 atomic_add(blocks_freed,
477 sbi->s_flex_groups[flex_group].free_blocks += blocks_freed; 474 &sbi->s_flex_groups[flex_group].free_blocks);
478 spin_unlock(sb_bgl_lock(sbi, flex_group));
479 } 475 }
480 /* 476 /*
481 * request to reload the buddy with the 477 * request to reload the buddy with the
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2df2e40b01af..b64789929a65 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -67,7 +67,8 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
67 unsigned int offset) 67 unsigned int offset)
68{ 68{
69 const char *error_msg = NULL; 69 const char *error_msg = NULL;
70 const int rlen = ext4_rec_len_from_disk(de->rec_len); 70 const int rlen = ext4_rec_len_from_disk(de->rec_len,
71 dir->i_sb->s_blocksize);
71 72
72 if (rlen < EXT4_DIR_REC_LEN(1)) 73 if (rlen < EXT4_DIR_REC_LEN(1))
73 error_msg = "rec_len is smaller than minimal"; 74 error_msg = "rec_len is smaller than minimal";
@@ -178,10 +179,11 @@ revalidate:
178 * least that it is non-zero. A 179 * least that it is non-zero. A
179 * failure will be detected in the 180 * failure will be detected in the
180 * dirent test below. */ 181 * dirent test below. */
181 if (ext4_rec_len_from_disk(de->rec_len) 182 if (ext4_rec_len_from_disk(de->rec_len,
182 < EXT4_DIR_REC_LEN(1)) 183 sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
183 break; 184 break;
184 i += ext4_rec_len_from_disk(de->rec_len); 185 i += ext4_rec_len_from_disk(de->rec_len,
186 sb->s_blocksize);
185 } 187 }
186 offset = i; 188 offset = i;
187 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) 189 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
@@ -203,7 +205,8 @@ revalidate:
203 ret = stored; 205 ret = stored;
204 goto out; 206 goto out;
205 } 207 }
206 offset += ext4_rec_len_from_disk(de->rec_len); 208 offset += ext4_rec_len_from_disk(de->rec_len,
209 sb->s_blocksize);
207 if (le32_to_cpu(de->inode)) { 210 if (le32_to_cpu(de->inode)) {
208 /* We might block in the next section 211 /* We might block in the next section
209 * if the data destination is 212 * if the data destination is
@@ -225,7 +228,8 @@ revalidate:
225 goto revalidate; 228 goto revalidate;
226 stored++; 229 stored++;
227 } 230 }
228 filp->f_pos += ext4_rec_len_from_disk(de->rec_len); 231 filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
232 sb->s_blocksize);
229 } 233 }
230 offset = 0; 234 offset = 0;
231 brelse(bh); 235 brelse(bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6083bb38057b..d0f15ef56de1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -33,14 +33,6 @@
33#undef EXT4FS_DEBUG 33#undef EXT4FS_DEBUG
34 34
35/* 35/*
36 * Define EXT4_RESERVATION to reserve data blocks for expanding files
37 */
38#define EXT4_DEFAULT_RESERVE_BLOCKS 8
39/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */
40#define EXT4_MAX_RESERVE_BLOCKS 1027
41#define EXT4_RESERVE_WINDOW_NOT_ALLOCATED 0
42
43/*
44 * Debug code 36 * Debug code
45 */ 37 */
46#ifdef EXT4FS_DEBUG 38#ifdef EXT4FS_DEBUG
@@ -54,8 +46,6 @@
54#define ext4_debug(f, a...) do {} while (0) 46#define ext4_debug(f, a...) do {} while (0)
55#endif 47#endif
56 48
57#define EXT4_MULTIBLOCK_ALLOCATOR 1
58
59/* prefer goal again. length */ 49/* prefer goal again. length */
60#define EXT4_MB_HINT_MERGE 1 50#define EXT4_MB_HINT_MERGE 1
61/* blocks already reserved */ 51/* blocks already reserved */
@@ -180,8 +170,9 @@ struct ext4_group_desc
180 */ 170 */
181 171
182struct flex_groups { 172struct flex_groups {
183 __u32 free_inodes; 173 atomic_t free_inodes;
184 __u32 free_blocks; 174 atomic_t free_blocks;
175 atomic_t used_dirs;
185}; 176};
186 177
187#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ 178#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */
@@ -249,6 +240,30 @@ struct flex_groups {
249#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ 240#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
250#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */ 241#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */
251 242
243/* Flags that should be inherited by new inodes from their parent. */
244#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
245 EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\
246 EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
247 EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
248 EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
249
250/* Flags that are appropriate for regular files (all but dir-specific ones). */
251#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
252
253/* Flags that are appropriate for non-directories/regular files. */
254#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
255
256/* Mask out flags that are inappropriate for the given type of inode. */
257static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
258{
259 if (S_ISDIR(mode))
260 return flags;
261 else if (S_ISREG(mode))
262 return flags & EXT4_REG_FLMASK;
263 else
264 return flags & EXT4_OTHER_FLMASK;
265}
266
252/* 267/*
253 * Inode dynamic state flags 268 * Inode dynamic state flags
254 */ 269 */
@@ -256,6 +271,7 @@ struct flex_groups {
256#define EXT4_STATE_NEW 0x00000002 /* inode is newly created */ 271#define EXT4_STATE_NEW 0x00000002 /* inode is newly created */
257#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */ 272#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
258#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ 273#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
274#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
259 275
260/* Used to pass group descriptor data when online resize is done */ 276/* Used to pass group descriptor data when online resize is done */
261struct ext4_new_group_input { 277struct ext4_new_group_input {
@@ -303,7 +319,9 @@ struct ext4_new_group_data {
303#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) 319#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
304#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input) 320#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input)
305#define EXT4_IOC_MIGRATE _IO('f', 9) 321#define EXT4_IOC_MIGRATE _IO('f', 9)
322 /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
306 /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ 323 /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
324#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
307 325
308/* 326/*
309 * ioctl commands in 32 bit emulation 327 * ioctl commands in 32 bit emulation
@@ -531,7 +549,7 @@ do { \
531#define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ 549#define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */
532#define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ 550#define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */
533#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ 551#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */
534#define EXT4_MOUNT_RESERVATION 0x10000 /* Preallocation */ 552#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */
535#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ 553#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */
536#define EXT4_MOUNT_NOBH 0x40000 /* No bufferheads */ 554#define EXT4_MOUNT_NOBH 0x40000 /* No bufferheads */
537#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ 555#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
@@ -666,7 +684,8 @@ struct ext4_super_block {
666 __u8 s_log_groups_per_flex; /* FLEX_BG group size */ 684 __u8 s_log_groups_per_flex; /* FLEX_BG group size */
667 __u8 s_reserved_char_pad2; 685 __u8 s_reserved_char_pad2;
668 __le16 s_reserved_pad; 686 __le16 s_reserved_pad;
669 __u32 s_reserved[162]; /* Padding to the end of the block */ 687 __le64 s_kbytes_written; /* nr of lifetime kilobytes written */
688 __u32 s_reserved[160]; /* Padding to the end of the block */
670}; 689};
671 690
672#ifdef __KERNEL__ 691#ifdef __KERNEL__
@@ -814,6 +833,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
814#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ 833#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
815 834
816/* 835/*
836 * Minimum number of groups in a flexgroup before we separate out
837 * directories into the first block group of a flexgroup
838 */
839#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4
840
841/*
817 * Structure of a directory entry 842 * Structure of a directory entry
818 */ 843 */
819#define EXT4_NAME_LEN 255 844#define EXT4_NAME_LEN 255
@@ -865,24 +890,6 @@ struct ext4_dir_entry_2 {
865 ~EXT4_DIR_ROUND) 890 ~EXT4_DIR_ROUND)
866#define EXT4_MAX_REC_LEN ((1<<16)-1) 891#define EXT4_MAX_REC_LEN ((1<<16)-1)
867 892
868static inline unsigned ext4_rec_len_from_disk(__le16 dlen)
869{
870 unsigned len = le16_to_cpu(dlen);
871
872 if (len == EXT4_MAX_REC_LEN || len == 0)
873 return 1 << 16;
874 return len;
875}
876
877static inline __le16 ext4_rec_len_to_disk(unsigned len)
878{
879 if (len == (1 << 16))
880 return cpu_to_le16(EXT4_MAX_REC_LEN);
881 else if (len > (1 << 16))
882 BUG();
883 return cpu_to_le16(len);
884}
885
886/* 893/*
887 * Hash Tree Directory indexing 894 * Hash Tree Directory indexing
888 * (c) Daniel Phillips, 2001 895 * (c) Daniel Phillips, 2001
@@ -970,22 +977,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
970 977
971extern struct proc_dir_entry *ext4_proc_root; 978extern struct proc_dir_entry *ext4_proc_root;
972 979
973#ifdef CONFIG_PROC_FS
974extern const struct file_operations ext4_ui_proc_fops;
975
976#define EXT4_PROC_HANDLER(name, var) \
977do { \
978 proc = proc_create_data(name, mode, sbi->s_proc, \
979 &ext4_ui_proc_fops, &sbi->s_##var); \
980 if (proc == NULL) { \
981 printk(KERN_ERR "EXT4-fs: can't create %s\n", name); \
982 goto err_out; \
983 } \
984} while (0)
985#else
986#define EXT4_PROC_HANDLER(name, var)
987#endif
988
989/* 980/*
990 * Function prototypes 981 * Function prototypes
991 */ 982 */
@@ -1092,13 +1083,14 @@ extern int ext4_can_truncate(struct inode *inode);
1092extern void ext4_truncate(struct inode *); 1083extern void ext4_truncate(struct inode *);
1093extern void ext4_set_inode_flags(struct inode *); 1084extern void ext4_set_inode_flags(struct inode *);
1094extern void ext4_get_inode_flags(struct ext4_inode_info *); 1085extern void ext4_get_inode_flags(struct ext4_inode_info *);
1086extern int ext4_alloc_da_blocks(struct inode *inode);
1095extern void ext4_set_aops(struct inode *inode); 1087extern void ext4_set_aops(struct inode *inode);
1096extern int ext4_writepage_trans_blocks(struct inode *); 1088extern int ext4_writepage_trans_blocks(struct inode *);
1097extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks); 1089extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
1098extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); 1090extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
1099extern int ext4_block_truncate_page(handle_t *handle, 1091extern int ext4_block_truncate_page(handle_t *handle,
1100 struct address_space *mapping, loff_t from); 1092 struct address_space *mapping, loff_t from);
1101extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page); 1093extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
1102extern qsize_t ext4_get_reserved_space(struct inode *inode); 1094extern qsize_t ext4_get_reserved_space(struct inode *inode);
1103 1095
1104/* ioctl.c */ 1096/* ioctl.c */
@@ -1107,7 +1099,10 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
1107 1099
1108/* migrate.c */ 1100/* migrate.c */
1109extern int ext4_ext_migrate(struct inode *); 1101extern int ext4_ext_migrate(struct inode *);
1102
1110/* namei.c */ 1103/* namei.c */
1104extern unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize);
1105extern __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize);
1111extern int ext4_orphan_add(handle_t *, struct inode *); 1106extern int ext4_orphan_add(handle_t *, struct inode *);
1112extern int ext4_orphan_del(handle_t *, struct inode *); 1107extern int ext4_orphan_del(handle_t *, struct inode *);
1113extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, 1108extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 18cb67b2cbbc..f0c3ec85bd48 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -241,5 +241,6 @@ extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
241extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *, 241extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
242 ext4_lblk_t *, ext4_fsblk_t *); 242 ext4_lblk_t *, ext4_fsblk_t *);
243extern void ext4_ext_drop_refs(struct ext4_ext_path *); 243extern void ext4_ext_drop_refs(struct ext4_ext_path *);
244extern int ext4_ext_check_inode(struct inode *inode);
244#endif /* _EXT4_EXTENTS */ 245#endif /* _EXT4_EXTENTS */
245 246
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index e69acc16f5c4..4ce2187123aa 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -33,9 +33,6 @@ typedef __u32 ext4_lblk_t;
33/* data type for block group number */ 33/* data type for block group number */
34typedef unsigned int ext4_group_t; 34typedef unsigned int ext4_group_t;
35 35
36#define rsv_start rsv_window._rsv_start
37#define rsv_end rsv_window._rsv_end
38
39/* 36/*
40 * storage for cached extent 37 * storage for cached extent
41 */ 38 */
@@ -125,6 +122,9 @@ struct ext4_inode_info {
125 struct list_head i_prealloc_list; 122 struct list_head i_prealloc_list;
126 spinlock_t i_prealloc_lock; 123 spinlock_t i_prealloc_lock;
127 124
125 /* ialloc */
126 ext4_group_t i_last_alloc_group;
127
128 /* allocation reservation info for delalloc */ 128 /* allocation reservation info for delalloc */
129 unsigned int i_reserved_data_blocks; 129 unsigned int i_reserved_data_blocks;
130 unsigned int i_reserved_meta_blocks; 130 unsigned int i_reserved_meta_blocks;
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 039b6ea1a042..57b71fefbccf 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -62,12 +62,10 @@ struct ext4_sb_info {
62 struct percpu_counter s_freeinodes_counter; 62 struct percpu_counter s_freeinodes_counter;
63 struct percpu_counter s_dirs_counter; 63 struct percpu_counter s_dirs_counter;
64 struct percpu_counter s_dirtyblocks_counter; 64 struct percpu_counter s_dirtyblocks_counter;
65 struct blockgroup_lock s_blockgroup_lock; 65 struct blockgroup_lock *s_blockgroup_lock;
66 struct proc_dir_entry *s_proc; 66 struct proc_dir_entry *s_proc;
67 67 struct kobject s_kobj;
68 /* root of the per fs reservation window tree */ 68 struct completion s_kobj_unregister;
69 spinlock_t s_rsv_window_lock;
70 struct rb_root s_rsv_window_root;
71 69
72 /* Journaling */ 70 /* Journaling */
73 struct inode *s_journal_inode; 71 struct inode *s_journal_inode;
@@ -146,6 +144,10 @@ struct ext4_sb_info {
146 /* locality groups */ 144 /* locality groups */
147 struct ext4_locality_group *s_locality_groups; 145 struct ext4_locality_group *s_locality_groups;
148 146
147 /* for write statistics */
148 unsigned long s_sectors_written_start;
149 u64 s_kbytes_written;
150
149 unsigned int s_log_groups_per_flex; 151 unsigned int s_log_groups_per_flex;
150 struct flex_groups *s_flex_groups; 152 struct flex_groups *s_flex_groups;
151}; 153};
@@ -153,7 +155,7 @@ struct ext4_sb_info {
153static inline spinlock_t * 155static inline spinlock_t *
154sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group) 156sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
155{ 157{
156 return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group); 158 return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
157} 159}
158 160
159#endif /* _EXT4_SB */ 161#endif /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e0aa4fe4f596..e3a55eb8b26a 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -152,6 +152,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
152 ext4_fsblk_t bg_start; 152 ext4_fsblk_t bg_start;
153 ext4_fsblk_t last_block; 153 ext4_fsblk_t last_block;
154 ext4_grpblk_t colour; 154 ext4_grpblk_t colour;
155 ext4_group_t block_group;
156 int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
155 int depth; 157 int depth;
156 158
157 if (path) { 159 if (path) {
@@ -170,10 +172,31 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
170 } 172 }
171 173
172 /* OK. use inode's group */ 174 /* OK. use inode's group */
173 bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + 175 block_group = ei->i_block_group;
176 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
177 /*
178 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
179 * block groups per flexgroup, reserve the first block
180 * group for directories and special files. Regular
181 * files will start at the second block group. This
182 * tends to speed up directory access and improves
183 * fsck times.
184 */
185 block_group &= ~(flex_size-1);
186 if (S_ISREG(inode->i_mode))
187 block_group++;
188 }
189 bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
174 le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block); 190 le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
175 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; 191 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
176 192
193 /*
194 * If we are doing delayed allocation, we don't need take
195 * colour into account.
196 */
197 if (test_opt(inode->i_sb, DELALLOC))
198 return bg_start;
199
177 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) 200 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
178 colour = (current->pid % 16) * 201 colour = (current->pid % 16) *
179 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); 202 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -301,7 +324,70 @@ ext4_ext_max_entries(struct inode *inode, int depth)
301 return max; 324 return max;
302} 325}
303 326
304static int __ext4_ext_check_header(const char *function, struct inode *inode, 327static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
328{
329 ext4_fsblk_t block = ext_pblock(ext), valid_block;
330 int len = ext4_ext_get_actual_len(ext);
331 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
332
333 valid_block = le32_to_cpu(es->s_first_data_block) +
334 EXT4_SB(inode->i_sb)->s_gdb_count;
335 if (unlikely(block <= valid_block ||
336 ((block + len) > ext4_blocks_count(es))))
337 return 0;
338 else
339 return 1;
340}
341
342static int ext4_valid_extent_idx(struct inode *inode,
343 struct ext4_extent_idx *ext_idx)
344{
345 ext4_fsblk_t block = idx_pblock(ext_idx), valid_block;
346 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
347
348 valid_block = le32_to_cpu(es->s_first_data_block) +
349 EXT4_SB(inode->i_sb)->s_gdb_count;
350 if (unlikely(block <= valid_block ||
351 (block >= ext4_blocks_count(es))))
352 return 0;
353 else
354 return 1;
355}
356
357static int ext4_valid_extent_entries(struct inode *inode,
358 struct ext4_extent_header *eh,
359 int depth)
360{
361 struct ext4_extent *ext;
362 struct ext4_extent_idx *ext_idx;
363 unsigned short entries;
364 if (eh->eh_entries == 0)
365 return 1;
366
367 entries = le16_to_cpu(eh->eh_entries);
368
369 if (depth == 0) {
370 /* leaf entries */
371 ext = EXT_FIRST_EXTENT(eh);
372 while (entries) {
373 if (!ext4_valid_extent(inode, ext))
374 return 0;
375 ext++;
376 entries--;
377 }
378 } else {
379 ext_idx = EXT_FIRST_INDEX(eh);
380 while (entries) {
381 if (!ext4_valid_extent_idx(inode, ext_idx))
382 return 0;
383 ext_idx++;
384 entries--;
385 }
386 }
387 return 1;
388}
389
390static int __ext4_ext_check(const char *function, struct inode *inode,
305 struct ext4_extent_header *eh, 391 struct ext4_extent_header *eh,
306 int depth) 392 int depth)
307{ 393{
@@ -329,11 +415,15 @@ static int __ext4_ext_check_header(const char *function, struct inode *inode,
329 error_msg = "invalid eh_entries"; 415 error_msg = "invalid eh_entries";
330 goto corrupted; 416 goto corrupted;
331 } 417 }
418 if (!ext4_valid_extent_entries(inode, eh, depth)) {
419 error_msg = "invalid extent entries";
420 goto corrupted;
421 }
332 return 0; 422 return 0;
333 423
334corrupted: 424corrupted:
335 ext4_error(inode->i_sb, function, 425 ext4_error(inode->i_sb, function,
336 "bad header in inode #%lu: %s - magic %x, " 426 "bad header/extent in inode #%lu: %s - magic %x, "
337 "entries %u, max %u(%u), depth %u(%u)", 427 "entries %u, max %u(%u), depth %u(%u)",
338 inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), 428 inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
339 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), 429 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
@@ -342,8 +432,13 @@ corrupted:
342 return -EIO; 432 return -EIO;
343} 433}
344 434
345#define ext4_ext_check_header(inode, eh, depth) \ 435#define ext4_ext_check(inode, eh, depth) \
346 __ext4_ext_check_header(__func__, inode, eh, depth) 436 __ext4_ext_check(__func__, inode, eh, depth)
437
438int ext4_ext_check_inode(struct inode *inode)
439{
440 return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
441}
347 442
348#ifdef EXT_DEBUG 443#ifdef EXT_DEBUG
349static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) 444static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
@@ -547,9 +642,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
547 642
548 eh = ext_inode_hdr(inode); 643 eh = ext_inode_hdr(inode);
549 depth = ext_depth(inode); 644 depth = ext_depth(inode);
550 if (ext4_ext_check_header(inode, eh, depth))
551 return ERR_PTR(-EIO);
552
553 645
554 /* account possible depth increase */ 646 /* account possible depth increase */
555 if (!path) { 647 if (!path) {
@@ -565,6 +657,8 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
565 i = depth; 657 i = depth;
566 /* walk through the tree */ 658 /* walk through the tree */
567 while (i) { 659 while (i) {
660 int need_to_validate = 0;
661
568 ext_debug("depth %d: num %d, max %d\n", 662 ext_debug("depth %d: num %d, max %d\n",
569 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 663 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
570 664
@@ -573,10 +667,17 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
573 path[ppos].p_depth = i; 667 path[ppos].p_depth = i;
574 path[ppos].p_ext = NULL; 668 path[ppos].p_ext = NULL;
575 669
576 bh = sb_bread(inode->i_sb, path[ppos].p_block); 670 bh = sb_getblk(inode->i_sb, path[ppos].p_block);
577 if (!bh) 671 if (unlikely(!bh))
578 goto err; 672 goto err;
579 673 if (!bh_uptodate_or_lock(bh)) {
674 if (bh_submit_read(bh) < 0) {
675 put_bh(bh);
676 goto err;
677 }
678 /* validate the extent entries */
679 need_to_validate = 1;
680 }
580 eh = ext_block_hdr(bh); 681 eh = ext_block_hdr(bh);
581 ppos++; 682 ppos++;
582 BUG_ON(ppos > depth); 683 BUG_ON(ppos > depth);
@@ -584,7 +685,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
584 path[ppos].p_hdr = eh; 685 path[ppos].p_hdr = eh;
585 i--; 686 i--;
586 687
587 if (ext4_ext_check_header(inode, eh, i)) 688 if (need_to_validate && ext4_ext_check(inode, eh, i))
588 goto err; 689 goto err;
589 } 690 }
590 691
@@ -1181,7 +1282,7 @@ got_index:
1181 return -EIO; 1282 return -EIO;
1182 eh = ext_block_hdr(bh); 1283 eh = ext_block_hdr(bh);
1183 /* subtract from p_depth to get proper eh_depth */ 1284 /* subtract from p_depth to get proper eh_depth */
1184 if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) { 1285 if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1185 put_bh(bh); 1286 put_bh(bh);
1186 return -EIO; 1287 return -EIO;
1187 } 1288 }
@@ -1194,7 +1295,7 @@ got_index:
1194 if (bh == NULL) 1295 if (bh == NULL)
1195 return -EIO; 1296 return -EIO;
1196 eh = ext_block_hdr(bh); 1297 eh = ext_block_hdr(bh);
1197 if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) { 1298 if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1198 put_bh(bh); 1299 put_bh(bh);
1199 return -EIO; 1300 return -EIO;
1200 } 1301 }
@@ -1740,11 +1841,13 @@ ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1740{ 1841{
1741 struct ext4_ext_cache *cex; 1842 struct ext4_ext_cache *cex;
1742 BUG_ON(len == 0); 1843 BUG_ON(len == 0);
1844 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1743 cex = &EXT4_I(inode)->i_cached_extent; 1845 cex = &EXT4_I(inode)->i_cached_extent;
1744 cex->ec_type = type; 1846 cex->ec_type = type;
1745 cex->ec_block = block; 1847 cex->ec_block = block;
1746 cex->ec_len = len; 1848 cex->ec_len = len;
1747 cex->ec_start = start; 1849 cex->ec_start = start;
1850 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1748} 1851}
1749 1852
1750/* 1853/*
@@ -1801,12 +1904,17 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
1801 struct ext4_extent *ex) 1904 struct ext4_extent *ex)
1802{ 1905{
1803 struct ext4_ext_cache *cex; 1906 struct ext4_ext_cache *cex;
1907 int ret = EXT4_EXT_CACHE_NO;
1804 1908
1909 /*
1910 * We borrow i_block_reservation_lock to protect i_cached_extent
1911 */
1912 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1805 cex = &EXT4_I(inode)->i_cached_extent; 1913 cex = &EXT4_I(inode)->i_cached_extent;
1806 1914
1807 /* has cache valid data? */ 1915 /* has cache valid data? */
1808 if (cex->ec_type == EXT4_EXT_CACHE_NO) 1916 if (cex->ec_type == EXT4_EXT_CACHE_NO)
1809 return EXT4_EXT_CACHE_NO; 1917 goto errout;
1810 1918
1811 BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP && 1919 BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
1812 cex->ec_type != EXT4_EXT_CACHE_EXTENT); 1920 cex->ec_type != EXT4_EXT_CACHE_EXTENT);
@@ -1817,11 +1925,11 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
1817 ext_debug("%u cached by %u:%u:%llu\n", 1925 ext_debug("%u cached by %u:%u:%llu\n",
1818 block, 1926 block,
1819 cex->ec_block, cex->ec_len, cex->ec_start); 1927 cex->ec_block, cex->ec_len, cex->ec_start);
1820 return cex->ec_type; 1928 ret = cex->ec_type;
1821 } 1929 }
1822 1930errout:
1823 /* not in cache */ 1931 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1824 return EXT4_EXT_CACHE_NO; 1932 return ret;
1825} 1933}
1826 1934
1827/* 1935/*
@@ -2137,7 +2245,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2137 return -ENOMEM; 2245 return -ENOMEM;
2138 } 2246 }
2139 path[0].p_hdr = ext_inode_hdr(inode); 2247 path[0].p_hdr = ext_inode_hdr(inode);
2140 if (ext4_ext_check_header(inode, path[0].p_hdr, depth)) { 2248 if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
2141 err = -EIO; 2249 err = -EIO;
2142 goto out; 2250 goto out;
2143 } 2251 }
@@ -2191,7 +2299,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2191 err = -EIO; 2299 err = -EIO;
2192 break; 2300 break;
2193 } 2301 }
2194 if (ext4_ext_check_header(inode, ext_block_hdr(bh), 2302 if (ext4_ext_check(inode, ext_block_hdr(bh),
2195 depth - i - 1)) { 2303 depth - i - 1)) {
2196 err = -EIO; 2304 err = -EIO;
2197 break; 2305 break;
@@ -2321,8 +2429,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2321 len = ee_len; 2429 len = ee_len;
2322 2430
2323 bio = bio_alloc(GFP_NOIO, len); 2431 bio = bio_alloc(GFP_NOIO, len);
2324 if (!bio)
2325 return -ENOMEM;
2326 bio->bi_sector = ee_pblock; 2432 bio->bi_sector = ee_pblock;
2327 bio->bi_bdev = inode->i_sb->s_bdev; 2433 bio->bi_bdev = inode->i_sb->s_bdev;
2328 2434
@@ -2776,6 +2882,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2776 if (allocated > max_blocks) 2882 if (allocated > max_blocks)
2777 allocated = max_blocks; 2883 allocated = max_blocks;
2778 set_buffer_unwritten(bh_result); 2884 set_buffer_unwritten(bh_result);
2885 bh_result->b_bdev = inode->i_sb->s_bdev;
2886 bh_result->b_blocknr = newblock;
2779 goto out2; 2887 goto out2;
2780 } 2888 }
2781 2889
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index f731cb545a03..588af8c77246 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -33,9 +33,14 @@
33 */ 33 */
34static int ext4_release_file(struct inode *inode, struct file *filp) 34static int ext4_release_file(struct inode *inode, struct file *filp)
35{ 35{
36 if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) {
37 ext4_alloc_da_blocks(inode);
38 EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE;
39 }
36 /* if we are the last writer on the inode, drop the block reservation */ 40 /* if we are the last writer on the inode, drop the block reservation */
37 if ((filp->f_mode & FMODE_WRITE) && 41 if ((filp->f_mode & FMODE_WRITE) &&
38 (atomic_read(&inode->i_writecount) == 1)) 42 (atomic_read(&inode->i_writecount) == 1) &&
43 !EXT4_I(inode)->i_reserved_data_blocks)
39 { 44 {
40 down_write(&EXT4_I(inode)->i_data_sem); 45 down_write(&EXT4_I(inode)->i_data_sem);
41 ext4_discard_preallocations(inode); 46 ext4_discard_preallocations(inode);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index fb51b40e3e8f..f18e0a08a6b5 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -189,7 +189,6 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
189 struct ext4_super_block *es; 189 struct ext4_super_block *es;
190 struct ext4_sb_info *sbi; 190 struct ext4_sb_info *sbi;
191 int fatal = 0, err, count, cleared; 191 int fatal = 0, err, count, cleared;
192 ext4_group_t flex_group;
193 192
194 if (atomic_read(&inode->i_count) > 1) { 193 if (atomic_read(&inode->i_count) > 1) {
195 printk(KERN_ERR "ext4_free_inode: inode has count=%d\n", 194 printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
@@ -268,6 +267,13 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
268 if (is_directory) { 267 if (is_directory) {
269 count = ext4_used_dirs_count(sb, gdp) - 1; 268 count = ext4_used_dirs_count(sb, gdp) - 1;
270 ext4_used_dirs_set(sb, gdp, count); 269 ext4_used_dirs_set(sb, gdp, count);
270 if (sbi->s_log_groups_per_flex) {
271 ext4_group_t f;
272
273 f = ext4_flex_group(sbi, block_group);
274 atomic_dec(&sbi->s_flex_groups[f].free_inodes);
275 }
276
271 } 277 }
272 gdp->bg_checksum = ext4_group_desc_csum(sbi, 278 gdp->bg_checksum = ext4_group_desc_csum(sbi,
273 block_group, gdp); 279 block_group, gdp);
@@ -277,10 +283,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
277 percpu_counter_dec(&sbi->s_dirs_counter); 283 percpu_counter_dec(&sbi->s_dirs_counter);
278 284
279 if (sbi->s_log_groups_per_flex) { 285 if (sbi->s_log_groups_per_flex) {
280 flex_group = ext4_flex_group(sbi, block_group); 286 ext4_group_t f;
281 spin_lock(sb_bgl_lock(sbi, flex_group)); 287
282 sbi->s_flex_groups[flex_group].free_inodes++; 288 f = ext4_flex_group(sbi, block_group);
283 spin_unlock(sb_bgl_lock(sbi, flex_group)); 289 atomic_inc(&sbi->s_flex_groups[f].free_inodes);
284 } 290 }
285 } 291 }
286 BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); 292 BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
@@ -360,9 +366,9 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
360 sbi->s_log_groups_per_flex; 366 sbi->s_log_groups_per_flex;
361 367
362find_close_to_parent: 368find_close_to_parent:
363 flexbg_free_blocks = flex_group[best_flex].free_blocks; 369 flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks);
364 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; 370 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
365 if (flex_group[best_flex].free_inodes && 371 if (atomic_read(&flex_group[best_flex].free_inodes) &&
366 flex_freeb_ratio > free_block_ratio) 372 flex_freeb_ratio > free_block_ratio)
367 goto found_flexbg; 373 goto found_flexbg;
368 374
@@ -375,24 +381,24 @@ find_close_to_parent:
375 if (i == parent_fbg_group || i == parent_fbg_group - 1) 381 if (i == parent_fbg_group || i == parent_fbg_group - 1)
376 continue; 382 continue;
377 383
378 flexbg_free_blocks = flex_group[i].free_blocks; 384 flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks);
379 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; 385 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
380 386
381 if (flex_freeb_ratio > free_block_ratio && 387 if (flex_freeb_ratio > free_block_ratio &&
382 flex_group[i].free_inodes) { 388 (atomic_read(&flex_group[i].free_inodes))) {
383 best_flex = i; 389 best_flex = i;
384 goto found_flexbg; 390 goto found_flexbg;
385 } 391 }
386 392
387 if (flex_group[best_flex].free_inodes == 0 || 393 if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) ||
388 (flex_group[i].free_blocks > 394 ((atomic_read(&flex_group[i].free_blocks) >
389 flex_group[best_flex].free_blocks && 395 atomic_read(&flex_group[best_flex].free_blocks)) &&
390 flex_group[i].free_inodes)) 396 atomic_read(&flex_group[i].free_inodes)))
391 best_flex = i; 397 best_flex = i;
392 } 398 }
393 399
394 if (!flex_group[best_flex].free_inodes || 400 if (!atomic_read(&flex_group[best_flex].free_inodes) ||
395 !flex_group[best_flex].free_blocks) 401 !atomic_read(&flex_group[best_flex].free_blocks))
396 return -1; 402 return -1;
397 403
398found_flexbg: 404found_flexbg:
@@ -410,6 +416,42 @@ out:
410 return 0; 416 return 0;
411} 417}
412 418
419struct orlov_stats {
420 __u32 free_inodes;
421 __u32 free_blocks;
422 __u32 used_dirs;
423};
424
425/*
426 * Helper function for Orlov's allocator; returns critical information
427 * for a particular block group or flex_bg. If flex_size is 1, then g
428 * is a block group number; otherwise it is flex_bg number.
429 */
430void get_orlov_stats(struct super_block *sb, ext4_group_t g,
431 int flex_size, struct orlov_stats *stats)
432{
433 struct ext4_group_desc *desc;
434 struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
435
436 if (flex_size > 1) {
437 stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
438 stats->free_blocks = atomic_read(&flex_group[g].free_blocks);
439 stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
440 return;
441 }
442
443 desc = ext4_get_group_desc(sb, g, NULL);
444 if (desc) {
445 stats->free_inodes = ext4_free_inodes_count(sb, desc);
446 stats->free_blocks = ext4_free_blks_count(sb, desc);
447 stats->used_dirs = ext4_used_dirs_count(sb, desc);
448 } else {
449 stats->free_inodes = 0;
450 stats->free_blocks = 0;
451 stats->used_dirs = 0;
452 }
453}
454
413/* 455/*
414 * Orlov's allocator for directories. 456 * Orlov's allocator for directories.
415 * 457 *
@@ -425,35 +467,34 @@ out:
425 * it has too many directories already (max_dirs) or 467 * it has too many directories already (max_dirs) or
426 * it has too few free inodes left (min_inodes) or 468 * it has too few free inodes left (min_inodes) or
427 * it has too few free blocks left (min_blocks) or 469 * it has too few free blocks left (min_blocks) or
428 * it's already running too large debt (max_debt).
429 * Parent's group is preferred, if it doesn't satisfy these 470 * Parent's group is preferred, if it doesn't satisfy these
430 * conditions we search cyclically through the rest. If none 471 * conditions we search cyclically through the rest. If none
431 * of the groups look good we just look for a group with more 472 * of the groups look good we just look for a group with more
432 * free inodes than average (starting at parent's group). 473 * free inodes than average (starting at parent's group).
433 *
434 * Debt is incremented each time we allocate a directory and decremented
435 * when we allocate an inode, within 0--255.
436 */ 474 */
437 475
438#define INODE_COST 64
439#define BLOCK_COST 256
440
441static int find_group_orlov(struct super_block *sb, struct inode *parent, 476static int find_group_orlov(struct super_block *sb, struct inode *parent,
442 ext4_group_t *group) 477 ext4_group_t *group, int mode)
443{ 478{
444 ext4_group_t parent_group = EXT4_I(parent)->i_block_group; 479 ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
445 struct ext4_sb_info *sbi = EXT4_SB(sb); 480 struct ext4_sb_info *sbi = EXT4_SB(sb);
446 struct ext4_super_block *es = sbi->s_es;
447 ext4_group_t ngroups = sbi->s_groups_count; 481 ext4_group_t ngroups = sbi->s_groups_count;
448 int inodes_per_group = EXT4_INODES_PER_GROUP(sb); 482 int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
449 unsigned int freei, avefreei; 483 unsigned int freei, avefreei;
450 ext4_fsblk_t freeb, avefreeb; 484 ext4_fsblk_t freeb, avefreeb;
451 ext4_fsblk_t blocks_per_dir;
452 unsigned int ndirs; 485 unsigned int ndirs;
453 int max_debt, max_dirs, min_inodes; 486 int max_dirs, min_inodes;
454 ext4_grpblk_t min_blocks; 487 ext4_grpblk_t min_blocks;
455 ext4_group_t i; 488 ext4_group_t i, grp, g;
456 struct ext4_group_desc *desc; 489 struct ext4_group_desc *desc;
490 struct orlov_stats stats;
491 int flex_size = ext4_flex_bg_size(sbi);
492
493 if (flex_size > 1) {
494 ngroups = (ngroups + flex_size - 1) >>
495 sbi->s_log_groups_per_flex;
496 parent_group >>= sbi->s_log_groups_per_flex;
497 }
457 498
458 freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); 499 freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
459 avefreei = freei / ngroups; 500 avefreei = freei / ngroups;
@@ -462,71 +503,98 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
462 do_div(avefreeb, ngroups); 503 do_div(avefreeb, ngroups);
463 ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); 504 ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
464 505
465 if ((parent == sb->s_root->d_inode) || 506 if (S_ISDIR(mode) &&
466 (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) { 507 ((parent == sb->s_root->d_inode) ||
508 (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
467 int best_ndir = inodes_per_group; 509 int best_ndir = inodes_per_group;
468 ext4_group_t grp;
469 int ret = -1; 510 int ret = -1;
470 511
471 get_random_bytes(&grp, sizeof(grp)); 512 get_random_bytes(&grp, sizeof(grp));
472 parent_group = (unsigned)grp % ngroups; 513 parent_group = (unsigned)grp % ngroups;
473 for (i = 0; i < ngroups; i++) { 514 for (i = 0; i < ngroups; i++) {
474 grp = (parent_group + i) % ngroups; 515 g = (parent_group + i) % ngroups;
475 desc = ext4_get_group_desc(sb, grp, NULL); 516 get_orlov_stats(sb, g, flex_size, &stats);
476 if (!desc || !ext4_free_inodes_count(sb, desc)) 517 if (!stats.free_inodes)
477 continue; 518 continue;
478 if (ext4_used_dirs_count(sb, desc) >= best_ndir) 519 if (stats.used_dirs >= best_ndir)
479 continue; 520 continue;
480 if (ext4_free_inodes_count(sb, desc) < avefreei) 521 if (stats.free_inodes < avefreei)
481 continue; 522 continue;
482 if (ext4_free_blks_count(sb, desc) < avefreeb) 523 if (stats.free_blocks < avefreeb)
483 continue; 524 continue;
484 *group = grp; 525 grp = g;
485 ret = 0; 526 ret = 0;
486 best_ndir = ext4_used_dirs_count(sb, desc); 527 best_ndir = stats.used_dirs;
528 }
529 if (ret)
530 goto fallback;
531 found_flex_bg:
532 if (flex_size == 1) {
533 *group = grp;
534 return 0;
535 }
536
537 /*
538 * We pack inodes at the beginning of the flexgroup's
539 * inode tables. Block allocation decisions will do
540 * something similar, although regular files will
541 * start at 2nd block group of the flexgroup. See
542 * ext4_ext_find_goal() and ext4_find_near().
543 */
544 grp *= flex_size;
545 for (i = 0; i < flex_size; i++) {
546 if (grp+i >= sbi->s_groups_count)
547 break;
548 desc = ext4_get_group_desc(sb, grp+i, NULL);
549 if (desc && ext4_free_inodes_count(sb, desc)) {
550 *group = grp+i;
551 return 0;
552 }
487 } 553 }
488 if (ret == 0)
489 return ret;
490 goto fallback; 554 goto fallback;
491 } 555 }
492 556
493 blocks_per_dir = ext4_blocks_count(es) - freeb;
494 do_div(blocks_per_dir, ndirs);
495
496 max_dirs = ndirs / ngroups + inodes_per_group / 16; 557 max_dirs = ndirs / ngroups + inodes_per_group / 16;
497 min_inodes = avefreei - inodes_per_group / 4; 558 min_inodes = avefreei - inodes_per_group*flex_size / 4;
498 min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4; 559 if (min_inodes < 1)
499 560 min_inodes = 1;
500 max_debt = EXT4_BLOCKS_PER_GROUP(sb); 561 min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;
501 max_debt /= max_t(int, blocks_per_dir, BLOCK_COST); 562
502 if (max_debt * INODE_COST > inodes_per_group) 563 /*
503 max_debt = inodes_per_group / INODE_COST; 564 * Start looking in the flex group where we last allocated an
504 if (max_debt > 255) 565 * inode for this parent directory
505 max_debt = 255; 566 */
506 if (max_debt == 0) 567 if (EXT4_I(parent)->i_last_alloc_group != ~0) {
507 max_debt = 1; 568 parent_group = EXT4_I(parent)->i_last_alloc_group;
569 if (flex_size > 1)
570 parent_group >>= sbi->s_log_groups_per_flex;
571 }
508 572
509 for (i = 0; i < ngroups; i++) { 573 for (i = 0; i < ngroups; i++) {
510 *group = (parent_group + i) % ngroups; 574 grp = (parent_group + i) % ngroups;
511 desc = ext4_get_group_desc(sb, *group, NULL); 575 get_orlov_stats(sb, grp, flex_size, &stats);
512 if (!desc || !ext4_free_inodes_count(sb, desc)) 576 if (stats.used_dirs >= max_dirs)
513 continue; 577 continue;
514 if (ext4_used_dirs_count(sb, desc) >= max_dirs) 578 if (stats.free_inodes < min_inodes)
515 continue; 579 continue;
516 if (ext4_free_inodes_count(sb, desc) < min_inodes) 580 if (stats.free_blocks < min_blocks)
517 continue; 581 continue;
518 if (ext4_free_blks_count(sb, desc) < min_blocks) 582 goto found_flex_bg;
519 continue;
520 return 0;
521 } 583 }
522 584
523fallback: 585fallback:
586 ngroups = sbi->s_groups_count;
587 avefreei = freei / ngroups;
588fallback_retry:
589 parent_group = EXT4_I(parent)->i_block_group;
524 for (i = 0; i < ngroups; i++) { 590 for (i = 0; i < ngroups; i++) {
525 *group = (parent_group + i) % ngroups; 591 grp = (parent_group + i) % ngroups;
526 desc = ext4_get_group_desc(sb, *group, NULL); 592 desc = ext4_get_group_desc(sb, grp, NULL);
527 if (desc && ext4_free_inodes_count(sb, desc) && 593 if (desc && ext4_free_inodes_count(sb, desc) &&
528 ext4_free_inodes_count(sb, desc) >= avefreei) 594 ext4_free_inodes_count(sb, desc) >= avefreei) {
595 *group = grp;
529 return 0; 596 return 0;
597 }
530 } 598 }
531 599
532 if (avefreei) { 600 if (avefreei) {
@@ -535,19 +603,58 @@ fallback:
535 * filesystems the above test can fail to find any blockgroups 603 * filesystems the above test can fail to find any blockgroups
536 */ 604 */
537 avefreei = 0; 605 avefreei = 0;
538 goto fallback; 606 goto fallback_retry;
539 } 607 }
540 608
541 return -1; 609 return -1;
542} 610}
543 611
544static int find_group_other(struct super_block *sb, struct inode *parent, 612static int find_group_other(struct super_block *sb, struct inode *parent,
545 ext4_group_t *group) 613 ext4_group_t *group, int mode)
546{ 614{
547 ext4_group_t parent_group = EXT4_I(parent)->i_block_group; 615 ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
548 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; 616 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
549 struct ext4_group_desc *desc; 617 struct ext4_group_desc *desc;
550 ext4_group_t i; 618 ext4_group_t i, last;
619 int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
620
621 /*
622 * Try to place the inode is the same flex group as its
623 * parent. If we can't find space, use the Orlov algorithm to
624 * find another flex group, and store that information in the
625 * parent directory's inode information so that use that flex
626 * group for future allocations.
627 */
628 if (flex_size > 1) {
629 int retry = 0;
630
631 try_again:
632 parent_group &= ~(flex_size-1);
633 last = parent_group + flex_size;
634 if (last > ngroups)
635 last = ngroups;
636 for (i = parent_group; i < last; i++) {
637 desc = ext4_get_group_desc(sb, i, NULL);
638 if (desc && ext4_free_inodes_count(sb, desc)) {
639 *group = i;
640 return 0;
641 }
642 }
643 if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
644 retry = 1;
645 parent_group = EXT4_I(parent)->i_last_alloc_group;
646 goto try_again;
647 }
648 /*
649 * If this didn't work, use the Orlov search algorithm
650 * to find a new flex group; we pass in the mode to
651 * avoid the topdir algorithms.
652 */
653 *group = parent_group + flex_size;
654 if (*group > ngroups)
655 *group = 0;
656 return find_group_orlov(sb, parent, group, mode);
657 }
551 658
552 /* 659 /*
553 * Try to place the inode in its parent directory 660 * Try to place the inode in its parent directory
@@ -665,6 +772,11 @@ static int ext4_claim_inode(struct super_block *sb,
665 if (S_ISDIR(mode)) { 772 if (S_ISDIR(mode)) {
666 count = ext4_used_dirs_count(sb, gdp) + 1; 773 count = ext4_used_dirs_count(sb, gdp) + 1;
667 ext4_used_dirs_set(sb, gdp, count); 774 ext4_used_dirs_set(sb, gdp, count);
775 if (sbi->s_log_groups_per_flex) {
776 ext4_group_t f = ext4_flex_group(sbi, group);
777
778 atomic_inc(&sbi->s_flex_groups[f].free_inodes);
779 }
668 } 780 }
669 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); 781 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
670err_ret: 782err_ret:
@@ -716,15 +828,16 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
716 sbi = EXT4_SB(sb); 828 sbi = EXT4_SB(sb);
717 es = sbi->s_es; 829 es = sbi->s_es;
718 830
719 if (sbi->s_log_groups_per_flex) { 831 if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
720 ret2 = find_group_flex(sb, dir, &group); 832 ret2 = find_group_flex(sb, dir, &group);
721 if (ret2 == -1) { 833 if (ret2 == -1) {
722 ret2 = find_group_other(sb, dir, &group); 834 ret2 = find_group_other(sb, dir, &group, mode);
723 if (ret2 == 0 && once) 835 if (ret2 == 0 && once) {
724 once = 0; 836 once = 0;
725 printk(KERN_NOTICE "ext4: find_group_flex " 837 printk(KERN_NOTICE "ext4: find_group_flex "
726 "failed, fallback succeeded dir %lu\n", 838 "failed, fallback succeeded dir %lu\n",
727 dir->i_ino); 839 dir->i_ino);
840 }
728 } 841 }
729 goto got_group; 842 goto got_group;
730 } 843 }
@@ -733,11 +846,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
733 if (test_opt(sb, OLDALLOC)) 846 if (test_opt(sb, OLDALLOC))
734 ret2 = find_group_dir(sb, dir, &group); 847 ret2 = find_group_dir(sb, dir, &group);
735 else 848 else
736 ret2 = find_group_orlov(sb, dir, &group); 849 ret2 = find_group_orlov(sb, dir, &group, mode);
737 } else 850 } else
738 ret2 = find_group_other(sb, dir, &group); 851 ret2 = find_group_other(sb, dir, &group, mode);
739 852
740got_group: 853got_group:
854 EXT4_I(dir)->i_last_alloc_group = group;
741 err = -ENOSPC; 855 err = -ENOSPC;
742 if (ret2 == -1) 856 if (ret2 == -1)
743 goto out; 857 goto out;
@@ -858,9 +972,7 @@ got:
858 972
859 if (sbi->s_log_groups_per_flex) { 973 if (sbi->s_log_groups_per_flex) {
860 flex_group = ext4_flex_group(sbi, group); 974 flex_group = ext4_flex_group(sbi, group);
861 spin_lock(sb_bgl_lock(sbi, flex_group)); 975 atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
862 sbi->s_flex_groups[flex_group].free_inodes--;
863 spin_unlock(sb_bgl_lock(sbi, flex_group));
864 } 976 }
865 977
866 inode->i_uid = current_fsuid(); 978 inode->i_uid = current_fsuid();
@@ -885,19 +997,16 @@ got:
885 ei->i_disksize = 0; 997 ei->i_disksize = 0;
886 998
887 /* 999 /*
888 * Don't inherit extent flag from directory. We set extent flag on 1000 * Don't inherit extent flag from directory, amongst others. We set
889 * newly created directory and file only if -o extent mount option is 1001 * extent flag on newly created directory and file only if -o extent
890 * specified 1002 * mount option is specified
891 */ 1003 */
892 ei->i_flags = EXT4_I(dir)->i_flags & ~(EXT4_INDEX_FL|EXT4_EXTENTS_FL); 1004 ei->i_flags =
893 if (S_ISLNK(mode)) 1005 ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
894 ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL);
895 /* dirsync only applies to directories */
896 if (!S_ISDIR(mode))
897 ei->i_flags &= ~EXT4_DIRSYNC_FL;
898 ei->i_file_acl = 0; 1006 ei->i_file_acl = 0;
899 ei->i_dtime = 0; 1007 ei->i_dtime = 0;
900 ei->i_block_group = group; 1008 ei->i_block_group = group;
1009 ei->i_last_alloc_group = ~0;
901 1010
902 ext4_set_inode_flags(inode); 1011 ext4_set_inode_flags(inode);
903 if (IS_DIRSYNC(inode)) 1012 if (IS_DIRSYNC(inode))
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 71d3ecd5db79..2a9ffd528dd1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -371,6 +371,34 @@ static int ext4_block_to_path(struct inode *inode,
371 return n; 371 return n;
372} 372}
373 373
374static int __ext4_check_blockref(const char *function, struct inode *inode,
375 __le32 *p, unsigned int max) {
376
377 unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es);
378 __le32 *bref = p;
379 while (bref < p+max) {
380 if (unlikely(le32_to_cpu(*bref) >= maxblocks)) {
381 ext4_error(inode->i_sb, function,
382 "block reference %u >= max (%u) "
383 "in inode #%lu, offset=%d",
384 le32_to_cpu(*bref), maxblocks,
385 inode->i_ino, (int)(bref-p));
386 return -EIO;
387 }
388 bref++;
389 }
390 return 0;
391}
392
393
394#define ext4_check_indirect_blockref(inode, bh) \
395 __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data, \
396 EXT4_ADDR_PER_BLOCK((inode)->i_sb))
397
398#define ext4_check_inode_blockref(inode) \
399 __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data, \
400 EXT4_NDIR_BLOCKS)
401
374/** 402/**
375 * ext4_get_branch - read the chain of indirect blocks leading to data 403 * ext4_get_branch - read the chain of indirect blocks leading to data
376 * @inode: inode in question 404 * @inode: inode in question
@@ -415,9 +443,22 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
415 if (!p->key) 443 if (!p->key)
416 goto no_block; 444 goto no_block;
417 while (--depth) { 445 while (--depth) {
418 bh = sb_bread(sb, le32_to_cpu(p->key)); 446 bh = sb_getblk(sb, le32_to_cpu(p->key));
419 if (!bh) 447 if (unlikely(!bh))
420 goto failure; 448 goto failure;
449
450 if (!bh_uptodate_or_lock(bh)) {
451 if (bh_submit_read(bh) < 0) {
452 put_bh(bh);
453 goto failure;
454 }
455 /* validate block references */
456 if (ext4_check_indirect_blockref(inode, bh)) {
457 put_bh(bh);
458 goto failure;
459 }
460 }
461
421 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); 462 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
422 /* Reader: end */ 463 /* Reader: end */
423 if (!p->key) 464 if (!p->key)
@@ -459,6 +500,8 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
459 ext4_fsblk_t bg_start; 500 ext4_fsblk_t bg_start;
460 ext4_fsblk_t last_block; 501 ext4_fsblk_t last_block;
461 ext4_grpblk_t colour; 502 ext4_grpblk_t colour;
503 ext4_group_t block_group;
504 int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
462 505
463 /* Try to find previous block */ 506 /* Try to find previous block */
464 for (p = ind->p - 1; p >= start; p--) { 507 for (p = ind->p - 1; p >= start; p--) {
@@ -474,9 +517,22 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
474 * It is going to be referred to from the inode itself? OK, just put it 517 * It is going to be referred to from the inode itself? OK, just put it
475 * into the same cylinder group then. 518 * into the same cylinder group then.
476 */ 519 */
477 bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group); 520 block_group = ei->i_block_group;
521 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
522 block_group &= ~(flex_size-1);
523 if (S_ISREG(inode->i_mode))
524 block_group++;
525 }
526 bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
478 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; 527 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
479 528
529 /*
530 * If we are doing delayed allocation, we don't need take
531 * colour into account.
532 */
533 if (test_opt(inode->i_sb, DELALLOC))
534 return bg_start;
535
480 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) 536 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
481 colour = (current->pid % 16) * 537 colour = (current->pid % 16) *
482 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); 538 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -1052,9 +1108,16 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
1052 /* 1108 /*
1053 * free those over-booking quota for metadata blocks 1109 * free those over-booking quota for metadata blocks
1054 */ 1110 */
1055
1056 if (mdb_free) 1111 if (mdb_free)
1057 vfs_dq_release_reservation_block(inode, mdb_free); 1112 vfs_dq_release_reservation_block(inode, mdb_free);
1113
1114 /*
1115 * If we have done all the pending block allocations and if
1116 * there aren't any writers on the inode, we can discard the
1117 * inode's preallocations.
1118 */
1119 if (!total && (atomic_read(&inode->i_writecount) == 0))
1120 ext4_discard_preallocations(inode);
1058} 1121}
1059 1122
1060/* 1123/*
@@ -1086,6 +1149,7 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
1086 int retval; 1149 int retval;
1087 1150
1088 clear_buffer_mapped(bh); 1151 clear_buffer_mapped(bh);
1152 clear_buffer_unwritten(bh);
1089 1153
1090 /* 1154 /*
1091 * Try to see if we can get the block without requesting 1155 * Try to see if we can get the block without requesting
@@ -1116,6 +1180,18 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
1116 return retval; 1180 return retval;
1117 1181
1118 /* 1182 /*
1183 * When we call get_blocks without the create flag, the
1184 * BH_Unwritten flag could have gotten set if the blocks
1185 * requested were part of a uninitialized extent. We need to
1186 * clear this flag now that we are committed to convert all or
1187 * part of the uninitialized extent to be an initialized
1188 * extent. This is because we need to avoid the combination
1189 * of BH_Unwritten and BH_Mapped flags being simultaneously
1190 * set on the buffer_head.
1191 */
1192 clear_buffer_unwritten(bh);
1193
1194 /*
1119 * New blocks allocate and/or writing to uninitialized extent 1195 * New blocks allocate and/or writing to uninitialized extent
1120 * will possibly result in updating i_data, so we take 1196 * will possibly result in updating i_data, so we take
1121 * the write lock of i_data_sem, and call get_blocks() 1197 * the write lock of i_data_sem, and call get_blocks()
@@ -1688,9 +1764,10 @@ static void ext4_da_page_release_reservation(struct page *page,
1688 1764
1689struct mpage_da_data { 1765struct mpage_da_data {
1690 struct inode *inode; 1766 struct inode *inode;
1691 struct buffer_head lbh; /* extent of blocks */ 1767 sector_t b_blocknr; /* start block number of extent */
1768 size_t b_size; /* size of extent */
1769 unsigned long b_state; /* state of the extent */
1692 unsigned long first_page, next_page; /* extent of pages */ 1770 unsigned long first_page, next_page; /* extent of pages */
1693 get_block_t *get_block;
1694 struct writeback_control *wbc; 1771 struct writeback_control *wbc;
1695 int io_done; 1772 int io_done;
1696 int pages_written; 1773 int pages_written;
@@ -1704,7 +1781,6 @@ struct mpage_da_data {
1704 * @mpd->inode: inode 1781 * @mpd->inode: inode
1705 * @mpd->first_page: first page of the extent 1782 * @mpd->first_page: first page of the extent
1706 * @mpd->next_page: page after the last page of the extent 1783 * @mpd->next_page: page after the last page of the extent
1707 * @mpd->get_block: the filesystem's block mapper function
1708 * 1784 *
1709 * By the time mpage_da_submit_io() is called we expect all blocks 1785 * By the time mpage_da_submit_io() is called we expect all blocks
1710 * to be allocated. this may be wrong if allocation failed. 1786 * to be allocated. this may be wrong if allocation failed.
@@ -1724,7 +1800,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
1724 /* 1800 /*
1725 * We need to start from the first_page to the next_page - 1 1801 * We need to start from the first_page to the next_page - 1
1726 * to make sure we also write the mapped dirty buffer_heads. 1802 * to make sure we also write the mapped dirty buffer_heads.
1727 * If we look at mpd->lbh.b_blocknr we would only be looking 1803 * If we look at mpd->b_blocknr we would only be looking
1728 * at the currently mapped buffer_heads. 1804 * at the currently mapped buffer_heads.
1729 */ 1805 */
1730 index = mpd->first_page; 1806 index = mpd->first_page;
@@ -1914,68 +1990,111 @@ static void ext4_print_free_blocks(struct inode *inode)
1914 return; 1990 return;
1915} 1991}
1916 1992
1993#define EXT4_DELALLOC_RSVED 1
1994static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
1995 struct buffer_head *bh_result, int create)
1996{
1997 int ret;
1998 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
1999 loff_t disksize = EXT4_I(inode)->i_disksize;
2000 handle_t *handle = NULL;
2001
2002 handle = ext4_journal_current_handle();
2003 BUG_ON(!handle);
2004 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
2005 bh_result, create, 0, EXT4_DELALLOC_RSVED);
2006 if (ret <= 0)
2007 return ret;
2008
2009 bh_result->b_size = (ret << inode->i_blkbits);
2010
2011 if (ext4_should_order_data(inode)) {
2012 int retval;
2013 retval = ext4_jbd2_file_inode(handle, inode);
2014 if (retval)
2015 /*
2016 * Failed to add inode for ordered mode. Don't
2017 * update file size
2018 */
2019 return retval;
2020 }
2021
2022 /*
2023 * Update on-disk size along with block allocation we don't
2024 * use 'extend_disksize' as size may change within already
2025 * allocated block -bzzz
2026 */
2027 disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
2028 if (disksize > i_size_read(inode))
2029 disksize = i_size_read(inode);
2030 if (disksize > EXT4_I(inode)->i_disksize) {
2031 ext4_update_i_disksize(inode, disksize);
2032 ret = ext4_mark_inode_dirty(handle, inode);
2033 return ret;
2034 }
2035 return 0;
2036}
2037
1917/* 2038/*
1918 * mpage_da_map_blocks - go through given space 2039 * mpage_da_map_blocks - go through given space
1919 * 2040 *
1920 * @mpd->lbh - bh describing space 2041 * @mpd - bh describing space
1921 * @mpd->get_block - the filesystem's block mapper function
1922 * 2042 *
1923 * The function skips space we know is already mapped to disk blocks. 2043 * The function skips space we know is already mapped to disk blocks.
1924 * 2044 *
1925 */ 2045 */
1926static int mpage_da_map_blocks(struct mpage_da_data *mpd) 2046static int mpage_da_map_blocks(struct mpage_da_data *mpd)
1927{ 2047{
1928 int err = 0; 2048 int err = 0;
1929 struct buffer_head new; 2049 struct buffer_head new;
1930 struct buffer_head *lbh = &mpd->lbh;
1931 sector_t next; 2050 sector_t next;
1932 2051
1933 /* 2052 /*
1934 * We consider only non-mapped and non-allocated blocks 2053 * We consider only non-mapped and non-allocated blocks
1935 */ 2054 */
1936 if (buffer_mapped(lbh) && !buffer_delay(lbh)) 2055 if ((mpd->b_state & (1 << BH_Mapped)) &&
2056 !(mpd->b_state & (1 << BH_Delay)))
1937 return 0; 2057 return 0;
1938 new.b_state = lbh->b_state; 2058 new.b_state = mpd->b_state;
1939 new.b_blocknr = 0; 2059 new.b_blocknr = 0;
1940 new.b_size = lbh->b_size; 2060 new.b_size = mpd->b_size;
1941 next = lbh->b_blocknr; 2061 next = mpd->b_blocknr;
1942 /* 2062 /*
1943 * If we didn't accumulate anything 2063 * If we didn't accumulate anything
1944 * to write simply return 2064 * to write simply return
1945 */ 2065 */
1946 if (!new.b_size) 2066 if (!new.b_size)
1947 return 0; 2067 return 0;
1948 err = mpd->get_block(mpd->inode, next, &new, 1);
1949 if (err) {
1950 2068
1951 /* If get block returns with error 2069 err = ext4_da_get_block_write(mpd->inode, next, &new, 1);
1952 * we simply return. Later writepage 2070 if (err) {
1953 * will redirty the page and writepages 2071 /*
1954 * will find the dirty page again 2072 * If get block returns with error we simply
2073 * return. Later writepage will redirty the page and
2074 * writepages will find the dirty page again
1955 */ 2075 */
1956 if (err == -EAGAIN) 2076 if (err == -EAGAIN)
1957 return 0; 2077 return 0;
1958 2078
1959 if (err == -ENOSPC && 2079 if (err == -ENOSPC &&
1960 ext4_count_free_blocks(mpd->inode->i_sb)) { 2080 ext4_count_free_blocks(mpd->inode->i_sb)) {
1961 mpd->retval = err; 2081 mpd->retval = err;
1962 return 0; 2082 return 0;
1963 } 2083 }
1964 2084
1965 /* 2085 /*
1966 * get block failure will cause us 2086 * get block failure will cause us to loop in
1967 * to loop in writepages. Because 2087 * writepages, because a_ops->writepage won't be able
1968 * a_ops->writepage won't be able to 2088 * to make progress. The page will be redirtied by
1969 * make progress. The page will be redirtied 2089 * writepage and writepages will again try to write
1970 * by writepage and writepages will again 2090 * the same.
1971 * try to write the same.
1972 */ 2091 */
1973 printk(KERN_EMERG "%s block allocation failed for inode %lu " 2092 printk(KERN_EMERG "%s block allocation failed for inode %lu "
1974 "at logical offset %llu with max blocks " 2093 "at logical offset %llu with max blocks "
1975 "%zd with error %d\n", 2094 "%zd with error %d\n",
1976 __func__, mpd->inode->i_ino, 2095 __func__, mpd->inode->i_ino,
1977 (unsigned long long)next, 2096 (unsigned long long)next,
1978 lbh->b_size >> mpd->inode->i_blkbits, err); 2097 mpd->b_size >> mpd->inode->i_blkbits, err);
1979 printk(KERN_EMERG "This should not happen.!! " 2098 printk(KERN_EMERG "This should not happen.!! "
1980 "Data will be lost\n"); 2099 "Data will be lost\n");
1981 if (err == -ENOSPC) { 2100 if (err == -ENOSPC) {
@@ -1983,7 +2102,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
1983 } 2102 }
1984 /* invlaidate all the pages */ 2103 /* invlaidate all the pages */
1985 ext4_da_block_invalidatepages(mpd, next, 2104 ext4_da_block_invalidatepages(mpd, next,
1986 lbh->b_size >> mpd->inode->i_blkbits); 2105 mpd->b_size >> mpd->inode->i_blkbits);
1987 return err; 2106 return err;
1988 } 2107 }
1989 BUG_ON(new.b_size == 0); 2108 BUG_ON(new.b_size == 0);
@@ -1995,7 +2114,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
1995 * If blocks are delayed marked, we need to 2114 * If blocks are delayed marked, we need to
1996 * put actual blocknr and drop delayed bit 2115 * put actual blocknr and drop delayed bit
1997 */ 2116 */
1998 if (buffer_delay(lbh) || buffer_unwritten(lbh)) 2117 if ((mpd->b_state & (1 << BH_Delay)) ||
2118 (mpd->b_state & (1 << BH_Unwritten)))
1999 mpage_put_bnr_to_bhs(mpd, next, &new); 2119 mpage_put_bnr_to_bhs(mpd, next, &new);
2000 2120
2001 return 0; 2121 return 0;
@@ -2014,12 +2134,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2014 * the function is used to collect contig. blocks in same state 2134 * the function is used to collect contig. blocks in same state
2015 */ 2135 */
2016static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, 2136static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
2017 sector_t logical, struct buffer_head *bh) 2137 sector_t logical, size_t b_size,
2138 unsigned long b_state)
2018{ 2139{
2019 sector_t next; 2140 sector_t next;
2020 size_t b_size = bh->b_size; 2141 int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
2021 struct buffer_head *lbh = &mpd->lbh;
2022 int nrblocks = lbh->b_size >> mpd->inode->i_blkbits;
2023 2142
2024 /* check if thereserved journal credits might overflow */ 2143 /* check if thereserved journal credits might overflow */
2025 if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { 2144 if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
@@ -2046,19 +2165,19 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
2046 /* 2165 /*
2047 * First block in the extent 2166 * First block in the extent
2048 */ 2167 */
2049 if (lbh->b_size == 0) { 2168 if (mpd->b_size == 0) {
2050 lbh->b_blocknr = logical; 2169 mpd->b_blocknr = logical;
2051 lbh->b_size = b_size; 2170 mpd->b_size = b_size;
2052 lbh->b_state = bh->b_state & BH_FLAGS; 2171 mpd->b_state = b_state & BH_FLAGS;
2053 return; 2172 return;
2054 } 2173 }
2055 2174
2056 next = lbh->b_blocknr + nrblocks; 2175 next = mpd->b_blocknr + nrblocks;
2057 /* 2176 /*
2058 * Can we merge the block to our big extent? 2177 * Can we merge the block to our big extent?
2059 */ 2178 */
2060 if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { 2179 if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
2061 lbh->b_size += b_size; 2180 mpd->b_size += b_size;
2062 return; 2181 return;
2063 } 2182 }
2064 2183
@@ -2087,7 +2206,7 @@ static int __mpage_da_writepage(struct page *page,
2087{ 2206{
2088 struct mpage_da_data *mpd = data; 2207 struct mpage_da_data *mpd = data;
2089 struct inode *inode = mpd->inode; 2208 struct inode *inode = mpd->inode;
2090 struct buffer_head *bh, *head, fake; 2209 struct buffer_head *bh, *head;
2091 sector_t logical; 2210 sector_t logical;
2092 2211
2093 if (mpd->io_done) { 2212 if (mpd->io_done) {
@@ -2129,9 +2248,9 @@ static int __mpage_da_writepage(struct page *page,
2129 /* 2248 /*
2130 * ... and blocks 2249 * ... and blocks
2131 */ 2250 */
2132 mpd->lbh.b_size = 0; 2251 mpd->b_size = 0;
2133 mpd->lbh.b_state = 0; 2252 mpd->b_state = 0;
2134 mpd->lbh.b_blocknr = 0; 2253 mpd->b_blocknr = 0;
2135 } 2254 }
2136 2255
2137 mpd->next_page = page->index + 1; 2256 mpd->next_page = page->index + 1;
@@ -2139,16 +2258,8 @@ static int __mpage_da_writepage(struct page *page,
2139 (PAGE_CACHE_SHIFT - inode->i_blkbits); 2258 (PAGE_CACHE_SHIFT - inode->i_blkbits);
2140 2259
2141 if (!page_has_buffers(page)) { 2260 if (!page_has_buffers(page)) {
2142 /* 2261 mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE,
2143 * There is no attached buffer heads yet (mmap?) 2262 (1 << BH_Dirty) | (1 << BH_Uptodate));
2144 * we treat the page asfull of dirty blocks
2145 */
2146 bh = &fake;
2147 bh->b_size = PAGE_CACHE_SIZE;
2148 bh->b_state = 0;
2149 set_buffer_dirty(bh);
2150 set_buffer_uptodate(bh);
2151 mpage_add_bh_to_extent(mpd, logical, bh);
2152 if (mpd->io_done) 2263 if (mpd->io_done)
2153 return MPAGE_DA_EXTENT_TAIL; 2264 return MPAGE_DA_EXTENT_TAIL;
2154 } else { 2265 } else {
@@ -2166,8 +2277,10 @@ static int __mpage_da_writepage(struct page *page,
2166 * with the page in ext4_da_writepage 2277 * with the page in ext4_da_writepage
2167 */ 2278 */
2168 if (buffer_dirty(bh) && 2279 if (buffer_dirty(bh) &&
2169 (!buffer_mapped(bh) || buffer_delay(bh))) { 2280 (!buffer_mapped(bh) || buffer_delay(bh))) {
2170 mpage_add_bh_to_extent(mpd, logical, bh); 2281 mpage_add_bh_to_extent(mpd, logical,
2282 bh->b_size,
2283 bh->b_state);
2171 if (mpd->io_done) 2284 if (mpd->io_done)
2172 return MPAGE_DA_EXTENT_TAIL; 2285 return MPAGE_DA_EXTENT_TAIL;
2173 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { 2286 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
@@ -2179,9 +2292,8 @@ static int __mpage_da_writepage(struct page *page,
2179 * unmapped buffer_head later we need to 2292 * unmapped buffer_head later we need to
2180 * use the b_state flag of that buffer_head. 2293 * use the b_state flag of that buffer_head.
2181 */ 2294 */
2182 if (mpd->lbh.b_size == 0) 2295 if (mpd->b_size == 0)
2183 mpd->lbh.b_state = 2296 mpd->b_state = bh->b_state & BH_FLAGS;
2184 bh->b_state & BH_FLAGS;
2185 } 2297 }
2186 logical++; 2298 logical++;
2187 } while ((bh = bh->b_this_page) != head); 2299 } while ((bh = bh->b_this_page) != head);
@@ -2191,51 +2303,6 @@ static int __mpage_da_writepage(struct page *page,
2191} 2303}
2192 2304
2193/* 2305/*
2194 * mpage_da_writepages - walk the list of dirty pages of the given
2195 * address space, allocates non-allocated blocks, maps newly-allocated
2196 * blocks to existing bhs and issue IO them
2197 *
2198 * @mapping: address space structure to write
2199 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2200 * @get_block: the filesystem's block mapper function.
2201 *
2202 * This is a library function, which implements the writepages()
2203 * address_space_operation.
2204 */
2205static int mpage_da_writepages(struct address_space *mapping,
2206 struct writeback_control *wbc,
2207 struct mpage_da_data *mpd)
2208{
2209 int ret;
2210
2211 if (!mpd->get_block)
2212 return generic_writepages(mapping, wbc);
2213
2214 mpd->lbh.b_size = 0;
2215 mpd->lbh.b_state = 0;
2216 mpd->lbh.b_blocknr = 0;
2217 mpd->first_page = 0;
2218 mpd->next_page = 0;
2219 mpd->io_done = 0;
2220 mpd->pages_written = 0;
2221 mpd->retval = 0;
2222
2223 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
2224 /*
2225 * Handle last extent of pages
2226 */
2227 if (!mpd->io_done && mpd->next_page != mpd->first_page) {
2228 if (mpage_da_map_blocks(mpd) == 0)
2229 mpage_da_submit_io(mpd);
2230
2231 mpd->io_done = 1;
2232 ret = MPAGE_DA_EXTENT_TAIL;
2233 }
2234 wbc->nr_to_write -= mpd->pages_written;
2235 return ret;
2236}
2237
2238/*
2239 * this is a special callback for ->write_begin() only 2306 * this is a special callback for ->write_begin() only
2240 * it's intention is to return mapped block or reserve space 2307 * it's intention is to return mapped block or reserve space
2241 */ 2308 */
@@ -2243,6 +2310,10 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2243 struct buffer_head *bh_result, int create) 2310 struct buffer_head *bh_result, int create)
2244{ 2311{
2245 int ret = 0; 2312 int ret = 0;
2313 sector_t invalid_block = ~((sector_t) 0xffff);
2314
2315 if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
2316 invalid_block = ~0;
2246 2317
2247 BUG_ON(create == 0); 2318 BUG_ON(create == 0);
2248 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); 2319 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
@@ -2264,59 +2335,21 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2264 /* not enough space to reserve */ 2335 /* not enough space to reserve */
2265 return ret; 2336 return ret;
2266 2337
2267 map_bh(bh_result, inode->i_sb, 0); 2338 map_bh(bh_result, inode->i_sb, invalid_block);
2268 set_buffer_new(bh_result); 2339 set_buffer_new(bh_result);
2269 set_buffer_delay(bh_result); 2340 set_buffer_delay(bh_result);
2270 } else if (ret > 0) { 2341 } else if (ret > 0) {
2271 bh_result->b_size = (ret << inode->i_blkbits); 2342 bh_result->b_size = (ret << inode->i_blkbits);
2272 ret = 0;
2273 }
2274
2275 return ret;
2276}
2277#define EXT4_DELALLOC_RSVED 1
2278static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
2279 struct buffer_head *bh_result, int create)
2280{
2281 int ret;
2282 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
2283 loff_t disksize = EXT4_I(inode)->i_disksize;
2284 handle_t *handle = NULL;
2285
2286 handle = ext4_journal_current_handle();
2287 BUG_ON(!handle);
2288 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
2289 bh_result, create, 0, EXT4_DELALLOC_RSVED);
2290 if (ret > 0) {
2291
2292 bh_result->b_size = (ret << inode->i_blkbits);
2293
2294 if (ext4_should_order_data(inode)) {
2295 int retval;
2296 retval = ext4_jbd2_file_inode(handle, inode);
2297 if (retval)
2298 /*
2299 * Failed to add inode for ordered
2300 * mode. Don't update file size
2301 */
2302 return retval;
2303 }
2304
2305 /* 2343 /*
2306 * Update on-disk size along with block allocation 2344 * With sub-block writes into unwritten extents
2307 * we don't use 'extend_disksize' as size may change 2345 * we also need to mark the buffer as new so that
2308 * within already allocated block -bzzz 2346 * the unwritten parts of the buffer gets correctly zeroed.
2309 */ 2347 */
2310 disksize = ((loff_t) iblock + ret) << inode->i_blkbits; 2348 if (buffer_unwritten(bh_result))
2311 if (disksize > i_size_read(inode)) 2349 set_buffer_new(bh_result);
2312 disksize = i_size_read(inode);
2313 if (disksize > EXT4_I(inode)->i_disksize) {
2314 ext4_update_i_disksize(inode, disksize);
2315 ret = ext4_mark_inode_dirty(handle, inode);
2316 return ret;
2317 }
2318 ret = 0; 2350 ret = 0;
2319 } 2351 }
2352
2320 return ret; 2353 return ret;
2321} 2354}
2322 2355
@@ -2569,8 +2602,38 @@ retry:
2569 dump_stack(); 2602 dump_stack();
2570 goto out_writepages; 2603 goto out_writepages;
2571 } 2604 }
2572 mpd.get_block = ext4_da_get_block_write; 2605
2573 ret = mpage_da_writepages(mapping, wbc, &mpd); 2606 /*
2607 * Now call __mpage_da_writepage to find the next
2608 * contiguous region of logical blocks that need
2609 * blocks to be allocated by ext4. We don't actually
2610 * submit the blocks for I/O here, even though
2611 * write_cache_pages thinks it will, and will set the
2612 * pages as clean for write before calling
2613 * __mpage_da_writepage().
2614 */
2615 mpd.b_size = 0;
2616 mpd.b_state = 0;
2617 mpd.b_blocknr = 0;
2618 mpd.first_page = 0;
2619 mpd.next_page = 0;
2620 mpd.io_done = 0;
2621 mpd.pages_written = 0;
2622 mpd.retval = 0;
2623 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
2624 &mpd);
2625 /*
2626 * If we have a contigous extent of pages and we
2627 * haven't done the I/O yet, map the blocks and submit
2628 * them for I/O.
2629 */
2630 if (!mpd.io_done && mpd.next_page != mpd.first_page) {
2631 if (mpage_da_map_blocks(&mpd) == 0)
2632 mpage_da_submit_io(&mpd);
2633 mpd.io_done = 1;
2634 ret = MPAGE_DA_EXTENT_TAIL;
2635 }
2636 wbc->nr_to_write -= mpd.pages_written;
2574 2637
2575 ext4_journal_stop(handle); 2638 ext4_journal_stop(handle);
2576 2639
@@ -2846,6 +2909,48 @@ out:
2846 return; 2909 return;
2847} 2910}
2848 2911
2912/*
2913 * Force all delayed allocation blocks to be allocated for a given inode.
2914 */
2915int ext4_alloc_da_blocks(struct inode *inode)
2916{
2917 if (!EXT4_I(inode)->i_reserved_data_blocks &&
2918 !EXT4_I(inode)->i_reserved_meta_blocks)
2919 return 0;
2920
2921 /*
2922 * We do something simple for now. The filemap_flush() will
2923 * also start triggering a write of the data blocks, which is
2924 * not strictly speaking necessary (and for users of
2925 * laptop_mode, not even desirable). However, to do otherwise
2926 * would require replicating code paths in:
2927 *
2928 * ext4_da_writepages() ->
2929 * write_cache_pages() ---> (via passed in callback function)
2930 * __mpage_da_writepage() -->
2931 * mpage_add_bh_to_extent()
2932 * mpage_da_map_blocks()
2933 *
2934 * The problem is that write_cache_pages(), located in
2935 * mm/page-writeback.c, marks pages clean in preparation for
2936 * doing I/O, which is not desirable if we're not planning on
2937 * doing I/O at all.
2938 *
2939 * We could call write_cache_pages(), and then redirty all of
2940 * the pages by calling redirty_page_for_writeback() but that
2941 * would be ugly in the extreme. So instead we would need to
2942 * replicate parts of the code in the above functions,
2943 * simplifying them becuase we wouldn't actually intend to
2944 * write out the pages, but rather only collect contiguous
2945 * logical block extents, call the multi-block allocator, and
2946 * then update the buffer heads with the block allocations.
2947 *
2948 * For now, though, we'll cheat by calling filemap_flush(),
2949 * which will map the blocks, and start the I/O, but not
2950 * actually wait for the I/O to complete.
2951 */
2952 return filemap_flush(inode->i_mapping);
2953}
2849 2954
2850/* 2955/*
2851 * bmap() is special. It gets used by applications such as lilo and by 2956 * bmap() is special. It gets used by applications such as lilo and by
@@ -3868,6 +3973,9 @@ void ext4_truncate(struct inode *inode)
3868 if (!ext4_can_truncate(inode)) 3973 if (!ext4_can_truncate(inode))
3869 return; 3974 return;
3870 3975
3976 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
3977 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
3978
3871 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 3979 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
3872 ext4_ext_truncate(inode); 3980 ext4_ext_truncate(inode);
3873 return; 3981 return;
@@ -4110,12 +4218,7 @@ make_io:
4110 unsigned num; 4218 unsigned num;
4111 4219
4112 table = ext4_inode_table(sb, gdp); 4220 table = ext4_inode_table(sb, gdp);
4113 /* Make sure s_inode_readahead_blks is a power of 2 */ 4221 /* s_inode_readahead_blks is always a power of 2 */
4114 while (EXT4_SB(sb)->s_inode_readahead_blks &
4115 (EXT4_SB(sb)->s_inode_readahead_blks-1))
4116 EXT4_SB(sb)->s_inode_readahead_blks =
4117 (EXT4_SB(sb)->s_inode_readahead_blks &
4118 (EXT4_SB(sb)->s_inode_readahead_blks-1));
4119 b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); 4222 b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
4120 if (table > b) 4223 if (table > b)
4121 b = table; 4224 b = table;
@@ -4278,15 +4381,14 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4278 ei->i_flags = le32_to_cpu(raw_inode->i_flags); 4381 ei->i_flags = le32_to_cpu(raw_inode->i_flags);
4279 inode->i_blocks = ext4_inode_blocks(raw_inode, ei); 4382 inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
4280 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); 4383 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
4281 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 4384 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
4282 cpu_to_le32(EXT4_OS_HURD)) {
4283 ei->i_file_acl |= 4385 ei->i_file_acl |=
4284 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; 4386 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
4285 }
4286 inode->i_size = ext4_isize(raw_inode); 4387 inode->i_size = ext4_isize(raw_inode);
4287 ei->i_disksize = inode->i_size; 4388 ei->i_disksize = inode->i_size;
4288 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 4389 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
4289 ei->i_block_group = iloc.block_group; 4390 ei->i_block_group = iloc.block_group;
4391 ei->i_last_alloc_group = ~0;
4290 /* 4392 /*
4291 * NOTE! The in-memory inode i_data array is in little-endian order 4393 * NOTE! The in-memory inode i_data array is in little-endian order
4292 * even on big-endian machines: we do NOT byteswap the block numbers! 4394 * even on big-endian machines: we do NOT byteswap the block numbers!
@@ -4329,6 +4431,34 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4329 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; 4431 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
4330 } 4432 }
4331 4433
4434 ret = 0;
4435 if (ei->i_file_acl &&
4436 ((ei->i_file_acl <
4437 (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
4438 EXT4_SB(sb)->s_gdb_count)) ||
4439 (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
4440 ext4_error(sb, __func__,
4441 "bad extended attribute block %llu in inode #%lu",
4442 ei->i_file_acl, inode->i_ino);
4443 ret = -EIO;
4444 goto bad_inode;
4445 } else if (ei->i_flags & EXT4_EXTENTS_FL) {
4446 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
4447 (S_ISLNK(inode->i_mode) &&
4448 !ext4_inode_is_fast_symlink(inode)))
4449 /* Validate extent which is part of inode */
4450 ret = ext4_ext_check_inode(inode);
4451 } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
4452 (S_ISLNK(inode->i_mode) &&
4453 !ext4_inode_is_fast_symlink(inode))) {
4454 /* Validate block references which are part of inode */
4455 ret = ext4_check_inode_blockref(inode);
4456 }
4457 if (ret) {
4458 brelse(bh);
4459 goto bad_inode;
4460 }
4461
4332 if (S_ISREG(inode->i_mode)) { 4462 if (S_ISREG(inode->i_mode)) {
4333 inode->i_op = &ext4_file_inode_operations; 4463 inode->i_op = &ext4_file_inode_operations;
4334 inode->i_fop = &ext4_file_operations; 4464 inode->i_fop = &ext4_file_operations;
@@ -4345,7 +4475,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4345 inode->i_op = &ext4_symlink_inode_operations; 4475 inode->i_op = &ext4_symlink_inode_operations;
4346 ext4_set_aops(inode); 4476 ext4_set_aops(inode);
4347 } 4477 }
4348 } else { 4478 } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
4479 S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
4349 inode->i_op = &ext4_special_inode_operations; 4480 inode->i_op = &ext4_special_inode_operations;
4350 if (raw_inode->i_block[0]) 4481 if (raw_inode->i_block[0])
4351 init_special_inode(inode, inode->i_mode, 4482 init_special_inode(inode, inode->i_mode,
@@ -4353,6 +4484,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4353 else 4484 else
4354 init_special_inode(inode, inode->i_mode, 4485 init_special_inode(inode, inode->i_mode,
4355 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 4486 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
4487 } else {
4488 brelse(bh);
4489 ret = -EIO;
4490 ext4_error(inode->i_sb, __func__,
4491 "bogus i_mode (%o) for inode=%lu",
4492 inode->i_mode, inode->i_ino);
4493 goto bad_inode;
4356 } 4494 }
4357 brelse(iloc.bh); 4495 brelse(iloc.bh);
4358 ext4_set_inode_flags(inode); 4496 ext4_set_inode_flags(inode);
@@ -5146,8 +5284,9 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
5146 return !buffer_mapped(bh); 5284 return !buffer_mapped(bh);
5147} 5285}
5148 5286
5149int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page) 5287int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5150{ 5288{
5289 struct page *page = vmf->page;
5151 loff_t size; 5290 loff_t size;
5152 unsigned long len; 5291 unsigned long len;
5153 int ret = -EINVAL; 5292 int ret = -EINVAL;
@@ -5199,6 +5338,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
5199 goto out_unlock; 5338 goto out_unlock;
5200 ret = 0; 5339 ret = 0;
5201out_unlock: 5340out_unlock:
5341 if (ret)
5342 ret = VM_FAULT_SIGBUS;
5202 up_read(&inode->i_alloc_sem); 5343 up_read(&inode->i_alloc_sem);
5203 return ret; 5344 return ret;
5204} 5345}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 42dc83fb247a..91e75f7a9e73 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -48,8 +48,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
48 if (err) 48 if (err)
49 return err; 49 return err;
50 50
51 if (!S_ISDIR(inode->i_mode)) 51 flags = ext4_mask_flags(inode->i_mode, flags);
52 flags &= ~EXT4_DIRSYNC_FL;
53 52
54 err = -EPERM; 53 err = -EPERM;
55 mutex_lock(&inode->i_mutex); 54 mutex_lock(&inode->i_mutex);
@@ -263,6 +262,20 @@ setversion_out:
263 return err; 262 return err;
264 } 263 }
265 264
265 case EXT4_IOC_ALLOC_DA_BLKS:
266 {
267 int err;
268 if (!is_owner_or_cap(inode))
269 return -EACCES;
270
271 err = mnt_want_write(filp->f_path.mnt);
272 if (err)
273 return err;
274 err = ext4_alloc_da_blocks(inode);
275 mnt_drop_write(filp->f_path.mnt);
276 return err;
277 }
278
266 default: 279 default:
267 return -ENOTTY; 280 return -ENOTTY;
268 } 281 }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b038188bd039..f871677a7984 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -46,22 +46,23 @@
46 * The allocation request involve request for multiple number of blocks 46 * The allocation request involve request for multiple number of blocks
47 * near to the goal(block) value specified. 47 * near to the goal(block) value specified.
48 * 48 *
49 * During initialization phase of the allocator we decide to use the group 49 * During initialization phase of the allocator we decide to use the
50 * preallocation or inode preallocation depending on the size file. The 50 * group preallocation or inode preallocation depending on the size of
51 * size of the file could be the resulting file size we would have after 51 * the file. The size of the file could be the resulting file size we
52 * allocation or the current file size which ever is larger. If the size is 52 * would have after allocation, or the current file size, which ever
53 * less that sbi->s_mb_stream_request we select the group 53 * is larger. If the size is less than sbi->s_mb_stream_request we
54 * preallocation. The default value of s_mb_stream_request is 16 54 * select to use the group preallocation. The default value of
55 * blocks. This can also be tuned via 55 * s_mb_stream_request is 16 blocks. This can also be tuned via
56 * /proc/fs/ext4/<partition>/stream_req. The value is represented in terms 56 * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
57 * of number of blocks. 57 * terms of number of blocks.
58 * 58 *
59 * The main motivation for having small file use group preallocation is to 59 * The main motivation for having small file use group preallocation is to
60 * ensure that we have small file closer in the disk. 60 * ensure that we have small files closer together on the disk.
61 * 61 *
62 * First stage the allocator looks at the inode prealloc list 62 * First stage the allocator looks at the inode prealloc list,
63 * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for 63 * ext4_inode_info->i_prealloc_list, which contains list of prealloc
64 * this particular inode. The inode prealloc space is represented as: 64 * spaces for this particular inode. The inode prealloc space is
65 * represented as:
65 * 66 *
66 * pa_lstart -> the logical start block for this prealloc space 67 * pa_lstart -> the logical start block for this prealloc space
67 * pa_pstart -> the physical start block for this prealloc space 68 * pa_pstart -> the physical start block for this prealloc space
@@ -121,29 +122,29 @@
121 * list. In case of inode preallocation we follow a list of heuristics 122 * list. In case of inode preallocation we follow a list of heuristics
122 * based on file size. This can be found in ext4_mb_normalize_request. If 123 * based on file size. This can be found in ext4_mb_normalize_request. If
123 * we are doing a group prealloc we try to normalize the request to 124 * we are doing a group prealloc we try to normalize the request to
124 * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to 125 * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is
125 * 512 blocks. This can be tuned via 126 * 512 blocks. This can be tuned via
126 * /proc/fs/ext4/<partition/group_prealloc. The value is represented in 127 * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in
127 * terms of number of blocks. If we have mounted the file system with -O 128 * terms of number of blocks. If we have mounted the file system with -O
128 * stripe=<value> option the group prealloc request is normalized to the 129 * stripe=<value> option the group prealloc request is normalized to the
129 * stripe value (sbi->s_stripe) 130 * stripe value (sbi->s_stripe)
130 * 131 *
131 * The regular allocator(using the buddy cache) support few tunables. 132 * The regular allocator(using the buddy cache) supports few tunables.
132 * 133 *
133 * /proc/fs/ext4/<partition>/min_to_scan 134 * /sys/fs/ext4/<partition>/mb_min_to_scan
134 * /proc/fs/ext4/<partition>/max_to_scan 135 * /sys/fs/ext4/<partition>/mb_max_to_scan
135 * /proc/fs/ext4/<partition>/order2_req 136 * /sys/fs/ext4/<partition>/mb_order2_req
136 * 137 *
137 * The regular allocator use buddy scan only if the request len is power of 138 * The regular allocator uses buddy scan only if the request len is power of
138 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The 139 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
139 * value of s_mb_order2_reqs can be tuned via 140 * value of s_mb_order2_reqs can be tuned via
140 * /proc/fs/ext4/<partition>/order2_req. If the request len is equal to 141 * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to
141 * stripe size (sbi->s_stripe), we try to search for contigous block in 142 * stripe size (sbi->s_stripe), we try to search for contigous block in
142 * stripe size. This should result in better allocation on RAID setup. If 143 * stripe size. This should result in better allocation on RAID setups. If
143 * not we search in the specific group using bitmap for best extents. The 144 * not, we search in the specific group using bitmap for best extents. The
144 * tunable min_to_scan and max_to_scan controll the behaviour here. 145 * tunable min_to_scan and max_to_scan control the behaviour here.
145 * min_to_scan indicate how long the mballoc __must__ look for a best 146 * min_to_scan indicate how long the mballoc __must__ look for a best
146 * extent and max_to_scanindicate how long the mballoc __can__ look for a 147 * extent and max_to_scan indicates how long the mballoc __can__ look for a
147 * best extent in the found extents. Searching for the blocks starts with 148 * best extent in the found extents. Searching for the blocks starts with
148 * the group specified as the goal value in allocation context via 149 * the group specified as the goal value in allocation context via
149 * ac_g_ex. Each group is first checked based on the criteria whether it 150 * ac_g_ex. Each group is first checked based on the criteria whether it
@@ -337,8 +338,6 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
337 ext4_group_t group); 338 ext4_group_t group);
338static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 339static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
339 ext4_group_t group); 340 ext4_group_t group);
340static int ext4_mb_init_per_dev_proc(struct super_block *sb);
341static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
342static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); 341static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
343 342
344 343
@@ -1726,6 +1725,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1726{ 1725{
1727 unsigned free, fragments; 1726 unsigned free, fragments;
1728 unsigned i, bits; 1727 unsigned i, bits;
1728 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
1729 struct ext4_group_desc *desc; 1729 struct ext4_group_desc *desc;
1730 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 1730 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1731 1731
@@ -1747,6 +1747,12 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1747 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) 1747 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
1748 return 0; 1748 return 0;
1749 1749
1750 /* Avoid using the first bg of a flexgroup for data files */
1751 if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
1752 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
1753 ((group % flex_size) == 0))
1754 return 0;
1755
1750 bits = ac->ac_sb->s_blocksize_bits + 1; 1756 bits = ac->ac_sb->s_blocksize_bits + 1;
1751 for (i = ac->ac_2order; i <= bits; i++) 1757 for (i = ac->ac_2order; i <= bits; i++)
1752 if (grp->bb_counters[i] > 0) 1758 if (grp->bb_counters[i] > 0)
@@ -1971,7 +1977,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1971 /* 1977 /*
1972 * We search using buddy data only if the order of the request 1978 * We search using buddy data only if the order of the request
1973 * is greater than equal to the sbi_s_mb_order2_reqs 1979 * is greater than equal to the sbi_s_mb_order2_reqs
1974 * You can tune it via /proc/fs/ext4/<partition>/order2_req 1980 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
1975 */ 1981 */
1976 if (i >= sbi->s_mb_order2_reqs) { 1982 if (i >= sbi->s_mb_order2_reqs) {
1977 /* 1983 /*
@@ -2693,7 +2699,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2693 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int); 2699 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
2694 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 2700 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2695 if (sbi->s_mb_maxs == NULL) { 2701 if (sbi->s_mb_maxs == NULL) {
2696 kfree(sbi->s_mb_maxs); 2702 kfree(sbi->s_mb_offsets);
2697 return -ENOMEM; 2703 return -ENOMEM;
2698 } 2704 }
2699 2705
@@ -2746,7 +2752,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2746 spin_lock_init(&lg->lg_prealloc_lock); 2752 spin_lock_init(&lg->lg_prealloc_lock);
2747 } 2753 }
2748 2754
2749 ext4_mb_init_per_dev_proc(sb);
2750 ext4_mb_history_init(sb); 2755 ext4_mb_history_init(sb);
2751 2756
2752 if (sbi->s_journal) 2757 if (sbi->s_journal)
@@ -2829,7 +2834,6 @@ int ext4_mb_release(struct super_block *sb)
2829 2834
2830 free_percpu(sbi->s_locality_groups); 2835 free_percpu(sbi->s_locality_groups);
2831 ext4_mb_history_release(sb); 2836 ext4_mb_history_release(sb);
2832 ext4_mb_destroy_per_dev_proc(sb);
2833 2837
2834 return 0; 2838 return 0;
2835} 2839}
@@ -2890,62 +2894,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2890 mb_debug("freed %u blocks in %u structures\n", count, count2); 2894 mb_debug("freed %u blocks in %u structures\n", count, count2);
2891} 2895}
2892 2896
2893#define EXT4_MB_STATS_NAME "stats"
2894#define EXT4_MB_MAX_TO_SCAN_NAME "max_to_scan"
2895#define EXT4_MB_MIN_TO_SCAN_NAME "min_to_scan"
2896#define EXT4_MB_ORDER2_REQ "order2_req"
2897#define EXT4_MB_STREAM_REQ "stream_req"
2898#define EXT4_MB_GROUP_PREALLOC "group_prealloc"
2899
2900static int ext4_mb_init_per_dev_proc(struct super_block *sb)
2901{
2902#ifdef CONFIG_PROC_FS
2903 mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
2904 struct ext4_sb_info *sbi = EXT4_SB(sb);
2905 struct proc_dir_entry *proc;
2906
2907 if (sbi->s_proc == NULL)
2908 return -EINVAL;
2909
2910 EXT4_PROC_HANDLER(EXT4_MB_STATS_NAME, mb_stats);
2911 EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan);
2912 EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan);
2913 EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs);
2914 EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request);
2915 EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc);
2916 return 0;
2917
2918err_out:
2919 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
2920 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
2921 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
2922 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
2923 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
2924 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
2925 return -ENOMEM;
2926#else
2927 return 0;
2928#endif
2929}
2930
2931static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
2932{
2933#ifdef CONFIG_PROC_FS
2934 struct ext4_sb_info *sbi = EXT4_SB(sb);
2935
2936 if (sbi->s_proc == NULL)
2937 return -EINVAL;
2938
2939 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
2940 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
2941 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
2942 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
2943 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
2944 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
2945#endif
2946 return 0;
2947}
2948
2949int __init init_ext4_mballoc(void) 2897int __init init_ext4_mballoc(void)
2950{ 2898{
2951 ext4_pspace_cachep = 2899 ext4_pspace_cachep =
@@ -3096,9 +3044,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
3096 if (sbi->s_log_groups_per_flex) { 3044 if (sbi->s_log_groups_per_flex) {
3097 ext4_group_t flex_group = ext4_flex_group(sbi, 3045 ext4_group_t flex_group = ext4_flex_group(sbi,
3098 ac->ac_b_ex.fe_group); 3046 ac->ac_b_ex.fe_group);
3099 spin_lock(sb_bgl_lock(sbi, flex_group)); 3047 atomic_sub(ac->ac_b_ex.fe_len,
3100 sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len; 3048 &sbi->s_flex_groups[flex_group].free_blocks);
3101 spin_unlock(sb_bgl_lock(sbi, flex_group));
3102 } 3049 }
3103 3050
3104 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 3051 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -3116,7 +3063,7 @@ out_err:
3116 * here we normalize request for locality group 3063 * here we normalize request for locality group
3117 * Group request are normalized to s_strip size if we set the same via mount 3064 * Group request are normalized to s_strip size if we set the same via mount
3118 * option. If not we set it to s_mb_group_prealloc which can be configured via 3065 * option. If not we set it to s_mb_group_prealloc which can be configured via
3119 * /proc/fs/ext4/<partition>/group_prealloc 3066 * /sys/fs/ext4/<partition>/mb_group_prealloc
3120 * 3067 *
3121 * XXX: should we try to preallocate more than the group has now? 3068 * XXX: should we try to preallocate more than the group has now?
3122 */ 3069 */
@@ -3608,8 +3555,11 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3608 spin_unlock(&pa->pa_lock); 3555 spin_unlock(&pa->pa_lock);
3609 3556
3610 grp_blk = pa->pa_pstart; 3557 grp_blk = pa->pa_pstart;
3611 /* If linear, pa_pstart may be in the next group when pa is used up */ 3558 /*
3612 if (pa->pa_linear) 3559 * If doing group-based preallocation, pa_pstart may be in the
3560 * next group when pa is used up
3561 */
3562 if (pa->pa_type == MB_GROUP_PA)
3613 grp_blk--; 3563 grp_blk--;
3614 3564
3615 ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL); 3565 ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
@@ -3704,7 +3654,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3704 INIT_LIST_HEAD(&pa->pa_inode_list); 3654 INIT_LIST_HEAD(&pa->pa_inode_list);
3705 INIT_LIST_HEAD(&pa->pa_group_list); 3655 INIT_LIST_HEAD(&pa->pa_group_list);
3706 pa->pa_deleted = 0; 3656 pa->pa_deleted = 0;
3707 pa->pa_linear = 0; 3657 pa->pa_type = MB_INODE_PA;
3708 3658
3709 mb_debug("new inode pa %p: %llu/%u for %u\n", pa, 3659 mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
3710 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3660 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
@@ -3767,7 +3717,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3767 INIT_LIST_HEAD(&pa->pa_inode_list); 3717 INIT_LIST_HEAD(&pa->pa_inode_list);
3768 INIT_LIST_HEAD(&pa->pa_group_list); 3718 INIT_LIST_HEAD(&pa->pa_group_list);
3769 pa->pa_deleted = 0; 3719 pa->pa_deleted = 0;
3770 pa->pa_linear = 1; 3720 pa->pa_type = MB_GROUP_PA;
3771 3721
3772 mb_debug("new group pa %p: %llu/%u for %u\n", pa, 3722 mb_debug("new group pa %p: %llu/%u for %u\n", pa,
3773 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3723 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
@@ -4021,7 +3971,7 @@ repeat:
4021 list_del_rcu(&pa->pa_inode_list); 3971 list_del_rcu(&pa->pa_inode_list);
4022 spin_unlock(pa->pa_obj_lock); 3972 spin_unlock(pa->pa_obj_lock);
4023 3973
4024 if (pa->pa_linear) 3974 if (pa->pa_type == MB_GROUP_PA)
4025 ext4_mb_release_group_pa(&e4b, pa, ac); 3975 ext4_mb_release_group_pa(&e4b, pa, ac);
4026 else 3976 else
4027 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); 3977 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
@@ -4121,7 +4071,7 @@ repeat:
4121 spin_unlock(&ei->i_prealloc_lock); 4071 spin_unlock(&ei->i_prealloc_lock);
4122 4072
4123 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { 4073 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
4124 BUG_ON(pa->pa_linear != 0); 4074 BUG_ON(pa->pa_type != MB_INODE_PA);
4125 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); 4075 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
4126 4076
4127 err = ext4_mb_load_buddy(sb, group, &e4b); 4077 err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -4232,7 +4182,7 @@ static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4232 * file is determined by the current size or the resulting size after 4182 * file is determined by the current size or the resulting size after
4233 * allocation which ever is larger 4183 * allocation which ever is larger
4234 * 4184 *
4235 * One can tune this size via /proc/fs/ext4/<partition>/stream_req 4185 * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
4236 */ 4186 */
4237static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) 4187static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4238{ 4188{
@@ -4373,7 +4323,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4373 continue; 4323 continue;
4374 } 4324 }
4375 /* only lg prealloc space */ 4325 /* only lg prealloc space */
4376 BUG_ON(!pa->pa_linear); 4326 BUG_ON(pa->pa_type != MB_GROUP_PA);
4377 4327
4378 /* seems this one can be freed ... */ 4328 /* seems this one can be freed ... */
4379 pa->pa_deleted = 1; 4329 pa->pa_deleted = 1;
@@ -4442,7 +4392,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
4442 pa_inode_list) { 4392 pa_inode_list) {
4443 spin_lock(&tmp_pa->pa_lock); 4393 spin_lock(&tmp_pa->pa_lock);
4444 if (tmp_pa->pa_deleted) { 4394 if (tmp_pa->pa_deleted) {
4445 spin_unlock(&pa->pa_lock); 4395 spin_unlock(&tmp_pa->pa_lock);
4446 continue; 4396 continue;
4447 } 4397 }
4448 if (!added && pa->pa_free < tmp_pa->pa_free) { 4398 if (!added && pa->pa_free < tmp_pa->pa_free) {
@@ -4479,7 +4429,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4479{ 4429{
4480 struct ext4_prealloc_space *pa = ac->ac_pa; 4430 struct ext4_prealloc_space *pa = ac->ac_pa;
4481 if (pa) { 4431 if (pa) {
4482 if (pa->pa_linear) { 4432 if (pa->pa_type == MB_GROUP_PA) {
4483 /* see comment in ext4_mb_use_group_pa() */ 4433 /* see comment in ext4_mb_use_group_pa() */
4484 spin_lock(&pa->pa_lock); 4434 spin_lock(&pa->pa_lock);
4485 pa->pa_pstart += ac->ac_b_ex.fe_len; 4435 pa->pa_pstart += ac->ac_b_ex.fe_len;
@@ -4499,7 +4449,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4499 * doesn't grow big. We need to release 4449 * doesn't grow big. We need to release
4500 * alloc_semp before calling ext4_mb_add_n_trim() 4450 * alloc_semp before calling ext4_mb_add_n_trim()
4501 */ 4451 */
4502 if (pa->pa_linear && likely(pa->pa_free)) { 4452 if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
4503 spin_lock(pa->pa_obj_lock); 4453 spin_lock(pa->pa_obj_lock);
4504 list_del_rcu(&pa->pa_inode_list); 4454 list_del_rcu(&pa->pa_inode_list);
4505 spin_unlock(pa->pa_obj_lock); 4455 spin_unlock(pa->pa_obj_lock);
@@ -4936,9 +4886,7 @@ do_more:
4936 4886
4937 if (sbi->s_log_groups_per_flex) { 4887 if (sbi->s_log_groups_per_flex) {
4938 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 4888 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4939 spin_lock(sb_bgl_lock(sbi, flex_group)); 4889 atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
4940 sbi->s_flex_groups[flex_group].free_blocks += count;
4941 spin_unlock(sb_bgl_lock(sbi, flex_group));
4942 } 4890 }
4943 4891
4944 ext4_mb_release_desc(&e4b); 4892 ext4_mb_release_desc(&e4b);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 10a2921baf14..dd9e6cd5f6cf 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -132,12 +132,15 @@ struct ext4_prealloc_space {
132 ext4_lblk_t pa_lstart; /* log. block */ 132 ext4_lblk_t pa_lstart; /* log. block */
133 unsigned short pa_len; /* len of preallocated chunk */ 133 unsigned short pa_len; /* len of preallocated chunk */
134 unsigned short pa_free; /* how many blocks are free */ 134 unsigned short pa_free; /* how many blocks are free */
135 unsigned short pa_linear; /* consumed in one direction 135 unsigned short pa_type; /* pa type. inode or group */
136 * strictly, for grp prealloc */
137 spinlock_t *pa_obj_lock; 136 spinlock_t *pa_obj_lock;
138 struct inode *pa_inode; /* hack, for history only */ 137 struct inode *pa_inode; /* hack, for history only */
139}; 138};
140 139
140enum {
141 MB_INODE_PA = 0,
142 MB_GROUP_PA = 1
143};
141 144
142struct ext4_free_extent { 145struct ext4_free_extent {
143 ext4_lblk_t fe_logical; 146 ext4_lblk_t fe_logical;
@@ -247,7 +250,6 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
247 250
248#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) 251#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
249 252
250struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
251static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, 253static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
252 struct ext4_free_extent *fex) 254 struct ext4_free_extent *fex)
253{ 255{
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 83410244d3ee..22098e1cd085 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -161,12 +161,12 @@ static struct dx_frame *dx_probe(const struct qstr *d_name,
161 struct dx_frame *frame, 161 struct dx_frame *frame,
162 int *err); 162 int *err);
163static void dx_release(struct dx_frame *frames); 163static void dx_release(struct dx_frame *frames);
164static int dx_make_map(struct ext4_dir_entry_2 *de, int size, 164static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
165 struct dx_hash_info *hinfo, struct dx_map_entry map[]); 165 struct dx_hash_info *hinfo, struct dx_map_entry map[]);
166static void dx_sort_map(struct dx_map_entry *map, unsigned count); 166static void dx_sort_map(struct dx_map_entry *map, unsigned count);
167static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to, 167static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
168 struct dx_map_entry *offsets, int count); 168 struct dx_map_entry *offsets, int count, unsigned blocksize);
169static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size); 169static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize);
170static void dx_insert_block(struct dx_frame *frame, 170static void dx_insert_block(struct dx_frame *frame,
171 u32 hash, ext4_lblk_t block); 171 u32 hash, ext4_lblk_t block);
172static int ext4_htree_next_block(struct inode *dir, __u32 hash, 172static int ext4_htree_next_block(struct inode *dir, __u32 hash,
@@ -180,14 +180,38 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
180static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, 180static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
181 struct inode *inode); 181 struct inode *inode);
182 182
183unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
184{
185 unsigned len = le16_to_cpu(dlen);
186
187 if (len == EXT4_MAX_REC_LEN || len == 0)
188 return blocksize;
189 return (len & 65532) | ((len & 3) << 16);
190}
191
192__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
193{
194 if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
195 BUG();
196 if (len < 65536)
197 return cpu_to_le16(len);
198 if (len == blocksize) {
199 if (blocksize == 65536)
200 return cpu_to_le16(EXT4_MAX_REC_LEN);
201 else
202 return cpu_to_le16(0);
203 }
204 return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
205}
206
183/* 207/*
184 * p is at least 6 bytes before the end of page 208 * p is at least 6 bytes before the end of page
185 */ 209 */
186static inline struct ext4_dir_entry_2 * 210static inline struct ext4_dir_entry_2 *
187ext4_next_entry(struct ext4_dir_entry_2 *p) 211ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
188{ 212{
189 return (struct ext4_dir_entry_2 *)((char *)p + 213 return (struct ext4_dir_entry_2 *)((char *)p +
190 ext4_rec_len_from_disk(p->rec_len)); 214 ext4_rec_len_from_disk(p->rec_len, blocksize));
191} 215}
192 216
193/* 217/*
@@ -294,7 +318,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent
294 space += EXT4_DIR_REC_LEN(de->name_len); 318 space += EXT4_DIR_REC_LEN(de->name_len);
295 names++; 319 names++;
296 } 320 }
297 de = ext4_next_entry(de); 321 de = ext4_next_entry(de, size);
298 } 322 }
299 printk("(%i)\n", names); 323 printk("(%i)\n", names);
300 return (struct stats) { names, space, 1 }; 324 return (struct stats) { names, space, 1 };
@@ -585,7 +609,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
585 top = (struct ext4_dir_entry_2 *) ((char *) de + 609 top = (struct ext4_dir_entry_2 *) ((char *) de +
586 dir->i_sb->s_blocksize - 610 dir->i_sb->s_blocksize -
587 EXT4_DIR_REC_LEN(0)); 611 EXT4_DIR_REC_LEN(0));
588 for (; de < top; de = ext4_next_entry(de)) { 612 for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
589 if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, 613 if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
590 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) 614 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
591 +((char *)de - bh->b_data))) { 615 +((char *)de - bh->b_data))) {
@@ -663,7 +687,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
663 } 687 }
664 if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) { 688 if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
665 de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; 689 de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
666 de = ext4_next_entry(de); 690 de = ext4_next_entry(de, dir->i_sb->s_blocksize);
667 if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0) 691 if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0)
668 goto errout; 692 goto errout;
669 count++; 693 count++;
@@ -713,15 +737,15 @@ errout:
713 * Create map of hash values, offsets, and sizes, stored at end of block. 737 * Create map of hash values, offsets, and sizes, stored at end of block.
714 * Returns number of entries mapped. 738 * Returns number of entries mapped.
715 */ 739 */
716static int dx_make_map (struct ext4_dir_entry_2 *de, int size, 740static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
717 struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) 741 struct dx_hash_info *hinfo,
742 struct dx_map_entry *map_tail)
718{ 743{
719 int count = 0; 744 int count = 0;
720 char *base = (char *) de; 745 char *base = (char *) de;
721 struct dx_hash_info h = *hinfo; 746 struct dx_hash_info h = *hinfo;
722 747
723 while ((char *) de < base + size) 748 while ((char *) de < base + blocksize) {
724 {
725 if (de->name_len && de->inode) { 749 if (de->name_len && de->inode) {
726 ext4fs_dirhash(de->name, de->name_len, &h); 750 ext4fs_dirhash(de->name, de->name_len, &h);
727 map_tail--; 751 map_tail--;
@@ -732,7 +756,7 @@ static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
732 cond_resched(); 756 cond_resched();
733 } 757 }
734 /* XXX: do we need to check rec_len == 0 case? -Chris */ 758 /* XXX: do we need to check rec_len == 0 case? -Chris */
735 de = ext4_next_entry(de); 759 de = ext4_next_entry(de, blocksize);
736 } 760 }
737 return count; 761 return count;
738} 762}
@@ -832,7 +856,8 @@ static inline int search_dirblock(struct buffer_head *bh,
832 return 1; 856 return 1;
833 } 857 }
834 /* prevent looping on a bad block */ 858 /* prevent looping on a bad block */
835 de_len = ext4_rec_len_from_disk(de->rec_len); 859 de_len = ext4_rec_len_from_disk(de->rec_len,
860 dir->i_sb->s_blocksize);
836 if (de_len <= 0) 861 if (de_len <= 0)
837 return -1; 862 return -1;
838 offset += de_len; 863 offset += de_len;
@@ -996,7 +1021,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
996 de = (struct ext4_dir_entry_2 *) bh->b_data; 1021 de = (struct ext4_dir_entry_2 *) bh->b_data;
997 top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize - 1022 top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
998 EXT4_DIR_REC_LEN(0)); 1023 EXT4_DIR_REC_LEN(0));
999 for (; de < top; de = ext4_next_entry(de)) { 1024 for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) {
1000 int off = (block << EXT4_BLOCK_SIZE_BITS(sb)) 1025 int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
1001 + ((char *) de - bh->b_data); 1026 + ((char *) de - bh->b_data);
1002 1027
@@ -1052,8 +1077,16 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1052 return ERR_PTR(-EIO); 1077 return ERR_PTR(-EIO);
1053 } 1078 }
1054 inode = ext4_iget(dir->i_sb, ino); 1079 inode = ext4_iget(dir->i_sb, ino);
1055 if (IS_ERR(inode)) 1080 if (unlikely(IS_ERR(inode))) {
1056 return ERR_CAST(inode); 1081 if (PTR_ERR(inode) == -ESTALE) {
1082 ext4_error(dir->i_sb, __func__,
1083 "deleted inode referenced: %u",
1084 ino);
1085 return ERR_PTR(-EIO);
1086 } else {
1087 return ERR_CAST(inode);
1088 }
1089 }
1057 } 1090 }
1058 return d_splice_alias(inode, dentry); 1091 return d_splice_alias(inode, dentry);
1059} 1092}
@@ -1109,7 +1142,8 @@ static inline void ext4_set_de_type(struct super_block *sb,
1109 * Returns pointer to last entry moved. 1142 * Returns pointer to last entry moved.
1110 */ 1143 */
1111static struct ext4_dir_entry_2 * 1144static struct ext4_dir_entry_2 *
1112dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) 1145dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
1146 unsigned blocksize)
1113{ 1147{
1114 unsigned rec_len = 0; 1148 unsigned rec_len = 0;
1115 1149
@@ -1118,7 +1152,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
1118 rec_len = EXT4_DIR_REC_LEN(de->name_len); 1152 rec_len = EXT4_DIR_REC_LEN(de->name_len);
1119 memcpy (to, de, rec_len); 1153 memcpy (to, de, rec_len);
1120 ((struct ext4_dir_entry_2 *) to)->rec_len = 1154 ((struct ext4_dir_entry_2 *) to)->rec_len =
1121 ext4_rec_len_to_disk(rec_len); 1155 ext4_rec_len_to_disk(rec_len, blocksize);
1122 de->inode = 0; 1156 de->inode = 0;
1123 map++; 1157 map++;
1124 to += rec_len; 1158 to += rec_len;
@@ -1130,19 +1164,19 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
1130 * Compact each dir entry in the range to the minimal rec_len. 1164 * Compact each dir entry in the range to the minimal rec_len.
1131 * Returns pointer to last entry in range. 1165 * Returns pointer to last entry in range.
1132 */ 1166 */
1133static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size) 1167static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
1134{ 1168{
1135 struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base; 1169 struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
1136 unsigned rec_len = 0; 1170 unsigned rec_len = 0;
1137 1171
1138 prev = to = de; 1172 prev = to = de;
1139 while ((char*)de < base + size) { 1173 while ((char*)de < base + blocksize) {
1140 next = ext4_next_entry(de); 1174 next = ext4_next_entry(de, blocksize);
1141 if (de->inode && de->name_len) { 1175 if (de->inode && de->name_len) {
1142 rec_len = EXT4_DIR_REC_LEN(de->name_len); 1176 rec_len = EXT4_DIR_REC_LEN(de->name_len);
1143 if (de > to) 1177 if (de > to)
1144 memmove(to, de, rec_len); 1178 memmove(to, de, rec_len);
1145 to->rec_len = ext4_rec_len_to_disk(rec_len); 1179 to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
1146 prev = to; 1180 prev = to;
1147 to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len); 1181 to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
1148 } 1182 }
@@ -1215,10 +1249,12 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1215 hash2, split, count-split)); 1249 hash2, split, count-split));
1216 1250
1217 /* Fancy dance to stay within two buffers */ 1251 /* Fancy dance to stay within two buffers */
1218 de2 = dx_move_dirents(data1, data2, map + split, count - split); 1252 de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
1219 de = dx_pack_dirents(data1, blocksize); 1253 de = dx_pack_dirents(data1, blocksize);
1220 de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de); 1254 de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
1221 de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2); 1255 blocksize);
1256 de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2,
1257 blocksize);
1222 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); 1258 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
1223 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); 1259 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
1224 1260
@@ -1268,6 +1304,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1268 const char *name = dentry->d_name.name; 1304 const char *name = dentry->d_name.name;
1269 int namelen = dentry->d_name.len; 1305 int namelen = dentry->d_name.len;
1270 unsigned int offset = 0; 1306 unsigned int offset = 0;
1307 unsigned int blocksize = dir->i_sb->s_blocksize;
1271 unsigned short reclen; 1308 unsigned short reclen;
1272 int nlen, rlen, err; 1309 int nlen, rlen, err;
1273 char *top; 1310 char *top;
@@ -1275,7 +1312,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1275 reclen = EXT4_DIR_REC_LEN(namelen); 1312 reclen = EXT4_DIR_REC_LEN(namelen);
1276 if (!de) { 1313 if (!de) {
1277 de = (struct ext4_dir_entry_2 *)bh->b_data; 1314 de = (struct ext4_dir_entry_2 *)bh->b_data;
1278 top = bh->b_data + dir->i_sb->s_blocksize - reclen; 1315 top = bh->b_data + blocksize - reclen;
1279 while ((char *) de <= top) { 1316 while ((char *) de <= top) {
1280 if (!ext4_check_dir_entry("ext4_add_entry", dir, de, 1317 if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
1281 bh, offset)) { 1318 bh, offset)) {
@@ -1287,7 +1324,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1287 return -EEXIST; 1324 return -EEXIST;
1288 } 1325 }
1289 nlen = EXT4_DIR_REC_LEN(de->name_len); 1326 nlen = EXT4_DIR_REC_LEN(de->name_len);
1290 rlen = ext4_rec_len_from_disk(de->rec_len); 1327 rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
1291 if ((de->inode? rlen - nlen: rlen) >= reclen) 1328 if ((de->inode? rlen - nlen: rlen) >= reclen)
1292 break; 1329 break;
1293 de = (struct ext4_dir_entry_2 *)((char *)de + rlen); 1330 de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
@@ -1306,11 +1343,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1306 1343
1307 /* By now the buffer is marked for journaling */ 1344 /* By now the buffer is marked for journaling */
1308 nlen = EXT4_DIR_REC_LEN(de->name_len); 1345 nlen = EXT4_DIR_REC_LEN(de->name_len);
1309 rlen = ext4_rec_len_from_disk(de->rec_len); 1346 rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
1310 if (de->inode) { 1347 if (de->inode) {
1311 struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen); 1348 struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
1312 de1->rec_len = ext4_rec_len_to_disk(rlen - nlen); 1349 de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize);
1313 de->rec_len = ext4_rec_len_to_disk(nlen); 1350 de->rec_len = ext4_rec_len_to_disk(nlen, blocksize);
1314 de = de1; 1351 de = de1;
1315 } 1352 }
1316 de->file_type = EXT4_FT_UNKNOWN; 1353 de->file_type = EXT4_FT_UNKNOWN;
@@ -1380,7 +1417,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1380 /* The 0th block becomes the root, move the dirents out */ 1417 /* The 0th block becomes the root, move the dirents out */
1381 fde = &root->dotdot; 1418 fde = &root->dotdot;
1382 de = (struct ext4_dir_entry_2 *)((char *)fde + 1419 de = (struct ext4_dir_entry_2 *)((char *)fde +
1383 ext4_rec_len_from_disk(fde->rec_len)); 1420 ext4_rec_len_from_disk(fde->rec_len, blocksize));
1384 if ((char *) de >= (((char *) root) + blocksize)) { 1421 if ((char *) de >= (((char *) root) + blocksize)) {
1385 ext4_error(dir->i_sb, __func__, 1422 ext4_error(dir->i_sb, __func__,
1386 "invalid rec_len for '..' in inode %lu", 1423 "invalid rec_len for '..' in inode %lu",
@@ -1402,12 +1439,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1402 memcpy (data1, de, len); 1439 memcpy (data1, de, len);
1403 de = (struct ext4_dir_entry_2 *) data1; 1440 de = (struct ext4_dir_entry_2 *) data1;
1404 top = data1 + len; 1441 top = data1 + len;
1405 while ((char *)(de2 = ext4_next_entry(de)) < top) 1442 while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
1406 de = de2; 1443 de = de2;
1407 de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de); 1444 de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
1445 blocksize);
1408 /* Initialize the root; the dot dirents already exist */ 1446 /* Initialize the root; the dot dirents already exist */
1409 de = (struct ext4_dir_entry_2 *) (&root->dotdot); 1447 de = (struct ext4_dir_entry_2 *) (&root->dotdot);
1410 de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2)); 1448 de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
1449 blocksize);
1411 memset (&root->info, 0, sizeof(root->info)); 1450 memset (&root->info, 0, sizeof(root->info));
1412 root->info.info_length = sizeof(root->info); 1451 root->info.info_length = sizeof(root->info);
1413 root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; 1452 root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
@@ -1488,7 +1527,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1488 return retval; 1527 return retval;
1489 de = (struct ext4_dir_entry_2 *) bh->b_data; 1528 de = (struct ext4_dir_entry_2 *) bh->b_data;
1490 de->inode = 0; 1529 de->inode = 0;
1491 de->rec_len = ext4_rec_len_to_disk(blocksize); 1530 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
1492 return add_dirent_to_buf(handle, dentry, inode, de, bh); 1531 return add_dirent_to_buf(handle, dentry, inode, de, bh);
1493} 1532}
1494 1533
@@ -1551,7 +1590,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1551 goto cleanup; 1590 goto cleanup;
1552 node2 = (struct dx_node *)(bh2->b_data); 1591 node2 = (struct dx_node *)(bh2->b_data);
1553 entries2 = node2->entries; 1592 entries2 = node2->entries;
1554 node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize); 1593 node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
1594 sb->s_blocksize);
1555 node2->fake.inode = 0; 1595 node2->fake.inode = 0;
1556 BUFFER_TRACE(frame->bh, "get_write_access"); 1596 BUFFER_TRACE(frame->bh, "get_write_access");
1557 err = ext4_journal_get_write_access(handle, frame->bh); 1597 err = ext4_journal_get_write_access(handle, frame->bh);
@@ -1639,6 +1679,7 @@ static int ext4_delete_entry(handle_t *handle,
1639 struct buffer_head *bh) 1679 struct buffer_head *bh)
1640{ 1680{
1641 struct ext4_dir_entry_2 *de, *pde; 1681 struct ext4_dir_entry_2 *de, *pde;
1682 unsigned int blocksize = dir->i_sb->s_blocksize;
1642 int i; 1683 int i;
1643 1684
1644 i = 0; 1685 i = 0;
@@ -1652,8 +1693,11 @@ static int ext4_delete_entry(handle_t *handle,
1652 ext4_journal_get_write_access(handle, bh); 1693 ext4_journal_get_write_access(handle, bh);
1653 if (pde) 1694 if (pde)
1654 pde->rec_len = ext4_rec_len_to_disk( 1695 pde->rec_len = ext4_rec_len_to_disk(
1655 ext4_rec_len_from_disk(pde->rec_len) + 1696 ext4_rec_len_from_disk(pde->rec_len,
1656 ext4_rec_len_from_disk(de->rec_len)); 1697 blocksize) +
1698 ext4_rec_len_from_disk(de->rec_len,
1699 blocksize),
1700 blocksize);
1657 else 1701 else
1658 de->inode = 0; 1702 de->inode = 0;
1659 dir->i_version++; 1703 dir->i_version++;
@@ -1661,9 +1705,9 @@ static int ext4_delete_entry(handle_t *handle,
1661 ext4_handle_dirty_metadata(handle, dir, bh); 1705 ext4_handle_dirty_metadata(handle, dir, bh);
1662 return 0; 1706 return 0;
1663 } 1707 }
1664 i += ext4_rec_len_from_disk(de->rec_len); 1708 i += ext4_rec_len_from_disk(de->rec_len, blocksize);
1665 pde = de; 1709 pde = de;
1666 de = ext4_next_entry(de); 1710 de = ext4_next_entry(de, blocksize);
1667 } 1711 }
1668 return -ENOENT; 1712 return -ENOENT;
1669} 1713}
@@ -1793,6 +1837,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1793 struct inode *inode; 1837 struct inode *inode;
1794 struct buffer_head *dir_block; 1838 struct buffer_head *dir_block;
1795 struct ext4_dir_entry_2 *de; 1839 struct ext4_dir_entry_2 *de;
1840 unsigned int blocksize = dir->i_sb->s_blocksize;
1796 int err, retries = 0; 1841 int err, retries = 0;
1797 1842
1798 if (EXT4_DIR_LINK_MAX(dir)) 1843 if (EXT4_DIR_LINK_MAX(dir))
@@ -1824,13 +1869,14 @@ retry:
1824 de = (struct ext4_dir_entry_2 *) dir_block->b_data; 1869 de = (struct ext4_dir_entry_2 *) dir_block->b_data;
1825 de->inode = cpu_to_le32(inode->i_ino); 1870 de->inode = cpu_to_le32(inode->i_ino);
1826 de->name_len = 1; 1871 de->name_len = 1;
1827 de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len)); 1872 de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
1873 blocksize);
1828 strcpy(de->name, "."); 1874 strcpy(de->name, ".");
1829 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 1875 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1830 de = ext4_next_entry(de); 1876 de = ext4_next_entry(de, blocksize);
1831 de->inode = cpu_to_le32(dir->i_ino); 1877 de->inode = cpu_to_le32(dir->i_ino);
1832 de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize - 1878 de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1),
1833 EXT4_DIR_REC_LEN(1)); 1879 blocksize);
1834 de->name_len = 2; 1880 de->name_len = 2;
1835 strcpy(de->name, ".."); 1881 strcpy(de->name, "..");
1836 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 1882 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
@@ -1885,7 +1931,7 @@ static int empty_dir(struct inode *inode)
1885 return 1; 1931 return 1;
1886 } 1932 }
1887 de = (struct ext4_dir_entry_2 *) bh->b_data; 1933 de = (struct ext4_dir_entry_2 *) bh->b_data;
1888 de1 = ext4_next_entry(de); 1934 de1 = ext4_next_entry(de, sb->s_blocksize);
1889 if (le32_to_cpu(de->inode) != inode->i_ino || 1935 if (le32_to_cpu(de->inode) != inode->i_ino ||
1890 !le32_to_cpu(de1->inode) || 1936 !le32_to_cpu(de1->inode) ||
1891 strcmp(".", de->name) || 1937 strcmp(".", de->name) ||
@@ -1896,9 +1942,9 @@ static int empty_dir(struct inode *inode)
1896 brelse(bh); 1942 brelse(bh);
1897 return 1; 1943 return 1;
1898 } 1944 }
1899 offset = ext4_rec_len_from_disk(de->rec_len) + 1945 offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) +
1900 ext4_rec_len_from_disk(de1->rec_len); 1946 ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize);
1901 de = ext4_next_entry(de1); 1947 de = ext4_next_entry(de1, sb->s_blocksize);
1902 while (offset < inode->i_size) { 1948 while (offset < inode->i_size) {
1903 if (!bh || 1949 if (!bh ||
1904 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { 1950 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
@@ -1927,8 +1973,8 @@ static int empty_dir(struct inode *inode)
1927 brelse(bh); 1973 brelse(bh);
1928 return 0; 1974 return 0;
1929 } 1975 }
1930 offset += ext4_rec_len_from_disk(de->rec_len); 1976 offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
1931 de = ext4_next_entry(de); 1977 de = ext4_next_entry(de, sb->s_blocksize);
1932 } 1978 }
1933 brelse(bh); 1979 brelse(bh);
1934 return 1; 1980 return 1;
@@ -2297,8 +2343,8 @@ retry:
2297 return err; 2343 return err;
2298} 2344}
2299 2345
2300#define PARENT_INO(buffer) \ 2346#define PARENT_INO(buffer, size) \
2301 (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer))->inode) 2347 (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode)
2302 2348
2303/* 2349/*
2304 * Anybody can rename anything with this: the permission checks are left to the 2350 * Anybody can rename anything with this: the permission checks are left to the
@@ -2311,7 +2357,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2311 struct inode *old_inode, *new_inode; 2357 struct inode *old_inode, *new_inode;
2312 struct buffer_head *old_bh, *new_bh, *dir_bh; 2358 struct buffer_head *old_bh, *new_bh, *dir_bh;
2313 struct ext4_dir_entry_2 *old_de, *new_de; 2359 struct ext4_dir_entry_2 *old_de, *new_de;
2314 int retval; 2360 int retval, force_da_alloc = 0;
2315 2361
2316 old_bh = new_bh = dir_bh = NULL; 2362 old_bh = new_bh = dir_bh = NULL;
2317 2363
@@ -2358,7 +2404,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2358 dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval); 2404 dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
2359 if (!dir_bh) 2405 if (!dir_bh)
2360 goto end_rename; 2406 goto end_rename;
2361 if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) 2407 if (le32_to_cpu(PARENT_INO(dir_bh->b_data,
2408 old_dir->i_sb->s_blocksize)) != old_dir->i_ino)
2362 goto end_rename; 2409 goto end_rename;
2363 retval = -EMLINK; 2410 retval = -EMLINK;
2364 if (!new_inode && new_dir != old_dir && 2411 if (!new_inode && new_dir != old_dir &&
@@ -2430,7 +2477,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2430 if (dir_bh) { 2477 if (dir_bh) {
2431 BUFFER_TRACE(dir_bh, "get_write_access"); 2478 BUFFER_TRACE(dir_bh, "get_write_access");
2432 ext4_journal_get_write_access(handle, dir_bh); 2479 ext4_journal_get_write_access(handle, dir_bh);
2433 PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); 2480 PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
2481 cpu_to_le32(new_dir->i_ino);
2434 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); 2482 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
2435 ext4_handle_dirty_metadata(handle, old_dir, dir_bh); 2483 ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
2436 ext4_dec_count(handle, old_dir); 2484 ext4_dec_count(handle, old_dir);
@@ -2449,6 +2497,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2449 ext4_mark_inode_dirty(handle, new_inode); 2497 ext4_mark_inode_dirty(handle, new_inode);
2450 if (!new_inode->i_nlink) 2498 if (!new_inode->i_nlink)
2451 ext4_orphan_add(handle, new_inode); 2499 ext4_orphan_add(handle, new_inode);
2500 if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
2501 force_da_alloc = 1;
2452 } 2502 }
2453 retval = 0; 2503 retval = 0;
2454 2504
@@ -2457,6 +2507,8 @@ end_rename:
2457 brelse(old_bh); 2507 brelse(old_bh);
2458 brelse(new_bh); 2508 brelse(new_bh);
2459 ext4_journal_stop(handle); 2509 ext4_journal_stop(handle);
2510 if (retval == 0 && force_da_alloc)
2511 ext4_alloc_da_blocks(old_inode);
2460 return retval; 2512 return retval;
2461} 2513}
2462 2514
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index c06886abd658..546c7dd869e1 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -938,10 +938,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
938 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { 938 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
939 ext4_group_t flex_group; 939 ext4_group_t flex_group;
940 flex_group = ext4_flex_group(sbi, input->group); 940 flex_group = ext4_flex_group(sbi, input->group);
941 sbi->s_flex_groups[flex_group].free_blocks += 941 atomic_add(input->free_blocks_count,
942 input->free_blocks_count; 942 &sbi->s_flex_groups[flex_group].free_blocks);
943 sbi->s_flex_groups[flex_group].free_inodes += 943 atomic_add(EXT4_INODES_PER_GROUP(sb),
944 EXT4_INODES_PER_GROUP(sb); 944 &sbi->s_flex_groups[flex_group].free_inodes);
945 } 945 }
946 946
947 ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); 947 ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f7371a6a923d..2958f4e6f222 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -35,6 +35,7 @@
35#include <linux/quotaops.h> 35#include <linux/quotaops.h>
36#include <linux/seq_file.h> 36#include <linux/seq_file.h>
37#include <linux/proc_fs.h> 37#include <linux/proc_fs.h>
38#include <linux/ctype.h>
38#include <linux/marker.h> 39#include <linux/marker.h>
39#include <linux/log2.h> 40#include <linux/log2.h>
40#include <linux/crc16.h> 41#include <linux/crc16.h>
@@ -48,6 +49,7 @@
48#include "group.h" 49#include "group.h"
49 50
50struct proc_dir_entry *ext4_proc_root; 51struct proc_dir_entry *ext4_proc_root;
52static struct kset *ext4_kset;
51 53
52static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 54static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
53 unsigned long journal_devnum); 55 unsigned long journal_devnum);
@@ -577,9 +579,9 @@ static void ext4_put_super(struct super_block *sb)
577 ext4_commit_super(sb, es, 1); 579 ext4_commit_super(sb, es, 1);
578 } 580 }
579 if (sbi->s_proc) { 581 if (sbi->s_proc) {
580 remove_proc_entry("inode_readahead_blks", sbi->s_proc);
581 remove_proc_entry(sb->s_id, ext4_proc_root); 582 remove_proc_entry(sb->s_id, ext4_proc_root);
582 } 583 }
584 kobject_del(&sbi->s_kobj);
583 585
584 for (i = 0; i < sbi->s_gdb_count; i++) 586 for (i = 0; i < sbi->s_gdb_count; i++)
585 brelse(sbi->s_group_desc[i]); 587 brelse(sbi->s_group_desc[i]);
@@ -615,6 +617,17 @@ static void ext4_put_super(struct super_block *sb)
615 ext4_blkdev_remove(sbi); 617 ext4_blkdev_remove(sbi);
616 } 618 }
617 sb->s_fs_info = NULL; 619 sb->s_fs_info = NULL;
620 /*
621 * Now that we are completely done shutting down the
622 * superblock, we need to actually destroy the kobject.
623 */
624 unlock_kernel();
625 unlock_super(sb);
626 kobject_put(&sbi->s_kobj);
627 wait_for_completion(&sbi->s_kobj_unregister);
628 lock_super(sb);
629 lock_kernel();
630 kfree(sbi->s_blockgroup_lock);
618 kfree(sbi); 631 kfree(sbi);
619 return; 632 return;
620} 633}
@@ -803,8 +816,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
803 if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL)) 816 if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
804 seq_puts(seq, ",noacl"); 817 seq_puts(seq, ",noacl");
805#endif 818#endif
806 if (!test_opt(sb, RESERVATION))
807 seq_puts(seq, ",noreservation");
808 if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { 819 if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
809 seq_printf(seq, ",commit=%u", 820 seq_printf(seq, ",commit=%u",
810 (unsigned) (sbi->s_commit_interval / HZ)); 821 (unsigned) (sbi->s_commit_interval / HZ));
@@ -855,6 +866,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
855 if (test_opt(sb, DATA_ERR_ABORT)) 866 if (test_opt(sb, DATA_ERR_ABORT))
856 seq_puts(seq, ",data_err=abort"); 867 seq_puts(seq, ",data_err=abort");
857 868
869 if (test_opt(sb, NO_AUTO_DA_ALLOC))
870 seq_puts(seq, ",noauto_da_alloc");
871
858 ext4_show_quota_options(seq, sb); 872 ext4_show_quota_options(seq, sb);
859 return 0; 873 return 0;
860} 874}
@@ -1004,7 +1018,7 @@ enum {
1004 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, 1018 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
1005 Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, 1019 Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
1006 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, 1020 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
1007 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, 1021 Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh,
1008 Opt_commit, Opt_min_batch_time, Opt_max_batch_time, 1022 Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
1009 Opt_journal_update, Opt_journal_dev, 1023 Opt_journal_update, Opt_journal_dev,
1010 Opt_journal_checksum, Opt_journal_async_commit, 1024 Opt_journal_checksum, Opt_journal_async_commit,
@@ -1012,8 +1026,8 @@ enum {
1012 Opt_data_err_abort, Opt_data_err_ignore, 1026 Opt_data_err_abort, Opt_data_err_ignore,
1013 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 1027 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
1014 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 1028 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
1015 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, 1029 Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
1016 Opt_grpquota, Opt_i_version, 1030 Opt_usrquota, Opt_grpquota, Opt_i_version,
1017 Opt_stripe, Opt_delalloc, Opt_nodelalloc, 1031 Opt_stripe, Opt_delalloc, Opt_nodelalloc,
1018 Opt_inode_readahead_blks, Opt_journal_ioprio 1032 Opt_inode_readahead_blks, Opt_journal_ioprio
1019}; 1033};
@@ -1039,8 +1053,6 @@ static const match_table_t tokens = {
1039 {Opt_nouser_xattr, "nouser_xattr"}, 1053 {Opt_nouser_xattr, "nouser_xattr"},
1040 {Opt_acl, "acl"}, 1054 {Opt_acl, "acl"},
1041 {Opt_noacl, "noacl"}, 1055 {Opt_noacl, "noacl"},
1042 {Opt_reservation, "reservation"},
1043 {Opt_noreservation, "noreservation"},
1044 {Opt_noload, "noload"}, 1056 {Opt_noload, "noload"},
1045 {Opt_nobh, "nobh"}, 1057 {Opt_nobh, "nobh"},
1046 {Opt_bh, "bh"}, 1058 {Opt_bh, "bh"},
@@ -1068,6 +1080,8 @@ static const match_table_t tokens = {
1068 {Opt_quota, "quota"}, 1080 {Opt_quota, "quota"},
1069 {Opt_usrquota, "usrquota"}, 1081 {Opt_usrquota, "usrquota"},
1070 {Opt_barrier, "barrier=%u"}, 1082 {Opt_barrier, "barrier=%u"},
1083 {Opt_barrier, "barrier"},
1084 {Opt_nobarrier, "nobarrier"},
1071 {Opt_i_version, "i_version"}, 1085 {Opt_i_version, "i_version"},
1072 {Opt_stripe, "stripe=%u"}, 1086 {Opt_stripe, "stripe=%u"},
1073 {Opt_resize, "resize"}, 1087 {Opt_resize, "resize"},
@@ -1075,6 +1089,9 @@ static const match_table_t tokens = {
1075 {Opt_nodelalloc, "nodelalloc"}, 1089 {Opt_nodelalloc, "nodelalloc"},
1076 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, 1090 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
1077 {Opt_journal_ioprio, "journal_ioprio=%u"}, 1091 {Opt_journal_ioprio, "journal_ioprio=%u"},
1092 {Opt_auto_da_alloc, "auto_da_alloc=%u"},
1093 {Opt_auto_da_alloc, "auto_da_alloc"},
1094 {Opt_noauto_da_alloc, "noauto_da_alloc"},
1078 {Opt_err, NULL}, 1095 {Opt_err, NULL},
1079}; 1096};
1080 1097
@@ -1207,12 +1224,6 @@ static int parse_options(char *options, struct super_block *sb,
1207 "not supported\n"); 1224 "not supported\n");
1208 break; 1225 break;
1209#endif 1226#endif
1210 case Opt_reservation:
1211 set_opt(sbi->s_mount_opt, RESERVATION);
1212 break;
1213 case Opt_noreservation:
1214 clear_opt(sbi->s_mount_opt, RESERVATION);
1215 break;
1216 case Opt_journal_update: 1227 case Opt_journal_update:
1217 /* @@@ FIXME */ 1228 /* @@@ FIXME */
1218 /* Eventually we will want to be able to create 1229 /* Eventually we will want to be able to create
@@ -1415,9 +1426,14 @@ set_qf_format:
1415 case Opt_abort: 1426 case Opt_abort:
1416 set_opt(sbi->s_mount_opt, ABORT); 1427 set_opt(sbi->s_mount_opt, ABORT);
1417 break; 1428 break;
1429 case Opt_nobarrier:
1430 clear_opt(sbi->s_mount_opt, BARRIER);
1431 break;
1418 case Opt_barrier: 1432 case Opt_barrier:
1419 if (match_int(&args[0], &option)) 1433 if (match_int(&args[0], &option)) {
1420 return 0; 1434 set_opt(sbi->s_mount_opt, BARRIER);
1435 break;
1436 }
1421 if (option) 1437 if (option)
1422 set_opt(sbi->s_mount_opt, BARRIER); 1438 set_opt(sbi->s_mount_opt, BARRIER);
1423 else 1439 else
@@ -1463,6 +1479,11 @@ set_qf_format:
1463 return 0; 1479 return 0;
1464 if (option < 0 || option > (1 << 30)) 1480 if (option < 0 || option > (1 << 30))
1465 return 0; 1481 return 0;
1482 if (option & (option - 1)) {
1483 printk(KERN_ERR "EXT4-fs: inode_readahead_blks"
1484 " must be a power of 2\n");
1485 return 0;
1486 }
1466 sbi->s_inode_readahead_blks = option; 1487 sbi->s_inode_readahead_blks = option;
1467 break; 1488 break;
1468 case Opt_journal_ioprio: 1489 case Opt_journal_ioprio:
@@ -1473,6 +1494,19 @@ set_qf_format:
1473 *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 1494 *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
1474 option); 1495 option);
1475 break; 1496 break;
1497 case Opt_noauto_da_alloc:
1498 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
1499 break;
1500 case Opt_auto_da_alloc:
1501 if (match_int(&args[0], &option)) {
1502 clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
1503 break;
1504 }
1505 if (option)
1506 clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
1507 else
1508 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
1509 break;
1476 default: 1510 default:
1477 printk(KERN_ERR 1511 printk(KERN_ERR
1478 "EXT4-fs: Unrecognized mount option \"%s\" " 1512 "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1612,10 +1646,12 @@ static int ext4_fill_flex_info(struct super_block *sb)
1612 gdp = ext4_get_group_desc(sb, i, &bh); 1646 gdp = ext4_get_group_desc(sb, i, &bh);
1613 1647
1614 flex_group = ext4_flex_group(sbi, i); 1648 flex_group = ext4_flex_group(sbi, i);
1615 sbi->s_flex_groups[flex_group].free_inodes += 1649 atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
1616 ext4_free_inodes_count(sb, gdp); 1650 ext4_free_inodes_count(sb, gdp));
1617 sbi->s_flex_groups[flex_group].free_blocks += 1651 atomic_set(&sbi->s_flex_groups[flex_group].free_blocks,
1618 ext4_free_blks_count(sb, gdp); 1652 ext4_free_blks_count(sb, gdp));
1653 atomic_set(&sbi->s_flex_groups[flex_group].used_dirs,
1654 ext4_used_dirs_count(sb, gdp));
1619 } 1655 }
1620 1656
1621 return 1; 1657 return 1;
@@ -1991,6 +2027,181 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
1991 return 0; 2027 return 0;
1992} 2028}
1993 2029
2030/* sysfs supprt */
2031
2032struct ext4_attr {
2033 struct attribute attr;
2034 ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
2035 ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
2036 const char *, size_t);
2037 int offset;
2038};
2039
2040static int parse_strtoul(const char *buf,
2041 unsigned long max, unsigned long *value)
2042{
2043 char *endp;
2044
2045 while (*buf && isspace(*buf))
2046 buf++;
2047 *value = simple_strtoul(buf, &endp, 0);
2048 while (*endp && isspace(*endp))
2049 endp++;
2050 if (*endp || *value > max)
2051 return -EINVAL;
2052
2053 return 0;
2054}
2055
2056static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
2057 struct ext4_sb_info *sbi,
2058 char *buf)
2059{
2060 return snprintf(buf, PAGE_SIZE, "%llu\n",
2061 (s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
2062}
2063
2064static ssize_t session_write_kbytes_show(struct ext4_attr *a,
2065 struct ext4_sb_info *sbi, char *buf)
2066{
2067 struct super_block *sb = sbi->s_buddy_cache->i_sb;
2068
2069 return snprintf(buf, PAGE_SIZE, "%lu\n",
2070 (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
2071 sbi->s_sectors_written_start) >> 1);
2072}
2073
2074static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
2075 struct ext4_sb_info *sbi, char *buf)
2076{
2077 struct super_block *sb = sbi->s_buddy_cache->i_sb;
2078
2079 return snprintf(buf, PAGE_SIZE, "%llu\n",
2080 sbi->s_kbytes_written +
2081 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
2082 EXT4_SB(sb)->s_sectors_written_start) >> 1));
2083}
2084
2085static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
2086 struct ext4_sb_info *sbi,
2087 const char *buf, size_t count)
2088{
2089 unsigned long t;
2090
2091 if (parse_strtoul(buf, 0x40000000, &t))
2092 return -EINVAL;
2093
2094 /* inode_readahead_blks must be a power of 2 */
2095 if (t & (t-1))
2096 return -EINVAL;
2097
2098 sbi->s_inode_readahead_blks = t;
2099 return count;
2100}
2101
2102static ssize_t sbi_ui_show(struct ext4_attr *a,
2103 struct ext4_sb_info *sbi, char *buf)
2104{
2105 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
2106
2107 return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
2108}
2109
2110static ssize_t sbi_ui_store(struct ext4_attr *a,
2111 struct ext4_sb_info *sbi,
2112 const char *buf, size_t count)
2113{
2114 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
2115 unsigned long t;
2116
2117 if (parse_strtoul(buf, 0xffffffff, &t))
2118 return -EINVAL;
2119 *ui = t;
2120 return count;
2121}
2122
2123#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
2124static struct ext4_attr ext4_attr_##_name = { \
2125 .attr = {.name = __stringify(_name), .mode = _mode }, \
2126 .show = _show, \
2127 .store = _store, \
2128 .offset = offsetof(struct ext4_sb_info, _elname), \
2129}
2130#define EXT4_ATTR(name, mode, show, store) \
2131static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
2132
2133#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
2134#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
2135#define EXT4_RW_ATTR_SBI_UI(name, elname) \
2136 EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
2137#define ATTR_LIST(name) &ext4_attr_##name.attr
2138
2139EXT4_RO_ATTR(delayed_allocation_blocks);
2140EXT4_RO_ATTR(session_write_kbytes);
2141EXT4_RO_ATTR(lifetime_write_kbytes);
2142EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
2143 inode_readahead_blks_store, s_inode_readahead_blks);
2144EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
2145EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
2146EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
2147EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
2148EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
2149EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
2150
2151static struct attribute *ext4_attrs[] = {
2152 ATTR_LIST(delayed_allocation_blocks),
2153 ATTR_LIST(session_write_kbytes),
2154 ATTR_LIST(lifetime_write_kbytes),
2155 ATTR_LIST(inode_readahead_blks),
2156 ATTR_LIST(mb_stats),
2157 ATTR_LIST(mb_max_to_scan),
2158 ATTR_LIST(mb_min_to_scan),
2159 ATTR_LIST(mb_order2_req),
2160 ATTR_LIST(mb_stream_req),
2161 ATTR_LIST(mb_group_prealloc),
2162 NULL,
2163};
2164
2165static ssize_t ext4_attr_show(struct kobject *kobj,
2166 struct attribute *attr, char *buf)
2167{
2168 struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
2169 s_kobj);
2170 struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
2171
2172 return a->show ? a->show(a, sbi, buf) : 0;
2173}
2174
2175static ssize_t ext4_attr_store(struct kobject *kobj,
2176 struct attribute *attr,
2177 const char *buf, size_t len)
2178{
2179 struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
2180 s_kobj);
2181 struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
2182
2183 return a->store ? a->store(a, sbi, buf, len) : 0;
2184}
2185
2186static void ext4_sb_release(struct kobject *kobj)
2187{
2188 struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
2189 s_kobj);
2190 complete(&sbi->s_kobj_unregister);
2191}
2192
2193
2194static struct sysfs_ops ext4_attr_ops = {
2195 .show = ext4_attr_show,
2196 .store = ext4_attr_store,
2197};
2198
2199static struct kobj_type ext4_ktype = {
2200 .default_attrs = ext4_attrs,
2201 .sysfs_ops = &ext4_attr_ops,
2202 .release = ext4_sb_release,
2203};
2204
1994static int ext4_fill_super(struct super_block *sb, void *data, int silent) 2205static int ext4_fill_super(struct super_block *sb, void *data, int silent)
1995 __releases(kernel_lock) 2206 __releases(kernel_lock)
1996 __acquires(kernel_lock) 2207 __acquires(kernel_lock)
@@ -2021,12 +2232,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2021 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 2232 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
2022 if (!sbi) 2233 if (!sbi)
2023 return -ENOMEM; 2234 return -ENOMEM;
2235
2236 sbi->s_blockgroup_lock =
2237 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
2238 if (!sbi->s_blockgroup_lock) {
2239 kfree(sbi);
2240 return -ENOMEM;
2241 }
2024 sb->s_fs_info = sbi; 2242 sb->s_fs_info = sbi;
2025 sbi->s_mount_opt = 0; 2243 sbi->s_mount_opt = 0;
2026 sbi->s_resuid = EXT4_DEF_RESUID; 2244 sbi->s_resuid = EXT4_DEF_RESUID;
2027 sbi->s_resgid = EXT4_DEF_RESGID; 2245 sbi->s_resgid = EXT4_DEF_RESGID;
2028 sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; 2246 sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
2029 sbi->s_sb_block = sb_block; 2247 sbi->s_sb_block = sb_block;
2248 sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part,
2249 sectors[1]);
2030 2250
2031 unlock_kernel(); 2251 unlock_kernel();
2032 2252
@@ -2064,6 +2284,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2064 sb->s_magic = le16_to_cpu(es->s_magic); 2284 sb->s_magic = le16_to_cpu(es->s_magic);
2065 if (sb->s_magic != EXT4_SUPER_MAGIC) 2285 if (sb->s_magic != EXT4_SUPER_MAGIC)
2066 goto cantfind_ext4; 2286 goto cantfind_ext4;
2287 sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
2067 2288
2068 /* Set defaults before we parse the mount options */ 2289 /* Set defaults before we parse the mount options */
2069 def_mount_opts = le32_to_cpu(es->s_default_mount_opts); 2290 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -2101,7 +2322,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2101 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; 2322 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
2102 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; 2323 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
2103 2324
2104 set_opt(sbi->s_mount_opt, RESERVATION);
2105 set_opt(sbi->s_mount_opt, BARRIER); 2325 set_opt(sbi->s_mount_opt, BARRIER);
2106 2326
2107 /* 2327 /*
@@ -2288,6 +2508,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2288 if (EXT4_BLOCKS_PER_GROUP(sb) == 0) 2508 if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
2289 goto cantfind_ext4; 2509 goto cantfind_ext4;
2290 2510
2511 /* check blocks count against device size */
2512 blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
2513 if (blocks_count && ext4_blocks_count(es) > blocks_count) {
2514 printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu "
2515 "exceeds size of device (%llu blocks)\n",
2516 ext4_blocks_count(es), blocks_count);
2517 goto failed_mount;
2518 }
2519
2291 /* 2520 /*
2292 * It makes no sense for the first data block to be beyond the end 2521 * It makes no sense for the first data block to be beyond the end
2293 * of the filesystem. 2522 * of the filesystem.
@@ -2325,14 +2554,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2325#ifdef CONFIG_PROC_FS 2554#ifdef CONFIG_PROC_FS
2326 if (ext4_proc_root) 2555 if (ext4_proc_root)
2327 sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); 2556 sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
2328
2329 if (sbi->s_proc)
2330 proc_create_data("inode_readahead_blks", 0644, sbi->s_proc,
2331 &ext4_ui_proc_fops,
2332 &sbi->s_inode_readahead_blks);
2333#endif 2557#endif
2334 2558
2335 bgl_lock_init(&sbi->s_blockgroup_lock); 2559 bgl_lock_init(sbi->s_blockgroup_lock);
2336 2560
2337 for (i = 0; i < db_count; i++) { 2561 for (i = 0; i < db_count; i++) {
2338 block = descriptor_loc(sb, logical_sb_block, i); 2562 block = descriptor_loc(sb, logical_sb_block, i);
@@ -2564,6 +2788,16 @@ no_journal:
2564 goto failed_mount4; 2788 goto failed_mount4;
2565 } 2789 }
2566 2790
2791 sbi->s_kobj.kset = ext4_kset;
2792 init_completion(&sbi->s_kobj_unregister);
2793 err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
2794 "%s", sb->s_id);
2795 if (err) {
2796 ext4_mb_release(sb);
2797 ext4_ext_release(sb);
2798 goto failed_mount4;
2799 };
2800
2567 /* 2801 /*
2568 * akpm: core read_super() calls in here with the superblock locked. 2802 * akpm: core read_super() calls in here with the superblock locked.
2569 * That deadlocks, because orphan cleanup needs to lock the superblock 2803 * That deadlocks, because orphan cleanup needs to lock the superblock
@@ -2618,7 +2852,6 @@ failed_mount2:
2618 kfree(sbi->s_group_desc); 2852 kfree(sbi->s_group_desc);
2619failed_mount: 2853failed_mount:
2620 if (sbi->s_proc) { 2854 if (sbi->s_proc) {
2621 remove_proc_entry("inode_readahead_blks", sbi->s_proc);
2622 remove_proc_entry(sb->s_id, ext4_proc_root); 2855 remove_proc_entry(sb->s_id, ext4_proc_root);
2623 } 2856 }
2624#ifdef CONFIG_QUOTA 2857#ifdef CONFIG_QUOTA
@@ -2913,6 +3146,10 @@ static int ext4_commit_super(struct super_block *sb,
2913 set_buffer_uptodate(sbh); 3146 set_buffer_uptodate(sbh);
2914 } 3147 }
2915 es->s_wtime = cpu_to_le32(get_seconds()); 3148 es->s_wtime = cpu_to_le32(get_seconds());
3149 es->s_kbytes_written =
3150 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
3151 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
3152 EXT4_SB(sb)->s_sectors_written_start) >> 1));
2916 ext4_free_blocks_count_set(es, percpu_counter_sum_positive( 3153 ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
2917 &EXT4_SB(sb)->s_freeblocks_counter)); 3154 &EXT4_SB(sb)->s_freeblocks_counter));
2918 es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( 3155 es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
@@ -3647,45 +3884,6 @@ static int ext4_get_sb(struct file_system_type *fs_type,
3647 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt); 3884 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
3648} 3885}
3649 3886
3650#ifdef CONFIG_PROC_FS
3651static int ext4_ui_proc_show(struct seq_file *m, void *v)
3652{
3653 unsigned int *p = m->private;
3654
3655 seq_printf(m, "%u\n", *p);
3656 return 0;
3657}
3658
3659static int ext4_ui_proc_open(struct inode *inode, struct file *file)
3660{
3661 return single_open(file, ext4_ui_proc_show, PDE(inode)->data);
3662}
3663
3664static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
3665 size_t cnt, loff_t *ppos)
3666{
3667 unsigned long *p = PDE(file->f_path.dentry->d_inode)->data;
3668 char str[32];
3669
3670 if (cnt >= sizeof(str))
3671 return -EINVAL;
3672 if (copy_from_user(str, buf, cnt))
3673 return -EFAULT;
3674
3675 *p = simple_strtoul(str, NULL, 0);
3676 return cnt;
3677}
3678
3679const struct file_operations ext4_ui_proc_fops = {
3680 .owner = THIS_MODULE,
3681 .open = ext4_ui_proc_open,
3682 .read = seq_read,
3683 .llseek = seq_lseek,
3684 .release = single_release,
3685 .write = ext4_ui_proc_write,
3686};
3687#endif
3688
3689static struct file_system_type ext4_fs_type = { 3887static struct file_system_type ext4_fs_type = {
3690 .owner = THIS_MODULE, 3888 .owner = THIS_MODULE,
3691 .name = "ext4", 3889 .name = "ext4",
@@ -3719,6 +3917,9 @@ static int __init init_ext4_fs(void)
3719{ 3917{
3720 int err; 3918 int err;
3721 3919
3920 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
3921 if (!ext4_kset)
3922 return -ENOMEM;
3722 ext4_proc_root = proc_mkdir("fs/ext4", NULL); 3923 ext4_proc_root = proc_mkdir("fs/ext4", NULL);
3723 err = init_ext4_mballoc(); 3924 err = init_ext4_mballoc();
3724 if (err) 3925 if (err)
@@ -3760,6 +3961,7 @@ static void __exit exit_ext4_fs(void)
3760 exit_ext4_xattr(); 3961 exit_ext4_xattr();
3761 exit_ext4_mballoc(); 3962 exit_ext4_mballoc();
3762 remove_proc_entry("fs/ext4", NULL); 3963 remove_proc_entry("fs/ext4", NULL);
3964 kset_unregister(ext4_kset);
3763} 3965}
3764 3966
3765MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); 3967MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig
index d0a69ff25375..182f9ffe2b51 100644
--- a/fs/fat/Kconfig
+++ b/fs/fat/Kconfig
@@ -95,3 +95,6 @@ config FAT_DEFAULT_IOCHARSET
95 Note that "utf8" is not recommended for FAT filesystems. 95 Note that "utf8" is not recommended for FAT filesystems.
96 If unsure, you shouldn't set "utf8" here. 96 If unsure, you shouldn't set "utf8" here.
97 See <file:Documentation/filesystems/vfat.txt> for more information. 97 See <file:Documentation/filesystems/vfat.txt> for more information.
98
99 Enable any character sets you need in File Systems/Native Language
100 Support.
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index de0004fe6e00..296785a0dec8 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -523,7 +523,9 @@ static int fat_remount(struct super_block *sb, int *flags, char *data)
523 523
524static int fat_statfs(struct dentry *dentry, struct kstatfs *buf) 524static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
525{ 525{
526 struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb); 526 struct super_block *sb = dentry->d_sb;
527 struct msdos_sb_info *sbi = MSDOS_SB(sb);
528 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
527 529
528 /* If the count of free cluster is still unknown, counts it here. */ 530 /* If the count of free cluster is still unknown, counts it here. */
529 if (sbi->free_clusters == -1 || !sbi->free_clus_valid) { 531 if (sbi->free_clusters == -1 || !sbi->free_clus_valid) {
@@ -537,6 +539,8 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
537 buf->f_blocks = sbi->max_cluster - FAT_START_ENT; 539 buf->f_blocks = sbi->max_cluster - FAT_START_ENT;
538 buf->f_bfree = sbi->free_clusters; 540 buf->f_bfree = sbi->free_clusters;
539 buf->f_bavail = sbi->free_clusters; 541 buf->f_bavail = sbi->free_clusters;
542 buf->f_fsid.val[0] = (u32)id;
543 buf->f_fsid.val[1] = (u32)(id >> 32);
540 buf->f_namelen = sbi->options.isvfat ? 260 : 12; 544 buf->f_namelen = sbi->options.isvfat ? 260 : 12;
541 545
542 return 0; 546 return 0;
@@ -930,7 +934,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
930 934
931 opts->fs_uid = current_uid(); 935 opts->fs_uid = current_uid();
932 opts->fs_gid = current_gid(); 936 opts->fs_gid = current_gid();
933 opts->fs_fmask = opts->fs_dmask = current->fs->umask; 937 opts->fs_fmask = current_umask();
934 opts->allow_utime = -1; 938 opts->allow_utime = -1;
935 opts->codepage = fat_default_codepage; 939 opts->codepage = fat_default_codepage;
936 opts->iocharset = fat_default_iocharset; 940 opts->iocharset = fat_default_iocharset;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index cc8e4de2fee5..1ad703150dee 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -117,11 +117,13 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
117{ 117{
118 if (unlikely(newfd == oldfd)) { /* corner case */ 118 if (unlikely(newfd == oldfd)) { /* corner case */
119 struct files_struct *files = current->files; 119 struct files_struct *files = current->files;
120 int retval = oldfd;
121
120 rcu_read_lock(); 122 rcu_read_lock();
121 if (!fcheck_files(files, oldfd)) 123 if (!fcheck_files(files, oldfd))
122 oldfd = -EBADF; 124 retval = -EBADF;
123 rcu_read_unlock(); 125 rcu_read_unlock();
124 return oldfd; 126 return retval;
125 } 127 }
126 return sys_dup3(oldfd, newfd, 0); 128 return sys_dup3(oldfd, newfd, 0);
127} 129}
diff --git a/fs/file_table.c b/fs/file_table.c
index b74a8e1da913..54018fe48840 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -169,7 +169,6 @@ struct file *alloc_file(struct vfsmount *mnt, struct dentry *dentry,
169 fmode_t mode, const struct file_operations *fop) 169 fmode_t mode, const struct file_operations *fop)
170{ 170{
171 struct file *file; 171 struct file *file;
172 struct path;
173 172
174 file = get_empty_filp(); 173 file = get_empty_filp();
175 if (!file) 174 if (!file)
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 1aa70260e6d1..a24c58e181db 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -199,7 +199,7 @@ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2)
199 return retval; 199 return retval;
200} 200}
201 201
202int get_filesystem_list(char * buf) 202int __init get_filesystem_list(char *buf)
203{ 203{
204 int len = 0; 204 int len = 0;
205 struct file_system_type * tmp; 205 struct file_system_type * tmp;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e3fe9918faaf..91013ff7dd53 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -196,7 +196,7 @@ static void redirty_tail(struct inode *inode)
196 struct inode *tail_inode; 196 struct inode *tail_inode;
197 197
198 tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list); 198 tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list);
199 if (!time_after_eq(inode->dirtied_when, 199 if (time_before(inode->dirtied_when,
200 tail_inode->dirtied_when)) 200 tail_inode->dirtied_when))
201 inode->dirtied_when = jiffies; 201 inode->dirtied_when = jiffies;
202 } 202 }
@@ -220,6 +220,21 @@ static void inode_sync_complete(struct inode *inode)
220 wake_up_bit(&inode->i_state, __I_SYNC); 220 wake_up_bit(&inode->i_state, __I_SYNC);
221} 221}
222 222
223static bool inode_dirtied_after(struct inode *inode, unsigned long t)
224{
225 bool ret = time_after(inode->dirtied_when, t);
226#ifndef CONFIG_64BIT
227 /*
228 * For inodes being constantly redirtied, dirtied_when can get stuck.
229 * It _appears_ to be in the future, but is actually in distant past.
230 * This test is necessary to prevent such wrapped-around relative times
231 * from permanently stopping the whole pdflush writeback.
232 */
233 ret = ret && time_before_eq(inode->dirtied_when, jiffies);
234#endif
235 return ret;
236}
237
223/* 238/*
224 * Move expired dirty inodes from @delaying_queue to @dispatch_queue. 239 * Move expired dirty inodes from @delaying_queue to @dispatch_queue.
225 */ 240 */
@@ -231,7 +246,7 @@ static void move_expired_inodes(struct list_head *delaying_queue,
231 struct inode *inode = list_entry(delaying_queue->prev, 246 struct inode *inode = list_entry(delaying_queue->prev,
232 struct inode, i_list); 247 struct inode, i_list);
233 if (older_than_this && 248 if (older_than_this &&
234 time_after(inode->dirtied_when, *older_than_this)) 249 inode_dirtied_after(inode, *older_than_this))
235 break; 250 break;
236 list_move(&inode->i_list, dispatch_queue); 251 list_move(&inode->i_list, dispatch_queue);
237 } 252 }
@@ -420,7 +435,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
420 * If older_than_this is non-NULL, then only write out inodes which 435 * If older_than_this is non-NULL, then only write out inodes which
421 * had their first dirtying at a time earlier than *older_than_this. 436 * had their first dirtying at a time earlier than *older_than_this.
422 * 437 *
423 * If we're a pdlfush thread, then implement pdflush collision avoidance 438 * If we're a pdflush thread, then implement pdflush collision avoidance
424 * against the entire list. 439 * against the entire list.
425 * 440 *
426 * If `bdi' is non-zero then we're being asked to writeback a specific queue. 441 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
@@ -492,8 +507,11 @@ void generic_sync_sb_inodes(struct super_block *sb,
492 continue; /* blockdev has wrong queue */ 507 continue; /* blockdev has wrong queue */
493 } 508 }
494 509
495 /* Was this inode dirtied after sync_sb_inodes was called? */ 510 /*
496 if (time_after(inode->dirtied_when, start)) 511 * Was this inode dirtied after sync_sb_inodes was called?
512 * This keeps sync from extra jobs and livelock.
513 */
514 if (inode_dirtied_after(inode, start))
497 break; 515 break;
498 516
499 /* Is another pdflush already flushing this queue? */ 517 /* Is another pdflush already flushing this queue? */
@@ -538,7 +556,8 @@ void generic_sync_sb_inodes(struct super_block *sb,
538 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 556 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
539 struct address_space *mapping; 557 struct address_space *mapping;
540 558
541 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 559 if (inode->i_state &
560 (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
542 continue; 561 continue;
543 mapping = inode->i_mapping; 562 mapping = inode->i_mapping;
544 if (mapping->nrpages == 0) 563 if (mapping->nrpages == 0)
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
new file mode 100644
index 000000000000..eee059052db5
--- /dev/null
+++ b/fs/fs_struct.c
@@ -0,0 +1,177 @@
1#include <linux/module.h>
2#include <linux/sched.h>
3#include <linux/fs.h>
4#include <linux/path.h>
5#include <linux/slab.h>
6#include <linux/fs_struct.h>
7
8/*
9 * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
10 * It can block.
11 */
12void set_fs_root(struct fs_struct *fs, struct path *path)
13{
14 struct path old_root;
15
16 write_lock(&fs->lock);
17 old_root = fs->root;
18 fs->root = *path;
19 path_get(path);
20 write_unlock(&fs->lock);
21 if (old_root.dentry)
22 path_put(&old_root);
23}
24
25/*
26 * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
27 * It can block.
28 */
29void set_fs_pwd(struct fs_struct *fs, struct path *path)
30{
31 struct path old_pwd;
32
33 write_lock(&fs->lock);
34 old_pwd = fs->pwd;
35 fs->pwd = *path;
36 path_get(path);
37 write_unlock(&fs->lock);
38
39 if (old_pwd.dentry)
40 path_put(&old_pwd);
41}
42
43void chroot_fs_refs(struct path *old_root, struct path *new_root)
44{
45 struct task_struct *g, *p;
46 struct fs_struct *fs;
47 int count = 0;
48
49 read_lock(&tasklist_lock);
50 do_each_thread(g, p) {
51 task_lock(p);
52 fs = p->fs;
53 if (fs) {
54 write_lock(&fs->lock);
55 if (fs->root.dentry == old_root->dentry
56 && fs->root.mnt == old_root->mnt) {
57 path_get(new_root);
58 fs->root = *new_root;
59 count++;
60 }
61 if (fs->pwd.dentry == old_root->dentry
62 && fs->pwd.mnt == old_root->mnt) {
63 path_get(new_root);
64 fs->pwd = *new_root;
65 count++;
66 }
67 write_unlock(&fs->lock);
68 }
69 task_unlock(p);
70 } while_each_thread(g, p);
71 read_unlock(&tasklist_lock);
72 while (count--)
73 path_put(old_root);
74}
75
76void free_fs_struct(struct fs_struct *fs)
77{
78 path_put(&fs->root);
79 path_put(&fs->pwd);
80 kmem_cache_free(fs_cachep, fs);
81}
82
83void exit_fs(struct task_struct *tsk)
84{
85 struct fs_struct *fs = tsk->fs;
86
87 if (fs) {
88 int kill;
89 task_lock(tsk);
90 write_lock(&fs->lock);
91 tsk->fs = NULL;
92 kill = !--fs->users;
93 write_unlock(&fs->lock);
94 task_unlock(tsk);
95 if (kill)
96 free_fs_struct(fs);
97 }
98}
99
100struct fs_struct *copy_fs_struct(struct fs_struct *old)
101{
102 struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
103 /* We don't need to lock fs - think why ;-) */
104 if (fs) {
105 fs->users = 1;
106 fs->in_exec = 0;
107 rwlock_init(&fs->lock);
108 fs->umask = old->umask;
109 read_lock(&old->lock);
110 fs->root = old->root;
111 path_get(&old->root);
112 fs->pwd = old->pwd;
113 path_get(&old->pwd);
114 read_unlock(&old->lock);
115 }
116 return fs;
117}
118
119int unshare_fs_struct(void)
120{
121 struct fs_struct *fs = current->fs;
122 struct fs_struct *new_fs = copy_fs_struct(fs);
123 int kill;
124
125 if (!new_fs)
126 return -ENOMEM;
127
128 task_lock(current);
129 write_lock(&fs->lock);
130 kill = !--fs->users;
131 current->fs = new_fs;
132 write_unlock(&fs->lock);
133 task_unlock(current);
134
135 if (kill)
136 free_fs_struct(fs);
137
138 return 0;
139}
140EXPORT_SYMBOL_GPL(unshare_fs_struct);
141
142int current_umask(void)
143{
144 return current->fs->umask;
145}
146EXPORT_SYMBOL(current_umask);
147
148/* to be mentioned only in INIT_TASK */
149struct fs_struct init_fs = {
150 .users = 1,
151 .lock = __RW_LOCK_UNLOCKED(init_fs.lock),
152 .umask = 0022,
153};
154
155void daemonize_fs_struct(void)
156{
157 struct fs_struct *fs = current->fs;
158
159 if (fs) {
160 int kill;
161
162 task_lock(current);
163
164 write_lock(&init_fs.lock);
165 init_fs.users++;
166 write_unlock(&init_fs.lock);
167
168 write_lock(&fs->lock);
169 current->fs = &init_fs;
170 kill = !--fs->users;
171 write_unlock(&fs->lock);
172
173 task_unlock(current);
174 if (kill)
175 free_fs_struct(fs);
176 }
177}
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
new file mode 100644
index 000000000000..9bbb8ce7bea0
--- /dev/null
+++ b/fs/fscache/Kconfig
@@ -0,0 +1,56 @@
1
2config FSCACHE
3 tristate "General filesystem local caching manager"
4 depends on EXPERIMENTAL
5 select SLOW_WORK
6 help
7 This option enables a generic filesystem caching manager that can be
8 used by various network and other filesystems to cache data locally.
9 Different sorts of caches can be plugged in, depending on the
10 resources available.
11
12 See Documentation/filesystems/caching/fscache.txt for more information.
13
14config FSCACHE_STATS
15 bool "Gather statistical information on local caching"
16 depends on FSCACHE && PROC_FS
17 help
18 This option causes statistical information to be gathered on local
19 caching and exported through file:
20
21 /proc/fs/fscache/stats
22
23 The gathering of statistics adds a certain amount of overhead to
24 execution as there are a quite a few stats gathered, and on a
25 multi-CPU system these may be on cachelines that keep bouncing
26 between CPUs. On the other hand, the stats are very useful for
27 debugging purposes. Saying 'Y' here is recommended.
28
29 See Documentation/filesystems/caching/fscache.txt for more information.
30
31config FSCACHE_HISTOGRAM
32 bool "Gather latency information on local caching"
33 depends on FSCACHE && PROC_FS
34 help
35 This option causes latency information to be gathered on local
36 caching and exported through file:
37
38 /proc/fs/fscache/histogram
39
40 The generation of this histogram adds a certain amount of overhead to
41 execution as there are a number of points at which data is gathered,
42 and on a multi-CPU system these may be on cachelines that keep
43 bouncing between CPUs. On the other hand, the histogram may be
44 useful for debugging purposes. Saying 'N' here is recommended.
45
46 See Documentation/filesystems/caching/fscache.txt for more information.
47
48config FSCACHE_DEBUG
49 bool "Debug FS-Cache"
50 depends on FSCACHE
51 help
52 This permits debugging to be dynamically enabled in the local caching
53 management module. If this is set, the debugging output may be
54 enabled by setting bits in /sys/modules/fscache/parameter/debug.
55
56 See Documentation/filesystems/caching/fscache.txt for more information.
diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
new file mode 100644
index 000000000000..91571b95aacc
--- /dev/null
+++ b/fs/fscache/Makefile
@@ -0,0 +1,19 @@
1#
2# Makefile for general filesystem caching code
3#
4
5fscache-y := \
6 cache.o \
7 cookie.o \
8 fsdef.o \
9 main.o \
10 netfs.o \
11 object.o \
12 operation.o \
13 page.o
14
15fscache-$(CONFIG_PROC_FS) += proc.o
16fscache-$(CONFIG_FSCACHE_STATS) += stats.o
17fscache-$(CONFIG_FSCACHE_HISTOGRAM) += histogram.o
18
19obj-$(CONFIG_FSCACHE) := fscache.o
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
new file mode 100644
index 000000000000..e21985bbb1fb
--- /dev/null
+++ b/fs/fscache/cache.c
@@ -0,0 +1,415 @@
1/* FS-Cache cache handling
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL CACHE
13#include <linux/module.h>
14#include <linux/slab.h>
15#include "internal.h"
16
17LIST_HEAD(fscache_cache_list);
18DECLARE_RWSEM(fscache_addremove_sem);
19DECLARE_WAIT_QUEUE_HEAD(fscache_cache_cleared_wq);
20EXPORT_SYMBOL(fscache_cache_cleared_wq);
21
22static LIST_HEAD(fscache_cache_tag_list);
23
24/*
25 * look up a cache tag
26 */
27struct fscache_cache_tag *__fscache_lookup_cache_tag(const char *name)
28{
29 struct fscache_cache_tag *tag, *xtag;
30
31 /* firstly check for the existence of the tag under read lock */
32 down_read(&fscache_addremove_sem);
33
34 list_for_each_entry(tag, &fscache_cache_tag_list, link) {
35 if (strcmp(tag->name, name) == 0) {
36 atomic_inc(&tag->usage);
37 up_read(&fscache_addremove_sem);
38 return tag;
39 }
40 }
41
42 up_read(&fscache_addremove_sem);
43
44 /* the tag does not exist - create a candidate */
45 xtag = kzalloc(sizeof(*xtag) + strlen(name) + 1, GFP_KERNEL);
46 if (!xtag)
47 /* return a dummy tag if out of memory */
48 return ERR_PTR(-ENOMEM);
49
50 atomic_set(&xtag->usage, 1);
51 strcpy(xtag->name, name);
52
53 /* write lock, search again and add if still not present */
54 down_write(&fscache_addremove_sem);
55
56 list_for_each_entry(tag, &fscache_cache_tag_list, link) {
57 if (strcmp(tag->name, name) == 0) {
58 atomic_inc(&tag->usage);
59 up_write(&fscache_addremove_sem);
60 kfree(xtag);
61 return tag;
62 }
63 }
64
65 list_add_tail(&xtag->link, &fscache_cache_tag_list);
66 up_write(&fscache_addremove_sem);
67 return xtag;
68}
69
70/*
71 * release a reference to a cache tag
72 */
73void __fscache_release_cache_tag(struct fscache_cache_tag *tag)
74{
75 if (tag != ERR_PTR(-ENOMEM)) {
76 down_write(&fscache_addremove_sem);
77
78 if (atomic_dec_and_test(&tag->usage))
79 list_del_init(&tag->link);
80 else
81 tag = NULL;
82
83 up_write(&fscache_addremove_sem);
84
85 kfree(tag);
86 }
87}
88
89/*
90 * select a cache in which to store an object
91 * - the cache addremove semaphore must be at least read-locked by the caller
92 * - the object will never be an index
93 */
94struct fscache_cache *fscache_select_cache_for_object(
95 struct fscache_cookie *cookie)
96{
97 struct fscache_cache_tag *tag;
98 struct fscache_object *object;
99 struct fscache_cache *cache;
100
101 _enter("");
102
103 if (list_empty(&fscache_cache_list)) {
104 _leave(" = NULL [no cache]");
105 return NULL;
106 }
107
108 /* we check the parent to determine the cache to use */
109 spin_lock(&cookie->lock);
110
111 /* the first in the parent's backing list should be the preferred
112 * cache */
113 if (!hlist_empty(&cookie->backing_objects)) {
114 object = hlist_entry(cookie->backing_objects.first,
115 struct fscache_object, cookie_link);
116
117 cache = object->cache;
118 if (object->state >= FSCACHE_OBJECT_DYING ||
119 test_bit(FSCACHE_IOERROR, &cache->flags))
120 cache = NULL;
121
122 spin_unlock(&cookie->lock);
123 _leave(" = %p [parent]", cache);
124 return cache;
125 }
126
127 /* the parent is unbacked */
128 if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
129 /* cookie not an index and is unbacked */
130 spin_unlock(&cookie->lock);
131 _leave(" = NULL [cookie ub,ni]");
132 return NULL;
133 }
134
135 spin_unlock(&cookie->lock);
136
137 if (!cookie->def->select_cache)
138 goto no_preference;
139
140 /* ask the netfs for its preference */
141 tag = cookie->def->select_cache(cookie->parent->netfs_data,
142 cookie->netfs_data);
143 if (!tag)
144 goto no_preference;
145
146 if (tag == ERR_PTR(-ENOMEM)) {
147 _leave(" = NULL [nomem tag]");
148 return NULL;
149 }
150
151 if (!tag->cache) {
152 _leave(" = NULL [unbacked tag]");
153 return NULL;
154 }
155
156 if (test_bit(FSCACHE_IOERROR, &tag->cache->flags))
157 return NULL;
158
159 _leave(" = %p [specific]", tag->cache);
160 return tag->cache;
161
162no_preference:
163 /* netfs has no preference - just select first cache */
164 cache = list_entry(fscache_cache_list.next,
165 struct fscache_cache, link);
166 _leave(" = %p [first]", cache);
167 return cache;
168}
169
170/**
171 * fscache_init_cache - Initialise a cache record
172 * @cache: The cache record to be initialised
173 * @ops: The cache operations to be installed in that record
174 * @idfmt: Format string to define identifier
175 * @...: sprintf-style arguments
176 *
177 * Initialise a record of a cache and fill in the name.
178 *
179 * See Documentation/filesystems/caching/backend-api.txt for a complete
180 * description.
181 */
182void fscache_init_cache(struct fscache_cache *cache,
183 const struct fscache_cache_ops *ops,
184 const char *idfmt,
185 ...)
186{
187 va_list va;
188
189 memset(cache, 0, sizeof(*cache));
190
191 cache->ops = ops;
192
193 va_start(va, idfmt);
194 vsnprintf(cache->identifier, sizeof(cache->identifier), idfmt, va);
195 va_end(va);
196
197 INIT_WORK(&cache->op_gc, fscache_operation_gc);
198 INIT_LIST_HEAD(&cache->link);
199 INIT_LIST_HEAD(&cache->object_list);
200 INIT_LIST_HEAD(&cache->op_gc_list);
201 spin_lock_init(&cache->object_list_lock);
202 spin_lock_init(&cache->op_gc_list_lock);
203}
204EXPORT_SYMBOL(fscache_init_cache);
205
206/**
207 * fscache_add_cache - Declare a cache as being open for business
208 * @cache: The record describing the cache
209 * @ifsdef: The record of the cache object describing the top-level index
210 * @tagname: The tag describing this cache
211 *
212 * Add a cache to the system, making it available for netfs's to use.
213 *
214 * See Documentation/filesystems/caching/backend-api.txt for a complete
215 * description.
216 */
217int fscache_add_cache(struct fscache_cache *cache,
218 struct fscache_object *ifsdef,
219 const char *tagname)
220{
221 struct fscache_cache_tag *tag;
222
223 BUG_ON(!cache->ops);
224 BUG_ON(!ifsdef);
225
226 cache->flags = 0;
227 ifsdef->event_mask = ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
228 ifsdef->state = FSCACHE_OBJECT_ACTIVE;
229
230 if (!tagname)
231 tagname = cache->identifier;
232
233 BUG_ON(!tagname[0]);
234
235 _enter("{%s.%s},,%s", cache->ops->name, cache->identifier, tagname);
236
237 /* we use the cache tag to uniquely identify caches */
238 tag = __fscache_lookup_cache_tag(tagname);
239 if (IS_ERR(tag))
240 goto nomem;
241
242 if (test_and_set_bit(FSCACHE_TAG_RESERVED, &tag->flags))
243 goto tag_in_use;
244
245 cache->kobj = kobject_create_and_add(tagname, fscache_root);
246 if (!cache->kobj)
247 goto error;
248
249 ifsdef->cookie = &fscache_fsdef_index;
250 ifsdef->cache = cache;
251 cache->fsdef = ifsdef;
252
253 down_write(&fscache_addremove_sem);
254
255 tag->cache = cache;
256 cache->tag = tag;
257
258 /* add the cache to the list */
259 list_add(&cache->link, &fscache_cache_list);
260
261 /* add the cache's netfs definition index object to the cache's
262 * list */
263 spin_lock(&cache->object_list_lock);
264 list_add_tail(&ifsdef->cache_link, &cache->object_list);
265 spin_unlock(&cache->object_list_lock);
266
267 /* add the cache's netfs definition index object to the top level index
268 * cookie as a known backing object */
269 spin_lock(&fscache_fsdef_index.lock);
270
271 hlist_add_head(&ifsdef->cookie_link,
272 &fscache_fsdef_index.backing_objects);
273
274 atomic_inc(&fscache_fsdef_index.usage);
275
276 /* done */
277 spin_unlock(&fscache_fsdef_index.lock);
278 up_write(&fscache_addremove_sem);
279
280 printk(KERN_NOTICE "FS-Cache: Cache \"%s\" added (type %s)\n",
281 cache->tag->name, cache->ops->name);
282 kobject_uevent(cache->kobj, KOBJ_ADD);
283
284 _leave(" = 0 [%s]", cache->identifier);
285 return 0;
286
287tag_in_use:
288 printk(KERN_ERR "FS-Cache: Cache tag '%s' already in use\n", tagname);
289 __fscache_release_cache_tag(tag);
290 _leave(" = -EXIST");
291 return -EEXIST;
292
293error:
294 __fscache_release_cache_tag(tag);
295 _leave(" = -EINVAL");
296 return -EINVAL;
297
298nomem:
299 _leave(" = -ENOMEM");
300 return -ENOMEM;
301}
302EXPORT_SYMBOL(fscache_add_cache);
303
304/**
305 * fscache_io_error - Note a cache I/O error
306 * @cache: The record describing the cache
307 *
308 * Note that an I/O error occurred in a cache and that it should no longer be
309 * used for anything. This also reports the error into the kernel log.
310 *
311 * See Documentation/filesystems/caching/backend-api.txt for a complete
312 * description.
313 */
314void fscache_io_error(struct fscache_cache *cache)
315{
316 set_bit(FSCACHE_IOERROR, &cache->flags);
317
318 printk(KERN_ERR "FS-Cache: Cache %s stopped due to I/O error\n",
319 cache->ops->name);
320}
321EXPORT_SYMBOL(fscache_io_error);
322
323/*
324 * request withdrawal of all the objects in a cache
325 * - all the objects being withdrawn are moved onto the supplied list
326 */
327static void fscache_withdraw_all_objects(struct fscache_cache *cache,
328 struct list_head *dying_objects)
329{
330 struct fscache_object *object;
331
332 spin_lock(&cache->object_list_lock);
333
334 while (!list_empty(&cache->object_list)) {
335 object = list_entry(cache->object_list.next,
336 struct fscache_object, cache_link);
337 list_move_tail(&object->cache_link, dying_objects);
338
339 _debug("withdraw %p", object->cookie);
340
341 spin_lock(&object->lock);
342 spin_unlock(&cache->object_list_lock);
343 fscache_raise_event(object, FSCACHE_OBJECT_EV_WITHDRAW);
344 spin_unlock(&object->lock);
345
346 cond_resched();
347 spin_lock(&cache->object_list_lock);
348 }
349
350 spin_unlock(&cache->object_list_lock);
351}
352
353/**
354 * fscache_withdraw_cache - Withdraw a cache from the active service
355 * @cache: The record describing the cache
356 *
357 * Withdraw a cache from service, unbinding all its cache objects from the
358 * netfs cookies they're currently representing.
359 *
360 * See Documentation/filesystems/caching/backend-api.txt for a complete
361 * description.
362 */
363void fscache_withdraw_cache(struct fscache_cache *cache)
364{
365 LIST_HEAD(dying_objects);
366
367 _enter("");
368
369 printk(KERN_NOTICE "FS-Cache: Withdrawing cache \"%s\"\n",
370 cache->tag->name);
371
372 /* make the cache unavailable for cookie acquisition */
373 if (test_and_set_bit(FSCACHE_CACHE_WITHDRAWN, &cache->flags))
374 BUG();
375
376 down_write(&fscache_addremove_sem);
377 list_del_init(&cache->link);
378 cache->tag->cache = NULL;
379 up_write(&fscache_addremove_sem);
380
381 /* make sure all pages pinned by operations on behalf of the netfs are
382 * written to disk */
383 cache->ops->sync_cache(cache);
384
385 /* dissociate all the netfs pages backed by this cache from the block
386 * mappings in the cache */
387 cache->ops->dissociate_pages(cache);
388
389 /* we now have to destroy all the active objects pertaining to this
390 * cache - which we do by passing them off to thread pool to be
391 * disposed of */
392 _debug("destroy");
393
394 fscache_withdraw_all_objects(cache, &dying_objects);
395
396 /* wait for all extant objects to finish their outstanding operations
397 * and go away */
398 _debug("wait for finish");
399 wait_event(fscache_cache_cleared_wq,
400 atomic_read(&cache->object_count) == 0);
401 _debug("wait for clearance");
402 wait_event(fscache_cache_cleared_wq,
403 list_empty(&cache->object_list));
404 _debug("cleared");
405 ASSERT(list_empty(&dying_objects));
406
407 kobject_put(cache->kobj);
408
409 clear_bit(FSCACHE_TAG_RESERVED, &cache->tag->flags);
410 fscache_release_cache_tag(cache->tag);
411 cache->tag = NULL;
412
413 _leave("");
414}
415EXPORT_SYMBOL(fscache_withdraw_cache);
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
new file mode 100644
index 000000000000..72fd18f6c71f
--- /dev/null
+++ b/fs/fscache/cookie.c
@@ -0,0 +1,500 @@
1/* netfs cookie management
2 *
3 * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * See Documentation/filesystems/caching/netfs-api.txt for more information on
12 * the netfs API.
13 */
14
15#define FSCACHE_DEBUG_LEVEL COOKIE
16#include <linux/module.h>
17#include <linux/slab.h>
18#include "internal.h"
19
20struct kmem_cache *fscache_cookie_jar;
21
22static atomic_t fscache_object_debug_id = ATOMIC_INIT(0);
23
24static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie);
25static int fscache_alloc_object(struct fscache_cache *cache,
26 struct fscache_cookie *cookie);
27static int fscache_attach_object(struct fscache_cookie *cookie,
28 struct fscache_object *object);
29
30/*
31 * initialise an cookie jar slab element prior to any use
32 */
33void fscache_cookie_init_once(void *_cookie)
34{
35 struct fscache_cookie *cookie = _cookie;
36
37 memset(cookie, 0, sizeof(*cookie));
38 spin_lock_init(&cookie->lock);
39 INIT_HLIST_HEAD(&cookie->backing_objects);
40}
41
42/*
43 * request a cookie to represent an object (index, datafile, xattr, etc)
44 * - parent specifies the parent object
45 * - the top level index cookie for each netfs is stored in the fscache_netfs
46 * struct upon registration
47 * - def points to the definition
48 * - the netfs_data will be passed to the functions pointed to in *def
49 * - all attached caches will be searched to see if they contain this object
50 * - index objects aren't stored on disk until there's a dependent file that
51 * needs storing
52 * - other objects are stored in a selected cache immediately, and all the
53 * indices forming the path to it are instantiated if necessary
54 * - we never let on to the netfs about errors
55 * - we may set a negative cookie pointer, but that's okay
56 */
57struct fscache_cookie *__fscache_acquire_cookie(
58 struct fscache_cookie *parent,
59 const struct fscache_cookie_def *def,
60 void *netfs_data)
61{
62 struct fscache_cookie *cookie;
63
64 BUG_ON(!def);
65
66 _enter("{%s},{%s},%p",
67 parent ? (char *) parent->def->name : "<no-parent>",
68 def->name, netfs_data);
69
70 fscache_stat(&fscache_n_acquires);
71
72 /* if there's no parent cookie, then we don't create one here either */
73 if (!parent) {
74 fscache_stat(&fscache_n_acquires_null);
75 _leave(" [no parent]");
76 return NULL;
77 }
78
79 /* validate the definition */
80 BUG_ON(!def->get_key);
81 BUG_ON(!def->name[0]);
82
83 BUG_ON(def->type == FSCACHE_COOKIE_TYPE_INDEX &&
84 parent->def->type != FSCACHE_COOKIE_TYPE_INDEX);
85
86 /* allocate and initialise a cookie */
87 cookie = kmem_cache_alloc(fscache_cookie_jar, GFP_KERNEL);
88 if (!cookie) {
89 fscache_stat(&fscache_n_acquires_oom);
90 _leave(" [ENOMEM]");
91 return NULL;
92 }
93
94 atomic_set(&cookie->usage, 1);
95 atomic_set(&cookie->n_children, 0);
96
97 atomic_inc(&parent->usage);
98 atomic_inc(&parent->n_children);
99
100 cookie->def = def;
101 cookie->parent = parent;
102 cookie->netfs_data = netfs_data;
103 cookie->flags = 0;
104
105 INIT_RADIX_TREE(&cookie->stores, GFP_NOFS);
106
107 switch (cookie->def->type) {
108 case FSCACHE_COOKIE_TYPE_INDEX:
109 fscache_stat(&fscache_n_cookie_index);
110 break;
111 case FSCACHE_COOKIE_TYPE_DATAFILE:
112 fscache_stat(&fscache_n_cookie_data);
113 break;
114 default:
115 fscache_stat(&fscache_n_cookie_special);
116 break;
117 }
118
119 /* if the object is an index then we need do nothing more here - we
120 * create indices on disk when we need them as an index may exist in
121 * multiple caches */
122 if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
123 if (fscache_acquire_non_index_cookie(cookie) < 0) {
124 atomic_dec(&parent->n_children);
125 __fscache_cookie_put(cookie);
126 fscache_stat(&fscache_n_acquires_nobufs);
127 _leave(" = NULL");
128 return NULL;
129 }
130 }
131
132 fscache_stat(&fscache_n_acquires_ok);
133 _leave(" = %p", cookie);
134 return cookie;
135}
136EXPORT_SYMBOL(__fscache_acquire_cookie);
137
138/*
139 * acquire a non-index cookie
140 * - this must make sure the index chain is instantiated and instantiate the
141 * object representation too
142 */
143static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
144{
145 struct fscache_object *object;
146 struct fscache_cache *cache;
147 uint64_t i_size;
148 int ret;
149
150 _enter("");
151
152 cookie->flags = 1 << FSCACHE_COOKIE_UNAVAILABLE;
153
154 /* now we need to see whether the backing objects for this cookie yet
155 * exist, if not there'll be nothing to search */
156 down_read(&fscache_addremove_sem);
157
158 if (list_empty(&fscache_cache_list)) {
159 up_read(&fscache_addremove_sem);
160 _leave(" = 0 [no caches]");
161 return 0;
162 }
163
164 /* select a cache in which to store the object */
165 cache = fscache_select_cache_for_object(cookie->parent);
166 if (!cache) {
167 up_read(&fscache_addremove_sem);
168 fscache_stat(&fscache_n_acquires_no_cache);
169 _leave(" = -ENOMEDIUM [no cache]");
170 return -ENOMEDIUM;
171 }
172
173 _debug("cache %s", cache->tag->name);
174
175 cookie->flags =
176 (1 << FSCACHE_COOKIE_LOOKING_UP) |
177 (1 << FSCACHE_COOKIE_CREATING) |
178 (1 << FSCACHE_COOKIE_NO_DATA_YET);
179
180 /* ask the cache to allocate objects for this cookie and its parent
181 * chain */
182 ret = fscache_alloc_object(cache, cookie);
183 if (ret < 0) {
184 up_read(&fscache_addremove_sem);
185 _leave(" = %d", ret);
186 return ret;
187 }
188
189 /* pass on how big the object we're caching is supposed to be */
190 cookie->def->get_attr(cookie->netfs_data, &i_size);
191
192 spin_lock(&cookie->lock);
193 if (hlist_empty(&cookie->backing_objects)) {
194 spin_unlock(&cookie->lock);
195 goto unavailable;
196 }
197
198 object = hlist_entry(cookie->backing_objects.first,
199 struct fscache_object, cookie_link);
200
201 fscache_set_store_limit(object, i_size);
202
203 /* initiate the process of looking up all the objects in the chain
204 * (done by fscache_initialise_object()) */
205 fscache_enqueue_object(object);
206
207 spin_unlock(&cookie->lock);
208
209 /* we may be required to wait for lookup to complete at this point */
210 if (!fscache_defer_lookup) {
211 _debug("non-deferred lookup %p", &cookie->flags);
212 wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
213 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
214 _debug("complete");
215 if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags))
216 goto unavailable;
217 }
218
219 up_read(&fscache_addremove_sem);
220 _leave(" = 0 [deferred]");
221 return 0;
222
223unavailable:
224 up_read(&fscache_addremove_sem);
225 _leave(" = -ENOBUFS");
226 return -ENOBUFS;
227}
228
229/*
230 * recursively allocate cache object records for a cookie/cache combination
231 * - caller must be holding the addremove sem
232 */
233static int fscache_alloc_object(struct fscache_cache *cache,
234 struct fscache_cookie *cookie)
235{
236 struct fscache_object *object;
237 struct hlist_node *_n;
238 int ret;
239
240 _enter("%p,%p{%s}", cache, cookie, cookie->def->name);
241
242 spin_lock(&cookie->lock);
243 hlist_for_each_entry(object, _n, &cookie->backing_objects,
244 cookie_link) {
245 if (object->cache == cache)
246 goto object_already_extant;
247 }
248 spin_unlock(&cookie->lock);
249
250 /* ask the cache to allocate an object (we may end up with duplicate
251 * objects at this stage, but we sort that out later) */
252 object = cache->ops->alloc_object(cache, cookie);
253 if (IS_ERR(object)) {
254 fscache_stat(&fscache_n_object_no_alloc);
255 ret = PTR_ERR(object);
256 goto error;
257 }
258
259 fscache_stat(&fscache_n_object_alloc);
260
261 object->debug_id = atomic_inc_return(&fscache_object_debug_id);
262
263 _debug("ALLOC OBJ%x: %s {%lx}",
264 object->debug_id, cookie->def->name, object->events);
265
266 ret = fscache_alloc_object(cache, cookie->parent);
267 if (ret < 0)
268 goto error_put;
269
270 /* only attach if we managed to allocate all we needed, otherwise
271 * discard the object we just allocated and instead use the one
272 * attached to the cookie */
273 if (fscache_attach_object(cookie, object) < 0)
274 cache->ops->put_object(object);
275
276 _leave(" = 0");
277 return 0;
278
279object_already_extant:
280 ret = -ENOBUFS;
281 if (object->state >= FSCACHE_OBJECT_DYING) {
282 spin_unlock(&cookie->lock);
283 goto error;
284 }
285 spin_unlock(&cookie->lock);
286 _leave(" = 0 [found]");
287 return 0;
288
289error_put:
290 cache->ops->put_object(object);
291error:
292 _leave(" = %d", ret);
293 return ret;
294}
295
296/*
297 * attach a cache object to a cookie
298 */
299static int fscache_attach_object(struct fscache_cookie *cookie,
300 struct fscache_object *object)
301{
302 struct fscache_object *p;
303 struct fscache_cache *cache = object->cache;
304 struct hlist_node *_n;
305 int ret;
306
307 _enter("{%s},{OBJ%x}", cookie->def->name, object->debug_id);
308
309 spin_lock(&cookie->lock);
310
311 /* there may be multiple initial creations of this object, but we only
312 * want one */
313 ret = -EEXIST;
314 hlist_for_each_entry(p, _n, &cookie->backing_objects, cookie_link) {
315 if (p->cache == object->cache) {
316 if (p->state >= FSCACHE_OBJECT_DYING)
317 ret = -ENOBUFS;
318 goto cant_attach_object;
319 }
320 }
321
322 /* pin the parent object */
323 spin_lock_nested(&cookie->parent->lock, 1);
324 hlist_for_each_entry(p, _n, &cookie->parent->backing_objects,
325 cookie_link) {
326 if (p->cache == object->cache) {
327 if (p->state >= FSCACHE_OBJECT_DYING) {
328 ret = -ENOBUFS;
329 spin_unlock(&cookie->parent->lock);
330 goto cant_attach_object;
331 }
332 object->parent = p;
333 spin_lock(&p->lock);
334 p->n_children++;
335 spin_unlock(&p->lock);
336 break;
337 }
338 }
339 spin_unlock(&cookie->parent->lock);
340
341 /* attach to the cache's object list */
342 if (list_empty(&object->cache_link)) {
343 spin_lock(&cache->object_list_lock);
344 list_add(&object->cache_link, &cache->object_list);
345 spin_unlock(&cache->object_list_lock);
346 }
347
348 /* attach to the cookie */
349 object->cookie = cookie;
350 atomic_inc(&cookie->usage);
351 hlist_add_head(&object->cookie_link, &cookie->backing_objects);
352 ret = 0;
353
354cant_attach_object:
355 spin_unlock(&cookie->lock);
356 _leave(" = %d", ret);
357 return ret;
358}
359
360/*
361 * update the index entries backing a cookie
362 */
363void __fscache_update_cookie(struct fscache_cookie *cookie)
364{
365 struct fscache_object *object;
366 struct hlist_node *_p;
367
368 fscache_stat(&fscache_n_updates);
369
370 if (!cookie) {
371 fscache_stat(&fscache_n_updates_null);
372 _leave(" [no cookie]");
373 return;
374 }
375
376 _enter("{%s}", cookie->def->name);
377
378 BUG_ON(!cookie->def->get_aux);
379
380 spin_lock(&cookie->lock);
381
382 /* update the index entry on disk in each cache backing this cookie */
383 hlist_for_each_entry(object, _p,
384 &cookie->backing_objects, cookie_link) {
385 fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
386 }
387
388 spin_unlock(&cookie->lock);
389 _leave("");
390}
391EXPORT_SYMBOL(__fscache_update_cookie);
392
393/*
394 * release a cookie back to the cache
395 * - the object will be marked as recyclable on disk if retire is true
396 * - all dependents of this cookie must have already been unregistered
397 * (indices/files/pages)
398 */
399void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
400{
401 struct fscache_cache *cache;
402 struct fscache_object *object;
403 unsigned long event;
404
405 fscache_stat(&fscache_n_relinquishes);
406
407 if (!cookie) {
408 fscache_stat(&fscache_n_relinquishes_null);
409 _leave(" [no cookie]");
410 return;
411 }
412
413 _enter("%p{%s,%p},%d",
414 cookie, cookie->def->name, cookie->netfs_data, retire);
415
416 if (atomic_read(&cookie->n_children) != 0) {
417 printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n",
418 cookie->def->name);
419 BUG();
420 }
421
422 /* wait for the cookie to finish being instantiated (or to fail) */
423 if (test_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) {
424 fscache_stat(&fscache_n_relinquishes_waitcrt);
425 wait_on_bit(&cookie->flags, FSCACHE_COOKIE_CREATING,
426 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
427 }
428
429 event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
430
431 /* detach pointers back to the netfs */
432 spin_lock(&cookie->lock);
433
434 cookie->netfs_data = NULL;
435 cookie->def = NULL;
436
437 /* break links with all the active objects */
438 while (!hlist_empty(&cookie->backing_objects)) {
439 object = hlist_entry(cookie->backing_objects.first,
440 struct fscache_object,
441 cookie_link);
442
443 _debug("RELEASE OBJ%x", object->debug_id);
444
445 /* detach each cache object from the object cookie */
446 spin_lock(&object->lock);
447 hlist_del_init(&object->cookie_link);
448
449 cache = object->cache;
450 object->cookie = NULL;
451 fscache_raise_event(object, event);
452 spin_unlock(&object->lock);
453
454 if (atomic_dec_and_test(&cookie->usage))
455 /* the cookie refcount shouldn't be reduced to 0 yet */
456 BUG();
457 }
458
459 spin_unlock(&cookie->lock);
460
461 if (cookie->parent) {
462 ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0);
463 ASSERTCMP(atomic_read(&cookie->parent->n_children), >, 0);
464 atomic_dec(&cookie->parent->n_children);
465 }
466
467 /* finally dispose of the cookie */
468 ASSERTCMP(atomic_read(&cookie->usage), >, 0);
469 fscache_cookie_put(cookie);
470
471 _leave("");
472}
473EXPORT_SYMBOL(__fscache_relinquish_cookie);
474
475/*
476 * destroy a cookie
477 */
478void __fscache_cookie_put(struct fscache_cookie *cookie)
479{
480 struct fscache_cookie *parent;
481
482 _enter("%p", cookie);
483
484 for (;;) {
485 _debug("FREE COOKIE %p", cookie);
486 parent = cookie->parent;
487 BUG_ON(!hlist_empty(&cookie->backing_objects));
488 kmem_cache_free(fscache_cookie_jar, cookie);
489
490 if (!parent)
491 break;
492
493 cookie = parent;
494 BUG_ON(atomic_read(&cookie->usage) <= 0);
495 if (!atomic_dec_and_test(&cookie->usage))
496 break;
497 }
498
499 _leave("");
500}
diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c
new file mode 100644
index 000000000000..f5b4baee7352
--- /dev/null
+++ b/fs/fscache/fsdef.c
@@ -0,0 +1,144 @@
1/* Filesystem index definition
2 *
3 * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL CACHE
13#include <linux/module.h>
14#include "internal.h"
15
16static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data,
17 void *buffer, uint16_t bufmax);
18
19static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data,
20 void *buffer, uint16_t bufmax);
21
22static
23enum fscache_checkaux fscache_fsdef_netfs_check_aux(void *cookie_netfs_data,
24 const void *data,
25 uint16_t datalen);
26
27/*
28 * The root index is owned by FS-Cache itself.
29 *
30 * When a netfs requests caching facilities, FS-Cache will, if one doesn't
31 * already exist, create an entry in the root index with the key being the name
32 * of the netfs ("AFS" for example), and the auxiliary data holding the index
33 * structure version supplied by the netfs:
34 *
35 * FSDEF
36 * |
37 * +-----------+
38 * | |
39 * NFS AFS
40 * [v=1] [v=1]
41 *
42 * If an entry with the appropriate name does already exist, the version is
43 * compared. If the version is different, the entire subtree from that entry
44 * will be discarded and a new entry created.
45 *
46 * The new entry will be an index, and a cookie referring to it will be passed
47 * to the netfs. This is then the root handle by which the netfs accesses the
48 * cache. It can create whatever objects it likes in that index, including
49 * further indices.
50 */
51static struct fscache_cookie_def fscache_fsdef_index_def = {
52 .name = ".FS-Cache",
53 .type = FSCACHE_COOKIE_TYPE_INDEX,
54};
55
56struct fscache_cookie fscache_fsdef_index = {
57 .usage = ATOMIC_INIT(1),
58 .lock = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock),
59 .backing_objects = HLIST_HEAD_INIT,
60 .def = &fscache_fsdef_index_def,
61};
62EXPORT_SYMBOL(fscache_fsdef_index);
63
64/*
65 * Definition of an entry in the root index. Each entry is an index, keyed to
66 * a specific netfs and only applicable to a particular version of the index
67 * structure used by that netfs.
68 */
69struct fscache_cookie_def fscache_fsdef_netfs_def = {
70 .name = "FSDEF.netfs",
71 .type = FSCACHE_COOKIE_TYPE_INDEX,
72 .get_key = fscache_fsdef_netfs_get_key,
73 .get_aux = fscache_fsdef_netfs_get_aux,
74 .check_aux = fscache_fsdef_netfs_check_aux,
75};
76
77/*
78 * get the key data for an FSDEF index record - this is the name of the netfs
79 * for which this entry is created
80 */
81static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data,
82 void *buffer, uint16_t bufmax)
83{
84 const struct fscache_netfs *netfs = cookie_netfs_data;
85 unsigned klen;
86
87 _enter("{%s.%u},", netfs->name, netfs->version);
88
89 klen = strlen(netfs->name);
90 if (klen > bufmax)
91 return 0;
92
93 memcpy(buffer, netfs->name, klen);
94 return klen;
95}
96
97/*
98 * get the auxiliary data for an FSDEF index record - this is the index
99 * structure version number of the netfs for which this version is created
100 */
101static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data,
102 void *buffer, uint16_t bufmax)
103{
104 const struct fscache_netfs *netfs = cookie_netfs_data;
105 unsigned dlen;
106
107 _enter("{%s.%u},", netfs->name, netfs->version);
108
109 dlen = sizeof(uint32_t);
110 if (dlen > bufmax)
111 return 0;
112
113 memcpy(buffer, &netfs->version, dlen);
114 return dlen;
115}
116
117/*
118 * check that the index structure version number stored in the auxiliary data
119 * matches the one the netfs gave us
120 */
121static enum fscache_checkaux fscache_fsdef_netfs_check_aux(
122 void *cookie_netfs_data,
123 const void *data,
124 uint16_t datalen)
125{
126 struct fscache_netfs *netfs = cookie_netfs_data;
127 uint32_t version;
128
129 _enter("{%s},,%hu", netfs->name, datalen);
130
131 if (datalen != sizeof(version)) {
132 _leave(" = OBSOLETE [dl=%d v=%zu]", datalen, sizeof(version));
133 return FSCACHE_CHECKAUX_OBSOLETE;
134 }
135
136 memcpy(&version, data, sizeof(version));
137 if (version != netfs->version) {
138 _leave(" = OBSOLETE [ver=%x net=%x]", version, netfs->version);
139 return FSCACHE_CHECKAUX_OBSOLETE;
140 }
141
142 _leave(" = OKAY");
143 return FSCACHE_CHECKAUX_OKAY;
144}
diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c
new file mode 100644
index 000000000000..bad496748a59
--- /dev/null
+++ b/fs/fscache/histogram.c
@@ -0,0 +1,109 @@
1/* FS-Cache latency histogram
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL THREAD
13#include <linux/module.h>
14#include <linux/proc_fs.h>
15#include <linux/seq_file.h>
16#include "internal.h"
17
18atomic_t fscache_obj_instantiate_histogram[HZ];
19atomic_t fscache_objs_histogram[HZ];
20atomic_t fscache_ops_histogram[HZ];
21atomic_t fscache_retrieval_delay_histogram[HZ];
22atomic_t fscache_retrieval_histogram[HZ];
23
24/*
25 * display the time-taken histogram
26 */
27static int fscache_histogram_show(struct seq_file *m, void *v)
28{
29 unsigned long index;
30 unsigned n[5], t;
31
32 switch ((unsigned long) v) {
33 case 1:
34 seq_puts(m, "JIFS SECS OBJ INST OP RUNS OBJ RUNS "
35 " RETRV DLY RETRIEVLS\n");
36 return 0;
37 case 2:
38 seq_puts(m, "===== ===== ========= ========= ========="
39 " ========= =========\n");
40 return 0;
41 default:
42 index = (unsigned long) v - 3;
43 n[0] = atomic_read(&fscache_obj_instantiate_histogram[index]);
44 n[1] = atomic_read(&fscache_ops_histogram[index]);
45 n[2] = atomic_read(&fscache_objs_histogram[index]);
46 n[3] = atomic_read(&fscache_retrieval_delay_histogram[index]);
47 n[4] = atomic_read(&fscache_retrieval_histogram[index]);
48 if (!(n[0] | n[1] | n[2] | n[3] | n[4]))
49 return 0;
50
51 t = (index * 1000) / HZ;
52
53 seq_printf(m, "%4lu 0.%03u %9u %9u %9u %9u %9u\n",
54 index, t, n[0], n[1], n[2], n[3], n[4]);
55 return 0;
56 }
57}
58
59/*
60 * set up the iterator to start reading from the first line
61 */
62static void *fscache_histogram_start(struct seq_file *m, loff_t *_pos)
63{
64 if ((unsigned long long)*_pos >= HZ + 2)
65 return NULL;
66 if (*_pos == 0)
67 *_pos = 1;
68 return (void *)(unsigned long) *_pos;
69}
70
71/*
72 * move to the next line
73 */
74static void *fscache_histogram_next(struct seq_file *m, void *v, loff_t *pos)
75{
76 (*pos)++;
77 return (unsigned long long)*pos > HZ + 2 ?
78 NULL : (void *)(unsigned long) *pos;
79}
80
81/*
82 * clean up after reading
83 */
84static void fscache_histogram_stop(struct seq_file *m, void *v)
85{
86}
87
88static const struct seq_operations fscache_histogram_ops = {
89 .start = fscache_histogram_start,
90 .stop = fscache_histogram_stop,
91 .next = fscache_histogram_next,
92 .show = fscache_histogram_show,
93};
94
95/*
96 * open "/proc/fs/fscache/histogram" to provide latency data
97 */
98static int fscache_histogram_open(struct inode *inode, struct file *file)
99{
100 return seq_open(file, &fscache_histogram_ops);
101}
102
103const struct file_operations fscache_histogram_fops = {
104 .owner = THIS_MODULE,
105 .open = fscache_histogram_open,
106 .read = seq_read,
107 .llseek = seq_lseek,
108 .release = seq_release,
109};
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
new file mode 100644
index 000000000000..1c341304621f
--- /dev/null
+++ b/fs/fscache/internal.h
@@ -0,0 +1,380 @@
1/* Internal definitions for FS-Cache
2 *
3 * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12/*
13 * Lock order, in the order in which multiple locks should be obtained:
14 * - fscache_addremove_sem
15 * - cookie->lock
16 * - cookie->parent->lock
17 * - cache->object_list_lock
18 * - object->lock
19 * - object->parent->lock
20 * - fscache_thread_lock
21 *
22 */
23
24#include <linux/fscache-cache.h>
25#include <linux/sched.h>
26
27#define FSCACHE_MIN_THREADS 4
28#define FSCACHE_MAX_THREADS 32
29
30/*
31 * cache.c
32 */
33extern struct list_head fscache_cache_list;
34extern struct rw_semaphore fscache_addremove_sem;
35
36extern struct fscache_cache *fscache_select_cache_for_object(
37 struct fscache_cookie *);
38
39/*
40 * cookie.c
41 */
42extern struct kmem_cache *fscache_cookie_jar;
43
44extern void fscache_cookie_init_once(void *);
45extern void __fscache_cookie_put(struct fscache_cookie *);
46
47/*
48 * fsdef.c
49 */
50extern struct fscache_cookie fscache_fsdef_index;
51extern struct fscache_cookie_def fscache_fsdef_netfs_def;
52
53/*
54 * histogram.c
55 */
56#ifdef CONFIG_FSCACHE_HISTOGRAM
57extern atomic_t fscache_obj_instantiate_histogram[HZ];
58extern atomic_t fscache_objs_histogram[HZ];
59extern atomic_t fscache_ops_histogram[HZ];
60extern atomic_t fscache_retrieval_delay_histogram[HZ];
61extern atomic_t fscache_retrieval_histogram[HZ];
62
63static inline void fscache_hist(atomic_t histogram[], unsigned long start_jif)
64{
65 unsigned long jif = jiffies - start_jif;
66 if (jif >= HZ)
67 jif = HZ - 1;
68 atomic_inc(&histogram[jif]);
69}
70
71extern const struct file_operations fscache_histogram_fops;
72
73#else
74#define fscache_hist(hist, start_jif) do {} while (0)
75#endif
76
77/*
78 * main.c
79 */
80extern unsigned fscache_defer_lookup;
81extern unsigned fscache_defer_create;
82extern unsigned fscache_debug;
83extern struct kobject *fscache_root;
84
85extern int fscache_wait_bit(void *);
86extern int fscache_wait_bit_interruptible(void *);
87
88/*
89 * object.c
90 */
91extern void fscache_withdrawing_object(struct fscache_cache *,
92 struct fscache_object *);
93extern void fscache_enqueue_object(struct fscache_object *);
94
95/*
96 * operation.c
97 */
98extern int fscache_submit_exclusive_op(struct fscache_object *,
99 struct fscache_operation *);
100extern int fscache_submit_op(struct fscache_object *,
101 struct fscache_operation *);
102extern void fscache_abort_object(struct fscache_object *);
103extern void fscache_start_operations(struct fscache_object *);
104extern void fscache_operation_gc(struct work_struct *);
105
106/*
107 * proc.c
108 */
109#ifdef CONFIG_PROC_FS
110extern int __init fscache_proc_init(void);
111extern void fscache_proc_cleanup(void);
112#else
113#define fscache_proc_init() (0)
114#define fscache_proc_cleanup() do {} while (0)
115#endif
116
117/*
118 * stats.c
119 */
120#ifdef CONFIG_FSCACHE_STATS
121extern atomic_t fscache_n_ops_processed[FSCACHE_MAX_THREADS];
122extern atomic_t fscache_n_objs_processed[FSCACHE_MAX_THREADS];
123
124extern atomic_t fscache_n_op_pend;
125extern atomic_t fscache_n_op_run;
126extern atomic_t fscache_n_op_enqueue;
127extern atomic_t fscache_n_op_deferred_release;
128extern atomic_t fscache_n_op_release;
129extern atomic_t fscache_n_op_gc;
130
131extern atomic_t fscache_n_attr_changed;
132extern atomic_t fscache_n_attr_changed_ok;
133extern atomic_t fscache_n_attr_changed_nobufs;
134extern atomic_t fscache_n_attr_changed_nomem;
135extern atomic_t fscache_n_attr_changed_calls;
136
137extern atomic_t fscache_n_allocs;
138extern atomic_t fscache_n_allocs_ok;
139extern atomic_t fscache_n_allocs_wait;
140extern atomic_t fscache_n_allocs_nobufs;
141extern atomic_t fscache_n_alloc_ops;
142extern atomic_t fscache_n_alloc_op_waits;
143
144extern atomic_t fscache_n_retrievals;
145extern atomic_t fscache_n_retrievals_ok;
146extern atomic_t fscache_n_retrievals_wait;
147extern atomic_t fscache_n_retrievals_nodata;
148extern atomic_t fscache_n_retrievals_nobufs;
149extern atomic_t fscache_n_retrievals_intr;
150extern atomic_t fscache_n_retrievals_nomem;
151extern atomic_t fscache_n_retrieval_ops;
152extern atomic_t fscache_n_retrieval_op_waits;
153
154extern atomic_t fscache_n_stores;
155extern atomic_t fscache_n_stores_ok;
156extern atomic_t fscache_n_stores_again;
157extern atomic_t fscache_n_stores_nobufs;
158extern atomic_t fscache_n_stores_oom;
159extern atomic_t fscache_n_store_ops;
160extern atomic_t fscache_n_store_calls;
161
162extern atomic_t fscache_n_marks;
163extern atomic_t fscache_n_uncaches;
164
165extern atomic_t fscache_n_acquires;
166extern atomic_t fscache_n_acquires_null;
167extern atomic_t fscache_n_acquires_no_cache;
168extern atomic_t fscache_n_acquires_ok;
169extern atomic_t fscache_n_acquires_nobufs;
170extern atomic_t fscache_n_acquires_oom;
171
172extern atomic_t fscache_n_updates;
173extern atomic_t fscache_n_updates_null;
174extern atomic_t fscache_n_updates_run;
175
176extern atomic_t fscache_n_relinquishes;
177extern atomic_t fscache_n_relinquishes_null;
178extern atomic_t fscache_n_relinquishes_waitcrt;
179
180extern atomic_t fscache_n_cookie_index;
181extern atomic_t fscache_n_cookie_data;
182extern atomic_t fscache_n_cookie_special;
183
184extern atomic_t fscache_n_object_alloc;
185extern atomic_t fscache_n_object_no_alloc;
186extern atomic_t fscache_n_object_lookups;
187extern atomic_t fscache_n_object_lookups_negative;
188extern atomic_t fscache_n_object_lookups_positive;
189extern atomic_t fscache_n_object_created;
190extern atomic_t fscache_n_object_avail;
191extern atomic_t fscache_n_object_dead;
192
193extern atomic_t fscache_n_checkaux_none;
194extern atomic_t fscache_n_checkaux_okay;
195extern atomic_t fscache_n_checkaux_update;
196extern atomic_t fscache_n_checkaux_obsolete;
197
198static inline void fscache_stat(atomic_t *stat)
199{
200 atomic_inc(stat);
201}
202
203extern const struct file_operations fscache_stats_fops;
204#else
205
206#define fscache_stat(stat) do {} while (0)
207#endif
208
209/*
210 * raise an event on an object
211 * - if the event is not masked for that object, then the object is
212 * queued for attention by the thread pool.
213 */
214static inline void fscache_raise_event(struct fscache_object *object,
215 unsigned event)
216{
217 if (!test_and_set_bit(event, &object->events) &&
218 test_bit(event, &object->event_mask))
219 fscache_enqueue_object(object);
220}
221
222/*
223 * drop a reference to a cookie
224 */
225static inline void fscache_cookie_put(struct fscache_cookie *cookie)
226{
227 BUG_ON(atomic_read(&cookie->usage) <= 0);
228 if (atomic_dec_and_test(&cookie->usage))
229 __fscache_cookie_put(cookie);
230}
231
232/*
233 * get an extra reference to a netfs retrieval context
234 */
235static inline
236void *fscache_get_context(struct fscache_cookie *cookie, void *context)
237{
238 if (cookie->def->get_context)
239 cookie->def->get_context(cookie->netfs_data, context);
240 return context;
241}
242
243/*
244 * release a reference to a netfs retrieval context
245 */
246static inline
247void fscache_put_context(struct fscache_cookie *cookie, void *context)
248{
249 if (cookie->def->put_context)
250 cookie->def->put_context(cookie->netfs_data, context);
251}
252
253/*****************************************************************************/
254/*
255 * debug tracing
256 */
257#define dbgprintk(FMT, ...) \
258 printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
259
260/* make sure we maintain the format strings, even when debugging is disabled */
261static inline __attribute__((format(printf, 1, 2)))
262void _dbprintk(const char *fmt, ...)
263{
264}
265
266#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
267#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
268#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
269
270#define kjournal(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
271
272#ifdef __KDEBUG
273#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
274#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
275#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
276
277#elif defined(CONFIG_FSCACHE_DEBUG)
278#define _enter(FMT, ...) \
279do { \
280 if (__do_kdebug(ENTER)) \
281 kenter(FMT, ##__VA_ARGS__); \
282} while (0)
283
284#define _leave(FMT, ...) \
285do { \
286 if (__do_kdebug(LEAVE)) \
287 kleave(FMT, ##__VA_ARGS__); \
288} while (0)
289
290#define _debug(FMT, ...) \
291do { \
292 if (__do_kdebug(DEBUG)) \
293 kdebug(FMT, ##__VA_ARGS__); \
294} while (0)
295
296#else
297#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
298#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
299#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
300#endif
301
302/*
303 * determine whether a particular optional debugging point should be logged
304 * - we need to go through three steps to persuade cpp to correctly join the
305 * shorthand in FSCACHE_DEBUG_LEVEL with its prefix
306 */
307#define ____do_kdebug(LEVEL, POINT) \
308 unlikely((fscache_debug & \
309 (FSCACHE_POINT_##POINT << (FSCACHE_DEBUG_ ## LEVEL * 3))))
310#define ___do_kdebug(LEVEL, POINT) \
311 ____do_kdebug(LEVEL, POINT)
312#define __do_kdebug(POINT) \
313 ___do_kdebug(FSCACHE_DEBUG_LEVEL, POINT)
314
315#define FSCACHE_DEBUG_CACHE 0
316#define FSCACHE_DEBUG_COOKIE 1
317#define FSCACHE_DEBUG_PAGE 2
318#define FSCACHE_DEBUG_OPERATION 3
319
320#define FSCACHE_POINT_ENTER 1
321#define FSCACHE_POINT_LEAVE 2
322#define FSCACHE_POINT_DEBUG 4
323
324#ifndef FSCACHE_DEBUG_LEVEL
325#define FSCACHE_DEBUG_LEVEL CACHE
326#endif
327
328/*
329 * assertions
330 */
331#if 1 /* defined(__KDEBUGALL) */
332
333#define ASSERT(X) \
334do { \
335 if (unlikely(!(X))) { \
336 printk(KERN_ERR "\n"); \
337 printk(KERN_ERR "FS-Cache: Assertion failed\n"); \
338 BUG(); \
339 } \
340} while (0)
341
342#define ASSERTCMP(X, OP, Y) \
343do { \
344 if (unlikely(!((X) OP (Y)))) { \
345 printk(KERN_ERR "\n"); \
346 printk(KERN_ERR "FS-Cache: Assertion failed\n"); \
347 printk(KERN_ERR "%lx " #OP " %lx is false\n", \
348 (unsigned long)(X), (unsigned long)(Y)); \
349 BUG(); \
350 } \
351} while (0)
352
353#define ASSERTIF(C, X) \
354do { \
355 if (unlikely((C) && !(X))) { \
356 printk(KERN_ERR "\n"); \
357 printk(KERN_ERR "FS-Cache: Assertion failed\n"); \
358 BUG(); \
359 } \
360} while (0)
361
362#define ASSERTIFCMP(C, X, OP, Y) \
363do { \
364 if (unlikely((C) && !((X) OP (Y)))) { \
365 printk(KERN_ERR "\n"); \
366 printk(KERN_ERR "FS-Cache: Assertion failed\n"); \
367 printk(KERN_ERR "%lx " #OP " %lx is false\n", \
368 (unsigned long)(X), (unsigned long)(Y)); \
369 BUG(); \
370 } \
371} while (0)
372
373#else
374
375#define ASSERT(X) do {} while (0)
376#define ASSERTCMP(X, OP, Y) do {} while (0)
377#define ASSERTIF(C, X) do {} while (0)
378#define ASSERTIFCMP(C, X, OP, Y) do {} while (0)
379
380#endif /* assert or not */
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
new file mode 100644
index 000000000000..4de41b597499
--- /dev/null
+++ b/fs/fscache/main.c
@@ -0,0 +1,124 @@
1/* General filesystem local caching manager
2 *
3 * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL CACHE
13#include <linux/module.h>
14#include <linux/init.h>
15#include <linux/sched.h>
16#include <linux/completion.h>
17#include <linux/slab.h>
18#include "internal.h"
19
20MODULE_DESCRIPTION("FS Cache Manager");
21MODULE_AUTHOR("Red Hat, Inc.");
22MODULE_LICENSE("GPL");
23
24unsigned fscache_defer_lookup = 1;
25module_param_named(defer_lookup, fscache_defer_lookup, uint,
26 S_IWUSR | S_IRUGO);
27MODULE_PARM_DESC(fscache_defer_lookup,
28 "Defer cookie lookup to background thread");
29
30unsigned fscache_defer_create = 1;
31module_param_named(defer_create, fscache_defer_create, uint,
32 S_IWUSR | S_IRUGO);
33MODULE_PARM_DESC(fscache_defer_create,
34 "Defer cookie creation to background thread");
35
36unsigned fscache_debug;
37module_param_named(debug, fscache_debug, uint,
38 S_IWUSR | S_IRUGO);
39MODULE_PARM_DESC(fscache_debug,
40 "FS-Cache debugging mask");
41
42struct kobject *fscache_root;
43
44/*
45 * initialise the fs caching module
46 */
47static int __init fscache_init(void)
48{
49 int ret;
50
51 ret = slow_work_register_user();
52 if (ret < 0)
53 goto error_slow_work;
54
55 ret = fscache_proc_init();
56 if (ret < 0)
57 goto error_proc;
58
59 fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar",
60 sizeof(struct fscache_cookie),
61 0,
62 0,
63 fscache_cookie_init_once);
64 if (!fscache_cookie_jar) {
65 printk(KERN_NOTICE
66 "FS-Cache: Failed to allocate a cookie jar\n");
67 ret = -ENOMEM;
68 goto error_cookie_jar;
69 }
70
71 fscache_root = kobject_create_and_add("fscache", kernel_kobj);
72 if (!fscache_root)
73 goto error_kobj;
74
75 printk(KERN_NOTICE "FS-Cache: Loaded\n");
76 return 0;
77
78error_kobj:
79 kmem_cache_destroy(fscache_cookie_jar);
80error_cookie_jar:
81 fscache_proc_cleanup();
82error_proc:
83 slow_work_unregister_user();
84error_slow_work:
85 return ret;
86}
87
88fs_initcall(fscache_init);
89
90/*
91 * clean up on module removal
92 */
93static void __exit fscache_exit(void)
94{
95 _enter("");
96
97 kobject_put(fscache_root);
98 kmem_cache_destroy(fscache_cookie_jar);
99 fscache_proc_cleanup();
100 slow_work_unregister_user();
101 printk(KERN_NOTICE "FS-Cache: Unloaded\n");
102}
103
104module_exit(fscache_exit);
105
106/*
107 * wait_on_bit() sleep function for uninterruptible waiting
108 */
109int fscache_wait_bit(void *flags)
110{
111 schedule();
112 return 0;
113}
114EXPORT_SYMBOL(fscache_wait_bit);
115
116/*
117 * wait_on_bit() sleep function for interruptible waiting
118 */
119int fscache_wait_bit_interruptible(void *flags)
120{
121 schedule();
122 return signal_pending(current);
123}
124EXPORT_SYMBOL(fscache_wait_bit_interruptible);
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
new file mode 100644
index 000000000000..e028b8eb1c40
--- /dev/null
+++ b/fs/fscache/netfs.c
@@ -0,0 +1,103 @@
1/* FS-Cache netfs (client) registration
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL COOKIE
13#include <linux/module.h>
14#include <linux/slab.h>
15#include "internal.h"
16
17static LIST_HEAD(fscache_netfs_list);
18
19/*
20 * register a network filesystem for caching
21 */
22int __fscache_register_netfs(struct fscache_netfs *netfs)
23{
24 struct fscache_netfs *ptr;
25 int ret;
26
27 _enter("{%s}", netfs->name);
28
29 INIT_LIST_HEAD(&netfs->link);
30
31 /* allocate a cookie for the primary index */
32 netfs->primary_index =
33 kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL);
34
35 if (!netfs->primary_index) {
36 _leave(" = -ENOMEM");
37 return -ENOMEM;
38 }
39
40 /* initialise the primary index cookie */
41 atomic_set(&netfs->primary_index->usage, 1);
42 atomic_set(&netfs->primary_index->n_children, 0);
43
44 netfs->primary_index->def = &fscache_fsdef_netfs_def;
45 netfs->primary_index->parent = &fscache_fsdef_index;
46 netfs->primary_index->netfs_data = netfs;
47
48 atomic_inc(&netfs->primary_index->parent->usage);
49 atomic_inc(&netfs->primary_index->parent->n_children);
50
51 spin_lock_init(&netfs->primary_index->lock);
52 INIT_HLIST_HEAD(&netfs->primary_index->backing_objects);
53
54 /* check the netfs type is not already present */
55 down_write(&fscache_addremove_sem);
56
57 ret = -EEXIST;
58 list_for_each_entry(ptr, &fscache_netfs_list, link) {
59 if (strcmp(ptr->name, netfs->name) == 0)
60 goto already_registered;
61 }
62
63 list_add(&netfs->link, &fscache_netfs_list);
64 ret = 0;
65
66 printk(KERN_NOTICE "FS-Cache: Netfs '%s' registered for caching\n",
67 netfs->name);
68
69already_registered:
70 up_write(&fscache_addremove_sem);
71
72 if (ret < 0) {
73 netfs->primary_index->parent = NULL;
74 __fscache_cookie_put(netfs->primary_index);
75 netfs->primary_index = NULL;
76 }
77
78 _leave(" = %d", ret);
79 return ret;
80}
81EXPORT_SYMBOL(__fscache_register_netfs);
82
83/*
84 * unregister a network filesystem from the cache
85 * - all cookies must have been released first
86 */
87void __fscache_unregister_netfs(struct fscache_netfs *netfs)
88{
89 _enter("{%s.%u}", netfs->name, netfs->version);
90
91 down_write(&fscache_addremove_sem);
92
93 list_del(&netfs->link);
94 fscache_relinquish_cookie(netfs->primary_index, 0);
95
96 up_write(&fscache_addremove_sem);
97
98 printk(KERN_NOTICE "FS-Cache: Netfs '%s' unregistered from caching\n",
99 netfs->name);
100
101 _leave("");
102}
103EXPORT_SYMBOL(__fscache_unregister_netfs);
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
new file mode 100644
index 000000000000..392a41b1b79d
--- /dev/null
+++ b/fs/fscache/object.c
@@ -0,0 +1,810 @@
1/* FS-Cache object state machine handler
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * See Documentation/filesystems/caching/object.txt for a description of the
12 * object state machine and the in-kernel representations.
13 */
14
15#define FSCACHE_DEBUG_LEVEL COOKIE
16#include <linux/module.h>
17#include "internal.h"
18
19const char *fscache_object_states[] = {
20 [FSCACHE_OBJECT_INIT] = "OBJECT_INIT",
21 [FSCACHE_OBJECT_LOOKING_UP] = "OBJECT_LOOKING_UP",
22 [FSCACHE_OBJECT_CREATING] = "OBJECT_CREATING",
23 [FSCACHE_OBJECT_AVAILABLE] = "OBJECT_AVAILABLE",
24 [FSCACHE_OBJECT_ACTIVE] = "OBJECT_ACTIVE",
25 [FSCACHE_OBJECT_UPDATING] = "OBJECT_UPDATING",
26 [FSCACHE_OBJECT_DYING] = "OBJECT_DYING",
27 [FSCACHE_OBJECT_LC_DYING] = "OBJECT_LC_DYING",
28 [FSCACHE_OBJECT_ABORT_INIT] = "OBJECT_ABORT_INIT",
29 [FSCACHE_OBJECT_RELEASING] = "OBJECT_RELEASING",
30 [FSCACHE_OBJECT_RECYCLING] = "OBJECT_RECYCLING",
31 [FSCACHE_OBJECT_WITHDRAWING] = "OBJECT_WITHDRAWING",
32 [FSCACHE_OBJECT_DEAD] = "OBJECT_DEAD",
33};
34EXPORT_SYMBOL(fscache_object_states);
35
36static void fscache_object_slow_work_put_ref(struct slow_work *);
37static int fscache_object_slow_work_get_ref(struct slow_work *);
38static void fscache_object_slow_work_execute(struct slow_work *);
39static void fscache_initialise_object(struct fscache_object *);
40static void fscache_lookup_object(struct fscache_object *);
41static void fscache_object_available(struct fscache_object *);
42static void fscache_release_object(struct fscache_object *);
43static void fscache_withdraw_object(struct fscache_object *);
44static void fscache_enqueue_dependents(struct fscache_object *);
45static void fscache_dequeue_object(struct fscache_object *);
46
47const struct slow_work_ops fscache_object_slow_work_ops = {
48 .get_ref = fscache_object_slow_work_get_ref,
49 .put_ref = fscache_object_slow_work_put_ref,
50 .execute = fscache_object_slow_work_execute,
51};
52EXPORT_SYMBOL(fscache_object_slow_work_ops);
53
54/*
55 * we need to notify the parent when an op completes that we had outstanding
56 * upon it
57 */
58static inline void fscache_done_parent_op(struct fscache_object *object)
59{
60 struct fscache_object *parent = object->parent;
61
62 _enter("OBJ%x {OBJ%x,%x}",
63 object->debug_id, parent->debug_id, parent->n_ops);
64
65 spin_lock_nested(&parent->lock, 1);
66 parent->n_ops--;
67 parent->n_obj_ops--;
68 if (parent->n_ops == 0)
69 fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
70 spin_unlock(&parent->lock);
71}
72
73/*
74 * process events that have been sent to an object's state machine
75 * - initiates parent lookup
76 * - does object lookup
77 * - does object creation
78 * - does object recycling and retirement
79 * - does object withdrawal
80 */
81static void fscache_object_state_machine(struct fscache_object *object)
82{
83 enum fscache_object_state new_state;
84
85 ASSERT(object != NULL);
86
87 _enter("{OBJ%x,%s,%lx}",
88 object->debug_id, fscache_object_states[object->state],
89 object->events);
90
91 switch (object->state) {
92 /* wait for the parent object to become ready */
93 case FSCACHE_OBJECT_INIT:
94 object->event_mask =
95 ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
96 fscache_initialise_object(object);
97 goto done;
98
99 /* look up the object metadata on disk */
100 case FSCACHE_OBJECT_LOOKING_UP:
101 fscache_lookup_object(object);
102 goto lookup_transit;
103
104 /* create the object metadata on disk */
105 case FSCACHE_OBJECT_CREATING:
106 fscache_lookup_object(object);
107 goto lookup_transit;
108
109 /* handle an object becoming available; start pending
110 * operations and queue dependent operations for processing */
111 case FSCACHE_OBJECT_AVAILABLE:
112 fscache_object_available(object);
113 goto active_transit;
114
115 /* normal running state */
116 case FSCACHE_OBJECT_ACTIVE:
117 goto active_transit;
118
119 /* update the object metadata on disk */
120 case FSCACHE_OBJECT_UPDATING:
121 clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events);
122 fscache_stat(&fscache_n_updates_run);
123 object->cache->ops->update_object(object);
124 goto active_transit;
125
126 /* handle an object dying during lookup or creation */
127 case FSCACHE_OBJECT_LC_DYING:
128 object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
129 object->cache->ops->lookup_complete(object);
130
131 spin_lock(&object->lock);
132 object->state = FSCACHE_OBJECT_DYING;
133 if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
134 &object->cookie->flags))
135 wake_up_bit(&object->cookie->flags,
136 FSCACHE_COOKIE_CREATING);
137 spin_unlock(&object->lock);
138
139 fscache_done_parent_op(object);
140
141 /* wait for completion of all active operations on this object
142 * and the death of all child objects of this object */
143 case FSCACHE_OBJECT_DYING:
144 dying:
145 clear_bit(FSCACHE_OBJECT_EV_CLEARED, &object->events);
146 spin_lock(&object->lock);
147 _debug("dying OBJ%x {%d,%d}",
148 object->debug_id, object->n_ops, object->n_children);
149 if (object->n_ops == 0 && object->n_children == 0) {
150 object->event_mask &=
151 ~(1 << FSCACHE_OBJECT_EV_CLEARED);
152 object->event_mask |=
153 (1 << FSCACHE_OBJECT_EV_WITHDRAW) |
154 (1 << FSCACHE_OBJECT_EV_RETIRE) |
155 (1 << FSCACHE_OBJECT_EV_RELEASE) |
156 (1 << FSCACHE_OBJECT_EV_ERROR);
157 } else {
158 object->event_mask &=
159 ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
160 (1 << FSCACHE_OBJECT_EV_RETIRE) |
161 (1 << FSCACHE_OBJECT_EV_RELEASE) |
162 (1 << FSCACHE_OBJECT_EV_ERROR));
163 object->event_mask |=
164 1 << FSCACHE_OBJECT_EV_CLEARED;
165 }
166 spin_unlock(&object->lock);
167 fscache_enqueue_dependents(object);
168 goto terminal_transit;
169
170 /* handle an abort during initialisation */
171 case FSCACHE_OBJECT_ABORT_INIT:
172 _debug("handle abort init %lx", object->events);
173 object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
174
175 spin_lock(&object->lock);
176 fscache_dequeue_object(object);
177
178 object->state = FSCACHE_OBJECT_DYING;
179 if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
180 &object->cookie->flags))
181 wake_up_bit(&object->cookie->flags,
182 FSCACHE_COOKIE_CREATING);
183 spin_unlock(&object->lock);
184 goto dying;
185
186 /* handle the netfs releasing an object and possibly marking it
187 * obsolete too */
188 case FSCACHE_OBJECT_RELEASING:
189 case FSCACHE_OBJECT_RECYCLING:
190 object->event_mask &=
191 ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
192 (1 << FSCACHE_OBJECT_EV_RETIRE) |
193 (1 << FSCACHE_OBJECT_EV_RELEASE) |
194 (1 << FSCACHE_OBJECT_EV_ERROR));
195 fscache_release_object(object);
196 spin_lock(&object->lock);
197 object->state = FSCACHE_OBJECT_DEAD;
198 spin_unlock(&object->lock);
199 fscache_stat(&fscache_n_object_dead);
200 goto terminal_transit;
201
202 /* handle the parent cache of this object being withdrawn from
203 * active service */
204 case FSCACHE_OBJECT_WITHDRAWING:
205 object->event_mask &=
206 ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
207 (1 << FSCACHE_OBJECT_EV_RETIRE) |
208 (1 << FSCACHE_OBJECT_EV_RELEASE) |
209 (1 << FSCACHE_OBJECT_EV_ERROR));
210 fscache_withdraw_object(object);
211 spin_lock(&object->lock);
212 object->state = FSCACHE_OBJECT_DEAD;
213 spin_unlock(&object->lock);
214 fscache_stat(&fscache_n_object_dead);
215 goto terminal_transit;
216
217 /* complain about the object being woken up once it is
218 * deceased */
219 case FSCACHE_OBJECT_DEAD:
220 printk(KERN_ERR "FS-Cache:"
221 " Unexpected event in dead state %lx\n",
222 object->events & object->event_mask);
223 BUG();
224
225 default:
226 printk(KERN_ERR "FS-Cache: Unknown object state %u\n",
227 object->state);
228 BUG();
229 }
230
231 /* determine the transition from a lookup state */
232lookup_transit:
233 switch (fls(object->events & object->event_mask) - 1) {
234 case FSCACHE_OBJECT_EV_WITHDRAW:
235 case FSCACHE_OBJECT_EV_RETIRE:
236 case FSCACHE_OBJECT_EV_RELEASE:
237 case FSCACHE_OBJECT_EV_ERROR:
238 new_state = FSCACHE_OBJECT_LC_DYING;
239 goto change_state;
240 case FSCACHE_OBJECT_EV_REQUEUE:
241 goto done;
242 case -1:
243 goto done; /* sleep until event */
244 default:
245 goto unsupported_event;
246 }
247
248 /* determine the transition from an active state */
249active_transit:
250 switch (fls(object->events & object->event_mask) - 1) {
251 case FSCACHE_OBJECT_EV_WITHDRAW:
252 case FSCACHE_OBJECT_EV_RETIRE:
253 case FSCACHE_OBJECT_EV_RELEASE:
254 case FSCACHE_OBJECT_EV_ERROR:
255 new_state = FSCACHE_OBJECT_DYING;
256 goto change_state;
257 case FSCACHE_OBJECT_EV_UPDATE:
258 new_state = FSCACHE_OBJECT_UPDATING;
259 goto change_state;
260 case -1:
261 new_state = FSCACHE_OBJECT_ACTIVE;
262 goto change_state; /* sleep until event */
263 default:
264 goto unsupported_event;
265 }
266
267 /* determine the transition from a terminal state */
268terminal_transit:
269 switch (fls(object->events & object->event_mask) - 1) {
270 case FSCACHE_OBJECT_EV_WITHDRAW:
271 new_state = FSCACHE_OBJECT_WITHDRAWING;
272 goto change_state;
273 case FSCACHE_OBJECT_EV_RETIRE:
274 new_state = FSCACHE_OBJECT_RECYCLING;
275 goto change_state;
276 case FSCACHE_OBJECT_EV_RELEASE:
277 new_state = FSCACHE_OBJECT_RELEASING;
278 goto change_state;
279 case FSCACHE_OBJECT_EV_ERROR:
280 new_state = FSCACHE_OBJECT_WITHDRAWING;
281 goto change_state;
282 case FSCACHE_OBJECT_EV_CLEARED:
283 new_state = FSCACHE_OBJECT_DYING;
284 goto change_state;
285 case -1:
286 goto done; /* sleep until event */
287 default:
288 goto unsupported_event;
289 }
290
291change_state:
292 spin_lock(&object->lock);
293 object->state = new_state;
294 spin_unlock(&object->lock);
295
296done:
297 _leave(" [->%s]", fscache_object_states[object->state]);
298 return;
299
300unsupported_event:
301 printk(KERN_ERR "FS-Cache:"
302 " Unsupported event %lx [mask %lx] in state %s\n",
303 object->events, object->event_mask,
304 fscache_object_states[object->state]);
305 BUG();
306}
307
308/*
309 * execute an object
310 */
311static void fscache_object_slow_work_execute(struct slow_work *work)
312{
313 struct fscache_object *object =
314 container_of(work, struct fscache_object, work);
315 unsigned long start;
316
317 _enter("{OBJ%x}", object->debug_id);
318
319 clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
320
321 start = jiffies;
322 fscache_object_state_machine(object);
323 fscache_hist(fscache_objs_histogram, start);
324 if (object->events & object->event_mask)
325 fscache_enqueue_object(object);
326}
327
328/*
329 * initialise an object
330 * - check the specified object's parent to see if we can make use of it
331 * immediately to do a creation
332 * - we may need to start the process of creating a parent and we need to wait
333 * for the parent's lookup and creation to complete if it's not there yet
334 * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
335 * leaf-most cookies of the object and all its children
336 */
337static void fscache_initialise_object(struct fscache_object *object)
338{
339 struct fscache_object *parent;
340
341 _enter("");
342 ASSERT(object->cookie != NULL);
343 ASSERT(object->cookie->parent != NULL);
344 ASSERT(list_empty(&object->work.link));
345
346 if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) |
347 (1 << FSCACHE_OBJECT_EV_RELEASE) |
348 (1 << FSCACHE_OBJECT_EV_RETIRE) |
349 (1 << FSCACHE_OBJECT_EV_WITHDRAW))) {
350 _debug("abort init %lx", object->events);
351 spin_lock(&object->lock);
352 object->state = FSCACHE_OBJECT_ABORT_INIT;
353 spin_unlock(&object->lock);
354 return;
355 }
356
357 spin_lock(&object->cookie->lock);
358 spin_lock_nested(&object->cookie->parent->lock, 1);
359
360 parent = object->parent;
361 if (!parent) {
362 _debug("no parent");
363 set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
364 } else {
365 spin_lock(&object->lock);
366 spin_lock_nested(&parent->lock, 1);
367 _debug("parent %s", fscache_object_states[parent->state]);
368
369 if (parent->state >= FSCACHE_OBJECT_DYING) {
370 _debug("bad parent");
371 set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
372 } else if (parent->state < FSCACHE_OBJECT_AVAILABLE) {
373 _debug("wait");
374
375 /* we may get woken up in this state by child objects
376 * binding on to us, so we need to make sure we don't
377 * add ourself to the list multiple times */
378 if (list_empty(&object->dep_link)) {
379 object->cache->ops->grab_object(object);
380 list_add(&object->dep_link,
381 &parent->dependents);
382
383 /* fscache_acquire_non_index_cookie() uses this
384 * to wake the chain up */
385 if (parent->state == FSCACHE_OBJECT_INIT)
386 fscache_enqueue_object(parent);
387 }
388 } else {
389 _debug("go");
390 parent->n_ops++;
391 parent->n_obj_ops++;
392 object->lookup_jif = jiffies;
393 object->state = FSCACHE_OBJECT_LOOKING_UP;
394 set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
395 }
396
397 spin_unlock(&parent->lock);
398 spin_unlock(&object->lock);
399 }
400
401 spin_unlock(&object->cookie->parent->lock);
402 spin_unlock(&object->cookie->lock);
403 _leave("");
404}
405
406/*
407 * look an object up in the cache from which it was allocated
408 * - we hold an "access lock" on the parent object, so the parent object cannot
409 * be withdrawn by either party till we've finished
410 * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
411 * leaf-most cookies of the object and all its children
412 */
413static void fscache_lookup_object(struct fscache_object *object)
414{
415 struct fscache_cookie *cookie = object->cookie;
416 struct fscache_object *parent;
417
418 _enter("");
419
420 parent = object->parent;
421 ASSERT(parent != NULL);
422 ASSERTCMP(parent->n_ops, >, 0);
423 ASSERTCMP(parent->n_obj_ops, >, 0);
424
425 /* make sure the parent is still available */
426 ASSERTCMP(parent->state, >=, FSCACHE_OBJECT_AVAILABLE);
427
428 if (parent->state >= FSCACHE_OBJECT_DYING ||
429 test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
430 _debug("unavailable");
431 set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
432 _leave("");
433 return;
434 }
435
436 _debug("LOOKUP \"%s/%s\" in \"%s\"",
437 parent->cookie->def->name, cookie->def->name,
438 object->cache->tag->name);
439
440 fscache_stat(&fscache_n_object_lookups);
441 object->cache->ops->lookup_object(object);
442
443 if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events))
444 set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
445
446 _leave("");
447}
448
449/**
450 * fscache_object_lookup_negative - Note negative cookie lookup
451 * @object: Object pointing to cookie to mark
452 *
453 * Note negative lookup, permitting those waiting to read data from an already
454 * existing backing object to continue as there's no data for them to read.
455 */
456void fscache_object_lookup_negative(struct fscache_object *object)
457{
458 struct fscache_cookie *cookie = object->cookie;
459
460 _enter("{OBJ%x,%s}",
461 object->debug_id, fscache_object_states[object->state]);
462
463 spin_lock(&object->lock);
464 if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
465 fscache_stat(&fscache_n_object_lookups_negative);
466
467 /* transit here to allow write requests to begin stacking up
468 * and read requests to begin returning ENODATA */
469 object->state = FSCACHE_OBJECT_CREATING;
470 spin_unlock(&object->lock);
471
472 set_bit(FSCACHE_COOKIE_PENDING_FILL, &cookie->flags);
473 set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
474
475 _debug("wake up lookup %p", &cookie->flags);
476 smp_mb__before_clear_bit();
477 clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
478 smp_mb__after_clear_bit();
479 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
480 set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
481 } else {
482 ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
483 spin_unlock(&object->lock);
484 }
485
486 _leave("");
487}
488EXPORT_SYMBOL(fscache_object_lookup_negative);
489
490/**
491 * fscache_obtained_object - Note successful object lookup or creation
492 * @object: Object pointing to cookie to mark
493 *
494 * Note successful lookup and/or creation, permitting those waiting to write
495 * data to a backing object to continue.
496 *
497 * Note that after calling this, an object's cookie may be relinquished by the
498 * netfs, and so must be accessed with object lock held.
499 */
500void fscache_obtained_object(struct fscache_object *object)
501{
502 struct fscache_cookie *cookie = object->cookie;
503
504 _enter("{OBJ%x,%s}",
505 object->debug_id, fscache_object_states[object->state]);
506
507 /* if we were still looking up, then we must have a positive lookup
508 * result, in which case there may be data available */
509 spin_lock(&object->lock);
510 if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
511 fscache_stat(&fscache_n_object_lookups_positive);
512
513 clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
514
515 object->state = FSCACHE_OBJECT_AVAILABLE;
516 spin_unlock(&object->lock);
517
518 smp_mb__before_clear_bit();
519 clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
520 smp_mb__after_clear_bit();
521 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
522 set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
523 } else {
524 ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
525 fscache_stat(&fscache_n_object_created);
526
527 object->state = FSCACHE_OBJECT_AVAILABLE;
528 spin_unlock(&object->lock);
529 set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
530 smp_wmb();
531 }
532
533 if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &cookie->flags))
534 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_CREATING);
535
536 _leave("");
537}
538EXPORT_SYMBOL(fscache_obtained_object);
539
540/*
541 * handle an object that has just become available
542 */
543static void fscache_object_available(struct fscache_object *object)
544{
545 _enter("{OBJ%x}", object->debug_id);
546
547 spin_lock(&object->lock);
548
549 if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags))
550 wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING);
551
552 fscache_done_parent_op(object);
553 if (object->n_in_progress == 0) {
554 if (object->n_ops > 0) {
555 ASSERTCMP(object->n_ops, >=, object->n_obj_ops);
556 ASSERTIF(object->n_ops > object->n_obj_ops,
557 !list_empty(&object->pending_ops));
558 fscache_start_operations(object);
559 } else {
560 ASSERT(list_empty(&object->pending_ops));
561 }
562 }
563 spin_unlock(&object->lock);
564
565 object->cache->ops->lookup_complete(object);
566 fscache_enqueue_dependents(object);
567
568 fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif);
569 fscache_stat(&fscache_n_object_avail);
570
571 _leave("");
572}
573
574/*
575 * drop an object's attachments
576 */
577static void fscache_drop_object(struct fscache_object *object)
578{
579 struct fscache_object *parent = object->parent;
580 struct fscache_cache *cache = object->cache;
581
582 _enter("{OBJ%x,%d}", object->debug_id, object->n_children);
583
584 spin_lock(&cache->object_list_lock);
585 list_del_init(&object->cache_link);
586 spin_unlock(&cache->object_list_lock);
587
588 cache->ops->drop_object(object);
589
590 if (parent) {
591 _debug("release parent OBJ%x {%d}",
592 parent->debug_id, parent->n_children);
593
594 spin_lock(&parent->lock);
595 parent->n_children--;
596 if (parent->n_children == 0)
597 fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
598 spin_unlock(&parent->lock);
599 object->parent = NULL;
600 }
601
602 /* this just shifts the object release to the slow work processor */
603 object->cache->ops->put_object(object);
604
605 _leave("");
606}
607
608/*
609 * release or recycle an object that the netfs has discarded
610 */
611static void fscache_release_object(struct fscache_object *object)
612{
613 _enter("");
614
615 fscache_drop_object(object);
616}
617
618/*
619 * withdraw an object from active service
620 */
621static void fscache_withdraw_object(struct fscache_object *object)
622{
623 struct fscache_cookie *cookie;
624 bool detached;
625
626 _enter("");
627
628 spin_lock(&object->lock);
629 cookie = object->cookie;
630 if (cookie) {
631 /* need to get the cookie lock before the object lock, starting
632 * from the object pointer */
633 atomic_inc(&cookie->usage);
634 spin_unlock(&object->lock);
635
636 detached = false;
637 spin_lock(&cookie->lock);
638 spin_lock(&object->lock);
639
640 if (object->cookie == cookie) {
641 hlist_del_init(&object->cookie_link);
642 object->cookie = NULL;
643 detached = true;
644 }
645 spin_unlock(&cookie->lock);
646 fscache_cookie_put(cookie);
647 if (detached)
648 fscache_cookie_put(cookie);
649 }
650
651 spin_unlock(&object->lock);
652
653 fscache_drop_object(object);
654}
655
656/*
657 * withdraw an object from active service at the behest of the cache
658 * - need break the links to a cached object cookie
659 * - called under two situations:
660 * (1) recycler decides to reclaim an in-use object
661 * (2) a cache is unmounted
662 * - have to take care as the cookie can be being relinquished by the netfs
663 * simultaneously
664 * - the object is pinned by the caller holding a refcount on it
665 */
666void fscache_withdrawing_object(struct fscache_cache *cache,
667 struct fscache_object *object)
668{
669 bool enqueue = false;
670
671 _enter(",OBJ%x", object->debug_id);
672
673 spin_lock(&object->lock);
674 if (object->state < FSCACHE_OBJECT_WITHDRAWING) {
675 object->state = FSCACHE_OBJECT_WITHDRAWING;
676 enqueue = true;
677 }
678 spin_unlock(&object->lock);
679
680 if (enqueue)
681 fscache_enqueue_object(object);
682
683 _leave("");
684}
685
686/*
687 * allow the slow work item processor to get a ref on an object
688 */
689static int fscache_object_slow_work_get_ref(struct slow_work *work)
690{
691 struct fscache_object *object =
692 container_of(work, struct fscache_object, work);
693
694 return object->cache->ops->grab_object(object) ? 0 : -EAGAIN;
695}
696
697/*
698 * allow the slow work item processor to discard a ref on a work item
699 */
700static void fscache_object_slow_work_put_ref(struct slow_work *work)
701{
702 struct fscache_object *object =
703 container_of(work, struct fscache_object, work);
704
705 return object->cache->ops->put_object(object);
706}
707
708/*
709 * enqueue an object for metadata-type processing
710 */
711void fscache_enqueue_object(struct fscache_object *object)
712{
713 _enter("{OBJ%x}", object->debug_id);
714
715 slow_work_enqueue(&object->work);
716}
717
718/*
719 * enqueue the dependents of an object for metadata-type processing
720 * - the caller must hold the object's lock
721 * - this may cause an already locked object to wind up being processed again
722 */
723static void fscache_enqueue_dependents(struct fscache_object *object)
724{
725 struct fscache_object *dep;
726
727 _enter("{OBJ%x}", object->debug_id);
728
729 if (list_empty(&object->dependents))
730 return;
731
732 spin_lock(&object->lock);
733
734 while (!list_empty(&object->dependents)) {
735 dep = list_entry(object->dependents.next,
736 struct fscache_object, dep_link);
737 list_del_init(&dep->dep_link);
738
739
740 /* sort onto appropriate lists */
741 fscache_enqueue_object(dep);
742 dep->cache->ops->put_object(dep);
743
744 if (!list_empty(&object->dependents))
745 cond_resched_lock(&object->lock);
746 }
747
748 spin_unlock(&object->lock);
749}
750
751/*
752 * remove an object from whatever queue it's waiting on
753 * - the caller must hold object->lock
754 */
755void fscache_dequeue_object(struct fscache_object *object)
756{
757 _enter("{OBJ%x}", object->debug_id);
758
759 if (!list_empty(&object->dep_link)) {
760 spin_lock(&object->parent->lock);
761 list_del_init(&object->dep_link);
762 spin_unlock(&object->parent->lock);
763 }
764
765 _leave("");
766}
767
768/**
769 * fscache_check_aux - Ask the netfs whether an object on disk is still valid
770 * @object: The object to ask about
771 * @data: The auxiliary data for the object
772 * @datalen: The size of the auxiliary data
773 *
774 * This function consults the netfs about the coherency state of an object
775 */
776enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
777 const void *data, uint16_t datalen)
778{
779 enum fscache_checkaux result;
780
781 if (!object->cookie->def->check_aux) {
782 fscache_stat(&fscache_n_checkaux_none);
783 return FSCACHE_CHECKAUX_OKAY;
784 }
785
786 result = object->cookie->def->check_aux(object->cookie->netfs_data,
787 data, datalen);
788 switch (result) {
789 /* entry okay as is */
790 case FSCACHE_CHECKAUX_OKAY:
791 fscache_stat(&fscache_n_checkaux_okay);
792 break;
793
794 /* entry requires update */
795 case FSCACHE_CHECKAUX_NEEDS_UPDATE:
796 fscache_stat(&fscache_n_checkaux_update);
797 break;
798
799 /* entry requires deletion */
800 case FSCACHE_CHECKAUX_OBSOLETE:
801 fscache_stat(&fscache_n_checkaux_obsolete);
802 break;
803
804 default:
805 BUG();
806 }
807
808 return result;
809}
810EXPORT_SYMBOL(fscache_check_aux);
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
new file mode 100644
index 000000000000..e7f8d53b8b6b
--- /dev/null
+++ b/fs/fscache/operation.c
@@ -0,0 +1,459 @@
1/* FS-Cache worker operation management routines
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * See Documentation/filesystems/caching/operations.txt
12 */
13
14#define FSCACHE_DEBUG_LEVEL OPERATION
15#include <linux/module.h>
16#include "internal.h"
17
18atomic_t fscache_op_debug_id;
19EXPORT_SYMBOL(fscache_op_debug_id);
20
21/**
22 * fscache_enqueue_operation - Enqueue an operation for processing
23 * @op: The operation to enqueue
24 *
25 * Enqueue an operation for processing by the FS-Cache thread pool.
26 *
27 * This will get its own ref on the object.
28 */
29void fscache_enqueue_operation(struct fscache_operation *op)
30{
31 _enter("{OBJ%x OP%x,%u}",
32 op->object->debug_id, op->debug_id, atomic_read(&op->usage));
33
34 ASSERT(op->processor != NULL);
35 ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
36 ASSERTCMP(atomic_read(&op->usage), >, 0);
37
38 if (list_empty(&op->pend_link)) {
39 switch (op->flags & FSCACHE_OP_TYPE) {
40 case FSCACHE_OP_FAST:
41 _debug("queue fast");
42 atomic_inc(&op->usage);
43 if (!schedule_work(&op->fast_work))
44 fscache_put_operation(op);
45 break;
46 case FSCACHE_OP_SLOW:
47 _debug("queue slow");
48 slow_work_enqueue(&op->slow_work);
49 break;
50 case FSCACHE_OP_MYTHREAD:
51 _debug("queue for caller's attention");
52 break;
53 default:
54 printk(KERN_ERR "FS-Cache: Unexpected op type %lx",
55 op->flags);
56 BUG();
57 break;
58 }
59 fscache_stat(&fscache_n_op_enqueue);
60 }
61}
62EXPORT_SYMBOL(fscache_enqueue_operation);
63
64/*
65 * start an op running
66 */
67static void fscache_run_op(struct fscache_object *object,
68 struct fscache_operation *op)
69{
70 object->n_in_progress++;
71 if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
72 wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
73 if (op->processor)
74 fscache_enqueue_operation(op);
75 fscache_stat(&fscache_n_op_run);
76}
77
78/*
79 * submit an exclusive operation for an object
80 * - other ops are excluded from running simultaneously with this one
81 * - this gets any extra refs it needs on an op
82 */
83int fscache_submit_exclusive_op(struct fscache_object *object,
84 struct fscache_operation *op)
85{
86 int ret;
87
88 _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
89
90 spin_lock(&object->lock);
91 ASSERTCMP(object->n_ops, >=, object->n_in_progress);
92 ASSERTCMP(object->n_ops, >=, object->n_exclusive);
93
94 ret = -ENOBUFS;
95 if (fscache_object_is_active(object)) {
96 op->object = object;
97 object->n_ops++;
98 object->n_exclusive++; /* reads and writes must wait */
99
100 if (object->n_ops > 0) {
101 atomic_inc(&op->usage);
102 list_add_tail(&op->pend_link, &object->pending_ops);
103 fscache_stat(&fscache_n_op_pend);
104 } else if (!list_empty(&object->pending_ops)) {
105 atomic_inc(&op->usage);
106 list_add_tail(&op->pend_link, &object->pending_ops);
107 fscache_stat(&fscache_n_op_pend);
108 fscache_start_operations(object);
109 } else {
110 ASSERTCMP(object->n_in_progress, ==, 0);
111 fscache_run_op(object, op);
112 }
113
114 /* need to issue a new write op after this */
115 clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
116 ret = 0;
117 } else if (object->state == FSCACHE_OBJECT_CREATING) {
118 op->object = object;
119 object->n_ops++;
120 object->n_exclusive++; /* reads and writes must wait */
121 atomic_inc(&op->usage);
122 list_add_tail(&op->pend_link, &object->pending_ops);
123 fscache_stat(&fscache_n_op_pend);
124 ret = 0;
125 } else {
126 /* not allowed to submit ops in any other state */
127 BUG();
128 }
129
130 spin_unlock(&object->lock);
131 return ret;
132}
133
134/*
135 * report an unexpected submission
136 */
137static void fscache_report_unexpected_submission(struct fscache_object *object,
138 struct fscache_operation *op,
139 unsigned long ostate)
140{
141 static bool once_only;
142 struct fscache_operation *p;
143 unsigned n;
144
145 if (once_only)
146 return;
147 once_only = true;
148
149 kdebug("unexpected submission OP%x [OBJ%x %s]",
150 op->debug_id, object->debug_id,
151 fscache_object_states[object->state]);
152 kdebug("objstate=%s [%s]",
153 fscache_object_states[object->state],
154 fscache_object_states[ostate]);
155 kdebug("objflags=%lx", object->flags);
156 kdebug("objevent=%lx [%lx]", object->events, object->event_mask);
157 kdebug("ops=%u inp=%u exc=%u",
158 object->n_ops, object->n_in_progress, object->n_exclusive);
159
160 if (!list_empty(&object->pending_ops)) {
161 n = 0;
162 list_for_each_entry(p, &object->pending_ops, pend_link) {
163 ASSERTCMP(p->object, ==, object);
164 kdebug("%p %p", op->processor, op->release);
165 n++;
166 }
167
168 kdebug("n=%u", n);
169 }
170
171 dump_stack();
172}
173
174/*
175 * submit an operation for an object
176 * - objects may be submitted only in the following states:
177 * - during object creation (write ops may be submitted)
178 * - whilst the object is active
179 * - after an I/O error incurred in one of the two above states (op rejected)
180 * - this gets any extra refs it needs on an op
181 */
182int fscache_submit_op(struct fscache_object *object,
183 struct fscache_operation *op)
184{
185 unsigned long ostate;
186 int ret;
187
188 _enter("{OBJ%x OP%x},{%u}",
189 object->debug_id, op->debug_id, atomic_read(&op->usage));
190
191 ASSERTCMP(atomic_read(&op->usage), >, 0);
192
193 spin_lock(&object->lock);
194 ASSERTCMP(object->n_ops, >=, object->n_in_progress);
195 ASSERTCMP(object->n_ops, >=, object->n_exclusive);
196
197 ostate = object->state;
198 smp_rmb();
199
200 if (fscache_object_is_active(object)) {
201 op->object = object;
202 object->n_ops++;
203
204 if (object->n_exclusive > 0) {
205 atomic_inc(&op->usage);
206 list_add_tail(&op->pend_link, &object->pending_ops);
207 fscache_stat(&fscache_n_op_pend);
208 } else if (!list_empty(&object->pending_ops)) {
209 atomic_inc(&op->usage);
210 list_add_tail(&op->pend_link, &object->pending_ops);
211 fscache_stat(&fscache_n_op_pend);
212 fscache_start_operations(object);
213 } else {
214 ASSERTCMP(object->n_exclusive, ==, 0);
215 fscache_run_op(object, op);
216 }
217 ret = 0;
218 } else if (object->state == FSCACHE_OBJECT_CREATING) {
219 op->object = object;
220 object->n_ops++;
221 atomic_inc(&op->usage);
222 list_add_tail(&op->pend_link, &object->pending_ops);
223 fscache_stat(&fscache_n_op_pend);
224 ret = 0;
225 } else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
226 fscache_report_unexpected_submission(object, op, ostate);
227 ASSERT(!fscache_object_is_active(object));
228 ret = -ENOBUFS;
229 } else {
230 ret = -ENOBUFS;
231 }
232
233 spin_unlock(&object->lock);
234 return ret;
235}
236
237/*
238 * queue an object for withdrawal on error, aborting all following asynchronous
239 * operations
240 */
241void fscache_abort_object(struct fscache_object *object)
242{
243 _enter("{OBJ%x}", object->debug_id);
244
245 fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
246}
247
248/*
249 * jump start the operation processing on an object
250 * - caller must hold object->lock
251 */
252void fscache_start_operations(struct fscache_object *object)
253{
254 struct fscache_operation *op;
255 bool stop = false;
256
257 while (!list_empty(&object->pending_ops) && !stop) {
258 op = list_entry(object->pending_ops.next,
259 struct fscache_operation, pend_link);
260
261 if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
262 if (object->n_in_progress > 0)
263 break;
264 stop = true;
265 }
266 list_del_init(&op->pend_link);
267 object->n_in_progress++;
268
269 if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
270 wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
271 if (op->processor)
272 fscache_enqueue_operation(op);
273
274 /* the pending queue was holding a ref on the object */
275 fscache_put_operation(op);
276 }
277
278 ASSERTCMP(object->n_in_progress, <=, object->n_ops);
279
280 _debug("woke %d ops on OBJ%x",
281 object->n_in_progress, object->debug_id);
282}
283
284/*
285 * release an operation
286 * - queues pending ops if this is the last in-progress op
287 */
288void fscache_put_operation(struct fscache_operation *op)
289{
290 struct fscache_object *object;
291 struct fscache_cache *cache;
292
293 _enter("{OBJ%x OP%x,%d}",
294 op->object->debug_id, op->debug_id, atomic_read(&op->usage));
295
296 ASSERTCMP(atomic_read(&op->usage), >, 0);
297
298 if (!atomic_dec_and_test(&op->usage))
299 return;
300
301 _debug("PUT OP");
302 if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags))
303 BUG();
304
305 fscache_stat(&fscache_n_op_release);
306
307 if (op->release) {
308 op->release(op);
309 op->release = NULL;
310 }
311
312 object = op->object;
313
314 /* now... we may get called with the object spinlock held, so we
315 * complete the cleanup here only if we can immediately acquire the
316 * lock, and defer it otherwise */
317 if (!spin_trylock(&object->lock)) {
318 _debug("defer put");
319 fscache_stat(&fscache_n_op_deferred_release);
320
321 cache = object->cache;
322 spin_lock(&cache->op_gc_list_lock);
323 list_add_tail(&op->pend_link, &cache->op_gc_list);
324 spin_unlock(&cache->op_gc_list_lock);
325 schedule_work(&cache->op_gc);
326 _leave(" [defer]");
327 return;
328 }
329
330 if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
331 ASSERTCMP(object->n_exclusive, >, 0);
332 object->n_exclusive--;
333 }
334
335 ASSERTCMP(object->n_in_progress, >, 0);
336 object->n_in_progress--;
337 if (object->n_in_progress == 0)
338 fscache_start_operations(object);
339
340 ASSERTCMP(object->n_ops, >, 0);
341 object->n_ops--;
342 if (object->n_ops == 0)
343 fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
344
345 spin_unlock(&object->lock);
346
347 kfree(op);
348 _leave(" [done]");
349}
350EXPORT_SYMBOL(fscache_put_operation);
351
352/*
353 * garbage collect operations that have had their release deferred
354 */
355void fscache_operation_gc(struct work_struct *work)
356{
357 struct fscache_operation *op;
358 struct fscache_object *object;
359 struct fscache_cache *cache =
360 container_of(work, struct fscache_cache, op_gc);
361 int count = 0;
362
363 _enter("");
364
365 do {
366 spin_lock(&cache->op_gc_list_lock);
367 if (list_empty(&cache->op_gc_list)) {
368 spin_unlock(&cache->op_gc_list_lock);
369 break;
370 }
371
372 op = list_entry(cache->op_gc_list.next,
373 struct fscache_operation, pend_link);
374 list_del(&op->pend_link);
375 spin_unlock(&cache->op_gc_list_lock);
376
377 object = op->object;
378
379 _debug("GC DEFERRED REL OBJ%x OP%x",
380 object->debug_id, op->debug_id);
381 fscache_stat(&fscache_n_op_gc);
382
383 ASSERTCMP(atomic_read(&op->usage), ==, 0);
384
385 spin_lock(&object->lock);
386 if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
387 ASSERTCMP(object->n_exclusive, >, 0);
388 object->n_exclusive--;
389 }
390
391 ASSERTCMP(object->n_in_progress, >, 0);
392 object->n_in_progress--;
393 if (object->n_in_progress == 0)
394 fscache_start_operations(object);
395
396 ASSERTCMP(object->n_ops, >, 0);
397 object->n_ops--;
398 if (object->n_ops == 0)
399 fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
400
401 spin_unlock(&object->lock);
402
403 } while (count++ < 20);
404
405 if (!list_empty(&cache->op_gc_list))
406 schedule_work(&cache->op_gc);
407
408 _leave("");
409}
410
411/*
412 * allow the slow work item processor to get a ref on an operation
413 */
414static int fscache_op_get_ref(struct slow_work *work)
415{
416 struct fscache_operation *op =
417 container_of(work, struct fscache_operation, slow_work);
418
419 atomic_inc(&op->usage);
420 return 0;
421}
422
423/*
424 * allow the slow work item processor to discard a ref on an operation
425 */
426static void fscache_op_put_ref(struct slow_work *work)
427{
428 struct fscache_operation *op =
429 container_of(work, struct fscache_operation, slow_work);
430
431 fscache_put_operation(op);
432}
433
434/*
435 * execute an operation using the slow thread pool to provide processing context
436 * - the caller holds a ref to this object, so we don't need to hold one
437 */
438static void fscache_op_execute(struct slow_work *work)
439{
440 struct fscache_operation *op =
441 container_of(work, struct fscache_operation, slow_work);
442 unsigned long start;
443
444 _enter("{OBJ%x OP%x,%d}",
445 op->object->debug_id, op->debug_id, atomic_read(&op->usage));
446
447 ASSERT(op->processor != NULL);
448 start = jiffies;
449 op->processor(op);
450 fscache_hist(fscache_ops_histogram, start);
451
452 _leave("");
453}
454
455const struct slow_work_ops fscache_op_slow_work_ops = {
456 .get_ref = fscache_op_get_ref,
457 .put_ref = fscache_op_put_ref,
458 .execute = fscache_op_execute,
459};
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
new file mode 100644
index 000000000000..2568e0eb644f
--- /dev/null
+++ b/fs/fscache/page.c
@@ -0,0 +1,816 @@
1/* Cache page management and data I/O routines
2 *
3 * Copyright (C) 2004-2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL PAGE
13#include <linux/module.h>
14#include <linux/fscache-cache.h>
15#include <linux/buffer_head.h>
16#include <linux/pagevec.h>
17#include "internal.h"
18
19/*
20 * check to see if a page is being written to the cache
21 */
22bool __fscache_check_page_write(struct fscache_cookie *cookie, struct page *page)
23{
24 void *val;
25
26 rcu_read_lock();
27 val = radix_tree_lookup(&cookie->stores, page->index);
28 rcu_read_unlock();
29
30 return val != NULL;
31}
32EXPORT_SYMBOL(__fscache_check_page_write);
33
34/*
35 * wait for a page to finish being written to the cache
36 */
37void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *page)
38{
39 wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0);
40
41 wait_event(*wq, !__fscache_check_page_write(cookie, page));
42}
43EXPORT_SYMBOL(__fscache_wait_on_page_write);
44
45/*
46 * note that a page has finished being written to the cache
47 */
48static void fscache_end_page_write(struct fscache_cookie *cookie, struct page *page)
49{
50 struct page *xpage;
51
52 spin_lock(&cookie->lock);
53 xpage = radix_tree_delete(&cookie->stores, page->index);
54 spin_unlock(&cookie->lock);
55 ASSERT(xpage != NULL);
56
57 wake_up_bit(&cookie->flags, 0);
58}
59
60/*
61 * actually apply the changed attributes to a cache object
62 */
63static void fscache_attr_changed_op(struct fscache_operation *op)
64{
65 struct fscache_object *object = op->object;
66
67 _enter("{OBJ%x OP%x}", object->debug_id, op->debug_id);
68
69 fscache_stat(&fscache_n_attr_changed_calls);
70
71 if (fscache_object_is_active(object) &&
72 object->cache->ops->attr_changed(object) < 0)
73 fscache_abort_object(object);
74
75 _leave("");
76}
77
78/*
79 * notification that the attributes on an object have changed
80 */
81int __fscache_attr_changed(struct fscache_cookie *cookie)
82{
83 struct fscache_operation *op;
84 struct fscache_object *object;
85
86 _enter("%p", cookie);
87
88 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
89
90 fscache_stat(&fscache_n_attr_changed);
91
92 op = kzalloc(sizeof(*op), GFP_KERNEL);
93 if (!op) {
94 fscache_stat(&fscache_n_attr_changed_nomem);
95 _leave(" = -ENOMEM");
96 return -ENOMEM;
97 }
98
99 fscache_operation_init(op, NULL);
100 fscache_operation_init_slow(op, fscache_attr_changed_op);
101 op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE);
102
103 spin_lock(&cookie->lock);
104
105 if (hlist_empty(&cookie->backing_objects))
106 goto nobufs;
107 object = hlist_entry(cookie->backing_objects.first,
108 struct fscache_object, cookie_link);
109
110 if (fscache_submit_exclusive_op(object, op) < 0)
111 goto nobufs;
112 spin_unlock(&cookie->lock);
113 fscache_stat(&fscache_n_attr_changed_ok);
114 fscache_put_operation(op);
115 _leave(" = 0");
116 return 0;
117
118nobufs:
119 spin_unlock(&cookie->lock);
120 kfree(op);
121 fscache_stat(&fscache_n_attr_changed_nobufs);
122 _leave(" = %d", -ENOBUFS);
123 return -ENOBUFS;
124}
125EXPORT_SYMBOL(__fscache_attr_changed);
126
127/*
128 * handle secondary execution given to a retrieval op on behalf of the
129 * cache
130 */
131static void fscache_retrieval_work(struct work_struct *work)
132{
133 struct fscache_retrieval *op =
134 container_of(work, struct fscache_retrieval, op.fast_work);
135 unsigned long start;
136
137 _enter("{OP%x}", op->op.debug_id);
138
139 start = jiffies;
140 op->op.processor(&op->op);
141 fscache_hist(fscache_ops_histogram, start);
142 fscache_put_operation(&op->op);
143}
144
145/*
146 * release a retrieval op reference
147 */
148static void fscache_release_retrieval_op(struct fscache_operation *_op)
149{
150 struct fscache_retrieval *op =
151 container_of(_op, struct fscache_retrieval, op);
152
153 _enter("{OP%x}", op->op.debug_id);
154
155 fscache_hist(fscache_retrieval_histogram, op->start_time);
156 if (op->context)
157 fscache_put_context(op->op.object->cookie, op->context);
158
159 _leave("");
160}
161
162/*
163 * allocate a retrieval op
164 */
165static struct fscache_retrieval *fscache_alloc_retrieval(
166 struct address_space *mapping,
167 fscache_rw_complete_t end_io_func,
168 void *context)
169{
170 struct fscache_retrieval *op;
171
172 /* allocate a retrieval operation and attempt to submit it */
173 op = kzalloc(sizeof(*op), GFP_NOIO);
174 if (!op) {
175 fscache_stat(&fscache_n_retrievals_nomem);
176 return NULL;
177 }
178
179 fscache_operation_init(&op->op, fscache_release_retrieval_op);
180 op->op.flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING);
181 op->mapping = mapping;
182 op->end_io_func = end_io_func;
183 op->context = context;
184 op->start_time = jiffies;
185 INIT_WORK(&op->op.fast_work, fscache_retrieval_work);
186 INIT_LIST_HEAD(&op->to_do);
187 return op;
188}
189
190/*
191 * wait for a deferred lookup to complete
192 */
193static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
194{
195 unsigned long jif;
196
197 _enter("");
198
199 if (!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) {
200 _leave(" = 0 [imm]");
201 return 0;
202 }
203
204 fscache_stat(&fscache_n_retrievals_wait);
205
206 jif = jiffies;
207 if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
208 fscache_wait_bit_interruptible,
209 TASK_INTERRUPTIBLE) != 0) {
210 fscache_stat(&fscache_n_retrievals_intr);
211 _leave(" = -ERESTARTSYS");
212 return -ERESTARTSYS;
213 }
214
215 ASSERT(!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags));
216
217 smp_rmb();
218 fscache_hist(fscache_retrieval_delay_histogram, jif);
219 _leave(" = 0 [dly]");
220 return 0;
221}
222
223/*
224 * read a page from the cache or allocate a block in which to store it
225 * - we return:
226 * -ENOMEM - out of memory, nothing done
227 * -ERESTARTSYS - interrupted
228 * -ENOBUFS - no backing object available in which to cache the block
229 * -ENODATA - no data available in the backing object for this block
230 * 0 - dispatched a read - it'll call end_io_func() when finished
231 */
232int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
233 struct page *page,
234 fscache_rw_complete_t end_io_func,
235 void *context,
236 gfp_t gfp)
237{
238 struct fscache_retrieval *op;
239 struct fscache_object *object;
240 int ret;
241
242 _enter("%p,%p,,,", cookie, page);
243
244 fscache_stat(&fscache_n_retrievals);
245
246 if (hlist_empty(&cookie->backing_objects))
247 goto nobufs;
248
249 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
250 ASSERTCMP(page, !=, NULL);
251
252 if (fscache_wait_for_deferred_lookup(cookie) < 0)
253 return -ERESTARTSYS;
254
255 op = fscache_alloc_retrieval(page->mapping, end_io_func, context);
256 if (!op) {
257 _leave(" = -ENOMEM");
258 return -ENOMEM;
259 }
260
261 spin_lock(&cookie->lock);
262
263 if (hlist_empty(&cookie->backing_objects))
264 goto nobufs_unlock;
265 object = hlist_entry(cookie->backing_objects.first,
266 struct fscache_object, cookie_link);
267
268 ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);
269
270 if (fscache_submit_op(object, &op->op) < 0)
271 goto nobufs_unlock;
272 spin_unlock(&cookie->lock);
273
274 fscache_stat(&fscache_n_retrieval_ops);
275
276 /* pin the netfs read context in case we need to do the actual netfs
277 * read because we've encountered a cache read failure */
278 fscache_get_context(object->cookie, op->context);
279
280 /* we wait for the operation to become active, and then process it
281 * *here*, in this thread, and not in the thread pool */
282 if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
283 _debug(">>> WT");
284 fscache_stat(&fscache_n_retrieval_op_waits);
285 wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
286 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
287 _debug("<<< GO");
288 }
289
290 /* ask the cache to honour the operation */
291 if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) {
292 ret = object->cache->ops->allocate_page(op, page, gfp);
293 if (ret == 0)
294 ret = -ENODATA;
295 } else {
296 ret = object->cache->ops->read_or_alloc_page(op, page, gfp);
297 }
298
299 if (ret == -ENOMEM)
300 fscache_stat(&fscache_n_retrievals_nomem);
301 else if (ret == -ERESTARTSYS)
302 fscache_stat(&fscache_n_retrievals_intr);
303 else if (ret == -ENODATA)
304 fscache_stat(&fscache_n_retrievals_nodata);
305 else if (ret < 0)
306 fscache_stat(&fscache_n_retrievals_nobufs);
307 else
308 fscache_stat(&fscache_n_retrievals_ok);
309
310 fscache_put_retrieval(op);
311 _leave(" = %d", ret);
312 return ret;
313
314nobufs_unlock:
315 spin_unlock(&cookie->lock);
316 kfree(op);
317nobufs:
318 fscache_stat(&fscache_n_retrievals_nobufs);
319 _leave(" = -ENOBUFS");
320 return -ENOBUFS;
321}
322EXPORT_SYMBOL(__fscache_read_or_alloc_page);
323
324/*
325 * read a list of page from the cache or allocate a block in which to store
326 * them
327 * - we return:
328 * -ENOMEM - out of memory, some pages may be being read
329 * -ERESTARTSYS - interrupted, some pages may be being read
330 * -ENOBUFS - no backing object or space available in which to cache any
331 * pages not being read
332 * -ENODATA - no data available in the backing object for some or all of
333 * the pages
334 * 0 - dispatched a read on all pages
335 *
336 * end_io_func() will be called for each page read from the cache as it is
337 * finishes being read
338 *
339 * any pages for which a read is dispatched will be removed from pages and
340 * nr_pages
341 */
342int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
343 struct address_space *mapping,
344 struct list_head *pages,
345 unsigned *nr_pages,
346 fscache_rw_complete_t end_io_func,
347 void *context,
348 gfp_t gfp)
349{
350 fscache_pages_retrieval_func_t func;
351 struct fscache_retrieval *op;
352 struct fscache_object *object;
353 int ret;
354
355 _enter("%p,,%d,,,", cookie, *nr_pages);
356
357 fscache_stat(&fscache_n_retrievals);
358
359 if (hlist_empty(&cookie->backing_objects))
360 goto nobufs;
361
362 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
363 ASSERTCMP(*nr_pages, >, 0);
364 ASSERT(!list_empty(pages));
365
366 if (fscache_wait_for_deferred_lookup(cookie) < 0)
367 return -ERESTARTSYS;
368
369 op = fscache_alloc_retrieval(mapping, end_io_func, context);
370 if (!op)
371 return -ENOMEM;
372
373 spin_lock(&cookie->lock);
374
375 if (hlist_empty(&cookie->backing_objects))
376 goto nobufs_unlock;
377 object = hlist_entry(cookie->backing_objects.first,
378 struct fscache_object, cookie_link);
379
380 if (fscache_submit_op(object, &op->op) < 0)
381 goto nobufs_unlock;
382 spin_unlock(&cookie->lock);
383
384 fscache_stat(&fscache_n_retrieval_ops);
385
386 /* pin the netfs read context in case we need to do the actual netfs
387 * read because we've encountered a cache read failure */
388 fscache_get_context(object->cookie, op->context);
389
390 /* we wait for the operation to become active, and then process it
391 * *here*, in this thread, and not in the thread pool */
392 if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
393 _debug(">>> WT");
394 fscache_stat(&fscache_n_retrieval_op_waits);
395 wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
396 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
397 _debug("<<< GO");
398 }
399
400 /* ask the cache to honour the operation */
401 if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags))
402 func = object->cache->ops->allocate_pages;
403 else
404 func = object->cache->ops->read_or_alloc_pages;
405 ret = func(op, pages, nr_pages, gfp);
406
407 if (ret == -ENOMEM)
408 fscache_stat(&fscache_n_retrievals_nomem);
409 else if (ret == -ERESTARTSYS)
410 fscache_stat(&fscache_n_retrievals_intr);
411 else if (ret == -ENODATA)
412 fscache_stat(&fscache_n_retrievals_nodata);
413 else if (ret < 0)
414 fscache_stat(&fscache_n_retrievals_nobufs);
415 else
416 fscache_stat(&fscache_n_retrievals_ok);
417
418 fscache_put_retrieval(op);
419 _leave(" = %d", ret);
420 return ret;
421
422nobufs_unlock:
423 spin_unlock(&cookie->lock);
424 kfree(op);
425nobufs:
426 fscache_stat(&fscache_n_retrievals_nobufs);
427 _leave(" = -ENOBUFS");
428 return -ENOBUFS;
429}
430EXPORT_SYMBOL(__fscache_read_or_alloc_pages);
431
432/*
433 * allocate a block in the cache on which to store a page
434 * - we return:
435 * -ENOMEM - out of memory, nothing done
436 * -ERESTARTSYS - interrupted
437 * -ENOBUFS - no backing object available in which to cache the block
438 * 0 - block allocated
439 */
440int __fscache_alloc_page(struct fscache_cookie *cookie,
441 struct page *page,
442 gfp_t gfp)
443{
444 struct fscache_retrieval *op;
445 struct fscache_object *object;
446 int ret;
447
448 _enter("%p,%p,,,", cookie, page);
449
450 fscache_stat(&fscache_n_allocs);
451
452 if (hlist_empty(&cookie->backing_objects))
453 goto nobufs;
454
455 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
456 ASSERTCMP(page, !=, NULL);
457
458 if (fscache_wait_for_deferred_lookup(cookie) < 0)
459 return -ERESTARTSYS;
460
461 op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
462 if (!op)
463 return -ENOMEM;
464
465 spin_lock(&cookie->lock);
466
467 if (hlist_empty(&cookie->backing_objects))
468 goto nobufs_unlock;
469 object = hlist_entry(cookie->backing_objects.first,
470 struct fscache_object, cookie_link);
471
472 if (fscache_submit_op(object, &op->op) < 0)
473 goto nobufs_unlock;
474 spin_unlock(&cookie->lock);
475
476 fscache_stat(&fscache_n_alloc_ops);
477
478 if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
479 _debug(">>> WT");
480 fscache_stat(&fscache_n_alloc_op_waits);
481 wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
482 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
483 _debug("<<< GO");
484 }
485
486 /* ask the cache to honour the operation */
487 ret = object->cache->ops->allocate_page(op, page, gfp);
488
489 if (ret < 0)
490 fscache_stat(&fscache_n_allocs_nobufs);
491 else
492 fscache_stat(&fscache_n_allocs_ok);
493
494 fscache_put_retrieval(op);
495 _leave(" = %d", ret);
496 return ret;
497
498nobufs_unlock:
499 spin_unlock(&cookie->lock);
500 kfree(op);
501nobufs:
502 fscache_stat(&fscache_n_allocs_nobufs);
503 _leave(" = -ENOBUFS");
504 return -ENOBUFS;
505}
506EXPORT_SYMBOL(__fscache_alloc_page);
507
508/*
509 * release a write op reference
510 */
511static void fscache_release_write_op(struct fscache_operation *_op)
512{
513 _enter("{OP%x}", _op->debug_id);
514}
515
516/*
517 * perform the background storage of a page into the cache
518 */
519static void fscache_write_op(struct fscache_operation *_op)
520{
521 struct fscache_storage *op =
522 container_of(_op, struct fscache_storage, op);
523 struct fscache_object *object = op->op.object;
524 struct fscache_cookie *cookie = object->cookie;
525 struct page *page;
526 unsigned n;
527 void *results[1];
528 int ret;
529
530 _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage));
531
532 spin_lock(&cookie->lock);
533 spin_lock(&object->lock);
534
535 if (!fscache_object_is_active(object)) {
536 spin_unlock(&object->lock);
537 spin_unlock(&cookie->lock);
538 _leave("");
539 return;
540 }
541
542 fscache_stat(&fscache_n_store_calls);
543
544 /* find a page to store */
545 page = NULL;
546 n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, 1,
547 FSCACHE_COOKIE_PENDING_TAG);
548 if (n != 1)
549 goto superseded;
550 page = results[0];
551 _debug("gang %d [%lx]", n, page->index);
552 if (page->index > op->store_limit)
553 goto superseded;
554
555 radix_tree_tag_clear(&cookie->stores, page->index,
556 FSCACHE_COOKIE_PENDING_TAG);
557
558 spin_unlock(&object->lock);
559 spin_unlock(&cookie->lock);
560
561 if (page) {
562 ret = object->cache->ops->write_page(op, page);
563 fscache_end_page_write(cookie, page);
564 page_cache_release(page);
565 if (ret < 0)
566 fscache_abort_object(object);
567 else
568 fscache_enqueue_operation(&op->op);
569 }
570
571 _leave("");
572 return;
573
574superseded:
575 /* this writer is going away and there aren't any more things to
576 * write */
577 _debug("cease");
578 clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
579 spin_unlock(&object->lock);
580 spin_unlock(&cookie->lock);
581 _leave("");
582}
583
584/*
585 * request a page be stored in the cache
586 * - returns:
587 * -ENOMEM - out of memory, nothing done
588 * -ENOBUFS - no backing object available in which to cache the page
589 * 0 - dispatched a write - it'll call end_io_func() when finished
590 *
591 * if the cookie still has a backing object at this point, that object can be
592 * in one of a few states with respect to storage processing:
593 *
594 * (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is
595 * set)
596 *
597 * (a) no writes yet (set FSCACHE_COOKIE_PENDING_FILL and queue deferred
598 * fill op)
599 *
600 * (b) writes deferred till post-creation (mark page for writing and
601 * return immediately)
602 *
603 * (2) negative lookup, object created, initial fill being made from netfs
604 * (FSCACHE_COOKIE_INITIAL_FILL is set)
605 *
606 * (a) fill point not yet reached this page (mark page for writing and
607 * return)
608 *
609 * (b) fill point passed this page (queue op to store this page)
610 *
611 * (3) object extant (queue op to store this page)
612 *
613 * any other state is invalid
614 */
615int __fscache_write_page(struct fscache_cookie *cookie,
616 struct page *page,
617 gfp_t gfp)
618{
619 struct fscache_storage *op;
620 struct fscache_object *object;
621 int ret;
622
623 _enter("%p,%x,", cookie, (u32) page->flags);
624
625 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
626 ASSERT(PageFsCache(page));
627
628 fscache_stat(&fscache_n_stores);
629
630 op = kzalloc(sizeof(*op), GFP_NOIO);
631 if (!op)
632 goto nomem;
633
634 fscache_operation_init(&op->op, fscache_release_write_op);
635 fscache_operation_init_slow(&op->op, fscache_write_op);
636 op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING);
637
638 ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
639 if (ret < 0)
640 goto nomem_free;
641
642 ret = -ENOBUFS;
643 spin_lock(&cookie->lock);
644
645 if (hlist_empty(&cookie->backing_objects))
646 goto nobufs;
647 object = hlist_entry(cookie->backing_objects.first,
648 struct fscache_object, cookie_link);
649 if (test_bit(FSCACHE_IOERROR, &object->cache->flags))
650 goto nobufs;
651
652 /* add the page to the pending-storage radix tree on the backing
653 * object */
654 spin_lock(&object->lock);
655
656 _debug("store limit %llx", (unsigned long long) object->store_limit);
657
658 ret = radix_tree_insert(&cookie->stores, page->index, page);
659 if (ret < 0) {
660 if (ret == -EEXIST)
661 goto already_queued;
662 _debug("insert failed %d", ret);
663 goto nobufs_unlock_obj;
664 }
665
666 radix_tree_tag_set(&cookie->stores, page->index,
667 FSCACHE_COOKIE_PENDING_TAG);
668 page_cache_get(page);
669
670 /* we only want one writer at a time, but we do need to queue new
671 * writers after exclusive ops */
672 if (test_and_set_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags))
673 goto already_pending;
674
675 spin_unlock(&object->lock);
676
677 op->op.debug_id = atomic_inc_return(&fscache_op_debug_id);
678 op->store_limit = object->store_limit;
679
680 if (fscache_submit_op(object, &op->op) < 0)
681 goto submit_failed;
682
683 spin_unlock(&cookie->lock);
684 radix_tree_preload_end();
685 fscache_stat(&fscache_n_store_ops);
686 fscache_stat(&fscache_n_stores_ok);
687
688 /* the slow work queue now carries its own ref on the object */
689 fscache_put_operation(&op->op);
690 _leave(" = 0");
691 return 0;
692
693already_queued:
694 fscache_stat(&fscache_n_stores_again);
695already_pending:
696 spin_unlock(&object->lock);
697 spin_unlock(&cookie->lock);
698 radix_tree_preload_end();
699 kfree(op);
700 fscache_stat(&fscache_n_stores_ok);
701 _leave(" = 0");
702 return 0;
703
704submit_failed:
705 radix_tree_delete(&cookie->stores, page->index);
706 page_cache_release(page);
707 ret = -ENOBUFS;
708 goto nobufs;
709
710nobufs_unlock_obj:
711 spin_unlock(&object->lock);
712nobufs:
713 spin_unlock(&cookie->lock);
714 radix_tree_preload_end();
715 kfree(op);
716 fscache_stat(&fscache_n_stores_nobufs);
717 _leave(" = -ENOBUFS");
718 return -ENOBUFS;
719
720nomem_free:
721 kfree(op);
722nomem:
723 fscache_stat(&fscache_n_stores_oom);
724 _leave(" = -ENOMEM");
725 return -ENOMEM;
726}
727EXPORT_SYMBOL(__fscache_write_page);
728
729/*
730 * remove a page from the cache
731 */
732void __fscache_uncache_page(struct fscache_cookie *cookie, struct page *page)
733{
734 struct fscache_object *object;
735
736 _enter(",%p", page);
737
738 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
739 ASSERTCMP(page, !=, NULL);
740
741 fscache_stat(&fscache_n_uncaches);
742
743 /* cache withdrawal may beat us to it */
744 if (!PageFsCache(page))
745 goto done;
746
747 /* get the object */
748 spin_lock(&cookie->lock);
749
750 if (hlist_empty(&cookie->backing_objects)) {
751 ClearPageFsCache(page);
752 goto done_unlock;
753 }
754
755 object = hlist_entry(cookie->backing_objects.first,
756 struct fscache_object, cookie_link);
757
758 /* there might now be stuff on disk we could read */
759 clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
760
761 /* only invoke the cache backend if we managed to mark the page
762 * uncached here; this deals with synchronisation vs withdrawal */
763 if (TestClearPageFsCache(page) &&
764 object->cache->ops->uncache_page) {
765 /* the cache backend releases the cookie lock */
766 object->cache->ops->uncache_page(object, page);
767 goto done;
768 }
769
770done_unlock:
771 spin_unlock(&cookie->lock);
772done:
773 _leave("");
774}
775EXPORT_SYMBOL(__fscache_uncache_page);
776
777/**
778 * fscache_mark_pages_cached - Mark pages as being cached
779 * @op: The retrieval op pages are being marked for
780 * @pagevec: The pages to be marked
781 *
782 * Mark a bunch of netfs pages as being cached. After this is called,
783 * the netfs must call fscache_uncache_page() to remove the mark.
784 */
785void fscache_mark_pages_cached(struct fscache_retrieval *op,
786 struct pagevec *pagevec)
787{
788 struct fscache_cookie *cookie = op->op.object->cookie;
789 unsigned long loop;
790
791#ifdef CONFIG_FSCACHE_STATS
792 atomic_add(pagevec->nr, &fscache_n_marks);
793#endif
794
795 for (loop = 0; loop < pagevec->nr; loop++) {
796 struct page *page = pagevec->pages[loop];
797
798 _debug("- mark %p{%lx}", page, page->index);
799 if (TestSetPageFsCache(page)) {
800 static bool once_only;
801 if (!once_only) {
802 once_only = true;
803 printk(KERN_WARNING "FS-Cache:"
804 " Cookie type %s marked page %lx"
805 " multiple times\n",
806 cookie->def->name, page->index);
807 }
808 }
809 }
810
811 if (cookie->def->mark_pages_cached)
812 cookie->def->mark_pages_cached(cookie->netfs_data,
813 op->mapping, pagevec);
814 pagevec_reinit(pagevec);
815}
816EXPORT_SYMBOL(fscache_mark_pages_cached);
diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c
new file mode 100644
index 000000000000..beeab44bc31a
--- /dev/null
+++ b/fs/fscache/proc.c
@@ -0,0 +1,68 @@
1/* FS-Cache statistics viewing interface
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL OPERATION
13#include <linux/module.h>
14#include <linux/proc_fs.h>
15#include <linux/seq_file.h>
16#include "internal.h"
17
18/*
19 * initialise the /proc/fs/fscache/ directory
20 */
21int __init fscache_proc_init(void)
22{
23 _enter("");
24
25 if (!proc_mkdir("fs/fscache", NULL))
26 goto error_dir;
27
28#ifdef CONFIG_FSCACHE_STATS
29 if (!proc_create("fs/fscache/stats", S_IFREG | 0444, NULL,
30 &fscache_stats_fops))
31 goto error_stats;
32#endif
33
34#ifdef CONFIG_FSCACHE_HISTOGRAM
35 if (!proc_create("fs/fscache/histogram", S_IFREG | 0444, NULL,
36 &fscache_histogram_fops))
37 goto error_histogram;
38#endif
39
40 _leave(" = 0");
41 return 0;
42
43#ifdef CONFIG_FSCACHE_HISTOGRAM
44error_histogram:
45#endif
46#ifdef CONFIG_FSCACHE_STATS
47 remove_proc_entry("fs/fscache/stats", NULL);
48error_stats:
49#endif
50 remove_proc_entry("fs/fscache", NULL);
51error_dir:
52 _leave(" = -ENOMEM");
53 return -ENOMEM;
54}
55
56/*
57 * clean up the /proc/fs/fscache/ directory
58 */
59void fscache_proc_cleanup(void)
60{
61#ifdef CONFIG_FSCACHE_HISTOGRAM
62 remove_proc_entry("fs/fscache/histogram", NULL);
63#endif
64#ifdef CONFIG_FSCACHE_STATS
65 remove_proc_entry("fs/fscache/stats", NULL);
66#endif
67 remove_proc_entry("fs/fscache", NULL);
68}
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
new file mode 100644
index 000000000000..65deb99e756b
--- /dev/null
+++ b/fs/fscache/stats.c
@@ -0,0 +1,212 @@
1/* FS-Cache statistics
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL THREAD
13#include <linux/module.h>
14#include <linux/proc_fs.h>
15#include <linux/seq_file.h>
16#include "internal.h"
17
18/*
19 * operation counters
20 */
21atomic_t fscache_n_op_pend;
22atomic_t fscache_n_op_run;
23atomic_t fscache_n_op_enqueue;
24atomic_t fscache_n_op_requeue;
25atomic_t fscache_n_op_deferred_release;
26atomic_t fscache_n_op_release;
27atomic_t fscache_n_op_gc;
28
29atomic_t fscache_n_attr_changed;
30atomic_t fscache_n_attr_changed_ok;
31atomic_t fscache_n_attr_changed_nobufs;
32atomic_t fscache_n_attr_changed_nomem;
33atomic_t fscache_n_attr_changed_calls;
34
35atomic_t fscache_n_allocs;
36atomic_t fscache_n_allocs_ok;
37atomic_t fscache_n_allocs_wait;
38atomic_t fscache_n_allocs_nobufs;
39atomic_t fscache_n_alloc_ops;
40atomic_t fscache_n_alloc_op_waits;
41
42atomic_t fscache_n_retrievals;
43atomic_t fscache_n_retrievals_ok;
44atomic_t fscache_n_retrievals_wait;
45atomic_t fscache_n_retrievals_nodata;
46atomic_t fscache_n_retrievals_nobufs;
47atomic_t fscache_n_retrievals_intr;
48atomic_t fscache_n_retrievals_nomem;
49atomic_t fscache_n_retrieval_ops;
50atomic_t fscache_n_retrieval_op_waits;
51
52atomic_t fscache_n_stores;
53atomic_t fscache_n_stores_ok;
54atomic_t fscache_n_stores_again;
55atomic_t fscache_n_stores_nobufs;
56atomic_t fscache_n_stores_oom;
57atomic_t fscache_n_store_ops;
58atomic_t fscache_n_store_calls;
59
60atomic_t fscache_n_marks;
61atomic_t fscache_n_uncaches;
62
63atomic_t fscache_n_acquires;
64atomic_t fscache_n_acquires_null;
65atomic_t fscache_n_acquires_no_cache;
66atomic_t fscache_n_acquires_ok;
67atomic_t fscache_n_acquires_nobufs;
68atomic_t fscache_n_acquires_oom;
69
70atomic_t fscache_n_updates;
71atomic_t fscache_n_updates_null;
72atomic_t fscache_n_updates_run;
73
74atomic_t fscache_n_relinquishes;
75atomic_t fscache_n_relinquishes_null;
76atomic_t fscache_n_relinquishes_waitcrt;
77
78atomic_t fscache_n_cookie_index;
79atomic_t fscache_n_cookie_data;
80atomic_t fscache_n_cookie_special;
81
82atomic_t fscache_n_object_alloc;
83atomic_t fscache_n_object_no_alloc;
84atomic_t fscache_n_object_lookups;
85atomic_t fscache_n_object_lookups_negative;
86atomic_t fscache_n_object_lookups_positive;
87atomic_t fscache_n_object_created;
88atomic_t fscache_n_object_avail;
89atomic_t fscache_n_object_dead;
90
91atomic_t fscache_n_checkaux_none;
92atomic_t fscache_n_checkaux_okay;
93atomic_t fscache_n_checkaux_update;
94atomic_t fscache_n_checkaux_obsolete;
95
96/*
97 * display the general statistics
98 */
99static int fscache_stats_show(struct seq_file *m, void *v)
100{
101 seq_puts(m, "FS-Cache statistics\n");
102
103 seq_printf(m, "Cookies: idx=%u dat=%u spc=%u\n",
104 atomic_read(&fscache_n_cookie_index),
105 atomic_read(&fscache_n_cookie_data),
106 atomic_read(&fscache_n_cookie_special));
107
108 seq_printf(m, "Objects: alc=%u nal=%u avl=%u ded=%u\n",
109 atomic_read(&fscache_n_object_alloc),
110 atomic_read(&fscache_n_object_no_alloc),
111 atomic_read(&fscache_n_object_avail),
112 atomic_read(&fscache_n_object_dead));
113 seq_printf(m, "ChkAux : non=%u ok=%u upd=%u obs=%u\n",
114 atomic_read(&fscache_n_checkaux_none),
115 atomic_read(&fscache_n_checkaux_okay),
116 atomic_read(&fscache_n_checkaux_update),
117 atomic_read(&fscache_n_checkaux_obsolete));
118
119 seq_printf(m, "Pages : mrk=%u unc=%u\n",
120 atomic_read(&fscache_n_marks),
121 atomic_read(&fscache_n_uncaches));
122
123 seq_printf(m, "Acquire: n=%u nul=%u noc=%u ok=%u nbf=%u"
124 " oom=%u\n",
125 atomic_read(&fscache_n_acquires),
126 atomic_read(&fscache_n_acquires_null),
127 atomic_read(&fscache_n_acquires_no_cache),
128 atomic_read(&fscache_n_acquires_ok),
129 atomic_read(&fscache_n_acquires_nobufs),
130 atomic_read(&fscache_n_acquires_oom));
131
132 seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u\n",
133 atomic_read(&fscache_n_object_lookups),
134 atomic_read(&fscache_n_object_lookups_negative),
135 atomic_read(&fscache_n_object_lookups_positive),
136 atomic_read(&fscache_n_object_created));
137
138 seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
139 atomic_read(&fscache_n_updates),
140 atomic_read(&fscache_n_updates_null),
141 atomic_read(&fscache_n_updates_run));
142
143 seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u\n",
144 atomic_read(&fscache_n_relinquishes),
145 atomic_read(&fscache_n_relinquishes_null),
146 atomic_read(&fscache_n_relinquishes_waitcrt));
147
148 seq_printf(m, "AttrChg: n=%u ok=%u nbf=%u oom=%u run=%u\n",
149 atomic_read(&fscache_n_attr_changed),
150 atomic_read(&fscache_n_attr_changed_ok),
151 atomic_read(&fscache_n_attr_changed_nobufs),
152 atomic_read(&fscache_n_attr_changed_nomem),
153 atomic_read(&fscache_n_attr_changed_calls));
154
155 seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u\n",
156 atomic_read(&fscache_n_allocs),
157 atomic_read(&fscache_n_allocs_ok),
158 atomic_read(&fscache_n_allocs_wait),
159 atomic_read(&fscache_n_allocs_nobufs));
160 seq_printf(m, "Allocs : ops=%u owt=%u\n",
161 atomic_read(&fscache_n_alloc_ops),
162 atomic_read(&fscache_n_alloc_op_waits));
163
164 seq_printf(m, "Retrvls: n=%u ok=%u wt=%u nod=%u nbf=%u"
165 " int=%u oom=%u\n",
166 atomic_read(&fscache_n_retrievals),
167 atomic_read(&fscache_n_retrievals_ok),
168 atomic_read(&fscache_n_retrievals_wait),
169 atomic_read(&fscache_n_retrievals_nodata),
170 atomic_read(&fscache_n_retrievals_nobufs),
171 atomic_read(&fscache_n_retrievals_intr),
172 atomic_read(&fscache_n_retrievals_nomem));
173 seq_printf(m, "Retrvls: ops=%u owt=%u\n",
174 atomic_read(&fscache_n_retrieval_ops),
175 atomic_read(&fscache_n_retrieval_op_waits));
176
177 seq_printf(m, "Stores : n=%u ok=%u agn=%u nbf=%u oom=%u\n",
178 atomic_read(&fscache_n_stores),
179 atomic_read(&fscache_n_stores_ok),
180 atomic_read(&fscache_n_stores_again),
181 atomic_read(&fscache_n_stores_nobufs),
182 atomic_read(&fscache_n_stores_oom));
183 seq_printf(m, "Stores : ops=%u run=%u\n",
184 atomic_read(&fscache_n_store_ops),
185 atomic_read(&fscache_n_store_calls));
186
187 seq_printf(m, "Ops : pend=%u run=%u enq=%u\n",
188 atomic_read(&fscache_n_op_pend),
189 atomic_read(&fscache_n_op_run),
190 atomic_read(&fscache_n_op_enqueue));
191 seq_printf(m, "Ops : dfr=%u rel=%u gc=%u\n",
192 atomic_read(&fscache_n_op_deferred_release),
193 atomic_read(&fscache_n_op_release),
194 atomic_read(&fscache_n_op_gc));
195 return 0;
196}
197
198/*
199 * open "/proc/fs/fscache/stats" allowing provision of a statistical summary
200 */
201static int fscache_stats_open(struct inode *inode, struct file *file)
202{
203 return single_open(file, fscache_stats_show, NULL);
204}
205
206const struct file_operations fscache_stats_fops = {
207 .owner = THIS_MODULE,
208 .open = fscache_stats_open,
209 .read = seq_read,
210 .llseek = seq_lseek,
211 .release = seq_release,
212};
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 06da05261e04..8b8eebc5614b 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1032,6 +1032,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
1032 fuse_put_request(fc, req); 1032 fuse_put_request(fc, req);
1033 return -ENOMEM; 1033 return -ENOMEM;
1034 } 1034 }
1035 req->out.argpages = 1;
1035 req->num_pages = 1; 1036 req->num_pages = 1;
1036 req->pages[0] = page; 1037 req->pages[0] = page;
1037 fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR); 1038 fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 821d10f719bd..06f30e965676 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -386,7 +386,6 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
386 req->in.numargs = 1; 386 req->in.numargs = 1;
387 req->in.args[0].size = sizeof(struct fuse_read_in); 387 req->in.args[0].size = sizeof(struct fuse_read_in);
388 req->in.args[0].value = inarg; 388 req->in.args[0].value = inarg;
389 req->out.argpages = 1;
390 req->out.argvar = 1; 389 req->out.argvar = 1;
391 req->out.numargs = 1; 390 req->out.numargs = 1;
392 req->out.args[0].size = count; 391 req->out.args[0].size = count;
@@ -453,6 +452,7 @@ static int fuse_readpage(struct file *file, struct page *page)
453 attr_ver = fuse_get_attr_version(fc); 452 attr_ver = fuse_get_attr_version(fc);
454 453
455 req->out.page_zeroing = 1; 454 req->out.page_zeroing = 1;
455 req->out.argpages = 1;
456 req->num_pages = 1; 456 req->num_pages = 1;
457 req->pages[0] = page; 457 req->pages[0] = page;
458 num_read = fuse_send_read(req, file, inode, pos, count, NULL); 458 num_read = fuse_send_read(req, file, inode, pos, count, NULL);
@@ -510,6 +510,8 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file,
510 struct fuse_conn *fc = get_fuse_conn(inode); 510 struct fuse_conn *fc = get_fuse_conn(inode);
511 loff_t pos = page_offset(req->pages[0]); 511 loff_t pos = page_offset(req->pages[0]);
512 size_t count = req->num_pages << PAGE_CACHE_SHIFT; 512 size_t count = req->num_pages << PAGE_CACHE_SHIFT;
513
514 req->out.argpages = 1;
513 req->out.page_zeroing = 1; 515 req->out.page_zeroing = 1;
514 fuse_read_fill(req, file, inode, pos, count, FUSE_READ); 516 fuse_read_fill(req, file, inode, pos, count, FUSE_READ);
515 req->misc.read.attr_ver = fuse_get_attr_version(fc); 517 req->misc.read.attr_ver = fuse_get_attr_version(fc);
@@ -621,7 +623,6 @@ static void fuse_write_fill(struct fuse_req *req, struct file *file,
621 inarg->flags = file ? file->f_flags : 0; 623 inarg->flags = file ? file->f_flags : 0;
622 req->in.h.opcode = FUSE_WRITE; 624 req->in.h.opcode = FUSE_WRITE;
623 req->in.h.nodeid = get_node_id(inode); 625 req->in.h.nodeid = get_node_id(inode);
624 req->in.argpages = 1;
625 req->in.numargs = 2; 626 req->in.numargs = 2;
626 if (fc->minor < 9) 627 if (fc->minor < 9)
627 req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; 628 req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
@@ -695,6 +696,7 @@ static int fuse_buffered_write(struct file *file, struct inode *inode,
695 if (IS_ERR(req)) 696 if (IS_ERR(req))
696 return PTR_ERR(req); 697 return PTR_ERR(req);
697 698
699 req->in.argpages = 1;
698 req->num_pages = 1; 700 req->num_pages = 1;
699 req->pages[0] = page; 701 req->pages[0] = page;
700 req->page_offset = offset; 702 req->page_offset = offset;
@@ -771,6 +773,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
771 size_t count = 0; 773 size_t count = 0;
772 int err; 774 int err;
773 775
776 req->in.argpages = 1;
774 req->page_offset = offset; 777 req->page_offset = offset;
775 778
776 do { 779 do {
@@ -935,21 +938,28 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
935} 938}
936 939
937static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf, 940static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
938 unsigned nbytes, int write) 941 size_t *nbytesp, int write)
939{ 942{
943 size_t nbytes = *nbytesp;
940 unsigned long user_addr = (unsigned long) buf; 944 unsigned long user_addr = (unsigned long) buf;
941 unsigned offset = user_addr & ~PAGE_MASK; 945 unsigned offset = user_addr & ~PAGE_MASK;
942 int npages; 946 int npages;
943 947
944 /* This doesn't work with nfsd */ 948 /* Special case for kernel I/O: can copy directly into the buffer */
945 if (!current->mm) 949 if (segment_eq(get_fs(), KERNEL_DS)) {
946 return -EPERM; 950 if (write)
951 req->in.args[1].value = (void *) user_addr;
952 else
953 req->out.args[0].value = (void *) user_addr;
954
955 return 0;
956 }
947 957
948 nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); 958 nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
949 npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; 959 npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
950 npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ); 960 npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
951 down_read(&current->mm->mmap_sem); 961 down_read(&current->mm->mmap_sem);
952 npages = get_user_pages(current, current->mm, user_addr, npages, write, 962 npages = get_user_pages(current, current->mm, user_addr, npages, !write,
953 0, req->pages, NULL); 963 0, req->pages, NULL);
954 up_read(&current->mm->mmap_sem); 964 up_read(&current->mm->mmap_sem);
955 if (npages < 0) 965 if (npages < 0)
@@ -957,6 +967,15 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
957 967
958 req->num_pages = npages; 968 req->num_pages = npages;
959 req->page_offset = offset; 969 req->page_offset = offset;
970
971 if (write)
972 req->in.argpages = 1;
973 else
974 req->out.argpages = 1;
975
976 nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset;
977 *nbytesp = min(*nbytesp, nbytes);
978
960 return 0; 979 return 0;
961} 980}
962 981
@@ -979,15 +998,13 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
979 998
980 while (count) { 999 while (count) {
981 size_t nres; 1000 size_t nres;
982 size_t nbytes_limit = min(count, nmax); 1001 size_t nbytes = min(count, nmax);
983 size_t nbytes; 1002 int err = fuse_get_user_pages(req, buf, &nbytes, write);
984 int err = fuse_get_user_pages(req, buf, nbytes_limit, !write);
985 if (err) { 1003 if (err) {
986 res = err; 1004 res = err;
987 break; 1005 break;
988 } 1006 }
989 nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset; 1007
990 nbytes = min(nbytes_limit, nbytes);
991 if (write) 1008 if (write)
992 nres = fuse_send_write(req, file, inode, pos, nbytes, 1009 nres = fuse_send_write(req, file, inode, pos, nbytes,
993 current->files); 1010 current->files);
@@ -1163,6 +1180,7 @@ static int fuse_writepage_locked(struct page *page)
1163 fuse_write_fill(req, NULL, ff, inode, page_offset(page), 0, 1); 1180 fuse_write_fill(req, NULL, ff, inode, page_offset(page), 0, 1);
1164 1181
1165 copy_highpage(tmp_page, page); 1182 copy_highpage(tmp_page, page);
1183 req->in.argpages = 1;
1166 req->num_pages = 1; 1184 req->num_pages = 1;
1167 req->pages[0] = tmp_page; 1185 req->pages[0] = tmp_page;
1168 req->page_offset = 0; 1186 req->page_offset = 0;
@@ -1234,8 +1252,9 @@ static void fuse_vma_close(struct vm_area_struct *vma)
1234 * - sync(2) 1252 * - sync(2)
1235 * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER 1253 * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
1236 */ 1254 */
1237static int fuse_page_mkwrite(struct vm_area_struct *vma, struct page *page) 1255static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1238{ 1256{
1257 struct page *page = vmf->page;
1239 /* 1258 /*
1240 * Don't use page->mapping as it may become NULL from a 1259 * Don't use page->mapping as it may become NULL from a
1241 * concurrent truncate. 1260 * concurrent truncate.
@@ -1273,6 +1292,17 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
1273 return 0; 1292 return 0;
1274} 1293}
1275 1294
1295static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
1296{
1297 /* Can't provide the coherency needed for MAP_SHARED */
1298 if (vma->vm_flags & VM_MAYSHARE)
1299 return -ENODEV;
1300
1301 invalidate_inode_pages2(file->f_mapping);
1302
1303 return generic_file_mmap(file, vma);
1304}
1305
1276static int convert_fuse_file_lock(const struct fuse_file_lock *ffl, 1306static int convert_fuse_file_lock(const struct fuse_file_lock *ffl,
1277 struct file_lock *fl) 1307 struct file_lock *fl)
1278{ 1308{
@@ -1907,6 +1937,7 @@ static const struct file_operations fuse_direct_io_file_operations = {
1907 .llseek = fuse_file_llseek, 1937 .llseek = fuse_file_llseek,
1908 .read = fuse_direct_read, 1938 .read = fuse_direct_read,
1909 .write = fuse_direct_write, 1939 .write = fuse_direct_write,
1940 .mmap = fuse_direct_mmap,
1910 .open = fuse_open, 1941 .open = fuse_open,
1911 .flush = fuse_flush, 1942 .flush = fuse_flush,
1912 .release = fuse_release, 1943 .release = fuse_release,
@@ -1916,7 +1947,7 @@ static const struct file_operations fuse_direct_io_file_operations = {
1916 .unlocked_ioctl = fuse_file_ioctl, 1947 .unlocked_ioctl = fuse_file_ioctl,
1917 .compat_ioctl = fuse_file_compat_ioctl, 1948 .compat_ioctl = fuse_file_compat_ioctl,
1918 .poll = fuse_file_poll, 1949 .poll = fuse_file_poll,
1919 /* no mmap and splice_read */ 1950 /* no splice_read */
1920}; 1951};
1921 1952
1922static const struct address_space_operations fuse_file_aops = { 1953static const struct address_space_operations fuse_file_aops = {
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 459b73dd45e1..91f7c85f1ffd 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -19,6 +19,7 @@
19#include <linux/random.h> 19#include <linux/random.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/exportfs.h> 21#include <linux/exportfs.h>
22#include <linux/smp_lock.h>
22 23
23MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); 24MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
24MODULE_DESCRIPTION("Filesystem in Userspace"); 25MODULE_DESCRIPTION("Filesystem in Userspace");
@@ -259,7 +260,9 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
259 260
260static void fuse_umount_begin(struct super_block *sb) 261static void fuse_umount_begin(struct super_block *sb)
261{ 262{
263 lock_kernel();
262 fuse_abort_conn(get_fuse_conn_super(sb)); 264 fuse_abort_conn(get_fuse_conn_super(sb));
265 unlock_kernel();
263} 266}
264 267
265static void fuse_send_destroy(struct fuse_conn *fc) 268static void fuse_send_destroy(struct fuse_conn *fc)
@@ -908,6 +911,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
908 err_put_root: 911 err_put_root:
909 dput(root_dentry); 912 dput(root_dentry);
910 err_put_conn: 913 err_put_conn:
914 bdi_destroy(&fc->bdi);
911 fuse_conn_put(fc); 915 fuse_conn_put(fc);
912 err_fput: 916 err_fput:
913 fput(file); 917 fput(file);
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 995d63b2e747..e0b53aa7bbec 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -134,7 +134,7 @@ generic_acl_init(struct inode *inode, struct inode *dir,
134 mode_t mode = inode->i_mode; 134 mode_t mode = inode->i_mode;
135 int error; 135 int error;
136 136
137 inode->i_mode = mode & ~current->fs->umask; 137 inode->i_mode = mode & ~current_umask();
138 if (!S_ISLNK(inode->i_mode)) 138 if (!S_ISLNK(inode->i_mode))
139 acl = ops->getacl(dir, ACL_TYPE_DEFAULT); 139 acl = ops->getacl(dir, ACL_TYPE_DEFAULT);
140 if (acl) { 140 if (acl) {
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 43764f4fa763..fa881bdc3d85 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -215,7 +215,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
215 if (error) 215 if (error)
216 return error; 216 return error;
217 if (!acl) { 217 if (!acl) {
218 mode &= ~current->fs->umask; 218 mode &= ~current_umask();
219 if (mode != ip->i_inode.i_mode) 219 if (mode != ip->i_inode.i_mode)
220 error = munge_mode(ip, mode); 220 error = munge_mode(ip, mode);
221 return error; 221 return error;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 3984e47d1d33..ff4981090489 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -597,7 +597,6 @@ __acquires(&gl->gl_spin)
597 597
598 GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)); 598 GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags));
599 599
600 down_read(&gfs2_umount_flush_sem);
601 if (test_bit(GLF_DEMOTE, &gl->gl_flags) && 600 if (test_bit(GLF_DEMOTE, &gl->gl_flags) &&
602 gl->gl_demote_state != gl->gl_state) { 601 gl->gl_demote_state != gl->gl_state) {
603 if (find_first_holder(gl)) 602 if (find_first_holder(gl))
@@ -614,15 +613,14 @@ __acquires(&gl->gl_spin)
614 if (ret == 0) 613 if (ret == 0)
615 goto out_unlock; 614 goto out_unlock;
616 if (ret == 2) 615 if (ret == 2)
617 goto out_sem; 616 goto out;
618 gh = find_first_waiter(gl); 617 gh = find_first_waiter(gl);
619 gl->gl_target = gh->gh_state; 618 gl->gl_target = gh->gh_state;
620 if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) 619 if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
621 do_error(gl, 0); /* Fail queued try locks */ 620 do_error(gl, 0); /* Fail queued try locks */
622 } 621 }
623 do_xmote(gl, gh, gl->gl_target); 622 do_xmote(gl, gh, gl->gl_target);
624out_sem: 623out:
625 up_read(&gfs2_umount_flush_sem);
626 return; 624 return;
627 625
628out_sched: 626out_sched:
@@ -631,7 +629,7 @@ out_sched:
631 gfs2_glock_put(gl); 629 gfs2_glock_put(gl);
632out_unlock: 630out_unlock:
633 clear_bit(GLF_LOCK, &gl->gl_flags); 631 clear_bit(GLF_LOCK, &gl->gl_flags);
634 goto out_sem; 632 goto out;
635} 633}
636 634
637static void glock_work_func(struct work_struct *work) 635static void glock_work_func(struct work_struct *work)
@@ -641,6 +639,7 @@ static void glock_work_func(struct work_struct *work)
641 639
642 if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) 640 if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags))
643 finish_xmote(gl, gl->gl_reply); 641 finish_xmote(gl, gl->gl_reply);
642 down_read(&gfs2_umount_flush_sem);
644 spin_lock(&gl->gl_spin); 643 spin_lock(&gl->gl_spin);
645 if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && 644 if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
646 gl->gl_state != LM_ST_UNLOCKED && 645 gl->gl_state != LM_ST_UNLOCKED &&
@@ -653,6 +652,7 @@ static void glock_work_func(struct work_struct *work)
653 } 652 }
654 run_queue(gl, 0); 653 run_queue(gl, 0);
655 spin_unlock(&gl->gl_spin); 654 spin_unlock(&gl->gl_spin);
655 up_read(&gfs2_umount_flush_sem);
656 if (!delay || 656 if (!delay ||
657 queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) 657 queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
658 gfs2_glock_put(gl); 658 gfs2_glock_put(gl);
@@ -1304,6 +1304,7 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
1304 nr--; 1304 nr--;
1305 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) 1305 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1306 gfs2_glock_put(gl); 1306 gfs2_glock_put(gl);
1307 got_ref = 0;
1307 } 1308 }
1308 spin_lock(&lru_lock); 1309 spin_lock(&lru_lock);
1309 if (may_demote) 1310 if (may_demote)
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index bf23a62aa925..70f87f43afa2 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -156,6 +156,12 @@ static void inode_go_sync(struct gfs2_glock *gl)
156 error = filemap_fdatawait(metamapping); 156 error = filemap_fdatawait(metamapping);
157 mapping_set_error(metamapping, error); 157 mapping_set_error(metamapping, error);
158 gfs2_ail_empty_gl(gl); 158 gfs2_ail_empty_gl(gl);
159 /*
160 * Writeback of the data mapping may cause the dirty flag to be set
161 * so we have to clear it again here.
162 */
163 smp_mb__before_clear_bit();
164 clear_bit(GLF_DIRTY, &gl->gl_flags);
159} 165}
160 166
161/** 167/**
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 7b277d449155..5a31d426116f 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -137,15 +137,15 @@ void gfs2_set_iop(struct inode *inode)
137 if (S_ISREG(mode)) { 137 if (S_ISREG(mode)) {
138 inode->i_op = &gfs2_file_iops; 138 inode->i_op = &gfs2_file_iops;
139 if (gfs2_localflocks(sdp)) 139 if (gfs2_localflocks(sdp))
140 inode->i_fop = gfs2_file_fops_nolock; 140 inode->i_fop = &gfs2_file_fops_nolock;
141 else 141 else
142 inode->i_fop = gfs2_file_fops; 142 inode->i_fop = &gfs2_file_fops;
143 } else if (S_ISDIR(mode)) { 143 } else if (S_ISDIR(mode)) {
144 inode->i_op = &gfs2_dir_iops; 144 inode->i_op = &gfs2_dir_iops;
145 if (gfs2_localflocks(sdp)) 145 if (gfs2_localflocks(sdp))
146 inode->i_fop = gfs2_dir_fops_nolock; 146 inode->i_fop = &gfs2_dir_fops_nolock;
147 else 147 else
148 inode->i_fop = gfs2_dir_fops; 148 inode->i_fop = &gfs2_dir_fops;
149 } else if (S_ISLNK(mode)) { 149 } else if (S_ISLNK(mode)) {
150 inode->i_op = &gfs2_symlink_iops; 150 inode->i_op = &gfs2_symlink_iops;
151 } else { 151 } else {
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index dca4fee3078b..c30be2b66580 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -101,21 +101,23 @@ void gfs2_dinode_print(const struct gfs2_inode *ip);
101extern const struct inode_operations gfs2_file_iops; 101extern const struct inode_operations gfs2_file_iops;
102extern const struct inode_operations gfs2_dir_iops; 102extern const struct inode_operations gfs2_dir_iops;
103extern const struct inode_operations gfs2_symlink_iops; 103extern const struct inode_operations gfs2_symlink_iops;
104extern const struct file_operations *gfs2_file_fops_nolock; 104extern const struct file_operations gfs2_file_fops_nolock;
105extern const struct file_operations *gfs2_dir_fops_nolock; 105extern const struct file_operations gfs2_dir_fops_nolock;
106 106
107extern void gfs2_set_inode_flags(struct inode *inode); 107extern void gfs2_set_inode_flags(struct inode *inode);
108 108
109#ifdef CONFIG_GFS2_FS_LOCKING_DLM 109#ifdef CONFIG_GFS2_FS_LOCKING_DLM
110extern const struct file_operations *gfs2_file_fops; 110extern const struct file_operations gfs2_file_fops;
111extern const struct file_operations *gfs2_dir_fops; 111extern const struct file_operations gfs2_dir_fops;
112
112static inline int gfs2_localflocks(const struct gfs2_sbd *sdp) 113static inline int gfs2_localflocks(const struct gfs2_sbd *sdp)
113{ 114{
114 return sdp->sd_args.ar_localflocks; 115 return sdp->sd_args.ar_localflocks;
115} 116}
116#else /* Single node only */ 117#else /* Single node only */
117#define gfs2_file_fops NULL 118#define gfs2_file_fops gfs2_file_fops_nolock
118#define gfs2_dir_fops NULL 119#define gfs2_dir_fops gfs2_dir_fops_nolock
120
119static inline int gfs2_localflocks(const struct gfs2_sbd *sdp) 121static inline int gfs2_localflocks(const struct gfs2_sbd *sdp)
120{ 122{
121 return 1; 123 return 1;
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 3b9e8de3500b..5d82e91887e3 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -337,8 +337,9 @@ static int gfs2_allocate_page_backing(struct page *page)
337 * blocks allocated on disk to back that page. 337 * blocks allocated on disk to back that page.
338 */ 338 */
339 339
340static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) 340static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
341{ 341{
342 struct page *page = vmf->page;
342 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 343 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
343 struct gfs2_inode *ip = GFS2_I(inode); 344 struct gfs2_inode *ip = GFS2_I(inode);
344 struct gfs2_sbd *sdp = GFS2_SB(inode); 345 struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -412,6 +413,10 @@ out_unlock:
412 gfs2_glock_dq(&gh); 413 gfs2_glock_dq(&gh);
413out: 414out:
414 gfs2_holder_uninit(&gh); 415 gfs2_holder_uninit(&gh);
416 if (ret == -ENOMEM)
417 ret = VM_FAULT_OOM;
418 else if (ret)
419 ret = VM_FAULT_SIGBUS;
415 return ret; 420 return ret;
416} 421}
417 422
@@ -702,7 +707,7 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
702 } 707 }
703} 708}
704 709
705const struct file_operations *gfs2_file_fops = &(const struct file_operations){ 710const struct file_operations gfs2_file_fops = {
706 .llseek = gfs2_llseek, 711 .llseek = gfs2_llseek,
707 .read = do_sync_read, 712 .read = do_sync_read,
708 .aio_read = generic_file_aio_read, 713 .aio_read = generic_file_aio_read,
@@ -720,7 +725,7 @@ const struct file_operations *gfs2_file_fops = &(const struct file_operations){
720 .setlease = gfs2_setlease, 725 .setlease = gfs2_setlease,
721}; 726};
722 727
723const struct file_operations *gfs2_dir_fops = &(const struct file_operations){ 728const struct file_operations gfs2_dir_fops = {
724 .readdir = gfs2_readdir, 729 .readdir = gfs2_readdir,
725 .unlocked_ioctl = gfs2_ioctl, 730 .unlocked_ioctl = gfs2_ioctl,
726 .open = gfs2_open, 731 .open = gfs2_open,
@@ -732,7 +737,7 @@ const struct file_operations *gfs2_dir_fops = &(const struct file_operations){
732 737
733#endif /* CONFIG_GFS2_FS_LOCKING_DLM */ 738#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
734 739
735const struct file_operations *gfs2_file_fops_nolock = &(const struct file_operations){ 740const struct file_operations gfs2_file_fops_nolock = {
736 .llseek = gfs2_llseek, 741 .llseek = gfs2_llseek,
737 .read = do_sync_read, 742 .read = do_sync_read,
738 .aio_read = generic_file_aio_read, 743 .aio_read = generic_file_aio_read,
@@ -748,7 +753,7 @@ const struct file_operations *gfs2_file_fops_nolock = &(const struct file_operat
748 .setlease = generic_setlease, 753 .setlease = generic_setlease,
749}; 754};
750 755
751const struct file_operations *gfs2_dir_fops_nolock = &(const struct file_operations){ 756const struct file_operations gfs2_dir_fops_nolock = {
752 .readdir = gfs2_readdir, 757 .readdir = gfs2_readdir,
753 .unlocked_ioctl = gfs2_ioctl, 758 .unlocked_ioctl = gfs2_ioctl,
754 .open = gfs2_open, 759 .open = gfs2_open,
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 51883b3ad89c..1ff9473ea753 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -272,11 +272,6 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
272 lock_page(page); 272 lock_page(page);
273 273
274 bio = bio_alloc(GFP_NOFS, 1); 274 bio = bio_alloc(GFP_NOFS, 1);
275 if (unlikely(!bio)) {
276 __free_page(page);
277 return -ENOBUFS;
278 }
279
280 bio->bi_sector = sector * (sb->s_blocksize >> 9); 275 bio->bi_sector = sector * (sb->s_blocksize >> 9);
281 bio->bi_bdev = sb->s_bdev; 276 bio->bi_bdev = sb->s_bdev;
282 bio_add_page(bio, page, PAGE_SIZE, 0); 277 bio_add_page(bio, page, PAGE_SIZE, 0);
@@ -1287,21 +1282,21 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
1287static struct super_block *get_gfs2_sb(const char *dev_name) 1282static struct super_block *get_gfs2_sb(const char *dev_name)
1288{ 1283{
1289 struct super_block *sb; 1284 struct super_block *sb;
1290 struct nameidata nd; 1285 struct path path;
1291 int error; 1286 int error;
1292 1287
1293 error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd); 1288 error = kern_path(dev_name, LOOKUP_FOLLOW, &path);
1294 if (error) { 1289 if (error) {
1295 printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n", 1290 printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
1296 dev_name, error); 1291 dev_name, error);
1297 return NULL; 1292 return NULL;
1298 } 1293 }
1299 sb = nd.path.dentry->d_inode->i_sb; 1294 sb = path.dentry->d_inode->i_sb;
1300 if (sb && (sb->s_type == &gfs2_fs_type)) 1295 if (sb && (sb->s_type == &gfs2_fs_type))
1301 atomic_inc(&sb->s_active); 1296 atomic_inc(&sb->s_active);
1302 else 1297 else
1303 sb = NULL; 1298 sb = NULL;
1304 path_put(&nd.path); 1299 path_put(&path);
1305 return sb; 1300 return sb;
1306} 1301}
1307 1302
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index abd5429ae285..1c70fa5168d6 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -371,6 +371,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
371 ip = ghs[1].gh_gl->gl_object; 371 ip = ghs[1].gh_gl->gl_object;
372 372
373 ip->i_disksize = size; 373 ip->i_disksize = size;
374 i_size_write(inode, size);
374 375
375 error = gfs2_meta_inode_buffer(ip, &dibh); 376 error = gfs2_meta_inode_buffer(ip, &dibh);
376 377
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 8d53f66b5bcc..152e6c4a0dca 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -81,7 +81,7 @@ struct gfs2_quota_change_host {
81 81
82static LIST_HEAD(qd_lru_list); 82static LIST_HEAD(qd_lru_list);
83static atomic_t qd_lru_count = ATOMIC_INIT(0); 83static atomic_t qd_lru_count = ATOMIC_INIT(0);
84static spinlock_t qd_lru_lock = SPIN_LOCK_UNLOCKED; 84static DEFINE_SPINLOCK(qd_lru_lock);
85 85
86int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask) 86int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask)
87{ 87{
@@ -1364,7 +1364,7 @@ int gfs2_quotad(void *data)
1364 refrigerator(); 1364 refrigerator();
1365 t = min(quotad_timeo, statfs_timeo); 1365 t = min(quotad_timeo, statfs_timeo);
1366 1366
1367 prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_UNINTERRUPTIBLE); 1367 prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_INTERRUPTIBLE);
1368 spin_lock(&sdp->sd_trunc_lock); 1368 spin_lock(&sdp->sd_trunc_lock);
1369 empty = list_empty(&sdp->sd_trunc_list); 1369 empty = list_empty(&sdp->sd_trunc_list);
1370 spin_unlock(&sdp->sd_trunc_lock); 1370 spin_unlock(&sdp->sd_trunc_lock);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index f03d024038ea..565038243fa2 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -212,8 +212,7 @@ static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
212 if (tmp == 0) 212 if (tmp == 0)
213 return BFITNOENT; 213 return BFITNOENT;
214 ptr--; 214 ptr--;
215 bit = fls64(tmp); 215 bit = __ffs64(tmp);
216 bit--; /* fls64 always adds one to the bit count */
217 bit /= 2; /* two bits per entry in the bitmap */ 216 bit /= 2; /* two bits per entry in the bitmap */
218 return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit; 217 return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
219} 218}
@@ -1445,10 +1444,12 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
1445u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n) 1444u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
1446{ 1445{
1447 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1446 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1447 struct buffer_head *dibh;
1448 struct gfs2_alloc *al = ip->i_alloc; 1448 struct gfs2_alloc *al = ip->i_alloc;
1449 struct gfs2_rgrpd *rgd = al->al_rgd; 1449 struct gfs2_rgrpd *rgd = al->al_rgd;
1450 u32 goal, blk; 1450 u32 goal, blk;
1451 u64 block; 1451 u64 block;
1452 int error;
1452 1453
1453 if (rgrp_contains_block(rgd, ip->i_goal)) 1454 if (rgrp_contains_block(rgd, ip->i_goal))
1454 goal = ip->i_goal - rgd->rd_data0; 1455 goal = ip->i_goal - rgd->rd_data0;
@@ -1461,7 +1462,13 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
1461 rgd->rd_last_alloc = blk; 1462 rgd->rd_last_alloc = blk;
1462 block = rgd->rd_data0 + blk; 1463 block = rgd->rd_data0 + blk;
1463 ip->i_goal = block; 1464 ip->i_goal = block;
1464 1465 error = gfs2_meta_inode_buffer(ip, &dibh);
1466 if (error == 0) {
1467 struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
1468 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1469 di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_goal);
1470 brelse(dibh);
1471 }
1465 gfs2_assert_withdraw(sdp, rgd->rd_free >= *n); 1472 gfs2_assert_withdraw(sdp, rgd->rd_free >= *n);
1466 rgd->rd_free -= *n; 1473 rgd->rd_free -= *n;
1467 1474
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 9435dda8f1e0..a1cbff2b4d99 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -70,6 +70,10 @@ static int hfs_releasepage(struct page *page, gfp_t mask)
70 BUG(); 70 BUG();
71 return 0; 71 return 0;
72 } 72 }
73
74 if (!tree)
75 return 0;
76
73 if (tree->node_size >= PAGE_CACHE_SIZE) { 77 if (tree->node_size >= PAGE_CACHE_SIZE) {
74 nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT); 78 nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT);
75 spin_lock(&tree->hash_lock); 79 spin_lock(&tree->hash_lock);
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 36ca2e1a4fa3..7b6165f25fbe 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -349,6 +349,7 @@ void hfs_mdb_put(struct super_block *sb)
349 if (HFS_SB(sb)->nls_disk) 349 if (HFS_SB(sb)->nls_disk)
350 unload_nls(HFS_SB(sb)->nls_disk); 350 unload_nls(HFS_SB(sb)->nls_disk);
351 351
352 free_pages((unsigned long)HFS_SB(sb)->bitmap, PAGE_SIZE < 8192 ? 1 : 0);
352 kfree(HFS_SB(sb)); 353 kfree(HFS_SB(sb));
353 sb->s_fs_info = NULL; 354 sb->s_fs_info = NULL;
354} 355}
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index c8b5acf4b0b7..a36bb749926d 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -82,6 +82,7 @@ static void hfs_put_super(struct super_block *sb)
82static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf) 82static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
83{ 83{
84 struct super_block *sb = dentry->d_sb; 84 struct super_block *sb = dentry->d_sb;
85 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
85 86
86 buf->f_type = HFS_SUPER_MAGIC; 87 buf->f_type = HFS_SUPER_MAGIC;
87 buf->f_bsize = sb->s_blocksize; 88 buf->f_bsize = sb->s_blocksize;
@@ -90,6 +91,8 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
90 buf->f_bavail = buf->f_bfree; 91 buf->f_bavail = buf->f_bfree;
91 buf->f_files = HFS_SB(sb)->fs_ablocks; 92 buf->f_files = HFS_SB(sb)->fs_ablocks;
92 buf->f_ffree = HFS_SB(sb)->free_ablocks; 93 buf->f_ffree = HFS_SB(sb)->free_ablocks;
94 buf->f_fsid.val[0] = (u32)id;
95 buf->f_fsid.val[1] = (u32)(id >> 32);
93 buf->f_namelen = HFS_NAMELEN; 96 buf->f_namelen = HFS_NAMELEN;
94 97
95 return 0; 98 return 0;
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index bab7f8d1bdfa..3fcbb0e1f6fc 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -48,7 +48,7 @@ void hfsplus_fill_defaults(struct hfsplus_sb_info *opts)
48 48
49 opts->creator = HFSPLUS_DEF_CR_TYPE; 49 opts->creator = HFSPLUS_DEF_CR_TYPE;
50 opts->type = HFSPLUS_DEF_CR_TYPE; 50 opts->type = HFSPLUS_DEF_CR_TYPE;
51 opts->umask = current->fs->umask; 51 opts->umask = current_umask();
52 opts->uid = current_uid(); 52 opts->uid = current_uid();
53 opts->gid = current_gid(); 53 opts->gid = current_gid();
54 opts->part = -1; 54 opts->part = -1;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index eb74531a0a8e..f2a64020f42e 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -223,6 +223,7 @@ static void hfsplus_put_super(struct super_block *sb)
223static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) 223static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
224{ 224{
225 struct super_block *sb = dentry->d_sb; 225 struct super_block *sb = dentry->d_sb;
226 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
226 227
227 buf->f_type = HFSPLUS_SUPER_MAGIC; 228 buf->f_type = HFSPLUS_SUPER_MAGIC;
228 buf->f_bsize = sb->s_blocksize; 229 buf->f_bsize = sb->s_blocksize;
@@ -231,6 +232,8 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
231 buf->f_bavail = buf->f_bfree; 232 buf->f_bavail = buf->f_bfree;
232 buf->f_files = 0xFFFFFFFF; 233 buf->f_files = 0xFFFFFFFF;
233 buf->f_ffree = 0xFFFFFFFF - HFSPLUS_SB(sb).next_cnid; 234 buf->f_ffree = 0xFFFFFFFF - HFSPLUS_SB(sb).next_cnid;
235 buf->f_fsid.val[0] = (u32)id;
236 buf->f_fsid.val[1] = (u32)(id >> 32);
234 buf->f_namelen = HFSPLUS_MAX_STRLEN; 237 buf->f_namelen = HFSPLUS_MAX_STRLEN;
235 238
236 return 0; 239 return 0;
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 0d049b8919c4..fc77965be841 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -136,6 +136,7 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
136{ 136{
137 struct super_block *s = dentry->d_sb; 137 struct super_block *s = dentry->d_sb;
138 struct hpfs_sb_info *sbi = hpfs_sb(s); 138 struct hpfs_sb_info *sbi = hpfs_sb(s);
139 u64 id = huge_encode_dev(s->s_bdev->bd_dev);
139 lock_kernel(); 140 lock_kernel();
140 141
141 /*if (sbi->sb_n_free == -1) {*/ 142 /*if (sbi->sb_n_free == -1) {*/
@@ -149,6 +150,8 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
149 buf->f_bavail = sbi->sb_n_free; 150 buf->f_bavail = sbi->sb_n_free;
150 buf->f_files = sbi->sb_dirband_size / 4; 151 buf->f_files = sbi->sb_dirband_size / 4;
151 buf->f_ffree = sbi->sb_n_free_dnodes; 152 buf->f_ffree = sbi->sb_n_free_dnodes;
153 buf->f_fsid.val[0] = (u32)id;
154 buf->f_fsid.val[1] = (u32)(id >> 32);
152 buf->f_namelen = 254; 155 buf->f_namelen = 254;
153 156
154 unlock_kernel(); 157 unlock_kernel();
@@ -420,8 +423,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
420 423
421 if (!(*flags & MS_RDONLY)) mark_dirty(s); 424 if (!(*flags & MS_RDONLY)) mark_dirty(s);
422 425
423 kfree(s->s_options); 426 replace_mount_options(s, new_opts);
424 s->s_options = new_opts;
425 427
426 return 0; 428 return 0;
427 429
@@ -477,7 +479,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
477 479
478 uid = current_uid(); 480 uid = current_uid();
479 gid = current_gid(); 481 gid = current_gid();
480 umask = current->fs->umask; 482 umask = current_umask();
481 lowercase = 0; 483 lowercase = 0;
482 conv = CONV_BINARY; 484 conv = CONV_BINARY;
483 eas = 2; 485 eas = 2;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index b278f7f52024..a5089a6dd67a 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -280,7 +280,12 @@ static ssize_t hppfs_read(struct file *file, char __user *buf, size_t count,
280 "errno = %d\n", err); 280 "errno = %d\n", err);
281 return err; 281 return err;
282 } 282 }
283 count = hppfs_read_file(hppfs->host_fd, buf, count); 283 err = hppfs_read_file(hppfs->host_fd, buf, count);
284 if (err < 0) {
285 printk(KERN_ERR "hppfs_read: read failed: %d\n", err);
286 return err;
287 }
288 count = err;
284 if (count > 0) 289 if (count > 0)
285 *ppos += count; 290 *ppos += count;
286 } 291 }
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9b800d97a687..c1462d43e721 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -26,7 +26,6 @@
26#include <linux/pagevec.h> 26#include <linux/pagevec.h>
27#include <linux/parser.h> 27#include <linux/parser.h>
28#include <linux/mman.h> 28#include <linux/mman.h>
29#include <linux/quotaops.h>
30#include <linux/slab.h> 29#include <linux/slab.h>
31#include <linux/dnotify.h> 30#include <linux/dnotify.h>
32#include <linux/statfs.h> 31#include <linux/statfs.h>
@@ -313,16 +312,6 @@ out:
313 return retval; 312 return retval;
314} 313}
315 314
316/*
317 * Read a page. Again trivial. If it didn't already exist
318 * in the page cache, it is zero-filled.
319 */
320static int hugetlbfs_readpage(struct file *file, struct page * page)
321{
322 unlock_page(page);
323 return -EINVAL;
324}
325
326static int hugetlbfs_write_begin(struct file *file, 315static int hugetlbfs_write_begin(struct file *file,
327 struct address_space *mapping, 316 struct address_space *mapping,
328 loff_t pos, unsigned len, unsigned flags, 317 loff_t pos, unsigned len, unsigned flags,
@@ -702,7 +691,6 @@ static void hugetlbfs_destroy_inode(struct inode *inode)
702} 691}
703 692
704static const struct address_space_operations hugetlbfs_aops = { 693static const struct address_space_operations hugetlbfs_aops = {
705 .readpage = hugetlbfs_readpage,
706 .write_begin = hugetlbfs_write_begin, 694 .write_begin = hugetlbfs_write_begin,
707 .write_end = hugetlbfs_write_end, 695 .write_end = hugetlbfs_write_end,
708 .set_page_dirty = hugetlbfs_set_page_dirty, 696 .set_page_dirty = hugetlbfs_set_page_dirty,
@@ -842,7 +830,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
842bad_val: 830bad_val:
843 printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n", 831 printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n",
844 args[0].from, p); 832 args[0].from, p);
845 return 1; 833 return -EINVAL;
846} 834}
847 835
848static int 836static int
@@ -943,14 +931,13 @@ static struct vfsmount *hugetlbfs_vfsmount;
943 931
944static int can_do_hugetlb_shm(void) 932static int can_do_hugetlb_shm(void)
945{ 933{
946 return likely(capable(CAP_IPC_LOCK) || 934 return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
947 in_group_p(sysctl_hugetlb_shm_group) ||
948 can_do_mlock());
949} 935}
950 936
951struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag) 937struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
952{ 938{
953 int error = -ENOMEM; 939 int error = -ENOMEM;
940 int unlock_shm = 0;
954 struct file *file; 941 struct file *file;
955 struct inode *inode; 942 struct inode *inode;
956 struct dentry *dentry, *root; 943 struct dentry *dentry, *root;
@@ -960,11 +947,14 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
960 if (!hugetlbfs_vfsmount) 947 if (!hugetlbfs_vfsmount)
961 return ERR_PTR(-ENOENT); 948 return ERR_PTR(-ENOENT);
962 949
963 if (!can_do_hugetlb_shm()) 950 if (!can_do_hugetlb_shm()) {
964 return ERR_PTR(-EPERM); 951 if (user_shm_lock(size, user)) {
965 952 unlock_shm = 1;
966 if (!user_shm_lock(size, user)) 953 WARN_ONCE(1,
967 return ERR_PTR(-ENOMEM); 954 "Using mlock ulimits for SHM_HUGETLB deprecated\n");
955 } else
956 return ERR_PTR(-EPERM);
957 }
968 958
969 root = hugetlbfs_vfsmount->mnt_root; 959 root = hugetlbfs_vfsmount->mnt_root;
970 quick_string.name = name; 960 quick_string.name = name;
@@ -1004,7 +994,8 @@ out_inode:
1004out_dentry: 994out_dentry:
1005 dput(dentry); 995 dput(dentry);
1006out_shm_unlock: 996out_shm_unlock:
1007 user_shm_unlock(size, user); 997 if (unlock_shm)
998 user_shm_unlock(size, user);
1008 return ERR_PTR(error); 999 return ERR_PTR(error);
1009} 1000}
1010 1001
diff --git a/fs/inode.c b/fs/inode.c
index d06d6d268de9..bca0c618fdb3 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -99,7 +99,7 @@ static DEFINE_MUTEX(iprune_mutex);
99 */ 99 */
100struct inodes_stat_t inodes_stat; 100struct inodes_stat_t inodes_stat;
101 101
102static struct kmem_cache * inode_cachep __read_mostly; 102static struct kmem_cache *inode_cachep __read_mostly;
103 103
104static void wake_up_inode(struct inode *inode) 104static void wake_up_inode(struct inode *inode)
105{ 105{
@@ -124,7 +124,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
124 static struct inode_operations empty_iops; 124 static struct inode_operations empty_iops;
125 static const struct file_operations empty_fops; 125 static const struct file_operations empty_fops;
126 126
127 struct address_space * const mapping = &inode->i_data; 127 struct address_space *const mapping = &inode->i_data;
128 128
129 inode->i_sb = sb; 129 inode->i_sb = sb;
130 inode->i_blkbits = sb->s_blocksize_bits; 130 inode->i_blkbits = sb->s_blocksize_bits;
@@ -216,9 +216,10 @@ static struct inode *alloc_inode(struct super_block *sb)
216 return NULL; 216 return NULL;
217} 217}
218 218
219void destroy_inode(struct inode *inode) 219void destroy_inode(struct inode *inode)
220{ 220{
221 BUG_ON(inode_has_buffers(inode)); 221 BUG_ON(inode_has_buffers(inode));
222 ima_inode_free(inode);
222 security_inode_free(inode); 223 security_inode_free(inode);
223 if (inode->i_sb->s_op->destroy_inode) 224 if (inode->i_sb->s_op->destroy_inode)
224 inode->i_sb->s_op->destroy_inode(inode); 225 inode->i_sb->s_op->destroy_inode(inode);
@@ -252,12 +253,11 @@ void inode_init_once(struct inode *inode)
252 mutex_init(&inode->inotify_mutex); 253 mutex_init(&inode->inotify_mutex);
253#endif 254#endif
254} 255}
255
256EXPORT_SYMBOL(inode_init_once); 256EXPORT_SYMBOL(inode_init_once);
257 257
258static void init_once(void *foo) 258static void init_once(void *foo)
259{ 259{
260 struct inode * inode = (struct inode *) foo; 260 struct inode *inode = (struct inode *) foo;
261 261
262 inode_init_once(inode); 262 inode_init_once(inode);
263} 263}
@@ -265,7 +265,7 @@ static void init_once(void *foo)
265/* 265/*
266 * inode_lock must be held 266 * inode_lock must be held
267 */ 267 */
268void __iget(struct inode * inode) 268void __iget(struct inode *inode)
269{ 269{
270 if (atomic_read(&inode->i_count)) { 270 if (atomic_read(&inode->i_count)) {
271 atomic_inc(&inode->i_count); 271 atomic_inc(&inode->i_count);
@@ -289,7 +289,7 @@ void clear_inode(struct inode *inode)
289{ 289{
290 might_sleep(); 290 might_sleep();
291 invalidate_inode_buffers(inode); 291 invalidate_inode_buffers(inode);
292 292
293 BUG_ON(inode->i_data.nrpages); 293 BUG_ON(inode->i_data.nrpages);
294 BUG_ON(!(inode->i_state & I_FREEING)); 294 BUG_ON(!(inode->i_state & I_FREEING));
295 BUG_ON(inode->i_state & I_CLEAR); 295 BUG_ON(inode->i_state & I_CLEAR);
@@ -303,7 +303,6 @@ void clear_inode(struct inode *inode)
303 cd_forget(inode); 303 cd_forget(inode);
304 inode->i_state = I_CLEAR; 304 inode->i_state = I_CLEAR;
305} 305}
306
307EXPORT_SYMBOL(clear_inode); 306EXPORT_SYMBOL(clear_inode);
308 307
309/* 308/*
@@ -351,8 +350,8 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
351 350
352 next = head->next; 351 next = head->next;
353 for (;;) { 352 for (;;) {
354 struct list_head * tmp = next; 353 struct list_head *tmp = next;
355 struct inode * inode; 354 struct inode *inode;
356 355
357 /* 356 /*
358 * We can reschedule here without worrying about the list's 357 * We can reschedule here without worrying about the list's
@@ -391,7 +390,7 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
391 * fails because there are busy inodes then a non zero value is returned. 390 * fails because there are busy inodes then a non zero value is returned.
392 * If the discard is successful all the inodes have been discarded. 391 * If the discard is successful all the inodes have been discarded.
393 */ 392 */
394int invalidate_inodes(struct super_block * sb) 393int invalidate_inodes(struct super_block *sb)
395{ 394{
396 int busy; 395 int busy;
397 LIST_HEAD(throw_away); 396 LIST_HEAD(throw_away);
@@ -407,7 +406,6 @@ int invalidate_inodes(struct super_block * sb)
407 406
408 return busy; 407 return busy;
409} 408}
410
411EXPORT_SYMBOL(invalidate_inodes); 409EXPORT_SYMBOL(invalidate_inodes);
412 410
413static int can_unuse(struct inode *inode) 411static int can_unuse(struct inode *inode)
@@ -504,7 +502,7 @@ static int shrink_icache_memory(int nr, gfp_t gfp_mask)
504 * Nasty deadlock avoidance. We may hold various FS locks, 502 * Nasty deadlock avoidance. We may hold various FS locks,
505 * and we don't want to recurse into the FS that called us 503 * and we don't want to recurse into the FS that called us
506 * in clear_inode() and friends.. 504 * in clear_inode() and friends..
507 */ 505 */
508 if (!(gfp_mask & __GFP_FS)) 506 if (!(gfp_mask & __GFP_FS))
509 return -1; 507 return -1;
510 prune_icache(nr); 508 prune_icache(nr);
@@ -524,10 +522,13 @@ static void __wait_on_freeing_inode(struct inode *inode);
524 * by hand after calling find_inode now! This simplifies iunique and won't 522 * by hand after calling find_inode now! This simplifies iunique and won't
525 * add any additional branch in the common code. 523 * add any additional branch in the common code.
526 */ 524 */
527static struct inode * find_inode(struct super_block * sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data) 525static struct inode *find_inode(struct super_block *sb,
526 struct hlist_head *head,
527 int (*test)(struct inode *, void *),
528 void *data)
528{ 529{
529 struct hlist_node *node; 530 struct hlist_node *node;
530 struct inode * inode = NULL; 531 struct inode *inode = NULL;
531 532
532repeat: 533repeat:
533 hlist_for_each_entry(inode, node, head, i_hash) { 534 hlist_for_each_entry(inode, node, head, i_hash) {
@@ -548,10 +549,11 @@ repeat:
548 * find_inode_fast is the fast path version of find_inode, see the comment at 549 * find_inode_fast is the fast path version of find_inode, see the comment at
549 * iget_locked for details. 550 * iget_locked for details.
550 */ 551 */
551static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head *head, unsigned long ino) 552static struct inode *find_inode_fast(struct super_block *sb,
553 struct hlist_head *head, unsigned long ino)
552{ 554{
553 struct hlist_node *node; 555 struct hlist_node *node;
554 struct inode * inode = NULL; 556 struct inode *inode = NULL;
555 557
556repeat: 558repeat:
557 hlist_for_each_entry(inode, node, head, i_hash) { 559 hlist_for_each_entry(inode, node, head, i_hash) {
@@ -631,10 +633,10 @@ struct inode *new_inode(struct super_block *sb)
631 * here to attempt to avoid that. 633 * here to attempt to avoid that.
632 */ 634 */
633 static unsigned int last_ino; 635 static unsigned int last_ino;
634 struct inode * inode; 636 struct inode *inode;
635 637
636 spin_lock_prefetch(&inode_lock); 638 spin_lock_prefetch(&inode_lock);
637 639
638 inode = alloc_inode(sb); 640 inode = alloc_inode(sb);
639 if (inode) { 641 if (inode) {
640 spin_lock(&inode_lock); 642 spin_lock(&inode_lock);
@@ -645,7 +647,6 @@ struct inode *new_inode(struct super_block *sb)
645 } 647 }
646 return inode; 648 return inode;
647} 649}
648
649EXPORT_SYMBOL(new_inode); 650EXPORT_SYMBOL(new_inode);
650 651
651void unlock_new_inode(struct inode *inode) 652void unlock_new_inode(struct inode *inode)
@@ -674,7 +675,6 @@ void unlock_new_inode(struct inode *inode)
674 inode->i_state &= ~(I_LOCK|I_NEW); 675 inode->i_state &= ~(I_LOCK|I_NEW);
675 wake_up_inode(inode); 676 wake_up_inode(inode);
676} 677}
677
678EXPORT_SYMBOL(unlock_new_inode); 678EXPORT_SYMBOL(unlock_new_inode);
679 679
680/* 680/*
@@ -683,13 +683,17 @@ EXPORT_SYMBOL(unlock_new_inode);
683 * We no longer cache the sb_flags in i_flags - see fs.h 683 * We no longer cache the sb_flags in i_flags - see fs.h
684 * -- rmk@arm.uk.linux.org 684 * -- rmk@arm.uk.linux.org
685 */ 685 */
686static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) 686static struct inode *get_new_inode(struct super_block *sb,
687 struct hlist_head *head,
688 int (*test)(struct inode *, void *),
689 int (*set)(struct inode *, void *),
690 void *data)
687{ 691{
688 struct inode * inode; 692 struct inode *inode;
689 693
690 inode = alloc_inode(sb); 694 inode = alloc_inode(sb);
691 if (inode) { 695 if (inode) {
692 struct inode * old; 696 struct inode *old;
693 697
694 spin_lock(&inode_lock); 698 spin_lock(&inode_lock);
695 /* We released the lock, so.. */ 699 /* We released the lock, so.. */
@@ -731,13 +735,14 @@ set_failed:
731 * get_new_inode_fast is the fast path version of get_new_inode, see the 735 * get_new_inode_fast is the fast path version of get_new_inode, see the
732 * comment at iget_locked for details. 736 * comment at iget_locked for details.
733 */ 737 */
734static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino) 738static struct inode *get_new_inode_fast(struct super_block *sb,
739 struct hlist_head *head, unsigned long ino)
735{ 740{
736 struct inode * inode; 741 struct inode *inode;
737 742
738 inode = alloc_inode(sb); 743 inode = alloc_inode(sb);
739 if (inode) { 744 if (inode) {
740 struct inode * old; 745 struct inode *old;
741 746
742 spin_lock(&inode_lock); 747 spin_lock(&inode_lock);
743 /* We released the lock, so.. */ 748 /* We released the lock, so.. */
@@ -823,7 +828,6 @@ struct inode *igrab(struct inode *inode)
823 spin_unlock(&inode_lock); 828 spin_unlock(&inode_lock);
824 return inode; 829 return inode;
825} 830}
826
827EXPORT_SYMBOL(igrab); 831EXPORT_SYMBOL(igrab);
828 832
829/** 833/**
@@ -924,7 +928,6 @@ struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
924 928
925 return ifind(sb, head, test, data, 0); 929 return ifind(sb, head, test, data, 0);
926} 930}
927
928EXPORT_SYMBOL(ilookup5_nowait); 931EXPORT_SYMBOL(ilookup5_nowait);
929 932
930/** 933/**
@@ -953,7 +956,6 @@ struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
953 956
954 return ifind(sb, head, test, data, 1); 957 return ifind(sb, head, test, data, 1);
955} 958}
956
957EXPORT_SYMBOL(ilookup5); 959EXPORT_SYMBOL(ilookup5);
958 960
959/** 961/**
@@ -976,7 +978,6 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
976 978
977 return ifind_fast(sb, head, ino); 979 return ifind_fast(sb, head, ino);
978} 980}
979
980EXPORT_SYMBOL(ilookup); 981EXPORT_SYMBOL(ilookup);
981 982
982/** 983/**
@@ -1015,7 +1016,6 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
1015 */ 1016 */
1016 return get_new_inode(sb, head, test, set, data); 1017 return get_new_inode(sb, head, test, set, data);
1017} 1018}
1018
1019EXPORT_SYMBOL(iget5_locked); 1019EXPORT_SYMBOL(iget5_locked);
1020 1020
1021/** 1021/**
@@ -1047,7 +1047,6 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
1047 */ 1047 */
1048 return get_new_inode_fast(sb, head, ino); 1048 return get_new_inode_fast(sb, head, ino);
1049} 1049}
1050
1051EXPORT_SYMBOL(iget_locked); 1050EXPORT_SYMBOL(iget_locked);
1052 1051
1053int insert_inode_locked(struct inode *inode) 1052int insert_inode_locked(struct inode *inode)
@@ -1055,13 +1054,22 @@ int insert_inode_locked(struct inode *inode)
1055 struct super_block *sb = inode->i_sb; 1054 struct super_block *sb = inode->i_sb;
1056 ino_t ino = inode->i_ino; 1055 ino_t ino = inode->i_ino;
1057 struct hlist_head *head = inode_hashtable + hash(sb, ino); 1056 struct hlist_head *head = inode_hashtable + hash(sb, ino);
1058 struct inode *old;
1059 1057
1060 inode->i_state |= I_LOCK|I_NEW; 1058 inode->i_state |= I_LOCK|I_NEW;
1061 while (1) { 1059 while (1) {
1060 struct hlist_node *node;
1061 struct inode *old = NULL;
1062 spin_lock(&inode_lock); 1062 spin_lock(&inode_lock);
1063 old = find_inode_fast(sb, head, ino); 1063 hlist_for_each_entry(old, node, head, i_hash) {
1064 if (likely(!old)) { 1064 if (old->i_ino != ino)
1065 continue;
1066 if (old->i_sb != sb)
1067 continue;
1068 if (old->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))
1069 continue;
1070 break;
1071 }
1072 if (likely(!node)) {
1065 hlist_add_head(&inode->i_hash, head); 1073 hlist_add_head(&inode->i_hash, head);
1066 spin_unlock(&inode_lock); 1074 spin_unlock(&inode_lock);
1067 return 0; 1075 return 0;
@@ -1076,7 +1084,6 @@ int insert_inode_locked(struct inode *inode)
1076 iput(old); 1084 iput(old);
1077 } 1085 }
1078} 1086}
1079
1080EXPORT_SYMBOL(insert_inode_locked); 1087EXPORT_SYMBOL(insert_inode_locked);
1081 1088
1082int insert_inode_locked4(struct inode *inode, unsigned long hashval, 1089int insert_inode_locked4(struct inode *inode, unsigned long hashval,
@@ -1084,14 +1091,24 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
1084{ 1091{
1085 struct super_block *sb = inode->i_sb; 1092 struct super_block *sb = inode->i_sb;
1086 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1093 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
1087 struct inode *old;
1088 1094
1089 inode->i_state |= I_LOCK|I_NEW; 1095 inode->i_state |= I_LOCK|I_NEW;
1090 1096
1091 while (1) { 1097 while (1) {
1098 struct hlist_node *node;
1099 struct inode *old = NULL;
1100
1092 spin_lock(&inode_lock); 1101 spin_lock(&inode_lock);
1093 old = find_inode(sb, head, test, data); 1102 hlist_for_each_entry(old, node, head, i_hash) {
1094 if (likely(!old)) { 1103 if (old->i_sb != sb)
1104 continue;
1105 if (!test(old, data))
1106 continue;
1107 if (old->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))
1108 continue;
1109 break;
1110 }
1111 if (likely(!node)) {
1095 hlist_add_head(&inode->i_hash, head); 1112 hlist_add_head(&inode->i_hash, head);
1096 spin_unlock(&inode_lock); 1113 spin_unlock(&inode_lock);
1097 return 0; 1114 return 0;
@@ -1106,7 +1123,6 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
1106 iput(old); 1123 iput(old);
1107 } 1124 }
1108} 1125}
1109
1110EXPORT_SYMBOL(insert_inode_locked4); 1126EXPORT_SYMBOL(insert_inode_locked4);
1111 1127
1112/** 1128/**
@@ -1124,7 +1140,6 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval)
1124 hlist_add_head(&inode->i_hash, head); 1140 hlist_add_head(&inode->i_hash, head);
1125 spin_unlock(&inode_lock); 1141 spin_unlock(&inode_lock);
1126} 1142}
1127
1128EXPORT_SYMBOL(__insert_inode_hash); 1143EXPORT_SYMBOL(__insert_inode_hash);
1129 1144
1130/** 1145/**
@@ -1139,7 +1154,6 @@ void remove_inode_hash(struct inode *inode)
1139 hlist_del_init(&inode->i_hash); 1154 hlist_del_init(&inode->i_hash);
1140 spin_unlock(&inode_lock); 1155 spin_unlock(&inode_lock);
1141} 1156}
1142
1143EXPORT_SYMBOL(remove_inode_hash); 1157EXPORT_SYMBOL(remove_inode_hash);
1144 1158
1145/* 1159/*
@@ -1187,7 +1201,6 @@ void generic_delete_inode(struct inode *inode)
1187 BUG_ON(inode->i_state != I_CLEAR); 1201 BUG_ON(inode->i_state != I_CLEAR);
1188 destroy_inode(inode); 1202 destroy_inode(inode);
1189} 1203}
1190
1191EXPORT_SYMBOL(generic_delete_inode); 1204EXPORT_SYMBOL(generic_delete_inode);
1192 1205
1193static void generic_forget_inode(struct inode *inode) 1206static void generic_forget_inode(struct inode *inode)
@@ -1237,12 +1250,11 @@ void generic_drop_inode(struct inode *inode)
1237 else 1250 else
1238 generic_forget_inode(inode); 1251 generic_forget_inode(inode);
1239} 1252}
1240
1241EXPORT_SYMBOL_GPL(generic_drop_inode); 1253EXPORT_SYMBOL_GPL(generic_drop_inode);
1242 1254
1243/* 1255/*
1244 * Called when we're dropping the last reference 1256 * Called when we're dropping the last reference
1245 * to an inode. 1257 * to an inode.
1246 * 1258 *
1247 * Call the FS "drop()" function, defaulting to 1259 * Call the FS "drop()" function, defaulting to
1248 * the legacy UNIX filesystem behaviour.. 1260 * the legacy UNIX filesystem behaviour..
@@ -1262,7 +1274,7 @@ static inline void iput_final(struct inode *inode)
1262} 1274}
1263 1275
1264/** 1276/**
1265 * iput - put an inode 1277 * iput - put an inode
1266 * @inode: inode to put 1278 * @inode: inode to put
1267 * 1279 *
1268 * Puts an inode, dropping its usage count. If the inode use count hits 1280 * Puts an inode, dropping its usage count. If the inode use count hits
@@ -1279,7 +1291,6 @@ void iput(struct inode *inode)
1279 iput_final(inode); 1291 iput_final(inode);
1280 } 1292 }
1281} 1293}
1282
1283EXPORT_SYMBOL(iput); 1294EXPORT_SYMBOL(iput);
1284 1295
1285/** 1296/**
@@ -1290,10 +1301,10 @@ EXPORT_SYMBOL(iput);
1290 * Returns the block number on the device holding the inode that 1301 * Returns the block number on the device holding the inode that
1291 * is the disk block number for the block of the file requested. 1302 * is the disk block number for the block of the file requested.
1292 * That is, asked for block 4 of inode 1 the function will return the 1303 * That is, asked for block 4 of inode 1 the function will return the
1293 * disk block relative to the disk start that holds that block of the 1304 * disk block relative to the disk start that holds that block of the
1294 * file. 1305 * file.
1295 */ 1306 */
1296sector_t bmap(struct inode * inode, sector_t block) 1307sector_t bmap(struct inode *inode, sector_t block)
1297{ 1308{
1298 sector_t res = 0; 1309 sector_t res = 0;
1299 if (inode->i_mapping->a_ops->bmap) 1310 if (inode->i_mapping->a_ops->bmap)
@@ -1425,7 +1436,6 @@ void file_update_time(struct file *file)
1425 mark_inode_dirty_sync(inode); 1436 mark_inode_dirty_sync(inode);
1426 mnt_drop_write(file->f_path.mnt); 1437 mnt_drop_write(file->f_path.mnt);
1427} 1438}
1428
1429EXPORT_SYMBOL(file_update_time); 1439EXPORT_SYMBOL(file_update_time);
1430 1440
1431int inode_needs_sync(struct inode *inode) 1441int inode_needs_sync(struct inode *inode)
@@ -1436,7 +1446,6 @@ int inode_needs_sync(struct inode *inode)
1436 return 1; 1446 return 1;
1437 return 0; 1447 return 0;
1438} 1448}
1439
1440EXPORT_SYMBOL(inode_needs_sync); 1449EXPORT_SYMBOL(inode_needs_sync);
1441 1450
1442int inode_wait(void *word) 1451int inode_wait(void *word)
@@ -1470,42 +1479,6 @@ static void __wait_on_freeing_inode(struct inode *inode)
1470 spin_lock(&inode_lock); 1479 spin_lock(&inode_lock);
1471} 1480}
1472 1481
1473/*
1474 * We rarely want to lock two inodes that do not have a parent/child
1475 * relationship (such as directory, child inode) simultaneously. The
1476 * vast majority of file systems should be able to get along fine
1477 * without this. Do not use these functions except as a last resort.
1478 */
1479void inode_double_lock(struct inode *inode1, struct inode *inode2)
1480{
1481 if (inode1 == NULL || inode2 == NULL || inode1 == inode2) {
1482 if (inode1)
1483 mutex_lock(&inode1->i_mutex);
1484 else if (inode2)
1485 mutex_lock(&inode2->i_mutex);
1486 return;
1487 }
1488
1489 if (inode1 < inode2) {
1490 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
1491 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
1492 } else {
1493 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
1494 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
1495 }
1496}
1497EXPORT_SYMBOL(inode_double_lock);
1498
1499void inode_double_unlock(struct inode *inode1, struct inode *inode2)
1500{
1501 if (inode1)
1502 mutex_unlock(&inode1->i_mutex);
1503
1504 if (inode2 && inode2 != inode1)
1505 mutex_unlock(&inode2->i_mutex);
1506}
1507EXPORT_SYMBOL(inode_double_unlock);
1508
1509static __initdata unsigned long ihash_entries; 1482static __initdata unsigned long ihash_entries;
1510static int __init set_ihash_entries(char *str) 1483static int __init set_ihash_entries(char *str)
1511{ 1484{
diff --git a/fs/internal.h b/fs/internal.h
index 53af885f1732..b4dac4fb6b61 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -11,6 +11,7 @@
11 11
12struct super_block; 12struct super_block;
13struct linux_binprm; 13struct linux_binprm;
14struct path;
14 15
15/* 16/*
16 * block_dev.c 17 * block_dev.c
@@ -43,7 +44,7 @@ extern void __init chrdev_init(void);
43/* 44/*
44 * exec.c 45 * exec.c
45 */ 46 */
46extern void check_unsafe_exec(struct linux_binprm *); 47extern int check_unsafe_exec(struct linux_binprm *);
47 48
48/* 49/*
49 * namespace.c 50 * namespace.c
@@ -60,3 +61,8 @@ extern void umount_tree(struct vfsmount *, int, struct list_head *);
60extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int); 61extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
61 62
62extern void __init mnt_init(void); 63extern void __init mnt_init(void);
64
65/*
66 * fs_struct.c
67 */
68extern void chroot_fs_refs(struct path *, struct path *);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index ac2d47e43926..82d9c42b8bac 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -258,7 +258,7 @@ int __generic_block_fiemap(struct inode *inode,
258 long long length = 0, map_len = 0; 258 long long length = 0, map_len = 0;
259 u64 logical = 0, phys = 0, size = 0; 259 u64 logical = 0, phys = 0, size = 0;
260 u32 flags = FIEMAP_EXTENT_MERGED; 260 u32 flags = FIEMAP_EXTENT_MERGED;
261 int ret = 0; 261 int ret = 0, past_eof = 0, whole_file = 0;
262 262
263 if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC))) 263 if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC)))
264 return ret; 264 return ret;
@@ -266,6 +266,9 @@ int __generic_block_fiemap(struct inode *inode,
266 start_blk = logical_to_blk(inode, start); 266 start_blk = logical_to_blk(inode, start);
267 267
268 length = (long long)min_t(u64, len, i_size_read(inode)); 268 length = (long long)min_t(u64, len, i_size_read(inode));
269 if (length < len)
270 whole_file = 1;
271
269 map_len = length; 272 map_len = length;
270 273
271 do { 274 do {
@@ -282,11 +285,26 @@ int __generic_block_fiemap(struct inode *inode,
282 285
283 /* HOLE */ 286 /* HOLE */
284 if (!buffer_mapped(&tmp)) { 287 if (!buffer_mapped(&tmp)) {
288 length -= blk_to_logical(inode, 1);
289 start_blk++;
290
291 /*
292 * we want to handle the case where there is an
293 * allocated block at the front of the file, and then
294 * nothing but holes up to the end of the file properly,
295 * to make sure that extent at the front gets properly
296 * marked with FIEMAP_EXTENT_LAST
297 */
298 if (!past_eof &&
299 blk_to_logical(inode, start_blk) >=
300 blk_to_logical(inode, 0)+i_size_read(inode))
301 past_eof = 1;
302
285 /* 303 /*
286 * first hole after going past the EOF, this is our 304 * first hole after going past the EOF, this is our
287 * last extent 305 * last extent
288 */ 306 */
289 if (length <= 0) { 307 if (past_eof && size) {
290 flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST; 308 flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST;
291 ret = fiemap_fill_next_extent(fieinfo, logical, 309 ret = fiemap_fill_next_extent(fieinfo, logical,
292 phys, size, 310 phys, size,
@@ -294,15 +312,37 @@ int __generic_block_fiemap(struct inode *inode,
294 break; 312 break;
295 } 313 }
296 314
297 length -= blk_to_logical(inode, 1);
298
299 /* if we have holes up to/past EOF then we're done */ 315 /* if we have holes up to/past EOF then we're done */
300 if (length <= 0) 316 if (length <= 0 || past_eof)
301 break; 317 break;
302
303 start_blk++;
304 } else { 318 } else {
305 if (length <= 0 && size) { 319 /*
320 * we have gone over the length of what we wanted to
321 * map, and it wasn't the entire file, so add the extent
322 * we got last time and exit.
323 *
324 * This is for the case where say we want to map all the
325 * way up to the second to the last block in a file, but
326 * the last block is a hole, making the second to last
327 * block FIEMAP_EXTENT_LAST. In this case we want to
328 * see if there is a hole after the second to last block
329 * so we can mark it properly. If we found data after
330 * we exceeded the length we were requesting, then we
331 * are good to go, just add the extent to the fieinfo
332 * and break
333 */
334 if (length <= 0 && !whole_file) {
335 ret = fiemap_fill_next_extent(fieinfo, logical,
336 phys, size,
337 flags);
338 break;
339 }
340
341 /*
342 * if size != 0 then we know we already have an extent
343 * to add, so add it.
344 */
345 if (size) {
306 ret = fiemap_fill_next_extent(fieinfo, logical, 346 ret = fiemap_fill_next_extent(fieinfo, logical,
307 phys, size, 347 phys, size,
308 flags); 348 flags);
@@ -319,19 +359,14 @@ int __generic_block_fiemap(struct inode *inode,
319 start_blk += logical_to_blk(inode, size); 359 start_blk += logical_to_blk(inode, size);
320 360
321 /* 361 /*
322 * if we are past the EOF we need to loop again to see 362 * If we are past the EOF, then we need to make sure as
323 * if there is a hole so we can mark this extent as the 363 * soon as we find a hole that the last extent we found
324 * last one, and if not keep mapping things until we 364 * is marked with FIEMAP_EXTENT_LAST
325 * find a hole, or we run out of slots in the extent
326 * array
327 */ 365 */
328 if (length <= 0) 366 if (!past_eof &&
329 continue; 367 logical+size >=
330 368 blk_to_logical(inode, 0)+i_size_read(inode))
331 ret = fiemap_fill_next_extent(fieinfo, logical, phys, 369 past_eof = 1;
332 size, flags);
333 if (ret)
334 break;
335 } 370 }
336 cond_resched(); 371 cond_resched();
337 } while (1); 372 } while (1);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 13d2eddd0692..b4cbe9603c7d 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -923,6 +923,7 @@ out_freesbi:
923static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf) 923static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
924{ 924{
925 struct super_block *sb = dentry->d_sb; 925 struct super_block *sb = dentry->d_sb;
926 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
926 927
927 buf->f_type = ISOFS_SUPER_MAGIC; 928 buf->f_type = ISOFS_SUPER_MAGIC;
928 buf->f_bsize = sb->s_blocksize; 929 buf->f_bsize = sb->s_blocksize;
@@ -932,6 +933,8 @@ static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
932 buf->f_bavail = 0; 933 buf->f_bavail = 0;
933 buf->f_files = ISOFS_SB(sb)->s_ninodes; 934 buf->f_files = ISOFS_SB(sb)->s_ninodes;
934 buf->f_ffree = 0; 935 buf->f_ffree = 0;
936 buf->f_fsid.val[0] = (u32)id;
937 buf->f_fsid.val[1] = (u32)(id >> 32);
935 buf->f_namelen = NAME_MAX; 938 buf->f_namelen = NAME_MAX;
936 return 0; 939 return 0;
937} 940}
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 3fbffb1ea714..618e21c0b7a3 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -20,6 +20,7 @@
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/bio.h>
23 24
24/* 25/*
25 * Default IO end handler for temporary BJ_IO buffer_heads. 26 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -171,14 +172,15 @@ static int journal_write_commit_record(journal_t *journal,
171 return (ret == -EIO); 172 return (ret == -EIO);
172} 173}
173 174
174static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) 175static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
176 int write_op)
175{ 177{
176 int i; 178 int i;
177 179
178 for (i = 0; i < bufs; i++) { 180 for (i = 0; i < bufs; i++) {
179 wbuf[i]->b_end_io = end_buffer_write_sync; 181 wbuf[i]->b_end_io = end_buffer_write_sync;
180 /* We use-up our safety reference in submit_bh() */ 182 /* We use-up our safety reference in submit_bh() */
181 submit_bh(WRITE, wbuf[i]); 183 submit_bh(write_op, wbuf[i]);
182 } 184 }
183} 185}
184 186
@@ -186,7 +188,8 @@ static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
186 * Submit all the data buffers to disk 188 * Submit all the data buffers to disk
187 */ 189 */
188static int journal_submit_data_buffers(journal_t *journal, 190static int journal_submit_data_buffers(journal_t *journal,
189 transaction_t *commit_transaction) 191 transaction_t *commit_transaction,
192 int write_op)
190{ 193{
191 struct journal_head *jh; 194 struct journal_head *jh;
192 struct buffer_head *bh; 195 struct buffer_head *bh;
@@ -225,7 +228,7 @@ write_out_data:
225 BUFFER_TRACE(bh, "needs blocking lock"); 228 BUFFER_TRACE(bh, "needs blocking lock");
226 spin_unlock(&journal->j_list_lock); 229 spin_unlock(&journal->j_list_lock);
227 /* Write out all data to prevent deadlocks */ 230 /* Write out all data to prevent deadlocks */
228 journal_do_submit_data(wbuf, bufs); 231 journal_do_submit_data(wbuf, bufs, write_op);
229 bufs = 0; 232 bufs = 0;
230 lock_buffer(bh); 233 lock_buffer(bh);
231 spin_lock(&journal->j_list_lock); 234 spin_lock(&journal->j_list_lock);
@@ -238,7 +241,7 @@ write_out_data:
238 spin_lock(&journal->j_list_lock); 241 spin_lock(&journal->j_list_lock);
239 } 242 }
240 /* Someone already cleaned up the buffer? */ 243 /* Someone already cleaned up the buffer? */
241 if (!buffer_jbd(bh) 244 if (!buffer_jbd(bh) || bh2jh(bh) != jh
242 || jh->b_transaction != commit_transaction 245 || jh->b_transaction != commit_transaction
243 || jh->b_jlist != BJ_SyncData) { 246 || jh->b_jlist != BJ_SyncData) {
244 jbd_unlock_bh_state(bh); 247 jbd_unlock_bh_state(bh);
@@ -256,7 +259,7 @@ write_out_data:
256 jbd_unlock_bh_state(bh); 259 jbd_unlock_bh_state(bh);
257 if (bufs == journal->j_wbufsize) { 260 if (bufs == journal->j_wbufsize) {
258 spin_unlock(&journal->j_list_lock); 261 spin_unlock(&journal->j_list_lock);
259 journal_do_submit_data(wbuf, bufs); 262 journal_do_submit_data(wbuf, bufs, write_op);
260 bufs = 0; 263 bufs = 0;
261 goto write_out_data; 264 goto write_out_data;
262 } 265 }
@@ -286,7 +289,7 @@ write_out_data:
286 } 289 }
287 } 290 }
288 spin_unlock(&journal->j_list_lock); 291 spin_unlock(&journal->j_list_lock);
289 journal_do_submit_data(wbuf, bufs); 292 journal_do_submit_data(wbuf, bufs, write_op);
290 293
291 return err; 294 return err;
292} 295}
@@ -315,6 +318,7 @@ void journal_commit_transaction(journal_t *journal)
315 int first_tag = 0; 318 int first_tag = 0;
316 int tag_flag; 319 int tag_flag;
317 int i; 320 int i;
321 int write_op = WRITE;
318 322
319 /* 323 /*
320 * First job: lock down the current transaction and wait for 324 * First job: lock down the current transaction and wait for
@@ -347,6 +351,13 @@ void journal_commit_transaction(journal_t *journal)
347 spin_lock(&journal->j_state_lock); 351 spin_lock(&journal->j_state_lock);
348 commit_transaction->t_state = T_LOCKED; 352 commit_transaction->t_state = T_LOCKED;
349 353
354 /*
355 * Use plugged writes here, since we want to submit several before
356 * we unplug the device. We don't do explicit unplugging in here,
357 * instead we rely on sync_buffer() doing the unplug for us.
358 */
359 if (commit_transaction->t_synchronous_commit)
360 write_op = WRITE_SYNC_PLUG;
350 spin_lock(&commit_transaction->t_handle_lock); 361 spin_lock(&commit_transaction->t_handle_lock);
351 while (commit_transaction->t_updates) { 362 while (commit_transaction->t_updates) {
352 DEFINE_WAIT(wait); 363 DEFINE_WAIT(wait);
@@ -431,7 +442,8 @@ void journal_commit_transaction(journal_t *journal)
431 * Now start flushing things to disk, in the order they appear 442 * Now start flushing things to disk, in the order they appear
432 * on the transaction lists. Data blocks go first. 443 * on the transaction lists. Data blocks go first.
433 */ 444 */
434 err = journal_submit_data_buffers(journal, commit_transaction); 445 err = journal_submit_data_buffers(journal, commit_transaction,
446 write_op);
435 447
436 /* 448 /*
437 * Wait for all previously submitted IO to complete. 449 * Wait for all previously submitted IO to complete.
@@ -466,7 +478,9 @@ void journal_commit_transaction(journal_t *journal)
466 spin_lock(&journal->j_list_lock); 478 spin_lock(&journal->j_list_lock);
467 continue; 479 continue;
468 } 480 }
469 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { 481 if (buffer_jbd(bh) && bh2jh(bh) == jh &&
482 jh->b_transaction == commit_transaction &&
483 jh->b_jlist == BJ_Locked) {
470 __journal_unfile_buffer(jh); 484 __journal_unfile_buffer(jh);
471 jbd_unlock_bh_state(bh); 485 jbd_unlock_bh_state(bh);
472 journal_remove_journal_head(bh); 486 journal_remove_journal_head(bh);
@@ -490,7 +504,7 @@ void journal_commit_transaction(journal_t *journal)
490 err = 0; 504 err = 0;
491 } 505 }
492 506
493 journal_write_revoke_records(journal, commit_transaction); 507 journal_write_revoke_records(journal, commit_transaction, write_op);
494 508
495 /* 509 /*
496 * If we found any dirty or locked buffers, then we should have 510 * If we found any dirty or locked buffers, then we should have
@@ -660,7 +674,7 @@ start_journal_io:
660 clear_buffer_dirty(bh); 674 clear_buffer_dirty(bh);
661 set_buffer_uptodate(bh); 675 set_buffer_uptodate(bh);
662 bh->b_end_io = journal_end_buffer_io_sync; 676 bh->b_end_io = journal_end_buffer_io_sync;
663 submit_bh(WRITE, bh); 677 submit_bh(write_op, bh);
664 } 678 }
665 cond_resched(); 679 cond_resched();
666 680
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index e79c07812afa..737f7246a4b5 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -637,6 +637,8 @@ struct journal_head *journal_get_descriptor_buffer(journal_t *journal)
637 return NULL; 637 return NULL;
638 638
639 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 639 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
640 if (!bh)
641 return NULL;
640 lock_buffer(bh); 642 lock_buffer(bh);
641 memset(bh->b_data, 0, journal->j_blocksize); 643 memset(bh->b_data, 0, journal->j_blocksize);
642 set_buffer_uptodate(bh); 644 set_buffer_uptodate(bh);
@@ -733,9 +735,7 @@ journal_t * journal_init_dev(struct block_device *bdev,
733 if (!journal->j_wbuf) { 735 if (!journal->j_wbuf) {
734 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", 736 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
735 __func__); 737 __func__);
736 kfree(journal); 738 goto out_err;
737 journal = NULL;
738 goto out;
739 } 739 }
740 journal->j_dev = bdev; 740 journal->j_dev = bdev;
741 journal->j_fs_dev = fs_dev; 741 journal->j_fs_dev = fs_dev;
@@ -743,11 +743,19 @@ journal_t * journal_init_dev(struct block_device *bdev,
743 journal->j_maxlen = len; 743 journal->j_maxlen = len;
744 744
745 bh = __getblk(journal->j_dev, start, journal->j_blocksize); 745 bh = __getblk(journal->j_dev, start, journal->j_blocksize);
746 J_ASSERT(bh != NULL); 746 if (!bh) {
747 printk(KERN_ERR
748 "%s: Cannot get buffer for journal superblock\n",
749 __func__);
750 goto out_err;
751 }
747 journal->j_sb_buffer = bh; 752 journal->j_sb_buffer = bh;
748 journal->j_superblock = (journal_superblock_t *)bh->b_data; 753 journal->j_superblock = (journal_superblock_t *)bh->b_data;
749out: 754
750 return journal; 755 return journal;
756out_err:
757 kfree(journal);
758 return NULL;
751} 759}
752 760
753/** 761/**
@@ -787,8 +795,7 @@ journal_t * journal_init_inode (struct inode *inode)
787 if (!journal->j_wbuf) { 795 if (!journal->j_wbuf) {
788 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", 796 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
789 __func__); 797 __func__);
790 kfree(journal); 798 goto out_err;
791 return NULL;
792 } 799 }
793 800
794 err = journal_bmap(journal, 0, &blocknr); 801 err = journal_bmap(journal, 0, &blocknr);
@@ -796,16 +803,23 @@ journal_t * journal_init_inode (struct inode *inode)
796 if (err) { 803 if (err) {
797 printk(KERN_ERR "%s: Cannnot locate journal superblock\n", 804 printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
798 __func__); 805 __func__);
799 kfree(journal); 806 goto out_err;
800 return NULL;
801 } 807 }
802 808
803 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 809 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
804 J_ASSERT(bh != NULL); 810 if (!bh) {
811 printk(KERN_ERR
812 "%s: Cannot get buffer for journal superblock\n",
813 __func__);
814 goto out_err;
815 }
805 journal->j_sb_buffer = bh; 816 journal->j_sb_buffer = bh;
806 journal->j_superblock = (journal_superblock_t *)bh->b_data; 817 journal->j_superblock = (journal_superblock_t *)bh->b_data;
807 818
808 return journal; 819 return journal;
820out_err:
821 kfree(journal);
822 return NULL;
809} 823}
810 824
811/* 825/*
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index c7bd649bbbdc..da6cd9bdaabc 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -55,6 +55,25 @@
55 * need do nothing. 55 * need do nothing.
56 * RevokeValid set, Revoked set: 56 * RevokeValid set, Revoked set:
57 * buffer has been revoked. 57 * buffer has been revoked.
58 *
59 * Locking rules:
60 * We keep two hash tables of revoke records. One hashtable belongs to the
61 * running transaction (is pointed to by journal->j_revoke), the other one
62 * belongs to the committing transaction. Accesses to the second hash table
63 * happen only from the kjournald and no other thread touches this table. Also
64 * journal_switch_revoke_table() which switches which hashtable belongs to the
65 * running and which to the committing transaction is called only from
66 * kjournald. Therefore we need no locks when accessing the hashtable belonging
67 * to the committing transaction.
68 *
69 * All users operating on the hash table belonging to the running transaction
70 * have a handle to the transaction. Therefore they are safe from kjournald
71 * switching hash tables under them. For operations on the lists of entries in
72 * the hash table j_revoke_lock is used.
73 *
74 * Finally, also replay code uses the hash tables but at this moment noone else
75 * can touch them (filesystem isn't mounted yet) and hence no locking is
76 * needed.
58 */ 77 */
59 78
60#ifndef __KERNEL__ 79#ifndef __KERNEL__
@@ -67,6 +86,7 @@
67#include <linux/slab.h> 86#include <linux/slab.h>
68#include <linux/list.h> 87#include <linux/list.h>
69#include <linux/init.h> 88#include <linux/init.h>
89#include <linux/bio.h>
70#endif 90#endif
71#include <linux/log2.h> 91#include <linux/log2.h>
72 92
@@ -99,8 +119,8 @@ struct jbd_revoke_table_s
99#ifdef __KERNEL__ 119#ifdef __KERNEL__
100static void write_one_revoke_record(journal_t *, transaction_t *, 120static void write_one_revoke_record(journal_t *, transaction_t *,
101 struct journal_head **, int *, 121 struct journal_head **, int *,
102 struct jbd_revoke_record_s *); 122 struct jbd_revoke_record_s *, int);
103static void flush_descriptor(journal_t *, struct journal_head *, int); 123static void flush_descriptor(journal_t *, struct journal_head *, int, int);
104#endif 124#endif
105 125
106/* Utility functions to maintain the revoke table */ 126/* Utility functions to maintain the revoke table */
@@ -402,8 +422,6 @@ int journal_revoke(handle_t *handle, unsigned long blocknr,
402 * the second time we would still have a pending revoke to cancel. So, 422 * the second time we would still have a pending revoke to cancel. So,
403 * do not trust the Revoked bit on buffers unless RevokeValid is also 423 * do not trust the Revoked bit on buffers unless RevokeValid is also
404 * set. 424 * set.
405 *
406 * The caller must have the journal locked.
407 */ 425 */
408int journal_cancel_revoke(handle_t *handle, struct journal_head *jh) 426int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
409{ 427{
@@ -481,12 +499,9 @@ void journal_switch_revoke_table(journal_t *journal)
481/* 499/*
482 * Write revoke records to the journal for all entries in the current 500 * Write revoke records to the journal for all entries in the current
483 * revoke hash, deleting the entries as we go. 501 * revoke hash, deleting the entries as we go.
484 *
485 * Called with the journal lock held.
486 */ 502 */
487
488void journal_write_revoke_records(journal_t *journal, 503void journal_write_revoke_records(journal_t *journal,
489 transaction_t *transaction) 504 transaction_t *transaction, int write_op)
490{ 505{
491 struct journal_head *descriptor; 506 struct journal_head *descriptor;
492 struct jbd_revoke_record_s *record; 507 struct jbd_revoke_record_s *record;
@@ -510,14 +525,14 @@ void journal_write_revoke_records(journal_t *journal,
510 hash_list->next; 525 hash_list->next;
511 write_one_revoke_record(journal, transaction, 526 write_one_revoke_record(journal, transaction,
512 &descriptor, &offset, 527 &descriptor, &offset,
513 record); 528 record, write_op);
514 count++; 529 count++;
515 list_del(&record->hash); 530 list_del(&record->hash);
516 kmem_cache_free(revoke_record_cache, record); 531 kmem_cache_free(revoke_record_cache, record);
517 } 532 }
518 } 533 }
519 if (descriptor) 534 if (descriptor)
520 flush_descriptor(journal, descriptor, offset); 535 flush_descriptor(journal, descriptor, offset, write_op);
521 jbd_debug(1, "Wrote %d revoke records\n", count); 536 jbd_debug(1, "Wrote %d revoke records\n", count);
522} 537}
523 538
@@ -530,7 +545,8 @@ static void write_one_revoke_record(journal_t *journal,
530 transaction_t *transaction, 545 transaction_t *transaction,
531 struct journal_head **descriptorp, 546 struct journal_head **descriptorp,
532 int *offsetp, 547 int *offsetp,
533 struct jbd_revoke_record_s *record) 548 struct jbd_revoke_record_s *record,
549 int write_op)
534{ 550{
535 struct journal_head *descriptor; 551 struct journal_head *descriptor;
536 int offset; 552 int offset;
@@ -549,7 +565,7 @@ static void write_one_revoke_record(journal_t *journal,
549 /* Make sure we have a descriptor with space left for the record */ 565 /* Make sure we have a descriptor with space left for the record */
550 if (descriptor) { 566 if (descriptor) {
551 if (offset == journal->j_blocksize) { 567 if (offset == journal->j_blocksize) {
552 flush_descriptor(journal, descriptor, offset); 568 flush_descriptor(journal, descriptor, offset, write_op);
553 descriptor = NULL; 569 descriptor = NULL;
554 } 570 }
555 } 571 }
@@ -586,7 +602,7 @@ static void write_one_revoke_record(journal_t *journal,
586 602
587static void flush_descriptor(journal_t *journal, 603static void flush_descriptor(journal_t *journal,
588 struct journal_head *descriptor, 604 struct journal_head *descriptor,
589 int offset) 605 int offset, int write_op)
590{ 606{
591 journal_revoke_header_t *header; 607 journal_revoke_header_t *header;
592 struct buffer_head *bh = jh2bh(descriptor); 608 struct buffer_head *bh = jh2bh(descriptor);
@@ -601,7 +617,7 @@ static void flush_descriptor(journal_t *journal,
601 set_buffer_jwrite(bh); 617 set_buffer_jwrite(bh);
602 BUFFER_TRACE(bh, "write"); 618 BUFFER_TRACE(bh, "write");
603 set_buffer_dirty(bh); 619 set_buffer_dirty(bh);
604 ll_rw_block(SWRITE, 1, &bh); 620 ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh);
605} 621}
606#endif 622#endif
607 623
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index e6a117431277..ed886e6db399 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1440,6 +1440,8 @@ int journal_stop(handle_t *handle)
1440 } 1440 }
1441 } 1441 }
1442 1442
1443 if (handle->h_sync)
1444 transaction->t_synchronous_commit = 1;
1443 current->journal_info = NULL; 1445 current->journal_info = NULL;
1444 spin_lock(&journal->j_state_lock); 1446 spin_lock(&journal->j_state_lock);
1445 spin_lock(&transaction->t_handle_lock); 1447 spin_lock(&transaction->t_handle_lock);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 62804e57a44c..0b7d3b8226fd 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -138,7 +138,7 @@ static int journal_submit_commit_record(journal_t *journal,
138 set_buffer_ordered(bh); 138 set_buffer_ordered(bh);
139 barrier_done = 1; 139 barrier_done = 1;
140 } 140 }
141 ret = submit_bh(WRITE_SYNC, bh); 141 ret = submit_bh(WRITE_SYNC_PLUG, bh);
142 if (barrier_done) 142 if (barrier_done)
143 clear_buffer_ordered(bh); 143 clear_buffer_ordered(bh);
144 144
@@ -159,7 +159,7 @@ static int journal_submit_commit_record(journal_t *journal,
159 lock_buffer(bh); 159 lock_buffer(bh);
160 set_buffer_uptodate(bh); 160 set_buffer_uptodate(bh);
161 clear_buffer_dirty(bh); 161 clear_buffer_dirty(bh);
162 ret = submit_bh(WRITE_SYNC, bh); 162 ret = submit_bh(WRITE_SYNC_PLUG, bh);
163 } 163 }
164 *cbh = bh; 164 *cbh = bh;
165 return ret; 165 return ret;
@@ -190,7 +190,7 @@ retry:
190 set_buffer_uptodate(bh); 190 set_buffer_uptodate(bh);
191 bh->b_end_io = journal_end_buffer_io_sync; 191 bh->b_end_io = journal_end_buffer_io_sync;
192 192
193 ret = submit_bh(WRITE_SYNC, bh); 193 ret = submit_bh(WRITE_SYNC_PLUG, bh);
194 if (ret) { 194 if (ret) {
195 unlock_buffer(bh); 195 unlock_buffer(bh);
196 return ret; 196 return ret;
@@ -367,6 +367,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
367 int tag_bytes = journal_tag_bytes(journal); 367 int tag_bytes = journal_tag_bytes(journal);
368 struct buffer_head *cbh = NULL; /* For transactional checksums */ 368 struct buffer_head *cbh = NULL; /* For transactional checksums */
369 __u32 crc32_sum = ~0; 369 __u32 crc32_sum = ~0;
370 int write_op = WRITE;
370 371
371 /* 372 /*
372 * First job: lock down the current transaction and wait for 373 * First job: lock down the current transaction and wait for
@@ -401,6 +402,13 @@ void jbd2_journal_commit_transaction(journal_t *journal)
401 spin_lock(&journal->j_state_lock); 402 spin_lock(&journal->j_state_lock);
402 commit_transaction->t_state = T_LOCKED; 403 commit_transaction->t_state = T_LOCKED;
403 404
405 /*
406 * Use plugged writes here, since we want to submit several before
407 * we unplug the device. We don't do explicit unplugging in here,
408 * instead we rely on sync_buffer() doing the unplug for us.
409 */
410 if (commit_transaction->t_synchronous_commit)
411 write_op = WRITE_SYNC_PLUG;
404 stats.u.run.rs_wait = commit_transaction->t_max_wait; 412 stats.u.run.rs_wait = commit_transaction->t_max_wait;
405 stats.u.run.rs_locked = jiffies; 413 stats.u.run.rs_locked = jiffies;
406 stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start, 414 stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
@@ -498,7 +506,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
498 if (err) 506 if (err)
499 jbd2_journal_abort(journal, err); 507 jbd2_journal_abort(journal, err);
500 508
501 jbd2_journal_write_revoke_records(journal, commit_transaction); 509 jbd2_journal_write_revoke_records(journal, commit_transaction,
510 write_op);
502 511
503 jbd_debug(3, "JBD: commit phase 2\n"); 512 jbd_debug(3, "JBD: commit phase 2\n");
504 513
@@ -680,7 +689,7 @@ start_journal_io:
680 clear_buffer_dirty(bh); 689 clear_buffer_dirty(bh);
681 set_buffer_uptodate(bh); 690 set_buffer_uptodate(bh);
682 bh->b_end_io = journal_end_buffer_io_sync; 691 bh->b_end_io = journal_end_buffer_io_sync;
683 submit_bh(WRITE, bh); 692 submit_bh(write_op, bh);
684 } 693 }
685 cond_resched(); 694 cond_resched();
686 stats.u.run.rs_blocks_logged += bufs; 695 stats.u.run.rs_blocks_logged += bufs;
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 257ff2625765..a360b06af2e3 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -55,6 +55,25 @@
55 * need do nothing. 55 * need do nothing.
56 * RevokeValid set, Revoked set: 56 * RevokeValid set, Revoked set:
57 * buffer has been revoked. 57 * buffer has been revoked.
58 *
59 * Locking rules:
60 * We keep two hash tables of revoke records. One hashtable belongs to the
61 * running transaction (is pointed to by journal->j_revoke), the other one
62 * belongs to the committing transaction. Accesses to the second hash table
63 * happen only from the kjournald and no other thread touches this table. Also
64 * journal_switch_revoke_table() which switches which hashtable belongs to the
65 * running and which to the committing transaction is called only from
66 * kjournald. Therefore we need no locks when accessing the hashtable belonging
67 * to the committing transaction.
68 *
69 * All users operating on the hash table belonging to the running transaction
70 * have a handle to the transaction. Therefore they are safe from kjournald
71 * switching hash tables under them. For operations on the lists of entries in
72 * the hash table j_revoke_lock is used.
73 *
74 * Finally, also replay code uses the hash tables but at this moment noone else
75 * can touch them (filesystem isn't mounted yet) and hence no locking is
76 * needed.
58 */ 77 */
59 78
60#ifndef __KERNEL__ 79#ifndef __KERNEL__
@@ -67,6 +86,7 @@
67#include <linux/slab.h> 86#include <linux/slab.h>
68#include <linux/list.h> 87#include <linux/list.h>
69#include <linux/init.h> 88#include <linux/init.h>
89#include <linux/bio.h>
70#endif 90#endif
71#include <linux/log2.h> 91#include <linux/log2.h>
72 92
@@ -99,8 +119,8 @@ struct jbd2_revoke_table_s
99#ifdef __KERNEL__ 119#ifdef __KERNEL__
100static void write_one_revoke_record(journal_t *, transaction_t *, 120static void write_one_revoke_record(journal_t *, transaction_t *,
101 struct journal_head **, int *, 121 struct journal_head **, int *,
102 struct jbd2_revoke_record_s *); 122 struct jbd2_revoke_record_s *, int);
103static void flush_descriptor(journal_t *, struct journal_head *, int); 123static void flush_descriptor(journal_t *, struct journal_head *, int, int);
104#endif 124#endif
105 125
106/* Utility functions to maintain the revoke table */ 126/* Utility functions to maintain the revoke table */
@@ -401,8 +421,6 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
401 * the second time we would still have a pending revoke to cancel. So, 421 * the second time we would still have a pending revoke to cancel. So,
402 * do not trust the Revoked bit on buffers unless RevokeValid is also 422 * do not trust the Revoked bit on buffers unless RevokeValid is also
403 * set. 423 * set.
404 *
405 * The caller must have the journal locked.
406 */ 424 */
407int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) 425int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
408{ 426{
@@ -480,12 +498,10 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
480/* 498/*
481 * Write revoke records to the journal for all entries in the current 499 * Write revoke records to the journal for all entries in the current
482 * revoke hash, deleting the entries as we go. 500 * revoke hash, deleting the entries as we go.
483 *
484 * Called with the journal lock held.
485 */ 501 */
486
487void jbd2_journal_write_revoke_records(journal_t *journal, 502void jbd2_journal_write_revoke_records(journal_t *journal,
488 transaction_t *transaction) 503 transaction_t *transaction,
504 int write_op)
489{ 505{
490 struct journal_head *descriptor; 506 struct journal_head *descriptor;
491 struct jbd2_revoke_record_s *record; 507 struct jbd2_revoke_record_s *record;
@@ -509,14 +525,14 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
509 hash_list->next; 525 hash_list->next;
510 write_one_revoke_record(journal, transaction, 526 write_one_revoke_record(journal, transaction,
511 &descriptor, &offset, 527 &descriptor, &offset,
512 record); 528 record, write_op);
513 count++; 529 count++;
514 list_del(&record->hash); 530 list_del(&record->hash);
515 kmem_cache_free(jbd2_revoke_record_cache, record); 531 kmem_cache_free(jbd2_revoke_record_cache, record);
516 } 532 }
517 } 533 }
518 if (descriptor) 534 if (descriptor)
519 flush_descriptor(journal, descriptor, offset); 535 flush_descriptor(journal, descriptor, offset, write_op);
520 jbd_debug(1, "Wrote %d revoke records\n", count); 536 jbd_debug(1, "Wrote %d revoke records\n", count);
521} 537}
522 538
@@ -529,7 +545,8 @@ static void write_one_revoke_record(journal_t *journal,
529 transaction_t *transaction, 545 transaction_t *transaction,
530 struct journal_head **descriptorp, 546 struct journal_head **descriptorp,
531 int *offsetp, 547 int *offsetp,
532 struct jbd2_revoke_record_s *record) 548 struct jbd2_revoke_record_s *record,
549 int write_op)
533{ 550{
534 struct journal_head *descriptor; 551 struct journal_head *descriptor;
535 int offset; 552 int offset;
@@ -548,7 +565,7 @@ static void write_one_revoke_record(journal_t *journal,
548 /* Make sure we have a descriptor with space left for the record */ 565 /* Make sure we have a descriptor with space left for the record */
549 if (descriptor) { 566 if (descriptor) {
550 if (offset == journal->j_blocksize) { 567 if (offset == journal->j_blocksize) {
551 flush_descriptor(journal, descriptor, offset); 568 flush_descriptor(journal, descriptor, offset, write_op);
552 descriptor = NULL; 569 descriptor = NULL;
553 } 570 }
554 } 571 }
@@ -593,7 +610,7 @@ static void write_one_revoke_record(journal_t *journal,
593 610
594static void flush_descriptor(journal_t *journal, 611static void flush_descriptor(journal_t *journal,
595 struct journal_head *descriptor, 612 struct journal_head *descriptor,
596 int offset) 613 int offset, int write_op)
597{ 614{
598 jbd2_journal_revoke_header_t *header; 615 jbd2_journal_revoke_header_t *header;
599 struct buffer_head *bh = jh2bh(descriptor); 616 struct buffer_head *bh = jh2bh(descriptor);
@@ -608,7 +625,7 @@ static void flush_descriptor(journal_t *journal,
608 set_buffer_jwrite(bh); 625 set_buffer_jwrite(bh);
609 BUFFER_TRACE(bh, "write"); 626 BUFFER_TRACE(bh, "write");
610 set_buffer_dirty(bh); 627 set_buffer_dirty(bh);
611 ll_rw_block(SWRITE, 1, &bh); 628 ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh);
612} 629}
613#endif 630#endif
614 631
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 28ce21d8598e..996ffda06bf3 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1315,6 +1315,8 @@ int jbd2_journal_stop(handle_t *handle)
1315 } 1315 }
1316 } 1316 }
1317 1317
1318 if (handle->h_sync)
1319 transaction->t_synchronous_commit = 1;
1318 current->journal_info = NULL; 1320 current->journal_info = NULL;
1319 spin_lock(&journal->j_state_lock); 1321 spin_lock(&journal->j_state_lock);
1320 spin_lock(&transaction->t_handle_lock); 1322 spin_lock(&transaction->t_handle_lock);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index d98713777a1b..043740dde20c 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -38,12 +38,12 @@ static int jffs2_acl_count(size_t size)
38 size_t s; 38 size_t s;
39 39
40 size -= sizeof(struct jffs2_acl_header); 40 size -= sizeof(struct jffs2_acl_header);
41 s = size - 4 * sizeof(struct jffs2_acl_entry_short); 41 if (size < 4 * sizeof(struct jffs2_acl_entry_short)) {
42 if (s < 0) {
43 if (size % sizeof(struct jffs2_acl_entry_short)) 42 if (size % sizeof(struct jffs2_acl_entry_short))
44 return -1; 43 return -1;
45 return size / sizeof(struct jffs2_acl_entry_short); 44 return size / sizeof(struct jffs2_acl_entry_short);
46 } else { 45 } else {
46 s = size - 4 * sizeof(struct jffs2_acl_entry_short);
47 if (s % sizeof(struct jffs2_acl_entry)) 47 if (s % sizeof(struct jffs2_acl_entry))
48 return -1; 48 return -1;
49 return s / sizeof(struct jffs2_acl_entry) + 4; 49 return s / sizeof(struct jffs2_acl_entry) + 4;
@@ -336,7 +336,7 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
336 return PTR_ERR(acl); 336 return PTR_ERR(acl);
337 337
338 if (!acl) { 338 if (!acl) {
339 *i_mode &= ~current->fs->umask; 339 *i_mode &= ~current_umask();
340 } else { 340 } else {
341 if (S_ISDIR(*i_mode)) 341 if (S_ISDIR(*i_mode))
342 jffs2_iset_acl(inode, &f->i_acl_default, acl); 342 jffs2_iset_acl(inode, &f->i_acl_default, acl);
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index c32b4a1ad6cf..a0244740b75a 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -480,13 +480,6 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb
480 return; 480 return;
481 481
482filebad: 482filebad:
483 mutex_lock(&c->erase_free_sem);
484 spin_lock(&c->erase_completion_lock);
485 /* Stick it on a list (any list) so erase_failed can take it
486 right off again. Silly, but shouldn't happen often. */
487 list_move(&jeb->list, &c->erasing_list);
488 spin_unlock(&c->erase_completion_lock);
489 mutex_unlock(&c->erase_free_sem);
490 jffs2_erase_failed(c, jeb, bad_offset); 483 jffs2_erase_failed(c, jeb, bad_offset);
491 return; 484 return;
492 485
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index f9211252b5f1..9eff2bdae8a7 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -284,10 +284,9 @@ void jffs2_free_inode_cache(struct jffs2_inode_cache *x)
284struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void) 284struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void)
285{ 285{
286 struct jffs2_xattr_datum *xd; 286 struct jffs2_xattr_datum *xd;
287 xd = kmem_cache_alloc(xattr_datum_cache, GFP_KERNEL); 287 xd = kmem_cache_zalloc(xattr_datum_cache, GFP_KERNEL);
288 dbg_memalloc("%p\n", xd); 288 dbg_memalloc("%p\n", xd);
289 289
290 memset(xd, 0, sizeof(struct jffs2_xattr_datum));
291 xd->class = RAWNODE_CLASS_XATTR_DATUM; 290 xd->class = RAWNODE_CLASS_XATTR_DATUM;
292 xd->node = (void *)xd; 291 xd->node = (void *)xd;
293 INIT_LIST_HEAD(&xd->xindex); 292 INIT_LIST_HEAD(&xd->xindex);
@@ -303,10 +302,9 @@ void jffs2_free_xattr_datum(struct jffs2_xattr_datum *xd)
303struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void) 302struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void)
304{ 303{
305 struct jffs2_xattr_ref *ref; 304 struct jffs2_xattr_ref *ref;
306 ref = kmem_cache_alloc(xattr_ref_cache, GFP_KERNEL); 305 ref = kmem_cache_zalloc(xattr_ref_cache, GFP_KERNEL);
307 dbg_memalloc("%p\n", ref); 306 dbg_memalloc("%p\n", ref);
308 307
309 memset(ref, 0, sizeof(struct jffs2_xattr_ref));
310 ref->class = RAWNODE_CLASS_XATTR_REF; 308 ref->class = RAWNODE_CLASS_XATTR_REF;
311 ref->node = (void *)ref; 309 ref->node = (void *)ref;
312 return ref; 310 return ref;
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index a166c1669e82..06ca1b8d2054 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -182,7 +182,7 @@ int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
182cleanup: 182cleanup:
183 posix_acl_release(acl); 183 posix_acl_release(acl);
184 } else 184 } else
185 inode->i_mode &= ~current->fs->umask; 185 inode->i_mode &= ~current_umask();
186 186
187 JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) | 187 JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) |
188 inode->i_mode; 188 inode->i_mode;
diff --git a/fs/libfs.c b/fs/libfs.c
index 4910a36f516e..80046ddf5063 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -246,8 +246,7 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
246 return 0; 246 return 0;
247 247
248Enomem: 248Enomem:
249 up_write(&s->s_umount); 249 deactivate_locked_super(s);
250 deactivate_super(s);
251 return -ENOMEM; 250 return -ENOMEM;
252} 251}
253 252
@@ -575,6 +574,21 @@ ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
575 * possibly a read which collects the result - which is stored in a 574 * possibly a read which collects the result - which is stored in a
576 * file-local buffer. 575 * file-local buffer.
577 */ 576 */
577
578void simple_transaction_set(struct file *file, size_t n)
579{
580 struct simple_transaction_argresp *ar = file->private_data;
581
582 BUG_ON(n > SIMPLE_TRANSACTION_LIMIT);
583
584 /*
585 * The barrier ensures that ar->size will really remain zero until
586 * ar->data is ready for reading.
587 */
588 smp_mb();
589 ar->size = n;
590}
591
578char *simple_transaction_get(struct file *file, const char __user *buf, size_t size) 592char *simple_transaction_get(struct file *file, const char __user *buf, size_t size)
579{ 593{
580 struct simple_transaction_argresp *ar; 594 struct simple_transaction_argresp *ar;
@@ -820,6 +834,7 @@ EXPORT_SYMBOL(simple_sync_file);
820EXPORT_SYMBOL(simple_unlink); 834EXPORT_SYMBOL(simple_unlink);
821EXPORT_SYMBOL(simple_read_from_buffer); 835EXPORT_SYMBOL(simple_read_from_buffer);
822EXPORT_SYMBOL(memory_read_from_buffer); 836EXPORT_SYMBOL(memory_read_from_buffer);
837EXPORT_SYMBOL(simple_transaction_set);
823EXPORT_SYMBOL(simple_transaction_get); 838EXPORT_SYMBOL(simple_transaction_get);
824EXPORT_SYMBOL(simple_transaction_read); 839EXPORT_SYMBOL(simple_transaction_read);
825EXPORT_SYMBOL(simple_transaction_release); 840EXPORT_SYMBOL(simple_transaction_release);
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index aedc47a264c1..1f3b0fc0d351 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -139,55 +139,6 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
139 return 0; 139 return 0;
140} 140}
141 141
142#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
143static const struct in6_addr *nlmclnt_map_v4addr(const struct sockaddr *sap,
144 struct in6_addr *addr_mapped)
145{
146 const struct sockaddr_in *sin = (const struct sockaddr_in *)sap;
147
148 switch (sap->sa_family) {
149 case AF_INET6:
150 return &((const struct sockaddr_in6 *)sap)->sin6_addr;
151 case AF_INET:
152 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, addr_mapped);
153 return addr_mapped;
154 }
155
156 return NULL;
157}
158
159/*
160 * If lockd is using a PF_INET6 listener, all incoming requests appear
161 * to come from AF_INET6 remotes. The address of AF_INET remotes are
162 * mapped to AF_INET6 automatically by the network layer. In case the
163 * user passed an AF_INET server address at mount time, ensure both
164 * addresses are AF_INET6 before comparing them.
165 */
166static int nlmclnt_cmp_addr(const struct nlm_host *host,
167 const struct sockaddr *sap)
168{
169 const struct in6_addr *addr1;
170 const struct in6_addr *addr2;
171 struct in6_addr addr1_mapped;
172 struct in6_addr addr2_mapped;
173
174 addr1 = nlmclnt_map_v4addr(nlm_addr(host), &addr1_mapped);
175 if (likely(addr1 != NULL)) {
176 addr2 = nlmclnt_map_v4addr(sap, &addr2_mapped);
177 if (likely(addr2 != NULL))
178 return ipv6_addr_equal(addr1, addr2);
179 }
180
181 return 0;
182}
183#else /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
184static int nlmclnt_cmp_addr(const struct nlm_host *host,
185 const struct sockaddr *sap)
186{
187 return nlm_cmp_addr(nlm_addr(host), sap);
188}
189#endif /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
190
191/* 142/*
192 * The server lockd has called us back to tell us the lock was granted 143 * The server lockd has called us back to tell us the lock was granted
193 */ 144 */
@@ -215,7 +166,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
215 */ 166 */
216 if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) 167 if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
217 continue; 168 continue;
218 if (!nlmclnt_cmp_addr(block->b_host, addr)) 169 if (!nlm_cmp_addr(nlm_addr(block->b_host), addr))
219 continue; 170 continue;
220 if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) 171 if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
221 continue; 172 continue;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 5e2c4d5ac827..6d5d4a4169e5 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -16,6 +16,8 @@
16#include <linux/sunrpc/svc.h> 16#include <linux/sunrpc/svc.h>
17#include <linux/lockd/lockd.h> 17#include <linux/lockd/lockd.h>
18 18
19#include <asm/unaligned.h>
20
19#define NLMDBG_FACILITY NLMDBG_MONITOR 21#define NLMDBG_FACILITY NLMDBG_MONITOR
20#define NSM_PROGRAM 100024 22#define NSM_PROGRAM 100024
21#define NSM_VERSION 1 23#define NSM_VERSION 1
@@ -274,10 +276,12 @@ static void nsm_init_private(struct nsm_handle *nsm)
274{ 276{
275 u64 *p = (u64 *)&nsm->sm_priv.data; 277 u64 *p = (u64 *)&nsm->sm_priv.data;
276 struct timespec ts; 278 struct timespec ts;
279 s64 ns;
277 280
278 ktime_get_ts(&ts); 281 ktime_get_ts(&ts);
279 *p++ = timespec_to_ns(&ts); 282 ns = timespec_to_ns(&ts);
280 *p = (unsigned long)nsm; 283 put_unaligned(ns, p);
284 put_unaligned((unsigned long)nsm, p + 1);
281} 285}
282 286
283static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap, 287static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 64f1c31b5853..1a54ae14a192 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -53,17 +53,6 @@ static struct svc_rqst *nlmsvc_rqst;
53unsigned long nlmsvc_timeout; 53unsigned long nlmsvc_timeout;
54 54
55/* 55/*
56 * If the kernel has IPv6 support available, always listen for
57 * both AF_INET and AF_INET6 requests.
58 */
59#if (defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) && \
60 defined(CONFIG_SUNRPC_REGISTER_V4)
61static const sa_family_t nlmsvc_family = AF_INET6;
62#else /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
63static const sa_family_t nlmsvc_family = AF_INET;
64#endif /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
65
66/*
67 * These can be set at insmod time (useful for NFS as root filesystem), 56 * These can be set at insmod time (useful for NFS as root filesystem),
68 * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003 57 * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003
69 */ 58 */
@@ -115,6 +104,16 @@ static void set_grace_period(void)
115 schedule_delayed_work(&grace_period_end, grace_period); 104 schedule_delayed_work(&grace_period_end, grace_period);
116} 105}
117 106
107static void restart_grace(void)
108{
109 if (nlmsvc_ops) {
110 cancel_delayed_work_sync(&grace_period_end);
111 locks_end_grace(&lockd_manager);
112 nlmsvc_invalidate_all();
113 set_grace_period();
114 }
115}
116
118/* 117/*
119 * This is the lockd kernel thread 118 * This is the lockd kernel thread
120 */ 119 */
@@ -160,10 +159,7 @@ lockd(void *vrqstp)
160 159
161 if (signalled()) { 160 if (signalled()) {
162 flush_signals(current); 161 flush_signals(current);
163 if (nlmsvc_ops) { 162 restart_grace();
164 nlmsvc_invalidate_all();
165 set_grace_period();
166 }
167 continue; 163 continue;
168 } 164 }
169 165
@@ -204,19 +200,30 @@ lockd(void *vrqstp)
204 return 0; 200 return 0;
205} 201}
206 202
207static int create_lockd_listener(struct svc_serv *serv, char *name, 203static int create_lockd_listener(struct svc_serv *serv, const char *name,
208 unsigned short port) 204 const int family, const unsigned short port)
209{ 205{
210 struct svc_xprt *xprt; 206 struct svc_xprt *xprt;
211 207
212 xprt = svc_find_xprt(serv, name, 0, 0); 208 xprt = svc_find_xprt(serv, name, family, 0);
213 if (xprt == NULL) 209 if (xprt == NULL)
214 return svc_create_xprt(serv, name, port, SVC_SOCK_DEFAULTS); 210 return svc_create_xprt(serv, name, family, port,
215 211 SVC_SOCK_DEFAULTS);
216 svc_xprt_put(xprt); 212 svc_xprt_put(xprt);
217 return 0; 213 return 0;
218} 214}
219 215
216static int create_lockd_family(struct svc_serv *serv, const int family)
217{
218 int err;
219
220 err = create_lockd_listener(serv, "udp", family, nlm_udpport);
221 if (err < 0)
222 return err;
223
224 return create_lockd_listener(serv, "tcp", family, nlm_tcpport);
225}
226
220/* 227/*
221 * Ensure there are active UDP and TCP listeners for lockd. 228 * Ensure there are active UDP and TCP listeners for lockd.
222 * 229 *
@@ -232,13 +239,15 @@ static int make_socks(struct svc_serv *serv)
232 static int warned; 239 static int warned;
233 int err; 240 int err;
234 241
235 err = create_lockd_listener(serv, "udp", nlm_udpport); 242 err = create_lockd_family(serv, PF_INET);
236 if (err < 0) 243 if (err < 0)
237 goto out_err; 244 goto out_err;
238 245
239 err = create_lockd_listener(serv, "tcp", nlm_tcpport); 246#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
240 if (err < 0) 247 err = create_lockd_family(serv, PF_INET6);
248 if (err < 0 && err != -EAFNOSUPPORT)
241 goto out_err; 249 goto out_err;
250#endif /* CONFIG_IPV6 || CONFIG_IPV6_MODULE */
242 251
243 warned = 0; 252 warned = 0;
244 return 0; 253 return 0;
@@ -274,7 +283,7 @@ int lockd_up(void)
274 "lockd_up: no pid, %d users??\n", nlmsvc_users); 283 "lockd_up: no pid, %d users??\n", nlmsvc_users);
275 284
276 error = -ENOMEM; 285 error = -ENOMEM;
277 serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, nlmsvc_family, NULL); 286 serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL);
278 if (!serv) { 287 if (!serv) {
279 printk(KERN_WARNING "lockd_up: create service failed\n"); 288 printk(KERN_WARNING "lockd_up: create service failed\n");
280 goto out; 289 goto out;
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 763b78a6e9de..83ee34203bd7 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -426,8 +426,15 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
426 ret = nlm_granted; 426 ret = nlm_granted;
427 goto out; 427 goto out;
428 case -EAGAIN: 428 case -EAGAIN:
429 /*
430 * If this is a blocking request for an
431 * already pending lock request then we need
432 * to put it back on lockd's block list
433 */
434 if (wait)
435 break;
429 ret = nlm_lck_denied; 436 ret = nlm_lck_denied;
430 break; 437 goto out;
431 case FILE_LOCK_DEFERRED: 438 case FILE_LOCK_DEFERRED:
432 if (wait) 439 if (wait)
433 break; 440 break;
@@ -443,10 +450,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
443 goto out; 450 goto out;
444 } 451 }
445 452
446 ret = nlm_lck_denied;
447 if (!wait)
448 goto out;
449
450 ret = nlm_lck_blocked; 453 ret = nlm_lck_blocked;
451 454
452 /* Append to list of blocked */ 455 /* Append to list of blocked */
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 618865b3128b..daad3c2740db 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -321,15 +321,20 @@ out:
321 321
322static int minix_statfs(struct dentry *dentry, struct kstatfs *buf) 322static int minix_statfs(struct dentry *dentry, struct kstatfs *buf)
323{ 323{
324 struct minix_sb_info *sbi = minix_sb(dentry->d_sb); 324 struct super_block *sb = dentry->d_sb;
325 buf->f_type = dentry->d_sb->s_magic; 325 struct minix_sb_info *sbi = minix_sb(sb);
326 buf->f_bsize = dentry->d_sb->s_blocksize; 326 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
327 buf->f_type = sb->s_magic;
328 buf->f_bsize = sb->s_blocksize;
327 buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size; 329 buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size;
328 buf->f_bfree = minix_count_free_blocks(sbi); 330 buf->f_bfree = minix_count_free_blocks(sbi);
329 buf->f_bavail = buf->f_bfree; 331 buf->f_bavail = buf->f_bfree;
330 buf->f_files = sbi->s_ninodes; 332 buf->f_files = sbi->s_ninodes;
331 buf->f_ffree = minix_count_free_inodes(sbi); 333 buf->f_ffree = minix_count_free_inodes(sbi);
332 buf->f_namelen = sbi->s_namelen; 334 buf->f_namelen = sbi->s_namelen;
335 buf->f_fsid.val[0] = (u32)id;
336 buf->f_fsid.val[1] = (u32)(id >> 32);
337
333 return 0; 338 return 0;
334} 339}
335 340
diff --git a/fs/mpage.c b/fs/mpage.c
index 16c3ef37eae3..680ba60863ff 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err)
82 bio_put(bio); 82 bio_put(bio);
83} 83}
84 84
85struct bio *mpage_bio_submit(int rw, struct bio *bio) 85static struct bio *mpage_bio_submit(int rw, struct bio *bio)
86{ 86{
87 bio->bi_end_io = mpage_end_io_read; 87 bio->bi_end_io = mpage_end_io_read;
88 if (rw == WRITE) 88 if (rw == WRITE)
@@ -90,7 +90,6 @@ struct bio *mpage_bio_submit(int rw, struct bio *bio)
90 submit_bio(rw, bio); 90 submit_bio(rw, bio);
91 return NULL; 91 return NULL;
92} 92}
93EXPORT_SYMBOL(mpage_bio_submit);
94 93
95static struct bio * 94static struct bio *
96mpage_alloc(struct block_device *bdev, 95mpage_alloc(struct block_device *bdev,
@@ -439,7 +438,14 @@ EXPORT_SYMBOL(mpage_readpage);
439 * just allocate full-size (16-page) BIOs. 438 * just allocate full-size (16-page) BIOs.
440 */ 439 */
441 440
442int __mpage_writepage(struct page *page, struct writeback_control *wbc, 441struct mpage_data {
442 struct bio *bio;
443 sector_t last_block_in_bio;
444 get_block_t *get_block;
445 unsigned use_writepage;
446};
447
448static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
443 void *data) 449 void *data)
444{ 450{
445 struct mpage_data *mpd = data; 451 struct mpage_data *mpd = data;
@@ -648,7 +654,6 @@ out:
648 mpd->bio = bio; 654 mpd->bio = bio;
649 return ret; 655 return ret;
650} 656}
651EXPORT_SYMBOL(__mpage_writepage);
652 657
653/** 658/**
654 * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them 659 * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
diff --git a/fs/namei.c b/fs/namei.c
index d040ce11785d..967c3db92724 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -32,6 +32,7 @@
32#include <linux/file.h> 32#include <linux/file.h>
33#include <linux/fcntl.h> 33#include <linux/fcntl.h>
34#include <linux/device_cgroup.h> 34#include <linux/device_cgroup.h>
35#include <linux/fs_struct.h>
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36 37
37#define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE]) 38#define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
@@ -1129,8 +1130,8 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
1129 * @nd: pointer to nameidata 1130 * @nd: pointer to nameidata
1130 * @open_flags: open intent flags 1131 * @open_flags: open intent flags
1131 */ 1132 */
1132int path_lookup_open(int dfd, const char *name, unsigned int lookup_flags, 1133static int path_lookup_open(int dfd, const char *name,
1133 struct nameidata *nd, int open_flags) 1134 unsigned int lookup_flags, struct nameidata *nd, int open_flags)
1134{ 1135{
1135 struct file *filp = get_empty_filp(); 1136 struct file *filp = get_empty_filp();
1136 int err; 1137 int err;
@@ -1247,6 +1248,8 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
1247 int err; 1248 int err;
1248 struct qstr this; 1249 struct qstr this;
1249 1250
1251 WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
1252
1250 err = __lookup_one_len(name, &this, base, len); 1253 err = __lookup_one_len(name, &this, base, len);
1251 if (err) 1254 if (err)
1252 return ERR_PTR(err); 1255 return ERR_PTR(err);
@@ -1578,7 +1581,7 @@ static int __open_namei_create(struct nameidata *nd, struct path *path,
1578 struct dentry *dir = nd->path.dentry; 1581 struct dentry *dir = nd->path.dentry;
1579 1582
1580 if (!IS_POSIXACL(dir->d_inode)) 1583 if (!IS_POSIXACL(dir->d_inode))
1581 mode &= ~current->fs->umask; 1584 mode &= ~current_umask();
1582 error = security_path_mknod(&nd->path, path->dentry, mode, 0); 1585 error = security_path_mknod(&nd->path, path->dentry, mode, 0);
1583 if (error) 1586 if (error)
1584 goto out_unlock; 1587 goto out_unlock;
@@ -1634,18 +1637,19 @@ static int open_will_write_to_fs(int flag, struct inode *inode)
1634 * open_to_namei_flags() for more details. 1637 * open_to_namei_flags() for more details.
1635 */ 1638 */
1636struct file *do_filp_open(int dfd, const char *pathname, 1639struct file *do_filp_open(int dfd, const char *pathname,
1637 int open_flag, int mode) 1640 int open_flag, int mode, int acc_mode)
1638{ 1641{
1639 struct file *filp; 1642 struct file *filp;
1640 struct nameidata nd; 1643 struct nameidata nd;
1641 int acc_mode, error; 1644 int error;
1642 struct path path; 1645 struct path path;
1643 struct dentry *dir; 1646 struct dentry *dir;
1644 int count = 0; 1647 int count = 0;
1645 int will_write; 1648 int will_write;
1646 int flag = open_to_namei_flags(open_flag); 1649 int flag = open_to_namei_flags(open_flag);
1647 1650
1648 acc_mode = MAY_OPEN | ACC_MODE(flag); 1651 if (!acc_mode)
1652 acc_mode = MAY_OPEN | ACC_MODE(flag);
1649 1653
1650 /* O_TRUNC implies we need access checks for write permissions */ 1654 /* O_TRUNC implies we need access checks for write permissions */
1651 if (flag & O_TRUNC) 1655 if (flag & O_TRUNC)
@@ -1866,7 +1870,7 @@ do_link:
1866 */ 1870 */
1867struct file *filp_open(const char *filename, int flags, int mode) 1871struct file *filp_open(const char *filename, int flags, int mode)
1868{ 1872{
1869 return do_filp_open(AT_FDCWD, filename, flags, mode); 1873 return do_filp_open(AT_FDCWD, filename, flags, mode, 0);
1870} 1874}
1871EXPORT_SYMBOL(filp_open); 1875EXPORT_SYMBOL(filp_open);
1872 1876
@@ -1989,7 +1993,7 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode,
1989 goto out_unlock; 1993 goto out_unlock;
1990 } 1994 }
1991 if (!IS_POSIXACL(nd.path.dentry->d_inode)) 1995 if (!IS_POSIXACL(nd.path.dentry->d_inode))
1992 mode &= ~current->fs->umask; 1996 mode &= ~current_umask();
1993 error = may_mknod(mode); 1997 error = may_mknod(mode);
1994 if (error) 1998 if (error)
1995 goto out_dput; 1999 goto out_dput;
@@ -2067,7 +2071,7 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode)
2067 goto out_unlock; 2071 goto out_unlock;
2068 2072
2069 if (!IS_POSIXACL(nd.path.dentry->d_inode)) 2073 if (!IS_POSIXACL(nd.path.dentry->d_inode))
2070 mode &= ~current->fs->umask; 2074 mode &= ~current_umask();
2071 error = mnt_want_write(nd.path.mnt); 2075 error = mnt_want_write(nd.path.mnt);
2072 if (error) 2076 if (error)
2073 goto out_dput; 2077 goto out_dput;
@@ -2897,10 +2901,3 @@ EXPORT_SYMBOL(vfs_symlink);
2897EXPORT_SYMBOL(vfs_unlink); 2901EXPORT_SYMBOL(vfs_unlink);
2898EXPORT_SYMBOL(dentry_unhash); 2902EXPORT_SYMBOL(dentry_unhash);
2899EXPORT_SYMBOL(generic_readlink); 2903EXPORT_SYMBOL(generic_readlink);
2900
2901/* to be mentioned only in INIT_TASK */
2902struct fs_struct init_fs = {
2903 .count = ATOMIC_INIT(1),
2904 .lock = __RW_LOCK_UNLOCKED(init_fs.lock),
2905 .umask = 0022,
2906};
diff --git a/fs/namespace.c b/fs/namespace.c
index 0a42e0e96027..134d494158d9 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -27,6 +27,7 @@
27#include <linux/ramfs.h> 27#include <linux/ramfs.h>
28#include <linux/log2.h> 28#include <linux/log2.h>
29#include <linux/idr.h> 29#include <linux/idr.h>
30#include <linux/fs_struct.h>
30#include <asm/uaccess.h> 31#include <asm/uaccess.h>
31#include <asm/unistd.h> 32#include <asm/unistd.h>
32#include "pnode.h" 33#include "pnode.h"
@@ -694,12 +695,16 @@ static inline void mangle(struct seq_file *m, const char *s)
694 */ 695 */
695int generic_show_options(struct seq_file *m, struct vfsmount *mnt) 696int generic_show_options(struct seq_file *m, struct vfsmount *mnt)
696{ 697{
697 const char *options = mnt->mnt_sb->s_options; 698 const char *options;
699
700 rcu_read_lock();
701 options = rcu_dereference(mnt->mnt_sb->s_options);
698 702
699 if (options != NULL && options[0]) { 703 if (options != NULL && options[0]) {
700 seq_putc(m, ','); 704 seq_putc(m, ',');
701 mangle(m, options); 705 mangle(m, options);
702 } 706 }
707 rcu_read_unlock();
703 708
704 return 0; 709 return 0;
705} 710}
@@ -720,11 +725,22 @@ EXPORT_SYMBOL(generic_show_options);
720 */ 725 */
721void save_mount_options(struct super_block *sb, char *options) 726void save_mount_options(struct super_block *sb, char *options)
722{ 727{
723 kfree(sb->s_options); 728 BUG_ON(sb->s_options);
724 sb->s_options = kstrdup(options, GFP_KERNEL); 729 rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL));
725} 730}
726EXPORT_SYMBOL(save_mount_options); 731EXPORT_SYMBOL(save_mount_options);
727 732
733void replace_mount_options(struct super_block *sb, char *options)
734{
735 char *old = sb->s_options;
736 rcu_assign_pointer(sb->s_options, options);
737 if (old) {
738 synchronize_rcu();
739 kfree(old);
740 }
741}
742EXPORT_SYMBOL(replace_mount_options);
743
728#ifdef CONFIG_PROC_FS 744#ifdef CONFIG_PROC_FS
729/* iterator */ 745/* iterator */
730static void *m_start(struct seq_file *m, loff_t *pos) 746static void *m_start(struct seq_file *m, loff_t *pos)
@@ -1072,9 +1088,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
1072 */ 1088 */
1073 1089
1074 if (flags & MNT_FORCE && sb->s_op->umount_begin) { 1090 if (flags & MNT_FORCE && sb->s_op->umount_begin) {
1075 lock_kernel();
1076 sb->s_op->umount_begin(sb); 1091 sb->s_op->umount_begin(sb);
1077 unlock_kernel();
1078 } 1092 }
1079 1093
1080 /* 1094 /*
@@ -1376,7 +1390,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
1376 if (parent_path) { 1390 if (parent_path) {
1377 detach_mnt(source_mnt, parent_path); 1391 detach_mnt(source_mnt, parent_path);
1378 attach_mnt(source_mnt, path); 1392 attach_mnt(source_mnt, path);
1379 touch_mnt_namespace(current->nsproxy->mnt_ns); 1393 touch_mnt_namespace(parent_path->mnt->mnt_ns);
1380 } else { 1394 } else {
1381 mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt); 1395 mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
1382 commit_tree(source_mnt); 1396 commit_tree(source_mnt);
@@ -1919,8 +1933,9 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
1919 if (data_page) 1933 if (data_page)
1920 ((char *)data_page)[PAGE_SIZE - 1] = 0; 1934 ((char *)data_page)[PAGE_SIZE - 1] = 0;
1921 1935
1922 /* Default to relatime */ 1936 /* Default to relatime unless overriden */
1923 mnt_flags |= MNT_RELATIME; 1937 if (!(flags & MS_NOATIME))
1938 mnt_flags |= MNT_RELATIME;
1924 1939
1925 /* Separate the per-mountpoint flags */ 1940 /* Separate the per-mountpoint flags */
1926 if (flags & MS_NOSUID) 1941 if (flags & MS_NOSUID)
@@ -2093,66 +2108,6 @@ out1:
2093} 2108}
2094 2109
2095/* 2110/*
2096 * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
2097 * It can block. Requires the big lock held.
2098 */
2099void set_fs_root(struct fs_struct *fs, struct path *path)
2100{
2101 struct path old_root;
2102
2103 write_lock(&fs->lock);
2104 old_root = fs->root;
2105 fs->root = *path;
2106 path_get(path);
2107 write_unlock(&fs->lock);
2108 if (old_root.dentry)
2109 path_put(&old_root);
2110}
2111
2112/*
2113 * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
2114 * It can block. Requires the big lock held.
2115 */
2116void set_fs_pwd(struct fs_struct *fs, struct path *path)
2117{
2118 struct path old_pwd;
2119
2120 write_lock(&fs->lock);
2121 old_pwd = fs->pwd;
2122 fs->pwd = *path;
2123 path_get(path);
2124 write_unlock(&fs->lock);
2125
2126 if (old_pwd.dentry)
2127 path_put(&old_pwd);
2128}
2129
2130static void chroot_fs_refs(struct path *old_root, struct path *new_root)
2131{
2132 struct task_struct *g, *p;
2133 struct fs_struct *fs;
2134
2135 read_lock(&tasklist_lock);
2136 do_each_thread(g, p) {
2137 task_lock(p);
2138 fs = p->fs;
2139 if (fs) {
2140 atomic_inc(&fs->count);
2141 task_unlock(p);
2142 if (fs->root.dentry == old_root->dentry
2143 && fs->root.mnt == old_root->mnt)
2144 set_fs_root(fs, new_root);
2145 if (fs->pwd.dentry == old_root->dentry
2146 && fs->pwd.mnt == old_root->mnt)
2147 set_fs_pwd(fs, new_root);
2148 put_fs_struct(fs);
2149 } else
2150 task_unlock(p);
2151 } while_each_thread(g, p);
2152 read_unlock(&tasklist_lock);
2153}
2154
2155/*
2156 * pivot_root Semantics: 2111 * pivot_root Semantics:
2157 * Moves the root file system of the current process to the directory put_old, 2112 * Moves the root file system of the current process to the directory put_old,
2158 * makes new_root as the new root file system of the current process, and sets 2113 * makes new_root as the new root file system of the current process, and sets
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index f54360f50a9c..fa038df63ac8 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -660,13 +660,10 @@ outrel:
660 if (user.object_name_len > NCP_OBJECT_NAME_MAX_LEN) 660 if (user.object_name_len > NCP_OBJECT_NAME_MAX_LEN)
661 return -ENOMEM; 661 return -ENOMEM;
662 if (user.object_name_len) { 662 if (user.object_name_len) {
663 newname = kmalloc(user.object_name_len, GFP_USER); 663 newname = memdup_user(user.object_name,
664 if (!newname) 664 user.object_name_len);
665 return -ENOMEM; 665 if (IS_ERR(newname))
666 if (copy_from_user(newname, user.object_name, user.object_name_len)) { 666 return PTR_ERR(newname);
667 kfree(newname);
668 return -EFAULT;
669 }
670 } else { 667 } else {
671 newname = NULL; 668 newname = NULL;
672 } 669 }
@@ -760,13 +757,9 @@ outrel:
760 if (user.len > NCP_PRIVATE_DATA_MAX_LEN) 757 if (user.len > NCP_PRIVATE_DATA_MAX_LEN)
761 return -ENOMEM; 758 return -ENOMEM;
762 if (user.len) { 759 if (user.len) {
763 new = kmalloc(user.len, GFP_USER); 760 new = memdup_user(user.data, user.len);
764 if (!new) 761 if (IS_ERR(new))
765 return -ENOMEM; 762 return PTR_ERR(new);
766 if (copy_from_user(new, user.data, user.len)) {
767 kfree(new);
768 return -EFAULT;
769 }
770 } else { 763 } else {
771 new = NULL; 764 new = NULL;
772 } 765 }
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 36fe20d6eba2..e67f3ec07736 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -84,3 +84,11 @@ config ROOT_NFS
84 <file:Documentation/filesystems/nfsroot.txt>. 84 <file:Documentation/filesystems/nfsroot.txt>.
85 85
86 Most people say N here. 86 Most people say N here.
87
88config NFS_FSCACHE
89 bool "Provide NFS client caching support (EXPERIMENTAL)"
90 depends on EXPERIMENTAL
91 depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y
92 help
93 Say Y here if you want NFS data to be cached locally on disc through
94 the general filesystem cache manager
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index ac6170c594a3..845159814de2 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -15,3 +15,4 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
15 callback.o callback_xdr.o callback_proc.o \ 15 callback.o callback_xdr.o callback_proc.o \
16 nfs4namespace.o 16 nfs4namespace.o
17nfs-$(CONFIG_SYSCTL) += sysctl.o 17nfs-$(CONFIG_SYSCTL) += sysctl.o
18nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 3e634f2a1083..a886e692ddd0 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -38,19 +38,10 @@ static struct svc_program nfs4_callback_program;
38 38
39unsigned int nfs_callback_set_tcpport; 39unsigned int nfs_callback_set_tcpport;
40unsigned short nfs_callback_tcpport; 40unsigned short nfs_callback_tcpport;
41unsigned short nfs_callback_tcpport6;
41static const int nfs_set_port_min = 0; 42static const int nfs_set_port_min = 0;
42static const int nfs_set_port_max = 65535; 43static const int nfs_set_port_max = 65535;
43 44
44/*
45 * If the kernel has IPv6 support available, always listen for
46 * both AF_INET and AF_INET6 requests.
47 */
48#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
49static const sa_family_t nfs_callback_family = AF_INET6;
50#else
51static const sa_family_t nfs_callback_family = AF_INET;
52#endif
53
54static int param_set_port(const char *val, struct kernel_param *kp) 45static int param_set_port(const char *val, struct kernel_param *kp)
55{ 46{
56 char *endp; 47 char *endp;
@@ -116,19 +107,29 @@ int nfs_callback_up(void)
116 mutex_lock(&nfs_callback_mutex); 107 mutex_lock(&nfs_callback_mutex);
117 if (nfs_callback_info.users++ || nfs_callback_info.task != NULL) 108 if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
118 goto out; 109 goto out;
119 serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, 110 serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
120 nfs_callback_family, NULL);
121 ret = -ENOMEM; 111 ret = -ENOMEM;
122 if (!serv) 112 if (!serv)
123 goto out_err; 113 goto out_err;
124 114
125 ret = svc_create_xprt(serv, "tcp", nfs_callback_set_tcpport, 115 ret = svc_create_xprt(serv, "tcp", PF_INET,
126 SVC_SOCK_ANONYMOUS); 116 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
127 if (ret <= 0) 117 if (ret <= 0)
128 goto out_err; 118 goto out_err;
129 nfs_callback_tcpport = ret; 119 nfs_callback_tcpport = ret;
130 dprintk("NFS: Callback listener port = %u (af %u)\n", 120 dprintk("NFS: Callback listener port = %u (af %u)\n",
131 nfs_callback_tcpport, nfs_callback_family); 121 nfs_callback_tcpport, PF_INET);
122
123#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
124 ret = svc_create_xprt(serv, "tcp", PF_INET6,
125 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
126 if (ret > 0) {
127 nfs_callback_tcpport6 = ret;
128 dprintk("NFS: Callback listener port = %u (af %u)\n",
129 nfs_callback_tcpport6, PF_INET6);
130 } else if (ret != -EAFNOSUPPORT)
131 goto out_err;
132#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
132 133
133 nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]); 134 nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
134 if (IS_ERR(nfs_callback_info.rqst)) { 135 if (IS_ERR(nfs_callback_info.rqst)) {
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index bb25d2135ff1..e110e286a262 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -72,5 +72,6 @@ extern void nfs_callback_down(void);
72 72
73extern unsigned int nfs_callback_set_tcpport; 73extern unsigned int nfs_callback_set_tcpport;
74extern unsigned short nfs_callback_tcpport; 74extern unsigned short nfs_callback_tcpport;
75extern unsigned short nfs_callback_tcpport6;
75 76
76#endif /* __LINUX_FS_NFS_CALLBACK_H */ 77#endif /* __LINUX_FS_NFS_CALLBACK_H */
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 2277421656e7..75c9cd2aa119 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -45,6 +45,7 @@
45#include "delegation.h" 45#include "delegation.h"
46#include "iostat.h" 46#include "iostat.h"
47#include "internal.h" 47#include "internal.h"
48#include "fscache.h"
48 49
49#define NFSDBG_FACILITY NFSDBG_CLIENT 50#define NFSDBG_FACILITY NFSDBG_CLIENT
50 51
@@ -154,6 +155,8 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
154 if (!IS_ERR(cred)) 155 if (!IS_ERR(cred))
155 clp->cl_machine_cred = cred; 156 clp->cl_machine_cred = cred;
156 157
158 nfs_fscache_get_client_cookie(clp);
159
157 return clp; 160 return clp;
158 161
159error_3: 162error_3:
@@ -187,6 +190,8 @@ static void nfs_free_client(struct nfs_client *clp)
187 190
188 nfs4_shutdown_client(clp); 191 nfs4_shutdown_client(clp);
189 192
193 nfs_fscache_release_client_cookie(clp);
194
190 /* -EIO all pending I/O */ 195 /* -EIO all pending I/O */
191 if (!IS_ERR(clp->cl_rpcclient)) 196 if (!IS_ERR(clp->cl_rpcclient))
192 rpc_shutdown_client(clp->cl_rpcclient); 197 rpc_shutdown_client(clp->cl_rpcclient);
@@ -224,38 +229,6 @@ void nfs_put_client(struct nfs_client *clp)
224} 229}
225 230
226#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 231#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
227static const struct in6_addr *nfs_map_ipv4_addr(const struct sockaddr *sa, struct in6_addr *addr_mapped)
228{
229 switch (sa->sa_family) {
230 default:
231 return NULL;
232 case AF_INET6:
233 return &((const struct sockaddr_in6 *)sa)->sin6_addr;
234 break;
235 case AF_INET:
236 ipv6_addr_set_v4mapped(((const struct sockaddr_in *)sa)->sin_addr.s_addr,
237 addr_mapped);
238 return addr_mapped;
239 }
240}
241
242static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
243 const struct sockaddr *sa2)
244{
245 const struct in6_addr *addr1;
246 const struct in6_addr *addr2;
247 struct in6_addr addr1_mapped;
248 struct in6_addr addr2_mapped;
249
250 addr1 = nfs_map_ipv4_addr(sa1, &addr1_mapped);
251 if (likely(addr1 != NULL)) {
252 addr2 = nfs_map_ipv4_addr(sa2, &addr2_mapped);
253 if (likely(addr2 != NULL))
254 return ipv6_addr_equal(addr1, addr2);
255 }
256 return 0;
257}
258
259/* 232/*
260 * Test if two ip6 socket addresses refer to the same socket by 233 * Test if two ip6 socket addresses refer to the same socket by
261 * comparing relevant fields. The padding bytes specifically, are not 234 * comparing relevant fields. The padding bytes specifically, are not
@@ -267,38 +240,21 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
267 * 240 *
268 * The caller should ensure both socket addresses are AF_INET6. 241 * The caller should ensure both socket addresses are AF_INET6.
269 */ 242 */
270static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1, 243static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
271 const struct sockaddr *sa2) 244 const struct sockaddr *sa2)
272{ 245{
273 const struct sockaddr_in6 *saddr1 = (const struct sockaddr_in6 *)sa1; 246 const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
274 const struct sockaddr_in6 *saddr2 = (const struct sockaddr_in6 *)sa2; 247 const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
275 248
276 if (!ipv6_addr_equal(&saddr1->sin6_addr, 249 if (ipv6_addr_scope(&sin1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL &&
277 &saddr1->sin6_addr)) 250 sin1->sin6_scope_id != sin2->sin6_scope_id)
278 return 0;
279 if (ipv6_addr_scope(&saddr1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL &&
280 saddr1->sin6_scope_id != saddr2->sin6_scope_id)
281 return 0; 251 return 0;
282 return saddr1->sin6_port == saddr2->sin6_port;
283}
284#else
285static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1,
286 const struct sockaddr_in *sa2)
287{
288 return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr;
289}
290 252
291static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, 253 return ipv6_addr_equal(&sin1->sin6_addr, &sin1->sin6_addr);
292 const struct sockaddr *sa2)
293{
294 if (unlikely(sa1->sa_family != AF_INET || sa2->sa_family != AF_INET))
295 return 0;
296 return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1,
297 (const struct sockaddr_in *)sa2);
298} 254}
299 255#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
300static int nfs_sockaddr_cmp_ip6(const struct sockaddr * sa1, 256static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
301 const struct sockaddr * sa2) 257 const struct sockaddr *sa2)
302{ 258{
303 return 0; 259 return 0;
304} 260}
@@ -311,20 +267,57 @@ static int nfs_sockaddr_cmp_ip6(const struct sockaddr * sa1,
311 * 267 *
312 * The caller should ensure both socket addresses are AF_INET. 268 * The caller should ensure both socket addresses are AF_INET.
313 */ 269 */
270static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1,
271 const struct sockaddr *sa2)
272{
273 const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
274 const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
275
276 return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
277}
278
279static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
280 const struct sockaddr *sa2)
281{
282 const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
283 const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
284
285 return nfs_sockaddr_match_ipaddr6(sa1, sa2) &&
286 (sin1->sin6_port == sin2->sin6_port);
287}
288
314static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1, 289static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,
315 const struct sockaddr *sa2) 290 const struct sockaddr *sa2)
316{ 291{
317 const struct sockaddr_in *saddr1 = (const struct sockaddr_in *)sa1; 292 const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
318 const struct sockaddr_in *saddr2 = (const struct sockaddr_in *)sa2; 293 const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
319 294
320 if (saddr1->sin_addr.s_addr != saddr2->sin_addr.s_addr) 295 return nfs_sockaddr_match_ipaddr4(sa1, sa2) &&
296 (sin1->sin_port == sin2->sin_port);
297}
298
299/*
300 * Test if two socket addresses represent the same actual socket,
301 * by comparing (only) relevant fields, excluding the port number.
302 */
303static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
304 const struct sockaddr *sa2)
305{
306 if (sa1->sa_family != sa2->sa_family)
321 return 0; 307 return 0;
322 return saddr1->sin_port == saddr2->sin_port; 308
309 switch (sa1->sa_family) {
310 case AF_INET:
311 return nfs_sockaddr_match_ipaddr4(sa1, sa2);
312 case AF_INET6:
313 return nfs_sockaddr_match_ipaddr6(sa1, sa2);
314 }
315 return 0;
323} 316}
324 317
325/* 318/*
326 * Test if two socket addresses represent the same actual socket, 319 * Test if two socket addresses represent the same actual socket,
327 * by comparing (only) relevant fields. 320 * by comparing (only) relevant fields, including the port number.
328 */ 321 */
329static int nfs_sockaddr_cmp(const struct sockaddr *sa1, 322static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
330 const struct sockaddr *sa2) 323 const struct sockaddr *sa2)
@@ -772,6 +765,7 @@ static int nfs_init_server(struct nfs_server *server,
772 765
773 /* Initialise the client representation from the mount data */ 766 /* Initialise the client representation from the mount data */
774 server->flags = data->flags; 767 server->flags = data->flags;
768 server->options = data->options;
775 769
776 if (data->rsize) 770 if (data->rsize)
777 server->rsize = nfs_block_size(data->rsize, NULL); 771 server->rsize = nfs_block_size(data->rsize, NULL);
@@ -1160,6 +1154,7 @@ static int nfs4_init_server(struct nfs_server *server,
1160 /* Initialise the client representation from the mount data */ 1154 /* Initialise the client representation from the mount data */
1161 server->flags = data->flags; 1155 server->flags = data->flags;
1162 server->caps |= NFS_CAP_ATOMIC_OPEN; 1156 server->caps |= NFS_CAP_ATOMIC_OPEN;
1157 server->options = data->options;
1163 1158
1164 /* Get a client record */ 1159 /* Get a client record */
1165 error = nfs4_set_client(server, 1160 error = nfs4_set_client(server,
@@ -1571,7 +1566,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
1571 1566
1572 /* display header on line 1 */ 1567 /* display header on line 1 */
1573 if (v == &nfs_volume_list) { 1568 if (v == &nfs_volume_list) {
1574 seq_puts(m, "NV SERVER PORT DEV FSID\n"); 1569 seq_puts(m, "NV SERVER PORT DEV FSID FSC\n");
1575 return 0; 1570 return 0;
1576 } 1571 }
1577 /* display one transport per line on subsequent lines */ 1572 /* display one transport per line on subsequent lines */
@@ -1585,12 +1580,13 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
1585 (unsigned long long) server->fsid.major, 1580 (unsigned long long) server->fsid.major,
1586 (unsigned long long) server->fsid.minor); 1581 (unsigned long long) server->fsid.minor);
1587 1582
1588 seq_printf(m, "v%u %s %s %-7s %-17s\n", 1583 seq_printf(m, "v%u %s %s %-7s %-17s %s\n",
1589 clp->rpc_ops->version, 1584 clp->rpc_ops->version,
1590 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), 1585 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
1591 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), 1586 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
1592 dev, 1587 dev,
1593 fsid); 1588 fsid,
1589 nfs_server_fscache_state(server));
1594 1590
1595 return 0; 1591 return 0;
1596} 1592}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 78bf72fc1db3..89f98e9a024b 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1624,8 +1624,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1624 } else if (atomic_read(&new_dentry->d_count) > 1) 1624 } else if (atomic_read(&new_dentry->d_count) > 1)
1625 /* dentry still busy? */ 1625 /* dentry still busy? */
1626 goto out; 1626 goto out;
1627 } else 1627 }
1628 nfs_drop_nlink(new_inode);
1629 1628
1630go_ahead: 1629go_ahead:
1631 /* 1630 /*
@@ -1638,10 +1637,8 @@ go_ahead:
1638 } 1637 }
1639 nfs_inode_return_delegation(old_inode); 1638 nfs_inode_return_delegation(old_inode);
1640 1639
1641 if (new_inode != NULL) { 1640 if (new_inode != NULL)
1642 nfs_inode_return_delegation(new_inode); 1641 nfs_inode_return_delegation(new_inode);
1643 d_delete(new_dentry);
1644 }
1645 1642
1646 error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name, 1643 error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name,
1647 new_dir, &new_dentry->d_name); 1644 new_dir, &new_dentry->d_name);
@@ -1650,6 +1647,8 @@ out:
1650 if (rehash) 1647 if (rehash)
1651 d_rehash(rehash); 1648 d_rehash(rehash);
1652 if (!error) { 1649 if (!error) {
1650 if (new_inode != NULL)
1651 nfs_drop_nlink(new_inode);
1653 d_move(old_dentry, new_dentry); 1652 d_move(old_dentry, new_dentry);
1654 nfs_set_verifier(new_dentry, 1653 nfs_set_verifier(new_dentry,
1655 nfs_save_change_attribute(new_dir)); 1654 nfs_save_change_attribute(new_dir));
@@ -1944,7 +1943,8 @@ int nfs_permission(struct inode *inode, int mask)
1944 case S_IFREG: 1943 case S_IFREG:
1945 /* NFSv4 has atomic_open... */ 1944 /* NFSv4 has atomic_open... */
1946 if (nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN) 1945 if (nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN)
1947 && (mask & MAY_OPEN)) 1946 && (mask & MAY_OPEN)
1947 && !(mask & MAY_EXEC))
1948 goto out; 1948 goto out;
1949 break; 1949 break;
1950 case S_IFDIR: 1950 case S_IFDIR:
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 90f292b520d2..ec7e27d00bc6 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -35,6 +35,7 @@
35#include "delegation.h" 35#include "delegation.h"
36#include "internal.h" 36#include "internal.h"
37#include "iostat.h" 37#include "iostat.h"
38#include "fscache.h"
38 39
39#define NFSDBG_FACILITY NFSDBG_FILE 40#define NFSDBG_FACILITY NFSDBG_FILE
40 41
@@ -64,11 +65,7 @@ const struct file_operations nfs_file_operations = {
64 .write = do_sync_write, 65 .write = do_sync_write,
65 .aio_read = nfs_file_read, 66 .aio_read = nfs_file_read,
66 .aio_write = nfs_file_write, 67 .aio_write = nfs_file_write,
67#ifdef CONFIG_MMU
68 .mmap = nfs_file_mmap, 68 .mmap = nfs_file_mmap,
69#else
70 .mmap = generic_file_mmap,
71#endif
72 .open = nfs_file_open, 69 .open = nfs_file_open,
73 .flush = nfs_file_flush, 70 .flush = nfs_file_flush,
74 .release = nfs_file_release, 71 .release = nfs_file_release,
@@ -141,9 +138,6 @@ nfs_file_release(struct inode *inode, struct file *filp)
141 dentry->d_parent->d_name.name, 138 dentry->d_parent->d_name.name,
142 dentry->d_name.name); 139 dentry->d_name.name);
143 140
144 /* Ensure that dirty pages are flushed out with the right creds */
145 if (filp->f_mode & FMODE_WRITE)
146 nfs_wb_all(dentry->d_inode);
147 nfs_inc_stats(inode, NFSIOS_VFSRELEASE); 141 nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
148 return nfs_release(inode, filp); 142 return nfs_release(inode, filp);
149} 143}
@@ -235,7 +229,6 @@ nfs_file_flush(struct file *file, fl_owner_t id)
235 struct nfs_open_context *ctx = nfs_file_open_context(file); 229 struct nfs_open_context *ctx = nfs_file_open_context(file);
236 struct dentry *dentry = file->f_path.dentry; 230 struct dentry *dentry = file->f_path.dentry;
237 struct inode *inode = dentry->d_inode; 231 struct inode *inode = dentry->d_inode;
238 int status;
239 232
240 dprintk("NFS: flush(%s/%s)\n", 233 dprintk("NFS: flush(%s/%s)\n",
241 dentry->d_parent->d_name.name, 234 dentry->d_parent->d_name.name,
@@ -245,11 +238,8 @@ nfs_file_flush(struct file *file, fl_owner_t id)
245 return 0; 238 return 0;
246 nfs_inc_stats(inode, NFSIOS_VFSFLUSH); 239 nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
247 240
248 /* Ensure that data+attribute caches are up to date after close() */ 241 /* Flush writes to the server and return any errors */
249 status = nfs_do_fsync(ctx, inode); 242 return nfs_do_fsync(ctx, inode);
250 if (!status)
251 nfs_revalidate_inode(NFS_SERVER(inode), inode);
252 return status;
253} 243}
254 244
255static ssize_t 245static ssize_t
@@ -304,11 +294,13 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
304 dprintk("NFS: mmap(%s/%s)\n", 294 dprintk("NFS: mmap(%s/%s)\n",
305 dentry->d_parent->d_name.name, dentry->d_name.name); 295 dentry->d_parent->d_name.name, dentry->d_name.name);
306 296
307 status = nfs_revalidate_mapping(inode, file->f_mapping); 297 /* Note: generic_file_mmap() returns ENOSYS on nommu systems
298 * so we call that before revalidating the mapping
299 */
300 status = generic_file_mmap(file, vma);
308 if (!status) { 301 if (!status) {
309 vma->vm_ops = &nfs_file_vm_ops; 302 vma->vm_ops = &nfs_file_vm_ops;
310 vma->vm_flags |= VM_CAN_NONLINEAR; 303 status = nfs_revalidate_mapping(inode, file->f_mapping);
311 file_accessed(file);
312 } 304 }
313 return status; 305 return status;
314} 306}
@@ -354,6 +346,15 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
354 file->f_path.dentry->d_name.name, 346 file->f_path.dentry->d_name.name,
355 mapping->host->i_ino, len, (long long) pos); 347 mapping->host->i_ino, len, (long long) pos);
356 348
349 /*
350 * Prevent starvation issues if someone is doing a consistency
351 * sync-to-disk
352 */
353 ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
354 nfs_wait_bit_killable, TASK_KILLABLE);
355 if (ret)
356 return ret;
357
357 page = grab_cache_page_write_begin(mapping, index, flags); 358 page = grab_cache_page_write_begin(mapping, index, flags);
358 if (!page) 359 if (!page)
359 return -ENOMEM; 360 return -ENOMEM;
@@ -409,6 +410,13 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
409 return copied; 410 return copied;
410} 411}
411 412
413/*
414 * Partially or wholly invalidate a page
415 * - Release the private state associated with a page if undergoing complete
416 * page invalidation
417 * - Called if either PG_private or PG_fscache is set on the page
418 * - Caller holds page lock
419 */
412static void nfs_invalidate_page(struct page *page, unsigned long offset) 420static void nfs_invalidate_page(struct page *page, unsigned long offset)
413{ 421{
414 dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset); 422 dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);
@@ -417,23 +425,43 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset)
417 return; 425 return;
418 /* Cancel any unstarted writes on this page */ 426 /* Cancel any unstarted writes on this page */
419 nfs_wb_page_cancel(page->mapping->host, page); 427 nfs_wb_page_cancel(page->mapping->host, page);
428
429 nfs_fscache_invalidate_page(page, page->mapping->host);
420} 430}
421 431
432/*
433 * Attempt to release the private state associated with a page
434 * - Called if either PG_private or PG_fscache is set on the page
435 * - Caller holds page lock
436 * - Return true (may release page) or false (may not)
437 */
422static int nfs_release_page(struct page *page, gfp_t gfp) 438static int nfs_release_page(struct page *page, gfp_t gfp)
423{ 439{
424 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); 440 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
425 441
426 /* If PagePrivate() is set, then the page is not freeable */ 442 /* If PagePrivate() is set, then the page is not freeable */
427 return 0; 443 if (PagePrivate(page))
444 return 0;
445 return nfs_fscache_release_page(page, gfp);
428} 446}
429 447
448/*
449 * Attempt to clear the private state associated with a page when an error
450 * occurs that requires the cached contents of an inode to be written back or
451 * destroyed
452 * - Called if either PG_private or fscache is set on the page
453 * - Caller holds page lock
454 * - Return 0 if successful, -error otherwise
455 */
430static int nfs_launder_page(struct page *page) 456static int nfs_launder_page(struct page *page)
431{ 457{
432 struct inode *inode = page->mapping->host; 458 struct inode *inode = page->mapping->host;
459 struct nfs_inode *nfsi = NFS_I(inode);
433 460
434 dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n", 461 dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
435 inode->i_ino, (long long)page_offset(page)); 462 inode->i_ino, (long long)page_offset(page));
436 463
464 nfs_fscache_wait_on_page_write(nfsi, page);
437 return nfs_wb_page(inode, page); 465 return nfs_wb_page(inode, page);
438} 466}
439 467
@@ -451,8 +479,14 @@ const struct address_space_operations nfs_file_aops = {
451 .launder_page = nfs_launder_page, 479 .launder_page = nfs_launder_page,
452}; 480};
453 481
454static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) 482/*
483 * Notification that a PTE pointing to an NFS page is about to be made
484 * writable, implying that someone is about to modify the page through a
485 * shared-writable mapping
486 */
487static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
455{ 488{
489 struct page *page = vmf->page;
456 struct file *filp = vma->vm_file; 490 struct file *filp = vma->vm_file;
457 struct dentry *dentry = filp->f_path.dentry; 491 struct dentry *dentry = filp->f_path.dentry;
458 unsigned pagelen; 492 unsigned pagelen;
@@ -464,6 +498,9 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
464 filp->f_mapping->host->i_ino, 498 filp->f_mapping->host->i_ino,
465 (long long)page_offset(page)); 499 (long long)page_offset(page));
466 500
501 /* make sure the cache has finished storing the page */
502 nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page);
503
467 lock_page(page); 504 lock_page(page);
468 mapping = page->mapping; 505 mapping = page->mapping;
469 if (mapping != dentry->d_inode->i_mapping) 506 if (mapping != dentry->d_inode->i_mapping)
@@ -479,11 +516,11 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
479 goto out_unlock; 516 goto out_unlock;
480 517
481 ret = nfs_updatepage(filp, page, 0, pagelen); 518 ret = nfs_updatepage(filp, page, 0, pagelen);
482 if (ret == 0)
483 ret = pagelen;
484out_unlock: 519out_unlock:
520 if (!ret)
521 return VM_FAULT_LOCKED;
485 unlock_page(page); 522 unlock_page(page);
486 return ret; 523 return VM_FAULT_SIGBUS;
487} 524}
488 525
489static struct vm_operations_struct nfs_file_vm_ops = { 526static struct vm_operations_struct nfs_file_vm_ops = {
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
new file mode 100644
index 000000000000..5b1006480bc2
--- /dev/null
+++ b/fs/nfs/fscache-index.c
@@ -0,0 +1,337 @@
1/* NFS FS-Cache index structure definition
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/init.h>
13#include <linux/kernel.h>
14#include <linux/sched.h>
15#include <linux/mm.h>
16#include <linux/nfs_fs.h>
17#include <linux/nfs_fs_sb.h>
18#include <linux/in6.h>
19
20#include "internal.h"
21#include "fscache.h"
22
23#define NFSDBG_FACILITY NFSDBG_FSCACHE
24
25/*
26 * Define the NFS filesystem for FS-Cache. Upon registration FS-Cache sticks
27 * the cookie for the top-level index object for NFS into here. The top-level
28 * index can than have other cache objects inserted into it.
29 */
30struct fscache_netfs nfs_fscache_netfs = {
31 .name = "nfs",
32 .version = 0,
33};
34
35/*
36 * Register NFS for caching
37 */
38int nfs_fscache_register(void)
39{
40 return fscache_register_netfs(&nfs_fscache_netfs);
41}
42
43/*
44 * Unregister NFS for caching
45 */
46void nfs_fscache_unregister(void)
47{
48 fscache_unregister_netfs(&nfs_fscache_netfs);
49}
50
51/*
52 * Layout of the key for an NFS server cache object.
53 */
54struct nfs_server_key {
55 uint16_t nfsversion; /* NFS protocol version */
56 uint16_t family; /* address family */
57 uint16_t port; /* IP port */
58 union {
59 struct in_addr ipv4_addr; /* IPv4 address */
60 struct in6_addr ipv6_addr; /* IPv6 address */
61 } addr[0];
62};
63
64/*
65 * Generate a key to describe a server in the main NFS index
66 * - We return the length of the key, or 0 if we can't generate one
67 */
68static uint16_t nfs_server_get_key(const void *cookie_netfs_data,
69 void *buffer, uint16_t bufmax)
70{
71 const struct nfs_client *clp = cookie_netfs_data;
72 const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr;
73 const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr;
74 struct nfs_server_key *key = buffer;
75 uint16_t len = sizeof(struct nfs_server_key);
76
77 key->nfsversion = clp->rpc_ops->version;
78 key->family = clp->cl_addr.ss_family;
79
80 memset(key, 0, len);
81
82 switch (clp->cl_addr.ss_family) {
83 case AF_INET:
84 key->port = sin->sin_port;
85 key->addr[0].ipv4_addr = sin->sin_addr;
86 len += sizeof(key->addr[0].ipv4_addr);
87 break;
88
89 case AF_INET6:
90 key->port = sin6->sin6_port;
91 key->addr[0].ipv6_addr = sin6->sin6_addr;
92 len += sizeof(key->addr[0].ipv6_addr);
93 break;
94
95 default:
96 printk(KERN_WARNING "NFS: Unknown network family '%d'\n",
97 clp->cl_addr.ss_family);
98 len = 0;
99 break;
100 }
101
102 return len;
103}
104
105/*
106 * Define the server object for FS-Cache. This is used to describe a server
107 * object to fscache_acquire_cookie(). It is keyed by the NFS protocol and
108 * server address parameters.
109 */
110const struct fscache_cookie_def nfs_fscache_server_index_def = {
111 .name = "NFS.server",
112 .type = FSCACHE_COOKIE_TYPE_INDEX,
113 .get_key = nfs_server_get_key,
114};
115
116/*
117 * Generate a key to describe a superblock key in the main NFS index
118 */
119static uint16_t nfs_super_get_key(const void *cookie_netfs_data,
120 void *buffer, uint16_t bufmax)
121{
122 const struct nfs_fscache_key *key;
123 const struct nfs_server *nfss = cookie_netfs_data;
124 uint16_t len;
125
126 key = nfss->fscache_key;
127 len = sizeof(key->key) + key->key.uniq_len;
128 if (len > bufmax) {
129 len = 0;
130 } else {
131 memcpy(buffer, &key->key, sizeof(key->key));
132 memcpy(buffer + sizeof(key->key),
133 key->key.uniquifier, key->key.uniq_len);
134 }
135
136 return len;
137}
138
139/*
140 * Define the superblock object for FS-Cache. This is used to describe a
141 * superblock object to fscache_acquire_cookie(). It is keyed by all the NFS
142 * parameters that might cause a separate superblock.
143 */
144const struct fscache_cookie_def nfs_fscache_super_index_def = {
145 .name = "NFS.super",
146 .type = FSCACHE_COOKIE_TYPE_INDEX,
147 .get_key = nfs_super_get_key,
148};
149
150/*
151 * Definition of the auxiliary data attached to NFS inode storage objects
152 * within the cache.
153 *
154 * The contents of this struct are recorded in the on-disk local cache in the
155 * auxiliary data attached to the data storage object backing an inode. This
156 * permits coherency to be managed when a new inode binds to an already extant
157 * cache object.
158 */
159struct nfs_fscache_inode_auxdata {
160 struct timespec mtime;
161 struct timespec ctime;
162 loff_t size;
163 u64 change_attr;
164};
165
166/*
167 * Generate a key to describe an NFS inode in an NFS server's index
168 */
169static uint16_t nfs_fscache_inode_get_key(const void *cookie_netfs_data,
170 void *buffer, uint16_t bufmax)
171{
172 const struct nfs_inode *nfsi = cookie_netfs_data;
173 uint16_t nsize;
174
175 /* use the inode's NFS filehandle as the key */
176 nsize = nfsi->fh.size;
177 memcpy(buffer, nfsi->fh.data, nsize);
178 return nsize;
179}
180
181/*
182 * Get certain file attributes from the netfs data
183 * - This function can be absent for an index
184 * - Not permitted to return an error
185 * - The netfs data from the cookie being used as the source is presented
186 */
187static void nfs_fscache_inode_get_attr(const void *cookie_netfs_data,
188 uint64_t *size)
189{
190 const struct nfs_inode *nfsi = cookie_netfs_data;
191
192 *size = nfsi->vfs_inode.i_size;
193}
194
195/*
196 * Get the auxiliary data from netfs data
197 * - This function can be absent if the index carries no state data
198 * - Should store the auxiliary data in the buffer
199 * - Should return the amount of amount stored
200 * - Not permitted to return an error
201 * - The netfs data from the cookie being used as the source is presented
202 */
203static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data,
204 void *buffer, uint16_t bufmax)
205{
206 struct nfs_fscache_inode_auxdata auxdata;
207 const struct nfs_inode *nfsi = cookie_netfs_data;
208
209 memset(&auxdata, 0, sizeof(auxdata));
210 auxdata.size = nfsi->vfs_inode.i_size;
211 auxdata.mtime = nfsi->vfs_inode.i_mtime;
212 auxdata.ctime = nfsi->vfs_inode.i_ctime;
213
214 if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
215 auxdata.change_attr = nfsi->change_attr;
216
217 if (bufmax > sizeof(auxdata))
218 bufmax = sizeof(auxdata);
219
220 memcpy(buffer, &auxdata, bufmax);
221 return bufmax;
222}
223
224/*
225 * Consult the netfs about the state of an object
226 * - This function can be absent if the index carries no state data
227 * - The netfs data from the cookie being used as the target is
228 * presented, as is the auxiliary data
229 */
230static
231enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
232 const void *data,
233 uint16_t datalen)
234{
235 struct nfs_fscache_inode_auxdata auxdata;
236 struct nfs_inode *nfsi = cookie_netfs_data;
237
238 if (datalen != sizeof(auxdata))
239 return FSCACHE_CHECKAUX_OBSOLETE;
240
241 memset(&auxdata, 0, sizeof(auxdata));
242 auxdata.size = nfsi->vfs_inode.i_size;
243 auxdata.mtime = nfsi->vfs_inode.i_mtime;
244 auxdata.ctime = nfsi->vfs_inode.i_ctime;
245
246 if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
247 auxdata.change_attr = nfsi->change_attr;
248
249 if (memcmp(data, &auxdata, datalen) != 0)
250 return FSCACHE_CHECKAUX_OBSOLETE;
251
252 return FSCACHE_CHECKAUX_OKAY;
253}
254
255/*
256 * Indication from FS-Cache that the cookie is no longer cached
257 * - This function is called when the backing store currently caching a cookie
258 * is removed
259 * - The netfs should use this to clean up any markers indicating cached pages
260 * - This is mandatory for any object that may have data
261 */
262static void nfs_fscache_inode_now_uncached(void *cookie_netfs_data)
263{
264 struct nfs_inode *nfsi = cookie_netfs_data;
265 struct pagevec pvec;
266 pgoff_t first;
267 int loop, nr_pages;
268
269 pagevec_init(&pvec, 0);
270 first = 0;
271
272 dprintk("NFS: nfs_inode_now_uncached: nfs_inode 0x%p\n", nfsi);
273
274 for (;;) {
275 /* grab a bunch of pages to unmark */
276 nr_pages = pagevec_lookup(&pvec,
277 nfsi->vfs_inode.i_mapping,
278 first,
279 PAGEVEC_SIZE - pagevec_count(&pvec));
280 if (!nr_pages)
281 break;
282
283 for (loop = 0; loop < nr_pages; loop++)
284 ClearPageFsCache(pvec.pages[loop]);
285
286 first = pvec.pages[nr_pages - 1]->index + 1;
287
288 pvec.nr = nr_pages;
289 pagevec_release(&pvec);
290 cond_resched();
291 }
292}
293
294/*
295 * Get an extra reference on a read context.
296 * - This function can be absent if the completion function doesn't require a
297 * context.
298 * - The read context is passed back to NFS in the event that a data read on the
299 * cache fails with EIO - in which case the server must be contacted to
300 * retrieve the data, which requires the read context for security.
301 */
302static void nfs_fh_get_context(void *cookie_netfs_data, void *context)
303{
304 get_nfs_open_context(context);
305}
306
307/*
308 * Release an extra reference on a read context.
309 * - This function can be absent if the completion function doesn't require a
310 * context.
311 */
312static void nfs_fh_put_context(void *cookie_netfs_data, void *context)
313{
314 if (context)
315 put_nfs_open_context(context);
316}
317
318/*
319 * Define the inode object for FS-Cache. This is used to describe an inode
320 * object to fscache_acquire_cookie(). It is keyed by the NFS file handle for
321 * an inode.
322 *
323 * Coherency is managed by comparing the copies of i_size, i_mtime and i_ctime
324 * held in the cache auxiliary data for the data storage object with those in
325 * the inode struct in memory.
326 */
327const struct fscache_cookie_def nfs_fscache_inode_object_def = {
328 .name = "NFS.fh",
329 .type = FSCACHE_COOKIE_TYPE_DATAFILE,
330 .get_key = nfs_fscache_inode_get_key,
331 .get_attr = nfs_fscache_inode_get_attr,
332 .get_aux = nfs_fscache_inode_get_aux,
333 .check_aux = nfs_fscache_inode_check_aux,
334 .now_uncached = nfs_fscache_inode_now_uncached,
335 .get_context = nfs_fh_get_context,
336 .put_context = nfs_fh_put_context,
337};
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
new file mode 100644
index 000000000000..379be678cb7e
--- /dev/null
+++ b/fs/nfs/fscache.c
@@ -0,0 +1,523 @@
1/* NFS filesystem cache interface
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/init.h>
13#include <linux/kernel.h>
14#include <linux/sched.h>
15#include <linux/mm.h>
16#include <linux/nfs_fs.h>
17#include <linux/nfs_fs_sb.h>
18#include <linux/in6.h>
19#include <linux/seq_file.h>
20
21#include "internal.h"
22#include "iostat.h"
23#include "fscache.h"
24
25#define NFSDBG_FACILITY NFSDBG_FSCACHE
26
27static struct rb_root nfs_fscache_keys = RB_ROOT;
28static DEFINE_SPINLOCK(nfs_fscache_keys_lock);
29
30/*
31 * Get the per-client index cookie for an NFS client if the appropriate mount
32 * flag was set
33 * - We always try and get an index cookie for the client, but get filehandle
34 * cookies on a per-superblock basis, depending on the mount flags
35 */
36void nfs_fscache_get_client_cookie(struct nfs_client *clp)
37{
38 /* create a cache index for looking up filehandles */
39 clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index,
40 &nfs_fscache_server_index_def,
41 clp);
42 dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n",
43 clp, clp->fscache);
44}
45
46/*
47 * Dispose of a per-client cookie
48 */
49void nfs_fscache_release_client_cookie(struct nfs_client *clp)
50{
51 dfprintk(FSCACHE, "NFS: releasing client cookie (0x%p/0x%p)\n",
52 clp, clp->fscache);
53
54 fscache_relinquish_cookie(clp->fscache, 0);
55 clp->fscache = NULL;
56}
57
58/*
59 * Get the cache cookie for an NFS superblock. We have to handle
60 * uniquification here because the cache doesn't do it for us.
61 */
62void nfs_fscache_get_super_cookie(struct super_block *sb,
63 struct nfs_parsed_mount_data *data)
64{
65 struct nfs_fscache_key *key, *xkey;
66 struct nfs_server *nfss = NFS_SB(sb);
67 struct rb_node **p, *parent;
68 const char *uniq = data->fscache_uniq ?: "";
69 int diff, ulen;
70
71 ulen = strlen(uniq);
72 key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL);
73 if (!key)
74 return;
75
76 key->nfs_client = nfss->nfs_client;
77 key->key.super.s_flags = sb->s_flags & NFS_MS_MASK;
78 key->key.nfs_server.flags = nfss->flags;
79 key->key.nfs_server.rsize = nfss->rsize;
80 key->key.nfs_server.wsize = nfss->wsize;
81 key->key.nfs_server.acregmin = nfss->acregmin;
82 key->key.nfs_server.acregmax = nfss->acregmax;
83 key->key.nfs_server.acdirmin = nfss->acdirmin;
84 key->key.nfs_server.acdirmax = nfss->acdirmax;
85 key->key.nfs_server.fsid = nfss->fsid;
86 key->key.rpc_auth.au_flavor = nfss->client->cl_auth->au_flavor;
87
88 key->key.uniq_len = ulen;
89 memcpy(key->key.uniquifier, uniq, ulen);
90
91 spin_lock(&nfs_fscache_keys_lock);
92 p = &nfs_fscache_keys.rb_node;
93 parent = NULL;
94 while (*p) {
95 parent = *p;
96 xkey = rb_entry(parent, struct nfs_fscache_key, node);
97
98 if (key->nfs_client < xkey->nfs_client)
99 goto go_left;
100 if (key->nfs_client > xkey->nfs_client)
101 goto go_right;
102
103 diff = memcmp(&key->key, &xkey->key, sizeof(key->key));
104 if (diff < 0)
105 goto go_left;
106 if (diff > 0)
107 goto go_right;
108
109 if (key->key.uniq_len == 0)
110 goto non_unique;
111 diff = memcmp(key->key.uniquifier,
112 xkey->key.uniquifier,
113 key->key.uniq_len);
114 if (diff < 0)
115 goto go_left;
116 if (diff > 0)
117 goto go_right;
118 goto non_unique;
119
120 go_left:
121 p = &(*p)->rb_left;
122 continue;
123 go_right:
124 p = &(*p)->rb_right;
125 }
126
127 rb_link_node(&key->node, parent, p);
128 rb_insert_color(&key->node, &nfs_fscache_keys);
129 spin_unlock(&nfs_fscache_keys_lock);
130 nfss->fscache_key = key;
131
132 /* create a cache index for looking up filehandles */
133 nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache,
134 &nfs_fscache_super_index_def,
135 nfss);
136 dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n",
137 nfss, nfss->fscache);
138 return;
139
140non_unique:
141 spin_unlock(&nfs_fscache_keys_lock);
142 kfree(key);
143 nfss->fscache_key = NULL;
144 nfss->fscache = NULL;
145 printk(KERN_WARNING "NFS:"
146 " Cache request denied due to non-unique superblock keys\n");
147}
148
149/*
150 * release a per-superblock cookie
151 */
152void nfs_fscache_release_super_cookie(struct super_block *sb)
153{
154 struct nfs_server *nfss = NFS_SB(sb);
155
156 dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n",
157 nfss, nfss->fscache);
158
159 fscache_relinquish_cookie(nfss->fscache, 0);
160 nfss->fscache = NULL;
161
162 if (nfss->fscache_key) {
163 spin_lock(&nfs_fscache_keys_lock);
164 rb_erase(&nfss->fscache_key->node, &nfs_fscache_keys);
165 spin_unlock(&nfs_fscache_keys_lock);
166 kfree(nfss->fscache_key);
167 nfss->fscache_key = NULL;
168 }
169}
170
171/*
172 * Initialise the per-inode cache cookie pointer for an NFS inode.
173 */
174void nfs_fscache_init_inode_cookie(struct inode *inode)
175{
176 NFS_I(inode)->fscache = NULL;
177 if (S_ISREG(inode->i_mode))
178 set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
179}
180
181/*
182 * Get the per-inode cache cookie for an NFS inode.
183 */
184static void nfs_fscache_enable_inode_cookie(struct inode *inode)
185{
186 struct super_block *sb = inode->i_sb;
187 struct nfs_inode *nfsi = NFS_I(inode);
188
189 if (nfsi->fscache || !NFS_FSCACHE(inode))
190 return;
191
192 if ((NFS_SB(sb)->options & NFS_OPTION_FSCACHE)) {
193 nfsi->fscache = fscache_acquire_cookie(
194 NFS_SB(sb)->fscache,
195 &nfs_fscache_inode_object_def,
196 nfsi);
197
198 dfprintk(FSCACHE, "NFS: get FH cookie (0x%p/0x%p/0x%p)\n",
199 sb, nfsi, nfsi->fscache);
200 }
201}
202
203/*
204 * Release a per-inode cookie.
205 */
206void nfs_fscache_release_inode_cookie(struct inode *inode)
207{
208 struct nfs_inode *nfsi = NFS_I(inode);
209
210 dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n",
211 nfsi, nfsi->fscache);
212
213 fscache_relinquish_cookie(nfsi->fscache, 0);
214 nfsi->fscache = NULL;
215}
216
217/*
218 * Retire a per-inode cookie, destroying the data attached to it.
219 */
220void nfs_fscache_zap_inode_cookie(struct inode *inode)
221{
222 struct nfs_inode *nfsi = NFS_I(inode);
223
224 dfprintk(FSCACHE, "NFS: zapping cookie (0x%p/0x%p)\n",
225 nfsi, nfsi->fscache);
226
227 fscache_relinquish_cookie(nfsi->fscache, 1);
228 nfsi->fscache = NULL;
229}
230
231/*
232 * Turn off the cache with regard to a per-inode cookie if opened for writing,
233 * invalidating all the pages in the page cache relating to the associated
234 * inode to clear the per-page caching.
235 */
236static void nfs_fscache_disable_inode_cookie(struct inode *inode)
237{
238 clear_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
239
240 if (NFS_I(inode)->fscache) {
241 dfprintk(FSCACHE,
242 "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode));
243
244 /* Need to invalidate any mapped pages that were read in before
245 * turning off the cache.
246 */
247 if (inode->i_mapping && inode->i_mapping->nrpages)
248 invalidate_inode_pages2(inode->i_mapping);
249
250 nfs_fscache_zap_inode_cookie(inode);
251 }
252}
253
254/*
255 * wait_on_bit() sleep function for uninterruptible waiting
256 */
257static int nfs_fscache_wait_bit(void *flags)
258{
259 schedule();
260 return 0;
261}
262
263/*
264 * Lock against someone else trying to also acquire or relinquish a cookie
265 */
266static inline void nfs_fscache_inode_lock(struct inode *inode)
267{
268 struct nfs_inode *nfsi = NFS_I(inode);
269
270 while (test_and_set_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags))
271 wait_on_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK,
272 nfs_fscache_wait_bit, TASK_UNINTERRUPTIBLE);
273}
274
275/*
276 * Unlock cookie management lock
277 */
278static inline void nfs_fscache_inode_unlock(struct inode *inode)
279{
280 struct nfs_inode *nfsi = NFS_I(inode);
281
282 smp_mb__before_clear_bit();
283 clear_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags);
284 smp_mb__after_clear_bit();
285 wake_up_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK);
286}
287
288/*
289 * Decide if we should enable or disable local caching for this inode.
290 * - For now, with NFS, only regular files that are open read-only will be able
291 * to use the cache.
292 * - May be invoked multiple times in parallel by parallel nfs_open() functions.
293 */
294void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
295{
296 if (NFS_FSCACHE(inode)) {
297 nfs_fscache_inode_lock(inode);
298 if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
299 nfs_fscache_disable_inode_cookie(inode);
300 else
301 nfs_fscache_enable_inode_cookie(inode);
302 nfs_fscache_inode_unlock(inode);
303 }
304}
305
306/*
307 * Replace a per-inode cookie due to revalidation detecting a file having
308 * changed on the server.
309 */
310void nfs_fscache_reset_inode_cookie(struct inode *inode)
311{
312 struct nfs_inode *nfsi = NFS_I(inode);
313 struct nfs_server *nfss = NFS_SERVER(inode);
314 struct fscache_cookie *old = nfsi->fscache;
315
316 nfs_fscache_inode_lock(inode);
317 if (nfsi->fscache) {
318 /* retire the current fscache cache and get a new one */
319 fscache_relinquish_cookie(nfsi->fscache, 1);
320
321 nfsi->fscache = fscache_acquire_cookie(
322 nfss->nfs_client->fscache,
323 &nfs_fscache_inode_object_def,
324 nfsi);
325
326 dfprintk(FSCACHE,
327 "NFS: revalidation new cookie (0x%p/0x%p/0x%p/0x%p)\n",
328 nfss, nfsi, old, nfsi->fscache);
329 }
330 nfs_fscache_inode_unlock(inode);
331}
332
333/*
334 * Release the caching state associated with a page, if the page isn't busy
335 * interacting with the cache.
336 * - Returns true (can release page) or false (page busy).
337 */
338int nfs_fscache_release_page(struct page *page, gfp_t gfp)
339{
340 struct nfs_inode *nfsi = NFS_I(page->mapping->host);
341 struct fscache_cookie *cookie = nfsi->fscache;
342
343 BUG_ON(!cookie);
344
345 if (fscache_check_page_write(cookie, page)) {
346 if (!(gfp & __GFP_WAIT))
347 return 0;
348 fscache_wait_on_page_write(cookie, page);
349 }
350
351 if (PageFsCache(page)) {
352 dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n",
353 cookie, page, nfsi);
354
355 fscache_uncache_page(cookie, page);
356 nfs_add_fscache_stats(page->mapping->host,
357 NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
358 }
359
360 return 1;
361}
362
363/*
364 * Release the caching state associated with a page if undergoing complete page
365 * invalidation.
366 */
367void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode)
368{
369 struct nfs_inode *nfsi = NFS_I(inode);
370 struct fscache_cookie *cookie = nfsi->fscache;
371
372 BUG_ON(!cookie);
373
374 dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n",
375 cookie, page, nfsi);
376
377 fscache_wait_on_page_write(cookie, page);
378
379 BUG_ON(!PageLocked(page));
380 fscache_uncache_page(cookie, page);
381 nfs_add_fscache_stats(page->mapping->host,
382 NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
383}
384
385/*
386 * Handle completion of a page being read from the cache.
387 * - Called in process (keventd) context.
388 */
389static void nfs_readpage_from_fscache_complete(struct page *page,
390 void *context,
391 int error)
392{
393 dfprintk(FSCACHE,
394 "NFS: readpage_from_fscache_complete (0x%p/0x%p/%d)\n",
395 page, context, error);
396
397 /* if the read completes with an error, we just unlock the page and let
398 * the VM reissue the readpage */
399 if (!error) {
400 SetPageUptodate(page);
401 unlock_page(page);
402 } else {
403 error = nfs_readpage_async(context, page->mapping->host, page);
404 if (error)
405 unlock_page(page);
406 }
407}
408
409/*
410 * Retrieve a page from fscache
411 */
412int __nfs_readpage_from_fscache(struct nfs_open_context *ctx,
413 struct inode *inode, struct page *page)
414{
415 int ret;
416
417 dfprintk(FSCACHE,
418 "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n",
419 NFS_I(inode)->fscache, page, page->index, page->flags, inode);
420
421 ret = fscache_read_or_alloc_page(NFS_I(inode)->fscache,
422 page,
423 nfs_readpage_from_fscache_complete,
424 ctx,
425 GFP_KERNEL);
426
427 switch (ret) {
428 case 0: /* read BIO submitted (page in fscache) */
429 dfprintk(FSCACHE,
430 "NFS: readpage_from_fscache: BIO submitted\n");
431 nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK, 1);
432 return ret;
433
434 case -ENOBUFS: /* inode not in cache */
435 case -ENODATA: /* page not in cache */
436 nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1);
437 dfprintk(FSCACHE,
438 "NFS: readpage_from_fscache %d\n", ret);
439 return 1;
440
441 default:
442 dfprintk(FSCACHE, "NFS: readpage_from_fscache %d\n", ret);
443 nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1);
444 }
445 return ret;
446}
447
448/*
449 * Retrieve a set of pages from fscache
450 */
451int __nfs_readpages_from_fscache(struct nfs_open_context *ctx,
452 struct inode *inode,
453 struct address_space *mapping,
454 struct list_head *pages,
455 unsigned *nr_pages)
456{
457 int ret, npages = *nr_pages;
458
459 dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n",
460 NFS_I(inode)->fscache, npages, inode);
461
462 ret = fscache_read_or_alloc_pages(NFS_I(inode)->fscache,
463 mapping, pages, nr_pages,
464 nfs_readpage_from_fscache_complete,
465 ctx,
466 mapping_gfp_mask(mapping));
467 if (*nr_pages < npages)
468 nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK,
469 npages);
470 if (*nr_pages > 0)
471 nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL,
472 *nr_pages);
473
474 switch (ret) {
475 case 0: /* read submitted to the cache for all pages */
476 BUG_ON(!list_empty(pages));
477 BUG_ON(*nr_pages != 0);
478 dfprintk(FSCACHE,
479 "NFS: nfs_getpages_from_fscache: submitted\n");
480
481 return ret;
482
483 case -ENOBUFS: /* some pages aren't cached and can't be */
484 case -ENODATA: /* some pages aren't cached */
485 dfprintk(FSCACHE,
486 "NFS: nfs_getpages_from_fscache: no page: %d\n", ret);
487 return 1;
488
489 default:
490 dfprintk(FSCACHE,
491 "NFS: nfs_getpages_from_fscache: ret %d\n", ret);
492 }
493
494 return ret;
495}
496
497/*
498 * Store a newly fetched page in fscache
499 * - PG_fscache must be set on the page
500 */
501void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync)
502{
503 int ret;
504
505 dfprintk(FSCACHE,
506 "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n",
507 NFS_I(inode)->fscache, page, page->index, page->flags, sync);
508
509 ret = fscache_write_page(NFS_I(inode)->fscache, page, GFP_KERNEL);
510 dfprintk(FSCACHE,
511 "NFS: readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n",
512 page, page->index, page->flags, ret);
513
514 if (ret != 0) {
515 fscache_uncache_page(NFS_I(inode)->fscache, page);
516 nfs_add_fscache_stats(inode,
517 NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL, 1);
518 nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
519 } else {
520 nfs_add_fscache_stats(inode,
521 NFSIOS_FSCACHE_PAGES_WRITTEN_OK, 1);
522 }
523}
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
new file mode 100644
index 000000000000..6e809bb0ff08
--- /dev/null
+++ b/fs/nfs/fscache.h
@@ -0,0 +1,220 @@
1/* NFS filesystem cache interface definitions
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#ifndef _NFS_FSCACHE_H
13#define _NFS_FSCACHE_H
14
15#include <linux/nfs_fs.h>
16#include <linux/nfs_mount.h>
17#include <linux/nfs4_mount.h>
18#include <linux/fscache.h>
19
20#ifdef CONFIG_NFS_FSCACHE
21
22/*
23 * set of NFS FS-Cache objects that form a superblock key
24 */
25struct nfs_fscache_key {
26 struct rb_node node;
27 struct nfs_client *nfs_client; /* the server */
28
29 /* the elements of the unique key - as used by nfs_compare_super() and
30 * nfs_compare_mount_options() to distinguish superblocks */
31 struct {
32 struct {
33 unsigned long s_flags; /* various flags
34 * (& NFS_MS_MASK) */
35 } super;
36
37 struct {
38 struct nfs_fsid fsid;
39 int flags;
40 unsigned int rsize; /* read size */
41 unsigned int wsize; /* write size */
42 unsigned int acregmin; /* attr cache timeouts */
43 unsigned int acregmax;
44 unsigned int acdirmin;
45 unsigned int acdirmax;
46 } nfs_server;
47
48 struct {
49 rpc_authflavor_t au_flavor;
50 } rpc_auth;
51
52 /* uniquifier - can be used if nfs_server.flags includes
53 * NFS_MOUNT_UNSHARED */
54 u8 uniq_len;
55 char uniquifier[0];
56 } key;
57};
58
59/*
60 * fscache-index.c
61 */
62extern struct fscache_netfs nfs_fscache_netfs;
63extern const struct fscache_cookie_def nfs_fscache_server_index_def;
64extern const struct fscache_cookie_def nfs_fscache_super_index_def;
65extern const struct fscache_cookie_def nfs_fscache_inode_object_def;
66
67extern int nfs_fscache_register(void);
68extern void nfs_fscache_unregister(void);
69
70/*
71 * fscache.c
72 */
73extern void nfs_fscache_get_client_cookie(struct nfs_client *);
74extern void nfs_fscache_release_client_cookie(struct nfs_client *);
75
76extern void nfs_fscache_get_super_cookie(struct super_block *,
77 struct nfs_parsed_mount_data *);
78extern void nfs_fscache_release_super_cookie(struct super_block *);
79
80extern void nfs_fscache_init_inode_cookie(struct inode *);
81extern void nfs_fscache_release_inode_cookie(struct inode *);
82extern void nfs_fscache_zap_inode_cookie(struct inode *);
83extern void nfs_fscache_set_inode_cookie(struct inode *, struct file *);
84extern void nfs_fscache_reset_inode_cookie(struct inode *);
85
86extern void __nfs_fscache_invalidate_page(struct page *, struct inode *);
87extern int nfs_fscache_release_page(struct page *, gfp_t);
88
89extern int __nfs_readpage_from_fscache(struct nfs_open_context *,
90 struct inode *, struct page *);
91extern int __nfs_readpages_from_fscache(struct nfs_open_context *,
92 struct inode *, struct address_space *,
93 struct list_head *, unsigned *);
94extern void __nfs_readpage_to_fscache(struct inode *, struct page *, int);
95
96/*
97 * wait for a page to complete writing to the cache
98 */
99static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi,
100 struct page *page)
101{
102 if (PageFsCache(page))
103 fscache_wait_on_page_write(nfsi->fscache, page);
104}
105
106/*
107 * release the caching state associated with a page if undergoing complete page
108 * invalidation
109 */
110static inline void nfs_fscache_invalidate_page(struct page *page,
111 struct inode *inode)
112{
113 if (PageFsCache(page))
114 __nfs_fscache_invalidate_page(page, inode);
115}
116
117/*
118 * Retrieve a page from an inode data storage object.
119 */
120static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx,
121 struct inode *inode,
122 struct page *page)
123{
124 if (NFS_I(inode)->fscache)
125 return __nfs_readpage_from_fscache(ctx, inode, page);
126 return -ENOBUFS;
127}
128
129/*
130 * Retrieve a set of pages from an inode data storage object.
131 */
132static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
133 struct inode *inode,
134 struct address_space *mapping,
135 struct list_head *pages,
136 unsigned *nr_pages)
137{
138 if (NFS_I(inode)->fscache)
139 return __nfs_readpages_from_fscache(ctx, inode, mapping, pages,
140 nr_pages);
141 return -ENOBUFS;
142}
143
144/*
145 * Store a page newly fetched from the server in an inode data storage object
146 * in the cache.
147 */
148static inline void nfs_readpage_to_fscache(struct inode *inode,
149 struct page *page,
150 int sync)
151{
152 if (PageFsCache(page))
153 __nfs_readpage_to_fscache(inode, page, sync);
154}
155
156/*
157 * indicate the client caching state as readable text
158 */
159static inline const char *nfs_server_fscache_state(struct nfs_server *server)
160{
161 if (server->fscache && (server->options & NFS_OPTION_FSCACHE))
162 return "yes";
163 return "no ";
164}
165
166
167#else /* CONFIG_NFS_FSCACHE */
168static inline int nfs_fscache_register(void) { return 0; }
169static inline void nfs_fscache_unregister(void) {}
170
171static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {}
172static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}
173
174static inline void nfs_fscache_get_super_cookie(
175 struct super_block *sb,
176 struct nfs_parsed_mount_data *data)
177{
178}
179static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
180
181static inline void nfs_fscache_init_inode_cookie(struct inode *inode) {}
182static inline void nfs_fscache_release_inode_cookie(struct inode *inode) {}
183static inline void nfs_fscache_zap_inode_cookie(struct inode *inode) {}
184static inline void nfs_fscache_set_inode_cookie(struct inode *inode,
185 struct file *filp) {}
186static inline void nfs_fscache_reset_inode_cookie(struct inode *inode) {}
187
188static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
189{
190 return 1; /* True: may release page */
191}
192static inline void nfs_fscache_invalidate_page(struct page *page,
193 struct inode *inode) {}
194static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi,
195 struct page *page) {}
196
197static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx,
198 struct inode *inode,
199 struct page *page)
200{
201 return -ENOBUFS;
202}
203static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
204 struct inode *inode,
205 struct address_space *mapping,
206 struct list_head *pages,
207 unsigned *nr_pages)
208{
209 return -ENOBUFS;
210}
211static inline void nfs_readpage_to_fscache(struct inode *inode,
212 struct page *page, int sync) {}
213
214static inline const char *nfs_server_fscache_state(struct nfs_server *server)
215{
216 return "no ";
217}
218
219#endif /* CONFIG_NFS_FSCACHE */
220#endif /* _NFS_FSCACHE_H */
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b7c9b2df1f29..46177cb87064 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -156,7 +156,7 @@ int nfs4_path_walk(struct nfs_server *server,
156 return ret; 156 return ret;
157 } 157 }
158 158
159 if (fattr.type != NFDIR) { 159 if (!S_ISDIR(fattr.mode)) {
160 printk(KERN_ERR "nfs4_get_root:" 160 printk(KERN_ERR "nfs4_get_root:"
161 " getroot encountered non-directory\n"); 161 " getroot encountered non-directory\n");
162 return -ENOTDIR; 162 return -ENOTDIR;
@@ -213,7 +213,7 @@ eat_dot_dir:
213 return ret; 213 return ret;
214 } 214 }
215 215
216 if (fattr.type != NFDIR) { 216 if (!S_ISDIR(fattr.mode)) {
217 printk(KERN_ERR "nfs4_get_root:" 217 printk(KERN_ERR "nfs4_get_root:"
218 " lookupfh encountered non-directory\n"); 218 " lookupfh encountered non-directory\n");
219 return -ENOTDIR; 219 return -ENOTDIR;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0c381686171e..64f87194d390 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -46,6 +46,7 @@
46#include "delegation.h" 46#include "delegation.h"
47#include "iostat.h" 47#include "iostat.h"
48#include "internal.h" 48#include "internal.h"
49#include "fscache.h"
49 50
50#define NFSDBG_FACILITY NFSDBG_VFS 51#define NFSDBG_FACILITY NFSDBG_VFS
51 52
@@ -66,6 +67,18 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
66} 67}
67 68
68/** 69/**
70 * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks
71 * @word: long word containing the bit lock
72 */
73int nfs_wait_bit_killable(void *word)
74{
75 if (fatal_signal_pending(current))
76 return -ERESTARTSYS;
77 schedule();
78 return 0;
79}
80
81/**
69 * nfs_compat_user_ino64 - returns the user-visible inode number 82 * nfs_compat_user_ino64 - returns the user-visible inode number
70 * @fileid: 64-bit fileid 83 * @fileid: 64-bit fileid
71 * 84 *
@@ -109,6 +122,7 @@ void nfs_clear_inode(struct inode *inode)
109 BUG_ON(!list_empty(&NFS_I(inode)->open_files)); 122 BUG_ON(!list_empty(&NFS_I(inode)->open_files));
110 nfs_zap_acl_cache(inode); 123 nfs_zap_acl_cache(inode);
111 nfs_access_zap_cache(inode); 124 nfs_access_zap_cache(inode);
125 nfs_fscache_release_inode_cookie(inode);
112} 126}
113 127
114/** 128/**
@@ -249,13 +263,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
249 struct inode *inode = ERR_PTR(-ENOENT); 263 struct inode *inode = ERR_PTR(-ENOENT);
250 unsigned long hash; 264 unsigned long hash;
251 265
252 if ((fattr->valid & NFS_ATTR_FATTR) == 0) 266 if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0)
253 goto out_no_inode; 267 goto out_no_inode;
254 268 if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0)
255 if (!fattr->nlink) {
256 printk("NFS: Buggy server - nlink == 0!\n");
257 goto out_no_inode; 269 goto out_no_inode;
258 }
259 270
260 hash = nfs_fattr_to_ino_t(fattr); 271 hash = nfs_fattr_to_ino_t(fattr);
261 272
@@ -291,7 +302,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
291 && fattr->size <= NFS_LIMIT_READDIRPLUS) 302 && fattr->size <= NFS_LIMIT_READDIRPLUS)
292 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); 303 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
293 /* Deal with crossing mountpoints */ 304 /* Deal with crossing mountpoints */
294 if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) { 305 if ((fattr->valid & NFS_ATTR_FATTR_FSID)
306 && !nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
295 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) 307 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
296 inode->i_op = &nfs_referral_inode_operations; 308 inode->i_op = &nfs_referral_inode_operations;
297 else 309 else
@@ -304,30 +316,49 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
304 else 316 else
305 init_special_inode(inode, inode->i_mode, fattr->rdev); 317 init_special_inode(inode, inode->i_mode, fattr->rdev);
306 318
319 memset(&inode->i_atime, 0, sizeof(inode->i_atime));
320 memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
321 memset(&inode->i_ctime, 0, sizeof(inode->i_ctime));
322 nfsi->change_attr = 0;
323 inode->i_size = 0;
324 inode->i_nlink = 0;
325 inode->i_uid = -2;
326 inode->i_gid = -2;
327 inode->i_blocks = 0;
328 memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
329
307 nfsi->read_cache_jiffies = fattr->time_start; 330 nfsi->read_cache_jiffies = fattr->time_start;
308 nfsi->attr_gencount = fattr->gencount; 331 nfsi->attr_gencount = fattr->gencount;
309 inode->i_atime = fattr->atime; 332 if (fattr->valid & NFS_ATTR_FATTR_ATIME)
310 inode->i_mtime = fattr->mtime; 333 inode->i_atime = fattr->atime;
311 inode->i_ctime = fattr->ctime; 334 if (fattr->valid & NFS_ATTR_FATTR_MTIME)
312 if (fattr->valid & NFS_ATTR_FATTR_V4) 335 inode->i_mtime = fattr->mtime;
336 if (fattr->valid & NFS_ATTR_FATTR_CTIME)
337 inode->i_ctime = fattr->ctime;
338 if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
313 nfsi->change_attr = fattr->change_attr; 339 nfsi->change_attr = fattr->change_attr;
314 inode->i_size = nfs_size_to_loff_t(fattr->size); 340 if (fattr->valid & NFS_ATTR_FATTR_SIZE)
315 inode->i_nlink = fattr->nlink; 341 inode->i_size = nfs_size_to_loff_t(fattr->size);
316 inode->i_uid = fattr->uid; 342 if (fattr->valid & NFS_ATTR_FATTR_NLINK)
317 inode->i_gid = fattr->gid; 343 inode->i_nlink = fattr->nlink;
318 if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) { 344 if (fattr->valid & NFS_ATTR_FATTR_OWNER)
345 inode->i_uid = fattr->uid;
346 if (fattr->valid & NFS_ATTR_FATTR_GROUP)
347 inode->i_gid = fattr->gid;
348 if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
349 inode->i_blocks = fattr->du.nfs2.blocks;
350 if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
319 /* 351 /*
320 * report the blocks in 512byte units 352 * report the blocks in 512byte units
321 */ 353 */
322 inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); 354 inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
323 } else {
324 inode->i_blocks = fattr->du.nfs2.blocks;
325 } 355 }
326 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); 356 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
327 nfsi->attrtimeo_timestamp = now; 357 nfsi->attrtimeo_timestamp = now;
328 memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
329 nfsi->access_cache = RB_ROOT; 358 nfsi->access_cache = RB_ROOT;
330 359
360 nfs_fscache_init_inode_cookie(inode);
361
331 unlock_new_inode(inode); 362 unlock_new_inode(inode);
332 } else 363 } else
333 nfs_refresh_inode(inode, fattr); 364 nfs_refresh_inode(inode, fattr);
@@ -514,6 +545,32 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
514 return err; 545 return err;
515} 546}
516 547
548/**
549 * nfs_close_context - Common close_context() routine NFSv2/v3
550 * @ctx: pointer to context
551 * @is_sync: is this a synchronous close
552 *
553 * always ensure that the attributes are up to date if we're mounted
554 * with close-to-open semantics
555 */
556void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
557{
558 struct inode *inode;
559 struct nfs_server *server;
560
561 if (!(ctx->mode & FMODE_WRITE))
562 return;
563 if (!is_sync)
564 return;
565 inode = ctx->path.dentry->d_inode;
566 if (!list_empty(&NFS_I(inode)->open_files))
567 return;
568 server = NFS_SERVER(inode);
569 if (server->flags & NFS_MOUNT_NOCTO)
570 return;
571 nfs_revalidate_inode(server, inode);
572}
573
517static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, struct dentry *dentry, struct rpc_cred *cred) 574static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, struct dentry *dentry, struct rpc_cred *cred)
518{ 575{
519 struct nfs_open_context *ctx; 576 struct nfs_open_context *ctx;
@@ -540,24 +597,15 @@ struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
540 return ctx; 597 return ctx;
541} 598}
542 599
543static void __put_nfs_open_context(struct nfs_open_context *ctx, int wait) 600static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
544{ 601{
545 struct inode *inode; 602 struct inode *inode = ctx->path.dentry->d_inode;
546
547 if (ctx == NULL)
548 return;
549 603
550 inode = ctx->path.dentry->d_inode;
551 if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock)) 604 if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock))
552 return; 605 return;
553 list_del(&ctx->list); 606 list_del(&ctx->list);
554 spin_unlock(&inode->i_lock); 607 spin_unlock(&inode->i_lock);
555 if (ctx->state != NULL) { 608 NFS_PROTO(inode)->close_context(ctx, is_sync);
556 if (wait)
557 nfs4_close_sync(&ctx->path, ctx->state, ctx->mode);
558 else
559 nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
560 }
561 if (ctx->cred != NULL) 609 if (ctx->cred != NULL)
562 put_rpccred(ctx->cred); 610 put_rpccred(ctx->cred);
563 path_put(&ctx->path); 611 path_put(&ctx->path);
@@ -642,6 +690,7 @@ int nfs_open(struct inode *inode, struct file *filp)
642 ctx->mode = filp->f_mode; 690 ctx->mode = filp->f_mode;
643 nfs_file_set_open_context(filp, ctx); 691 nfs_file_set_open_context(filp, ctx);
644 put_nfs_open_context(ctx); 692 put_nfs_open_context(ctx);
693 nfs_fscache_set_inode_cookie(inode, filp);
645 return 0; 694 return 0;
646} 695}
647 696
@@ -670,9 +719,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
670 if (NFS_STALE(inode)) 719 if (NFS_STALE(inode))
671 goto out; 720 goto out;
672 721
673 if (NFS_STALE(inode))
674 goto out;
675
676 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); 722 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
677 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr); 723 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr);
678 if (status != 0) { 724 if (status != 0) {
@@ -745,6 +791,7 @@ static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_spa
745 memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); 791 memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
746 spin_unlock(&inode->i_lock); 792 spin_unlock(&inode->i_lock);
747 nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); 793 nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
794 nfs_fscache_reset_inode_cookie(inode);
748 dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n", 795 dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
749 inode->i_sb->s_id, (long long)NFS_FILEID(inode)); 796 inode->i_sb->s_id, (long long)NFS_FILEID(inode));
750 return 0; 797 return 0;
@@ -815,25 +862,31 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
815{ 862{
816 struct nfs_inode *nfsi = NFS_I(inode); 863 struct nfs_inode *nfsi = NFS_I(inode);
817 864
818 if ((fattr->valid & NFS_ATTR_WCC_V4) != 0 && 865 if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
819 nfsi->change_attr == fattr->pre_change_attr) { 866 && (fattr->valid & NFS_ATTR_FATTR_CHANGE)
867 && nfsi->change_attr == fattr->pre_change_attr) {
820 nfsi->change_attr = fattr->change_attr; 868 nfsi->change_attr = fattr->change_attr;
821 if (S_ISDIR(inode->i_mode)) 869 if (S_ISDIR(inode->i_mode))
822 nfsi->cache_validity |= NFS_INO_INVALID_DATA; 870 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
823 } 871 }
824 /* If we have atomic WCC data, we may update some attributes */ 872 /* If we have atomic WCC data, we may update some attributes */
825 if ((fattr->valid & NFS_ATTR_WCC) != 0) { 873 if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
826 if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) 874 && (fattr->valid & NFS_ATTR_FATTR_CTIME)
875 && timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
827 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); 876 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
828 if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { 877
878 if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
879 && (fattr->valid & NFS_ATTR_FATTR_MTIME)
880 && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
829 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); 881 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
830 if (S_ISDIR(inode->i_mode)) 882 if (S_ISDIR(inode->i_mode))
831 nfsi->cache_validity |= NFS_INO_INVALID_DATA; 883 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
832 }
833 if (i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) &&
834 nfsi->npages == 0)
835 i_size_write(inode, nfs_size_to_loff_t(fattr->size));
836 } 884 }
885 if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
886 && (fattr->valid & NFS_ATTR_FATTR_SIZE)
887 && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
888 && nfsi->npages == 0)
889 i_size_write(inode, nfs_size_to_loff_t(fattr->size));
837} 890}
838 891
839/** 892/**
@@ -853,35 +906,39 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
853 906
854 907
855 /* Has the inode gone and changed behind our back? */ 908 /* Has the inode gone and changed behind our back? */
856 if (nfsi->fileid != fattr->fileid 909 if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
857 || (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) { 910 return -EIO;
911 if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
858 return -EIO; 912 return -EIO;
859 }
860 913
861 if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && 914 if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
862 nfsi->change_attr != fattr->change_attr) 915 nfsi->change_attr != fattr->change_attr)
863 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 916 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
864 917
865 /* Verify a few of the more important attributes */ 918 /* Verify a few of the more important attributes */
866 if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) 919 if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
867 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 920 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
868 921
869 cur_size = i_size_read(inode); 922 if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
870 new_isize = nfs_size_to_loff_t(fattr->size); 923 cur_size = i_size_read(inode);
871 if (cur_size != new_isize && nfsi->npages == 0) 924 new_isize = nfs_size_to_loff_t(fattr->size);
872 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 925 if (cur_size != new_isize && nfsi->npages == 0)
926 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
927 }
873 928
874 /* Have any file permissions changed? */ 929 /* Have any file permissions changed? */
875 if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) 930 if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
876 || inode->i_uid != fattr->uid 931 invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
877 || inode->i_gid != fattr->gid) 932 if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && inode->i_uid != fattr->uid)
933 invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
934 if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && inode->i_gid != fattr->gid)
878 invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; 935 invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
879 936
880 /* Has the link count changed? */ 937 /* Has the link count changed? */
881 if (inode->i_nlink != fattr->nlink) 938 if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink)
882 invalid |= NFS_INO_INVALID_ATTR; 939 invalid |= NFS_INO_INVALID_ATTR;
883 940
884 if (!timespec_equal(&inode->i_atime, &fattr->atime)) 941 if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime))
885 invalid |= NFS_INO_INVALID_ATIME; 942 invalid |= NFS_INO_INVALID_ATIME;
886 943
887 if (invalid != 0) 944 if (invalid != 0)
@@ -893,11 +950,15 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
893 950
894static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr) 951static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
895{ 952{
953 if (!(fattr->valid & NFS_ATTR_FATTR_CTIME))
954 return 0;
896 return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0; 955 return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
897} 956}
898 957
899static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr) 958static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
900{ 959{
960 if (!(fattr->valid & NFS_ATTR_FATTR_SIZE))
961 return 0;
901 return nfs_size_to_loff_t(fattr->size) > i_size_read(inode); 962 return nfs_size_to_loff_t(fattr->size) > i_size_read(inode);
902} 963}
903 964
@@ -975,6 +1036,7 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
975 spin_lock(&inode->i_lock); 1036 spin_lock(&inode->i_lock);
976 status = nfs_refresh_inode_locked(inode, fattr); 1037 status = nfs_refresh_inode_locked(inode, fattr);
977 spin_unlock(&inode->i_lock); 1038 spin_unlock(&inode->i_lock);
1039
978 return status; 1040 return status;
979} 1041}
980 1042
@@ -1033,20 +1095,31 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
1033 /* Don't do a WCC update if these attributes are already stale */ 1095 /* Don't do a WCC update if these attributes are already stale */
1034 if ((fattr->valid & NFS_ATTR_FATTR) == 0 || 1096 if ((fattr->valid & NFS_ATTR_FATTR) == 0 ||
1035 !nfs_inode_attrs_need_update(inode, fattr)) { 1097 !nfs_inode_attrs_need_update(inode, fattr)) {
1036 fattr->valid &= ~(NFS_ATTR_WCC_V4|NFS_ATTR_WCC); 1098 fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE
1099 | NFS_ATTR_FATTR_PRESIZE
1100 | NFS_ATTR_FATTR_PREMTIME
1101 | NFS_ATTR_FATTR_PRECTIME);
1037 goto out_noforce; 1102 goto out_noforce;
1038 } 1103 }
1039 if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && 1104 if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
1040 (fattr->valid & NFS_ATTR_WCC_V4) == 0) { 1105 (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) {
1041 fattr->pre_change_attr = NFS_I(inode)->change_attr; 1106 fattr->pre_change_attr = NFS_I(inode)->change_attr;
1042 fattr->valid |= NFS_ATTR_WCC_V4; 1107 fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
1043 } 1108 }
1044 if ((fattr->valid & NFS_ATTR_FATTR) != 0 && 1109 if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 &&
1045 (fattr->valid & NFS_ATTR_WCC) == 0) { 1110 (fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) {
1046 memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime)); 1111 memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime));
1112 fattr->valid |= NFS_ATTR_FATTR_PRECTIME;
1113 }
1114 if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 &&
1115 (fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) {
1047 memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime)); 1116 memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime));
1117 fattr->valid |= NFS_ATTR_FATTR_PREMTIME;
1118 }
1119 if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 &&
1120 (fattr->valid & NFS_ATTR_FATTR_PRESIZE) == 0) {
1048 fattr->pre_size = i_size_read(inode); 1121 fattr->pre_size = i_size_read(inode);
1049 fattr->valid |= NFS_ATTR_WCC; 1122 fattr->valid |= NFS_ATTR_FATTR_PRESIZE;
1050 } 1123 }
1051out_noforce: 1124out_noforce:
1052 status = nfs_post_op_update_inode_locked(inode, fattr); 1125 status = nfs_post_op_update_inode_locked(inode, fattr);
@@ -1078,18 +1151,18 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1078 __func__, inode->i_sb->s_id, inode->i_ino, 1151 __func__, inode->i_sb->s_id, inode->i_ino,
1079 atomic_read(&inode->i_count), fattr->valid); 1152 atomic_read(&inode->i_count), fattr->valid);
1080 1153
1081 if (nfsi->fileid != fattr->fileid) 1154 if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
1082 goto out_fileid; 1155 goto out_fileid;
1083 1156
1084 /* 1157 /*
1085 * Make sure the inode's type hasn't changed. 1158 * Make sure the inode's type hasn't changed.
1086 */ 1159 */
1087 if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) 1160 if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
1088 goto out_changed; 1161 goto out_changed;
1089 1162
1090 server = NFS_SERVER(inode); 1163 server = NFS_SERVER(inode);
1091 /* Update the fsid? */ 1164 /* Update the fsid? */
1092 if (S_ISDIR(inode->i_mode) && 1165 if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
1093 !nfs_fsid_equal(&server->fsid, &fattr->fsid) && 1166 !nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
1094 !test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags)) 1167 !test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags))
1095 server->fsid = fattr->fsid; 1168 server->fsid = fattr->fsid;
@@ -1099,14 +1172,27 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1099 */ 1172 */
1100 nfsi->read_cache_jiffies = fattr->time_start; 1173 nfsi->read_cache_jiffies = fattr->time_start;
1101 1174
1102 nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME 1175 if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) || (fattr->valid & (NFS_ATTR_FATTR_MTIME|NFS_ATTR_FATTR_CTIME)))
1103 | NFS_INO_REVAL_PAGECACHE); 1176 nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
1177 | NFS_INO_INVALID_ATIME
1178 | NFS_INO_REVAL_PAGECACHE);
1104 1179
1105 /* Do atomic weak cache consistency updates */ 1180 /* Do atomic weak cache consistency updates */
1106 nfs_wcc_update_inode(inode, fattr); 1181 nfs_wcc_update_inode(inode, fattr);
1107 1182
1108 /* More cache consistency checks */ 1183 /* More cache consistency checks */
1109 if (!(fattr->valid & NFS_ATTR_FATTR_V4)) { 1184 if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
1185 if (nfsi->change_attr != fattr->change_attr) {
1186 dprintk("NFS: change_attr change on server for file %s/%ld\n",
1187 inode->i_sb->s_id, inode->i_ino);
1188 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1189 if (S_ISDIR(inode->i_mode))
1190 nfs_force_lookup_revalidate(inode);
1191 nfsi->change_attr = fattr->change_attr;
1192 }
1193 }
1194
1195 if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
1110 /* NFSv2/v3: Check if the mtime agrees */ 1196 /* NFSv2/v3: Check if the mtime agrees */
1111 if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) { 1197 if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
1112 dprintk("NFS: mtime change on server for file %s/%ld\n", 1198 dprintk("NFS: mtime change on server for file %s/%ld\n",
@@ -1114,59 +1200,80 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1114 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; 1200 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
1115 if (S_ISDIR(inode->i_mode)) 1201 if (S_ISDIR(inode->i_mode))
1116 nfs_force_lookup_revalidate(inode); 1202 nfs_force_lookup_revalidate(inode);
1203 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
1117 } 1204 }
1205 }
1206 if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
1118 /* If ctime has changed we should definitely clear access+acl caches */ 1207 /* If ctime has changed we should definitely clear access+acl caches */
1119 if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) 1208 if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
1120 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 1209 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1121 } else if (nfsi->change_attr != fattr->change_attr) { 1210 /* and probably clear data for a directory too as utimes can cause
1122 dprintk("NFS: change_attr change on server for file %s/%ld\n", 1211 * havoc with our cache.
1123 inode->i_sb->s_id, inode->i_ino); 1212 */
1124 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 1213 if (S_ISDIR(inode->i_mode)) {
1125 if (S_ISDIR(inode->i_mode)) 1214 invalid |= NFS_INO_INVALID_DATA;
1126 nfs_force_lookup_revalidate(inode); 1215 nfs_force_lookup_revalidate(inode);
1216 }
1217 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
1218 }
1127 } 1219 }
1128 1220
1129 /* Check if our cached file size is stale */ 1221 /* Check if our cached file size is stale */
1130 new_isize = nfs_size_to_loff_t(fattr->size); 1222 if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
1131 cur_isize = i_size_read(inode); 1223 new_isize = nfs_size_to_loff_t(fattr->size);
1132 if (new_isize != cur_isize) { 1224 cur_isize = i_size_read(inode);
1133 /* Do we perhaps have any outstanding writes, or has 1225 if (new_isize != cur_isize) {
1134 * the file grown beyond our last write? */ 1226 /* Do we perhaps have any outstanding writes, or has
1135 if (nfsi->npages == 0 || new_isize > cur_isize) { 1227 * the file grown beyond our last write? */
1136 i_size_write(inode, new_isize); 1228 if (nfsi->npages == 0 || new_isize > cur_isize) {
1137 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; 1229 i_size_write(inode, new_isize);
1230 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
1231 }
1232 dprintk("NFS: isize change on server for file %s/%ld\n",
1233 inode->i_sb->s_id, inode->i_ino);
1138 } 1234 }
1139 dprintk("NFS: isize change on server for file %s/%ld\n",
1140 inode->i_sb->s_id, inode->i_ino);
1141 } 1235 }
1142 1236
1143 1237
1144 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); 1238 if (fattr->valid & NFS_ATTR_FATTR_ATIME)
1145 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); 1239 memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
1146 memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
1147 nfsi->change_attr = fattr->change_attr;
1148
1149 if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) ||
1150 inode->i_uid != fattr->uid ||
1151 inode->i_gid != fattr->gid)
1152 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1153 1240
1154 if (inode->i_nlink != fattr->nlink) 1241 if (fattr->valid & NFS_ATTR_FATTR_MODE) {
1155 invalid |= NFS_INO_INVALID_ATTR; 1242 if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
1243 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1244 inode->i_mode = fattr->mode;
1245 }
1246 }
1247 if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
1248 if (inode->i_uid != fattr->uid) {
1249 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1250 inode->i_uid = fattr->uid;
1251 }
1252 }
1253 if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
1254 if (inode->i_gid != fattr->gid) {
1255 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1256 inode->i_gid = fattr->gid;
1257 }
1258 }
1156 1259
1157 inode->i_mode = fattr->mode; 1260 if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
1158 inode->i_nlink = fattr->nlink; 1261 if (inode->i_nlink != fattr->nlink) {
1159 inode->i_uid = fattr->uid; 1262 invalid |= NFS_INO_INVALID_ATTR;
1160 inode->i_gid = fattr->gid; 1263 if (S_ISDIR(inode->i_mode))
1264 invalid |= NFS_INO_INVALID_DATA;
1265 inode->i_nlink = fattr->nlink;
1266 }
1267 }
1161 1268
1162 if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) { 1269 if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
1163 /* 1270 /*
1164 * report the blocks in 512byte units 1271 * report the blocks in 512byte units
1165 */ 1272 */
1166 inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); 1273 inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
1167 } else {
1168 inode->i_blocks = fattr->du.nfs2.blocks;
1169 } 1274 }
1275 if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
1276 inode->i_blocks = fattr->du.nfs2.blocks;
1170 1277
1171 /* Update attrtimeo value if we're out of the unstable period */ 1278 /* Update attrtimeo value if we're out of the unstable period */
1172 if (invalid & NFS_INO_INVALID_ATTR) { 1279 if (invalid & NFS_INO_INVALID_ATTR) {
@@ -1274,7 +1381,6 @@ static void init_once(void *foo)
1274 INIT_LIST_HEAD(&nfsi->access_cache_entry_lru); 1381 INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
1275 INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); 1382 INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
1276 INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC); 1383 INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
1277 nfsi->ncommit = 0;
1278 nfsi->npages = 0; 1384 nfsi->npages = 0;
1279 atomic_set(&nfsi->silly_count, 1); 1385 atomic_set(&nfsi->silly_count, 1);
1280 INIT_HLIST_HEAD(&nfsi->silly_list); 1386 INIT_HLIST_HEAD(&nfsi->silly_list);
@@ -1337,6 +1443,10 @@ static int __init init_nfs_fs(void)
1337{ 1443{
1338 int err; 1444 int err;
1339 1445
1446 err = nfs_fscache_register();
1447 if (err < 0)
1448 goto out7;
1449
1340 err = nfsiod_start(); 1450 err = nfsiod_start();
1341 if (err) 1451 if (err)
1342 goto out6; 1452 goto out6;
@@ -1389,6 +1499,8 @@ out4:
1389out5: 1499out5:
1390 nfsiod_stop(); 1500 nfsiod_stop();
1391out6: 1501out6:
1502 nfs_fscache_unregister();
1503out7:
1392 return err; 1504 return err;
1393} 1505}
1394 1506
@@ -1399,6 +1511,7 @@ static void __exit exit_nfs_fs(void)
1399 nfs_destroy_readpagecache(); 1511 nfs_destroy_readpagecache();
1400 nfs_destroy_inodecache(); 1512 nfs_destroy_inodecache();
1401 nfs_destroy_nfspagecache(); 1513 nfs_destroy_nfspagecache();
1514 nfs_fscache_unregister();
1402#ifdef CONFIG_PROC_FS 1515#ifdef CONFIG_PROC_FS
1403 rpc_proc_unregister("nfs"); 1516 rpc_proc_unregister("nfs");
1404#endif 1517#endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 340ede8f608f..e4d6a8348adf 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -5,6 +5,8 @@
5#include <linux/mount.h> 5#include <linux/mount.h>
6#include <linux/security.h> 6#include <linux/security.h>
7 7
8#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
9
8struct nfs_string; 10struct nfs_string;
9 11
10/* Maximum number of readahead requests 12/* Maximum number of readahead requests
@@ -37,10 +39,12 @@ struct nfs_parsed_mount_data {
37 int acregmin, acregmax, 39 int acregmin, acregmax,
38 acdirmin, acdirmax; 40 acdirmin, acdirmax;
39 int namlen; 41 int namlen;
42 unsigned int options;
40 unsigned int bsize; 43 unsigned int bsize;
41 unsigned int auth_flavor_len; 44 unsigned int auth_flavor_len;
42 rpc_authflavor_t auth_flavors[1]; 45 rpc_authflavor_t auth_flavors[1];
43 char *client_address; 46 char *client_address;
47 char *fscache_uniq;
44 48
45 struct { 49 struct {
46 struct sockaddr_storage address; 50 struct sockaddr_storage address;
@@ -152,6 +156,9 @@ extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
152extern struct rpc_procinfo nfs4_procedures[]; 156extern struct rpc_procinfo nfs4_procedures[];
153#endif 157#endif
154 158
159/* proc.c */
160void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
161
155/* dir.c */ 162/* dir.c */
156extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask); 163extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
157 164
@@ -165,6 +172,7 @@ extern void nfs_clear_inode(struct inode *);
165extern void nfs4_clear_inode(struct inode *); 172extern void nfs4_clear_inode(struct inode *);
166#endif 173#endif
167void nfs_zap_acl_cache(struct inode *inode); 174void nfs_zap_acl_cache(struct inode *inode);
175extern int nfs_wait_bit_killable(void *word);
168 176
169/* super.c */ 177/* super.c */
170void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *); 178void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *);
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index a36952810032..a2ab2529b5ca 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -16,6 +16,9 @@
16 16
17struct nfs_iostats { 17struct nfs_iostats {
18 unsigned long long bytes[__NFSIOS_BYTESMAX]; 18 unsigned long long bytes[__NFSIOS_BYTESMAX];
19#ifdef CONFIG_NFS_FSCACHE
20 unsigned long long fscache[__NFSIOS_FSCACHEMAX];
21#endif
19 unsigned long events[__NFSIOS_COUNTSMAX]; 22 unsigned long events[__NFSIOS_COUNTSMAX];
20} ____cacheline_aligned; 23} ____cacheline_aligned;
21 24
@@ -57,6 +60,21 @@ static inline void nfs_add_stats(const struct inode *inode,
57 nfs_add_server_stats(NFS_SERVER(inode), stat, addend); 60 nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
58} 61}
59 62
63#ifdef CONFIG_NFS_FSCACHE
64static inline void nfs_add_fscache_stats(struct inode *inode,
65 enum nfs_stat_fscachecounters stat,
66 unsigned long addend)
67{
68 struct nfs_iostats *iostats;
69 int cpu;
70
71 cpu = get_cpu();
72 iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu);
73 iostats->fscache[stat] += addend;
74 put_cpu_no_resched();
75}
76#endif
77
60static inline struct nfs_iostats *nfs_alloc_iostats(void) 78static inline struct nfs_iostats *nfs_alloc_iostats(void)
61{ 79{
62 return alloc_percpu(struct nfs_iostats); 80 return alloc_percpu(struct nfs_iostats);
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 28bab67d1519..c862c9340f9a 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -120,8 +120,8 @@ xdr_decode_time(__be32 *p, struct timespec *timep)
120static __be32 * 120static __be32 *
121xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) 121xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
122{ 122{
123 u32 rdev; 123 u32 rdev, type;
124 fattr->type = (enum nfs_ftype) ntohl(*p++); 124 type = ntohl(*p++);
125 fattr->mode = ntohl(*p++); 125 fattr->mode = ntohl(*p++);
126 fattr->nlink = ntohl(*p++); 126 fattr->nlink = ntohl(*p++);
127 fattr->uid = ntohl(*p++); 127 fattr->uid = ntohl(*p++);
@@ -136,10 +136,9 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
136 p = xdr_decode_time(p, &fattr->atime); 136 p = xdr_decode_time(p, &fattr->atime);
137 p = xdr_decode_time(p, &fattr->mtime); 137 p = xdr_decode_time(p, &fattr->mtime);
138 p = xdr_decode_time(p, &fattr->ctime); 138 p = xdr_decode_time(p, &fattr->ctime);
139 fattr->valid |= NFS_ATTR_FATTR; 139 fattr->valid |= NFS_ATTR_FATTR_V2;
140 fattr->rdev = new_decode_dev(rdev); 140 fattr->rdev = new_decode_dev(rdev);
141 if (fattr->type == NFCHR && rdev == NFS2_FIFO_DEV) { 141 if (type == NFCHR && rdev == NFS2_FIFO_DEV) {
142 fattr->type = NFFIFO;
143 fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO; 142 fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO;
144 fattr->rdev = 0; 143 fattr->rdev = 0;
145 } 144 }
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index c55be7a7679e..d0cc5ce0edfe 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -328,7 +328,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
328 data->arg.create.verifier[1] = current->pid; 328 data->arg.create.verifier[1] = current->pid;
329 } 329 }
330 330
331 sattr->ia_mode &= ~current->fs->umask; 331 sattr->ia_mode &= ~current_umask();
332 332
333 for (;;) { 333 for (;;) {
334 status = nfs3_do_create(dir, dentry, data); 334 status = nfs3_do_create(dir, dentry, data);
@@ -528,7 +528,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
528 528
529 dprintk("NFS call mkdir %s\n", dentry->d_name.name); 529 dprintk("NFS call mkdir %s\n", dentry->d_name.name);
530 530
531 sattr->ia_mode &= ~current->fs->umask; 531 sattr->ia_mode &= ~current_umask();
532 532
533 data = nfs3_alloc_createdata(); 533 data = nfs3_alloc_createdata();
534 if (data == NULL) 534 if (data == NULL)
@@ -639,7 +639,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
639 dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name, 639 dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name,
640 MAJOR(rdev), MINOR(rdev)); 640 MAJOR(rdev), MINOR(rdev));
641 641
642 sattr->ia_mode &= ~current->fs->umask; 642 sattr->ia_mode &= ~current_umask();
643 643
644 data = nfs3_alloc_createdata(); 644 data = nfs3_alloc_createdata();
645 if (data == NULL) 645 if (data == NULL)
@@ -834,4 +834,5 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
834 .commit_done = nfs3_commit_done, 834 .commit_done = nfs3_commit_done,
835 .lock = nfs3_proc_lock, 835 .lock = nfs3_proc_lock,
836 .clear_acl_cache = nfs3_forget_cached_acls, 836 .clear_acl_cache = nfs3_forget_cached_acls,
837 .close_context = nfs_close_context,
837}; 838};
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 6cdeacffde46..35869a4921f1 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -91,19 +91,15 @@
91/* 91/*
92 * Map file type to S_IFMT bits 92 * Map file type to S_IFMT bits
93 */ 93 */
94static struct { 94static const umode_t nfs_type2fmt[] = {
95 unsigned int mode; 95 [NF3BAD] = 0,
96 unsigned int nfs2type; 96 [NF3REG] = S_IFREG,
97} nfs_type2fmt[] = { 97 [NF3DIR] = S_IFDIR,
98 { 0, NFNON }, 98 [NF3BLK] = S_IFBLK,
99 { S_IFREG, NFREG }, 99 [NF3CHR] = S_IFCHR,
100 { S_IFDIR, NFDIR }, 100 [NF3LNK] = S_IFLNK,
101 { S_IFBLK, NFBLK }, 101 [NF3SOCK] = S_IFSOCK,
102 { S_IFCHR, NFCHR }, 102 [NF3FIFO] = S_IFIFO,
103 { S_IFLNK, NFLNK },
104 { S_IFSOCK, NFSOCK },
105 { S_IFIFO, NFFIFO },
106 { 0, NFBAD }
107}; 103};
108 104
109/* 105/*
@@ -148,13 +144,12 @@ static __be32 *
148xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) 144xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
149{ 145{
150 unsigned int type, major, minor; 146 unsigned int type, major, minor;
151 int fmode; 147 umode_t fmode;
152 148
153 type = ntohl(*p++); 149 type = ntohl(*p++);
154 if (type >= NF3BAD) 150 if (type > NF3FIFO)
155 type = NF3BAD; 151 type = NF3NON;
156 fmode = nfs_type2fmt[type].mode; 152 fmode = nfs_type2fmt[type];
157 fattr->type = nfs_type2fmt[type].nfs2type;
158 fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode; 153 fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode;
159 fattr->nlink = ntohl(*p++); 154 fattr->nlink = ntohl(*p++);
160 fattr->uid = ntohl(*p++); 155 fattr->uid = ntohl(*p++);
@@ -177,7 +172,7 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
177 p = xdr_decode_time3(p, &fattr->ctime); 172 p = xdr_decode_time3(p, &fattr->ctime);
178 173
179 /* Update the mode bits */ 174 /* Update the mode bits */
180 fattr->valid |= (NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3); 175 fattr->valid |= NFS_ATTR_FATTR_V3;
181 return p; 176 return p;
182} 177}
183 178
@@ -233,7 +228,9 @@ xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr)
233 p = xdr_decode_hyper(p, &fattr->pre_size); 228 p = xdr_decode_hyper(p, &fattr->pre_size);
234 p = xdr_decode_time3(p, &fattr->pre_mtime); 229 p = xdr_decode_time3(p, &fattr->pre_mtime);
235 p = xdr_decode_time3(p, &fattr->pre_ctime); 230 p = xdr_decode_time3(p, &fattr->pre_ctime);
236 fattr->valid |= NFS_ATTR_WCC; 231 fattr->valid |= NFS_ATTR_FATTR_PRESIZE
232 | NFS_ATTR_FATTR_PREMTIME
233 | NFS_ATTR_FATTR_PRECTIME;
237 return p; 234 return p;
238} 235}
239 236
@@ -716,7 +713,8 @@ nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p,
716 if (args->npages != 0) 713 if (args->npages != 0)
717 xdr_encode_pages(buf, args->pages, 0, args->len); 714 xdr_encode_pages(buf, args->pages, 0, args->len);
718 else 715 else
719 req->rq_slen += args->len; 716 req->rq_slen = xdr_adjust_iovec(req->rq_svec,
717 p + XDR_QUADLEN(args->len));
720 718
721 err = nfsacl_encode(buf, base, args->inode, 719 err = nfsacl_encode(buf, base, args->inode,
722 (args->mask & NFS_ACL) ? 720 (args->mask & NFS_ACL) ?
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8dde84b988d9..4674f8092da8 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -193,14 +193,6 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
193 kunmap_atomic(start, KM_USER0); 193 kunmap_atomic(start, KM_USER0);
194} 194}
195 195
196static int nfs4_wait_bit_killable(void *word)
197{
198 if (fatal_signal_pending(current))
199 return -ERESTARTSYS;
200 schedule();
201 return 0;
202}
203
204static int nfs4_wait_clnt_recover(struct nfs_client *clp) 196static int nfs4_wait_clnt_recover(struct nfs_client *clp)
205{ 197{
206 int res; 198 int res;
@@ -208,7 +200,7 @@ static int nfs4_wait_clnt_recover(struct nfs_client *clp)
208 might_sleep(); 200 might_sleep();
209 201
210 res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, 202 res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
211 nfs4_wait_bit_killable, TASK_KILLABLE); 203 nfs_wait_bit_killable, TASK_KILLABLE);
212 return res; 204 return res;
213} 205}
214 206
@@ -1439,7 +1431,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
1439 if (calldata->arg.seqid == NULL) 1431 if (calldata->arg.seqid == NULL)
1440 goto out_free_calldata; 1432 goto out_free_calldata;
1441 calldata->arg.fmode = 0; 1433 calldata->arg.fmode = 0;
1442 calldata->arg.bitmask = server->attr_bitmask; 1434 calldata->arg.bitmask = server->cache_consistency_bitmask;
1443 calldata->res.fattr = &calldata->fattr; 1435 calldata->res.fattr = &calldata->fattr;
1444 calldata->res.seqid = calldata->arg.seqid; 1436 calldata->res.seqid = calldata->arg.seqid;
1445 calldata->res.server = server; 1437 calldata->res.server = server;
@@ -1509,7 +1501,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
1509 attr.ia_mode = nd->intent.open.create_mode; 1501 attr.ia_mode = nd->intent.open.create_mode;
1510 attr.ia_valid = ATTR_MODE; 1502 attr.ia_valid = ATTR_MODE;
1511 if (!IS_POSIXACL(dir)) 1503 if (!IS_POSIXACL(dir))
1512 attr.ia_mode &= ~current->fs->umask; 1504 attr.ia_mode &= ~current_umask();
1513 } else { 1505 } else {
1514 attr.ia_valid = 0; 1506 attr.ia_valid = 0;
1515 BUG_ON(nd->intent.open.flags & O_CREAT); 1507 BUG_ON(nd->intent.open.flags & O_CREAT);
@@ -1580,6 +1572,15 @@ out_drop:
1580 return 0; 1572 return 0;
1581} 1573}
1582 1574
1575void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
1576{
1577 if (ctx->state == NULL)
1578 return;
1579 if (is_sync)
1580 nfs4_close_sync(&ctx->path, ctx->state, ctx->mode);
1581 else
1582 nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
1583}
1583 1584
1584static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) 1585static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
1585{ 1586{
@@ -1600,6 +1601,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
1600 server->caps |= NFS_CAP_HARDLINKS; 1601 server->caps |= NFS_CAP_HARDLINKS;
1601 if (res.has_symlinks != 0) 1602 if (res.has_symlinks != 0)
1602 server->caps |= NFS_CAP_SYMLINKS; 1603 server->caps |= NFS_CAP_SYMLINKS;
1604 memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
1605 server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
1606 server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
1603 server->acl_bitmask = res.acl_bitmask; 1607 server->acl_bitmask = res.acl_bitmask;
1604 } 1608 }
1605 return status; 1609 return status;
@@ -2079,7 +2083,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
2079 struct nfs_removeargs *args = msg->rpc_argp; 2083 struct nfs_removeargs *args = msg->rpc_argp;
2080 struct nfs_removeres *res = msg->rpc_resp; 2084 struct nfs_removeres *res = msg->rpc_resp;
2081 2085
2082 args->bitmask = server->attr_bitmask; 2086 args->bitmask = server->cache_consistency_bitmask;
2083 res->server = server; 2087 res->server = server;
2084 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; 2088 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
2085} 2089}
@@ -2323,7 +2327,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
2323 .pages = &page, 2327 .pages = &page,
2324 .pgbase = 0, 2328 .pgbase = 0,
2325 .count = count, 2329 .count = count,
2326 .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask, 2330 .bitmask = NFS_SERVER(dentry->d_inode)->cache_consistency_bitmask,
2327 }; 2331 };
2328 struct nfs4_readdir_res res; 2332 struct nfs4_readdir_res res;
2329 struct rpc_message msg = { 2333 struct rpc_message msg = {
@@ -2552,7 +2556,7 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
2552{ 2556{
2553 struct nfs_server *server = NFS_SERVER(data->inode); 2557 struct nfs_server *server = NFS_SERVER(data->inode);
2554 2558
2555 data->args.bitmask = server->attr_bitmask; 2559 data->args.bitmask = server->cache_consistency_bitmask;
2556 data->res.server = server; 2560 data->res.server = server;
2557 data->timestamp = jiffies; 2561 data->timestamp = jiffies;
2558 2562
@@ -2575,7 +2579,7 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
2575{ 2579{
2576 struct nfs_server *server = NFS_SERVER(data->inode); 2580 struct nfs_server *server = NFS_SERVER(data->inode);
2577 2581
2578 data->args.bitmask = server->attr_bitmask; 2582 data->args.bitmask = server->cache_consistency_bitmask;
2579 data->res.server = server; 2583 data->res.server = server;
2580 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; 2584 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
2581} 2585}
@@ -2590,12 +2594,9 @@ static void nfs4_renew_done(struct rpc_task *task, void *data)
2590 unsigned long timestamp = (unsigned long)data; 2594 unsigned long timestamp = (unsigned long)data;
2591 2595
2592 if (task->tk_status < 0) { 2596 if (task->tk_status < 0) {
2593 switch (task->tk_status) { 2597 /* Unless we're shutting down, schedule state recovery! */
2594 case -NFS4ERR_STALE_CLIENTID: 2598 if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0)
2595 case -NFS4ERR_EXPIRED: 2599 nfs4_schedule_state_recovery(clp);
2596 case -NFS4ERR_CB_PATH_DOWN:
2597 nfs4_schedule_state_recovery(clp);
2598 }
2599 return; 2600 return;
2600 } 2601 }
2601 spin_lock(&clp->cl_lock); 2602 spin_lock(&clp->cl_lock);
@@ -3678,6 +3679,19 @@ ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen)
3678 return len; 3679 return len;
3679} 3680}
3680 3681
3682static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr)
3683{
3684 if (!((fattr->valid & NFS_ATTR_FATTR_FILEID) &&
3685 (fattr->valid & NFS_ATTR_FATTR_FSID) &&
3686 (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)))
3687 return;
3688
3689 fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE |
3690 NFS_ATTR_FATTR_NLINK;
3691 fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO;
3692 fattr->nlink = 2;
3693}
3694
3681int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, 3695int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
3682 struct nfs4_fs_locations *fs_locations, struct page *page) 3696 struct nfs4_fs_locations *fs_locations, struct page *page)
3683{ 3697{
@@ -3704,6 +3718,7 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
3704 fs_locations->server = server; 3718 fs_locations->server = server;
3705 fs_locations->nlocations = 0; 3719 fs_locations->nlocations = 0;
3706 status = rpc_call_sync(server->client, &msg, 0); 3720 status = rpc_call_sync(server->client, &msg, 0);
3721 nfs_fixup_referral_attributes(&fs_locations->fattr);
3707 dprintk("%s: returned status = %d\n", __func__, status); 3722 dprintk("%s: returned status = %d\n", __func__, status);
3708 return status; 3723 return status;
3709} 3724}
@@ -3767,6 +3782,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
3767 .commit_done = nfs4_commit_done, 3782 .commit_done = nfs4_commit_done,
3768 .lock = nfs4_proc_lock, 3783 .lock = nfs4_proc_lock,
3769 .clear_acl_cache = nfs4_zap_acl_attr, 3784 .clear_acl_cache = nfs4_zap_acl_attr,
3785 .close_context = nfs4_close_context,
3770}; 3786};
3771 3787
3772/* 3788/*
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 2022fe47966f..0298e909559f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -62,8 +62,14 @@ static LIST_HEAD(nfs4_clientid_list);
62 62
63static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred) 63static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred)
64{ 64{
65 int status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, 65 unsigned short port;
66 nfs_callback_tcpport, cred); 66 int status;
67
68 port = nfs_callback_tcpport;
69 if (clp->cl_addr.ss_family == AF_INET6)
70 port = nfs_callback_tcpport6;
71
72 status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred);
67 if (status == 0) 73 if (status == 0)
68 status = nfs4_proc_setclientid_confirm(clp, cred); 74 status = nfs4_proc_setclientid_confirm(clp, cred);
69 if (status == 0) 75 if (status == 0)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index d1e4c8f8a0a9..1690f0e44b91 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -522,20 +522,17 @@ static int nfs4_stat_to_errno(int);
522 decode_lookup_maxsz + \ 522 decode_lookup_maxsz + \
523 decode_fs_locations_maxsz) 523 decode_fs_locations_maxsz)
524 524
525static struct { 525static const umode_t nfs_type2fmt[] = {
526 unsigned int mode; 526 [NF4BAD] = 0,
527 unsigned int nfs2type; 527 [NF4REG] = S_IFREG,
528} nfs_type2fmt[] = { 528 [NF4DIR] = S_IFDIR,
529 { 0, NFNON }, 529 [NF4BLK] = S_IFBLK,
530 { S_IFREG, NFREG }, 530 [NF4CHR] = S_IFCHR,
531 { S_IFDIR, NFDIR }, 531 [NF4LNK] = S_IFLNK,
532 { S_IFBLK, NFBLK }, 532 [NF4SOCK] = S_IFSOCK,
533 { S_IFCHR, NFCHR }, 533 [NF4FIFO] = S_IFIFO,
534 { S_IFLNK, NFLNK }, 534 [NF4ATTRDIR] = 0,
535 { S_IFSOCK, NFSOCK }, 535 [NF4NAMEDATTR] = 0,
536 { S_IFIFO, NFFIFO },
537 { 0, NFNON },
538 { 0, NFNON },
539}; 536};
540 537
541struct compound_hdr { 538struct compound_hdr {
@@ -2160,6 +2157,7 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3
2160static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *type) 2157static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *type)
2161{ 2158{
2162 __be32 *p; 2159 __be32 *p;
2160 int ret = 0;
2163 2161
2164 *type = 0; 2162 *type = 0;
2165 if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U))) 2163 if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U)))
@@ -2172,14 +2170,16 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
2172 return -EIO; 2170 return -EIO;
2173 } 2171 }
2174 bitmap[0] &= ~FATTR4_WORD0_TYPE; 2172 bitmap[0] &= ~FATTR4_WORD0_TYPE;
2173 ret = NFS_ATTR_FATTR_TYPE;
2175 } 2174 }
2176 dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type].nfs2type); 2175 dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
2177 return 0; 2176 return ret;
2178} 2177}
2179 2178
2180static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change) 2179static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
2181{ 2180{
2182 __be32 *p; 2181 __be32 *p;
2182 int ret = 0;
2183 2183
2184 *change = 0; 2184 *change = 0;
2185 if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U))) 2185 if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U)))
@@ -2188,15 +2188,17 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
2188 READ_BUF(8); 2188 READ_BUF(8);
2189 READ64(*change); 2189 READ64(*change);
2190 bitmap[0] &= ~FATTR4_WORD0_CHANGE; 2190 bitmap[0] &= ~FATTR4_WORD0_CHANGE;
2191 ret = NFS_ATTR_FATTR_CHANGE;
2191 } 2192 }
2192 dprintk("%s: change attribute=%Lu\n", __func__, 2193 dprintk("%s: change attribute=%Lu\n", __func__,
2193 (unsigned long long)*change); 2194 (unsigned long long)*change);
2194 return 0; 2195 return ret;
2195} 2196}
2196 2197
2197static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size) 2198static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
2198{ 2199{
2199 __be32 *p; 2200 __be32 *p;
2201 int ret = 0;
2200 2202
2201 *size = 0; 2203 *size = 0;
2202 if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U))) 2204 if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U)))
@@ -2205,9 +2207,10 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
2205 READ_BUF(8); 2207 READ_BUF(8);
2206 READ64(*size); 2208 READ64(*size);
2207 bitmap[0] &= ~FATTR4_WORD0_SIZE; 2209 bitmap[0] &= ~FATTR4_WORD0_SIZE;
2210 ret = NFS_ATTR_FATTR_SIZE;
2208 } 2211 }
2209 dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size); 2212 dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size);
2210 return 0; 2213 return ret;
2211} 2214}
2212 2215
2213static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 2216static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2245,6 +2248,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
2245static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid) 2248static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
2246{ 2249{
2247 __be32 *p; 2250 __be32 *p;
2251 int ret = 0;
2248 2252
2249 fsid->major = 0; 2253 fsid->major = 0;
2250 fsid->minor = 0; 2254 fsid->minor = 0;
@@ -2255,11 +2259,12 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
2255 READ64(fsid->major); 2259 READ64(fsid->major);
2256 READ64(fsid->minor); 2260 READ64(fsid->minor);
2257 bitmap[0] &= ~FATTR4_WORD0_FSID; 2261 bitmap[0] &= ~FATTR4_WORD0_FSID;
2262 ret = NFS_ATTR_FATTR_FSID;
2258 } 2263 }
2259 dprintk("%s: fsid=(0x%Lx/0x%Lx)\n", __func__, 2264 dprintk("%s: fsid=(0x%Lx/0x%Lx)\n", __func__,
2260 (unsigned long long)fsid->major, 2265 (unsigned long long)fsid->major,
2261 (unsigned long long)fsid->minor); 2266 (unsigned long long)fsid->minor);
2262 return 0; 2267 return ret;
2263} 2268}
2264 2269
2265static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 2270static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2297,6 +2302,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
2297static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) 2302static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
2298{ 2303{
2299 __be32 *p; 2304 __be32 *p;
2305 int ret = 0;
2300 2306
2301 *fileid = 0; 2307 *fileid = 0;
2302 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U))) 2308 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U)))
@@ -2305,14 +2311,16 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
2305 READ_BUF(8); 2311 READ_BUF(8);
2306 READ64(*fileid); 2312 READ64(*fileid);
2307 bitmap[0] &= ~FATTR4_WORD0_FILEID; 2313 bitmap[0] &= ~FATTR4_WORD0_FILEID;
2314 ret = NFS_ATTR_FATTR_FILEID;
2308 } 2315 }
2309 dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); 2316 dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
2310 return 0; 2317 return ret;
2311} 2318}
2312 2319
2313static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) 2320static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
2314{ 2321{
2315 __be32 *p; 2322 __be32 *p;
2323 int ret = 0;
2316 2324
2317 *fileid = 0; 2325 *fileid = 0;
2318 if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U))) 2326 if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
@@ -2321,9 +2329,10 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
2321 READ_BUF(8); 2329 READ_BUF(8);
2322 READ64(*fileid); 2330 READ64(*fileid);
2323 bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; 2331 bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
2332 ret = NFS_ATTR_FATTR_FILEID;
2324 } 2333 }
2325 dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); 2334 dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
2326 return 0; 2335 return ret;
2327} 2336}
2328 2337
2329static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 2338static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2479,6 +2488,8 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
2479 if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES) 2488 if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES)
2480 res->nlocations++; 2489 res->nlocations++;
2481 } 2490 }
2491 if (res->nlocations != 0)
2492 status = NFS_ATTR_FATTR_V4_REFERRAL;
2482out: 2493out:
2483 dprintk("%s: fs_locations done, error = %d\n", __func__, status); 2494 dprintk("%s: fs_locations done, error = %d\n", __func__, status);
2484 return status; 2495 return status;
@@ -2580,26 +2591,30 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
2580 return status; 2591 return status;
2581} 2592}
2582 2593
2583static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *mode) 2594static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode)
2584{ 2595{
2596 uint32_t tmp;
2585 __be32 *p; 2597 __be32 *p;
2598 int ret = 0;
2586 2599
2587 *mode = 0; 2600 *mode = 0;
2588 if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U))) 2601 if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U)))
2589 return -EIO; 2602 return -EIO;
2590 if (likely(bitmap[1] & FATTR4_WORD1_MODE)) { 2603 if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
2591 READ_BUF(4); 2604 READ_BUF(4);
2592 READ32(*mode); 2605 READ32(tmp);
2593 *mode &= ~S_IFMT; 2606 *mode = tmp & ~S_IFMT;
2594 bitmap[1] &= ~FATTR4_WORD1_MODE; 2607 bitmap[1] &= ~FATTR4_WORD1_MODE;
2608 ret = NFS_ATTR_FATTR_MODE;
2595 } 2609 }
2596 dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode); 2610 dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
2597 return 0; 2611 return ret;
2598} 2612}
2599 2613
2600static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink) 2614static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
2601{ 2615{
2602 __be32 *p; 2616 __be32 *p;
2617 int ret = 0;
2603 2618
2604 *nlink = 1; 2619 *nlink = 1;
2605 if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U))) 2620 if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U)))
@@ -2608,15 +2623,17 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
2608 READ_BUF(4); 2623 READ_BUF(4);
2609 READ32(*nlink); 2624 READ32(*nlink);
2610 bitmap[1] &= ~FATTR4_WORD1_NUMLINKS; 2625 bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
2626 ret = NFS_ATTR_FATTR_NLINK;
2611 } 2627 }
2612 dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink); 2628 dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink);
2613 return 0; 2629 return ret;
2614} 2630}
2615 2631
2616static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid) 2632static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid)
2617{ 2633{
2618 uint32_t len; 2634 uint32_t len;
2619 __be32 *p; 2635 __be32 *p;
2636 int ret = 0;
2620 2637
2621 *uid = -2; 2638 *uid = -2;
2622 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U))) 2639 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))
@@ -2626,7 +2643,9 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
2626 READ32(len); 2643 READ32(len);
2627 READ_BUF(len); 2644 READ_BUF(len);
2628 if (len < XDR_MAX_NETOBJ) { 2645 if (len < XDR_MAX_NETOBJ) {
2629 if (nfs_map_name_to_uid(clp, (char *)p, len, uid) != 0) 2646 if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
2647 ret = NFS_ATTR_FATTR_OWNER;
2648 else
2630 dprintk("%s: nfs_map_name_to_uid failed!\n", 2649 dprintk("%s: nfs_map_name_to_uid failed!\n",
2631 __func__); 2650 __func__);
2632 } else 2651 } else
@@ -2635,13 +2654,14 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
2635 bitmap[1] &= ~FATTR4_WORD1_OWNER; 2654 bitmap[1] &= ~FATTR4_WORD1_OWNER;
2636 } 2655 }
2637 dprintk("%s: uid=%d\n", __func__, (int)*uid); 2656 dprintk("%s: uid=%d\n", __func__, (int)*uid);
2638 return 0; 2657 return ret;
2639} 2658}
2640 2659
2641static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid) 2660static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid)
2642{ 2661{
2643 uint32_t len; 2662 uint32_t len;
2644 __be32 *p; 2663 __be32 *p;
2664 int ret = 0;
2645 2665
2646 *gid = -2; 2666 *gid = -2;
2647 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U))) 2667 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))
@@ -2651,7 +2671,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
2651 READ32(len); 2671 READ32(len);
2652 READ_BUF(len); 2672 READ_BUF(len);
2653 if (len < XDR_MAX_NETOBJ) { 2673 if (len < XDR_MAX_NETOBJ) {
2654 if (nfs_map_group_to_gid(clp, (char *)p, len, gid) != 0) 2674 if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
2675 ret = NFS_ATTR_FATTR_GROUP;
2676 else
2655 dprintk("%s: nfs_map_group_to_gid failed!\n", 2677 dprintk("%s: nfs_map_group_to_gid failed!\n",
2656 __func__); 2678 __func__);
2657 } else 2679 } else
@@ -2660,13 +2682,14 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
2660 bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP; 2682 bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP;
2661 } 2683 }
2662 dprintk("%s: gid=%d\n", __func__, (int)*gid); 2684 dprintk("%s: gid=%d\n", __func__, (int)*gid);
2663 return 0; 2685 return ret;
2664} 2686}
2665 2687
2666static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev) 2688static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
2667{ 2689{
2668 uint32_t major = 0, minor = 0; 2690 uint32_t major = 0, minor = 0;
2669 __be32 *p; 2691 __be32 *p;
2692 int ret = 0;
2670 2693
2671 *rdev = MKDEV(0,0); 2694 *rdev = MKDEV(0,0);
2672 if (unlikely(bitmap[1] & (FATTR4_WORD1_RAWDEV - 1U))) 2695 if (unlikely(bitmap[1] & (FATTR4_WORD1_RAWDEV - 1U)))
@@ -2681,9 +2704,10 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
2681 if (MAJOR(tmp) == major && MINOR(tmp) == minor) 2704 if (MAJOR(tmp) == major && MINOR(tmp) == minor)
2682 *rdev = tmp; 2705 *rdev = tmp;
2683 bitmap[1] &= ~ FATTR4_WORD1_RAWDEV; 2706 bitmap[1] &= ~ FATTR4_WORD1_RAWDEV;
2707 ret = NFS_ATTR_FATTR_RDEV;
2684 } 2708 }
2685 dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor); 2709 dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor);
2686 return 0; 2710 return ret;
2687} 2711}
2688 2712
2689static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 2713static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2740,6 +2764,7 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
2740static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used) 2764static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
2741{ 2765{
2742 __be32 *p; 2766 __be32 *p;
2767 int ret = 0;
2743 2768
2744 *used = 0; 2769 *used = 0;
2745 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U))) 2770 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U)))
@@ -2748,10 +2773,11 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
2748 READ_BUF(8); 2773 READ_BUF(8);
2749 READ64(*used); 2774 READ64(*used);
2750 bitmap[1] &= ~FATTR4_WORD1_SPACE_USED; 2775 bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
2776 ret = NFS_ATTR_FATTR_SPACE_USED;
2751 } 2777 }
2752 dprintk("%s: space used=%Lu\n", __func__, 2778 dprintk("%s: space used=%Lu\n", __func__,
2753 (unsigned long long)*used); 2779 (unsigned long long)*used);
2754 return 0; 2780 return ret;
2755} 2781}
2756 2782
2757static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) 2783static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
@@ -2778,6 +2804,8 @@ static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, str
2778 return -EIO; 2804 return -EIO;
2779 if (likely(bitmap[1] & FATTR4_WORD1_TIME_ACCESS)) { 2805 if (likely(bitmap[1] & FATTR4_WORD1_TIME_ACCESS)) {
2780 status = decode_attr_time(xdr, time); 2806 status = decode_attr_time(xdr, time);
2807 if (status == 0)
2808 status = NFS_ATTR_FATTR_ATIME;
2781 bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS; 2809 bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS;
2782 } 2810 }
2783 dprintk("%s: atime=%ld\n", __func__, (long)time->tv_sec); 2811 dprintk("%s: atime=%ld\n", __func__, (long)time->tv_sec);
@@ -2794,6 +2822,8 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s
2794 return -EIO; 2822 return -EIO;
2795 if (likely(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) { 2823 if (likely(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) {
2796 status = decode_attr_time(xdr, time); 2824 status = decode_attr_time(xdr, time);
2825 if (status == 0)
2826 status = NFS_ATTR_FATTR_CTIME;
2797 bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA; 2827 bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA;
2798 } 2828 }
2799 dprintk("%s: ctime=%ld\n", __func__, (long)time->tv_sec); 2829 dprintk("%s: ctime=%ld\n", __func__, (long)time->tv_sec);
@@ -2810,6 +2840,8 @@ static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, str
2810 return -EIO; 2840 return -EIO;
2811 if (likely(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) { 2841 if (likely(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) {
2812 status = decode_attr_time(xdr, time); 2842 status = decode_attr_time(xdr, time);
2843 if (status == 0)
2844 status = NFS_ATTR_FATTR_MTIME;
2813 bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY; 2845 bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY;
2814 } 2846 }
2815 dprintk("%s: mtime=%ld\n", __func__, (long)time->tv_sec); 2847 dprintk("%s: mtime=%ld\n", __func__, (long)time->tv_sec);
@@ -2994,63 +3026,116 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
2994 uint32_t attrlen, 3026 uint32_t attrlen,
2995 bitmap[2] = {0}, 3027 bitmap[2] = {0},
2996 type; 3028 type;
2997 int status, fmode = 0; 3029 int status;
3030 umode_t fmode = 0;
2998 uint64_t fileid; 3031 uint64_t fileid;
2999 3032
3000 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) 3033 status = decode_op_hdr(xdr, OP_GETATTR);
3001 goto xdr_error; 3034 if (status < 0)
3002 if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
3003 goto xdr_error; 3035 goto xdr_error;
3004 3036
3005 fattr->bitmap[0] = bitmap[0]; 3037 status = decode_attr_bitmap(xdr, bitmap);
3006 fattr->bitmap[1] = bitmap[1]; 3038 if (status < 0)
3039 goto xdr_error;
3007 3040
3008 if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) 3041 status = decode_attr_length(xdr, &attrlen, &savep);
3042 if (status < 0)
3009 goto xdr_error; 3043 goto xdr_error;
3010 3044
3011 3045
3012 if ((status = decode_attr_type(xdr, bitmap, &type)) != 0) 3046 status = decode_attr_type(xdr, bitmap, &type);
3047 if (status < 0)
3013 goto xdr_error; 3048 goto xdr_error;
3014 fattr->type = nfs_type2fmt[type].nfs2type; 3049 fattr->mode = 0;
3015 fmode = nfs_type2fmt[type].mode; 3050 if (status != 0) {
3051 fattr->mode |= nfs_type2fmt[type];
3052 fattr->valid |= status;
3053 }
3016 3054
3017 if ((status = decode_attr_change(xdr, bitmap, &fattr->change_attr)) != 0) 3055 status = decode_attr_change(xdr, bitmap, &fattr->change_attr);
3056 if (status < 0)
3018 goto xdr_error; 3057 goto xdr_error;
3019 if ((status = decode_attr_size(xdr, bitmap, &fattr->size)) != 0) 3058 fattr->valid |= status;
3059
3060 status = decode_attr_size(xdr, bitmap, &fattr->size);
3061 if (status < 0)
3020 goto xdr_error; 3062 goto xdr_error;
3021 if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid)) != 0) 3063 fattr->valid |= status;
3064
3065 status = decode_attr_fsid(xdr, bitmap, &fattr->fsid);
3066 if (status < 0)
3022 goto xdr_error; 3067 goto xdr_error;
3023 if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0) 3068 fattr->valid |= status;
3069
3070 status = decode_attr_fileid(xdr, bitmap, &fattr->fileid);
3071 if (status < 0)
3024 goto xdr_error; 3072 goto xdr_error;
3025 if ((status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr, 3073 fattr->valid |= status;
3074
3075 status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
3026 struct nfs4_fs_locations, 3076 struct nfs4_fs_locations,
3027 fattr))) != 0) 3077 fattr));
3078 if (status < 0)
3028 goto xdr_error; 3079 goto xdr_error;
3029 if ((status = decode_attr_mode(xdr, bitmap, &fattr->mode)) != 0) 3080 fattr->valid |= status;
3081
3082 status = decode_attr_mode(xdr, bitmap, &fmode);
3083 if (status < 0)
3030 goto xdr_error; 3084 goto xdr_error;
3031 fattr->mode |= fmode; 3085 if (status != 0) {
3032 if ((status = decode_attr_nlink(xdr, bitmap, &fattr->nlink)) != 0) 3086 fattr->mode |= fmode;
3087 fattr->valid |= status;
3088 }
3089
3090 status = decode_attr_nlink(xdr, bitmap, &fattr->nlink);
3091 if (status < 0)
3033 goto xdr_error; 3092 goto xdr_error;
3034 if ((status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid)) != 0) 3093 fattr->valid |= status;
3094
3095 status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid);
3096 if (status < 0)
3035 goto xdr_error; 3097 goto xdr_error;
3036 if ((status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid)) != 0) 3098 fattr->valid |= status;
3099
3100 status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid);
3101 if (status < 0)
3037 goto xdr_error; 3102 goto xdr_error;
3038 if ((status = decode_attr_rdev(xdr, bitmap, &fattr->rdev)) != 0) 3103 fattr->valid |= status;
3104
3105 status = decode_attr_rdev(xdr, bitmap, &fattr->rdev);
3106 if (status < 0)
3039 goto xdr_error; 3107 goto xdr_error;
3040 if ((status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used)) != 0) 3108 fattr->valid |= status;
3109
3110 status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used);
3111 if (status < 0)
3041 goto xdr_error; 3112 goto xdr_error;
3042 if ((status = decode_attr_time_access(xdr, bitmap, &fattr->atime)) != 0) 3113 fattr->valid |= status;
3114
3115 status = decode_attr_time_access(xdr, bitmap, &fattr->atime);
3116 if (status < 0)
3043 goto xdr_error; 3117 goto xdr_error;
3044 if ((status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime)) != 0) 3118 fattr->valid |= status;
3119
3120 status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime);
3121 if (status < 0)
3045 goto xdr_error; 3122 goto xdr_error;
3046 if ((status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime)) != 0) 3123 fattr->valid |= status;
3124
3125 status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime);
3126 if (status < 0)
3047 goto xdr_error; 3127 goto xdr_error;
3048 if ((status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid)) != 0) 3128 fattr->valid |= status;
3129
3130 status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid);
3131 if (status < 0)
3049 goto xdr_error; 3132 goto xdr_error;
3050 if (fattr->fileid == 0 && fileid != 0) 3133 if (status != 0 && !(fattr->valid & status)) {
3051 fattr->fileid = fileid; 3134 fattr->fileid = fileid;
3052 if ((status = verify_attr_len(xdr, savep, attrlen)) == 0) 3135 fattr->valid |= status;
3053 fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4; 3136 }
3137
3138 status = verify_attr_len(xdr, savep, attrlen);
3054xdr_error: 3139xdr_error:
3055 dprintk("%s: xdr returned %d\n", __func__, -status); 3140 dprintk("%s: xdr returned %d\n", __func__, -status);
3056 return status; 3141 return status;
@@ -4078,9 +4163,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_se
4078 status = decode_setattr(&xdr, res); 4163 status = decode_setattr(&xdr, res);
4079 if (status) 4164 if (status)
4080 goto out; 4165 goto out;
4081 status = decode_getfattr(&xdr, res->fattr, res->server); 4166 decode_getfattr(&xdr, res->fattr, res->server);
4082 if (status == NFS4ERR_DELAY)
4083 status = 0;
4084out: 4167out:
4085 return status; 4168 return status;
4086} 4169}
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index d9ef602fbc5a..e3ed5908820b 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -129,7 +129,7 @@ enum {
129 Opt_err 129 Opt_err
130}; 130};
131 131
132static match_table_t __initconst tokens = { 132static const match_table_t tokens __initconst = {
133 {Opt_port, "port=%u"}, 133 {Opt_port, "port=%u"},
134 {Opt_rsize, "rsize=%u"}, 134 {Opt_rsize, "rsize=%u"},
135 {Opt_wsize, "wsize=%u"}, 135 {Opt_wsize, "wsize=%u"},
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 7f079209d70a..e2975939126a 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -176,17 +176,6 @@ void nfs_release_request(struct nfs_page *req)
176 kref_put(&req->wb_kref, nfs_free_request); 176 kref_put(&req->wb_kref, nfs_free_request);
177} 177}
178 178
179static int nfs_wait_bit_killable(void *word)
180{
181 int ret = 0;
182
183 if (fatal_signal_pending(current))
184 ret = -ERESTARTSYS;
185 else
186 schedule();
187 return ret;
188}
189
190/** 179/**
191 * nfs_wait_on_request - Wait for a request to complete. 180 * nfs_wait_on_request - Wait for a request to complete.
192 * @req: request to wait upon. 181 * @req: request to wait upon.
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 193465210d7c..7be72d90d49d 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -663,4 +663,5 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
663 .commit_setup = nfs_proc_commit_setup, 663 .commit_setup = nfs_proc_commit_setup,
664 .lock = nfs_proc_lock, 664 .lock = nfs_proc_lock,
665 .lock_check_bounds = nfs_lock_check_bounds, 665 .lock_check_bounds = nfs_lock_check_bounds,
666 .close_context = nfs_close_context,
666}; 667};
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index f856004bb7fa..4ace3c50a8eb 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -24,6 +24,7 @@
24 24
25#include "internal.h" 25#include "internal.h"
26#include "iostat.h" 26#include "iostat.h"
27#include "fscache.h"
27 28
28#define NFSDBG_FACILITY NFSDBG_PAGECACHE 29#define NFSDBG_FACILITY NFSDBG_PAGECACHE
29 30
@@ -111,8 +112,8 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
111 } 112 }
112} 113}
113 114
114static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, 115int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
115 struct page *page) 116 struct page *page)
116{ 117{
117 LIST_HEAD(one_request); 118 LIST_HEAD(one_request);
118 struct nfs_page *new; 119 struct nfs_page *new;
@@ -139,6 +140,11 @@ static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
139 140
140static void nfs_readpage_release(struct nfs_page *req) 141static void nfs_readpage_release(struct nfs_page *req)
141{ 142{
143 struct inode *d_inode = req->wb_context->path.dentry->d_inode;
144
145 if (PageUptodate(req->wb_page))
146 nfs_readpage_to_fscache(d_inode, req->wb_page, 0);
147
142 unlock_page(req->wb_page); 148 unlock_page(req->wb_page);
143 149
144 dprintk("NFS: read done (%s/%Ld %d@%Ld)\n", 150 dprintk("NFS: read done (%s/%Ld %d@%Ld)\n",
@@ -510,8 +516,15 @@ int nfs_readpage(struct file *file, struct page *page)
510 } else 516 } else
511 ctx = get_nfs_open_context(nfs_file_open_context(file)); 517 ctx = get_nfs_open_context(nfs_file_open_context(file));
512 518
519 if (!IS_SYNC(inode)) {
520 error = nfs_readpage_from_fscache(ctx, inode, page);
521 if (error == 0)
522 goto out;
523 }
524
513 error = nfs_readpage_async(ctx, inode, page); 525 error = nfs_readpage_async(ctx, inode, page);
514 526
527out:
515 put_nfs_open_context(ctx); 528 put_nfs_open_context(ctx);
516 return error; 529 return error;
517out_unlock: 530out_unlock:
@@ -584,6 +597,15 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
584 return -EBADF; 597 return -EBADF;
585 } else 598 } else
586 desc.ctx = get_nfs_open_context(nfs_file_open_context(filp)); 599 desc.ctx = get_nfs_open_context(nfs_file_open_context(filp));
600
601 /* attempt to read as many of the pages as possible from the cache
602 * - this returns -ENOBUFS immediately if the cookie is negative
603 */
604 ret = nfs_readpages_from_fscache(desc.ctx, inode, mapping,
605 pages, &nr_pages);
606 if (ret == 0)
607 goto read_complete; /* all pages were read */
608
587 if (rsize < PAGE_CACHE_SIZE) 609 if (rsize < PAGE_CACHE_SIZE)
588 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); 610 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
589 else 611 else
@@ -594,6 +616,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
594 nfs_pageio_complete(&pgio); 616 nfs_pageio_complete(&pgio);
595 npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 617 npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
596 nfs_add_stats(inode, NFSIOS_READPAGES, npages); 618 nfs_add_stats(inode, NFSIOS_READPAGES, npages);
619read_complete:
597 put_nfs_open_context(desc.ctx); 620 put_nfs_open_context(desc.ctx);
598out: 621out:
599 return ret; 622 return ret;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index d6686f4786dc..d2d67781c579 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -60,6 +60,7 @@
60#include "delegation.h" 60#include "delegation.h"
61#include "iostat.h" 61#include "iostat.h"
62#include "internal.h" 62#include "internal.h"
63#include "fscache.h"
63 64
64#define NFSDBG_FACILITY NFSDBG_VFS 65#define NFSDBG_FACILITY NFSDBG_VFS
65 66
@@ -76,6 +77,7 @@ enum {
76 Opt_rdirplus, Opt_nordirplus, 77 Opt_rdirplus, Opt_nordirplus,
77 Opt_sharecache, Opt_nosharecache, 78 Opt_sharecache, Opt_nosharecache,
78 Opt_resvport, Opt_noresvport, 79 Opt_resvport, Opt_noresvport,
80 Opt_fscache, Opt_nofscache,
79 81
80 /* Mount options that take integer arguments */ 82 /* Mount options that take integer arguments */
81 Opt_port, 83 Opt_port,
@@ -93,6 +95,7 @@ enum {
93 Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost, 95 Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
94 Opt_addr, Opt_mountaddr, Opt_clientaddr, 96 Opt_addr, Opt_mountaddr, Opt_clientaddr,
95 Opt_lookupcache, 97 Opt_lookupcache,
98 Opt_fscache_uniq,
96 99
97 /* Special mount options */ 100 /* Special mount options */
98 Opt_userspace, Opt_deprecated, Opt_sloppy, 101 Opt_userspace, Opt_deprecated, Opt_sloppy,
@@ -132,6 +135,9 @@ static const match_table_t nfs_mount_option_tokens = {
132 { Opt_nosharecache, "nosharecache" }, 135 { Opt_nosharecache, "nosharecache" },
133 { Opt_resvport, "resvport" }, 136 { Opt_resvport, "resvport" },
134 { Opt_noresvport, "noresvport" }, 137 { Opt_noresvport, "noresvport" },
138 { Opt_fscache, "fsc" },
139 { Opt_fscache_uniq, "fsc=%s" },
140 { Opt_nofscache, "nofsc" },
135 141
136 { Opt_port, "port=%u" }, 142 { Opt_port, "port=%u" },
137 { Opt_rsize, "rsize=%u" }, 143 { Opt_rsize, "rsize=%u" },
@@ -563,6 +569,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
563 if (clp->rpc_ops->version == 4) 569 if (clp->rpc_ops->version == 4)
564 seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr); 570 seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr);
565#endif 571#endif
572 if (nfss->options & NFS_OPTION_FSCACHE)
573 seq_printf(m, ",fsc");
566} 574}
567 575
568/* 576/*
@@ -641,6 +649,10 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
641 totals.events[i] += stats->events[i]; 649 totals.events[i] += stats->events[i];
642 for (i = 0; i < __NFSIOS_BYTESMAX; i++) 650 for (i = 0; i < __NFSIOS_BYTESMAX; i++)
643 totals.bytes[i] += stats->bytes[i]; 651 totals.bytes[i] += stats->bytes[i];
652#ifdef CONFIG_NFS_FSCACHE
653 for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
654 totals.fscache[i] += stats->fscache[i];
655#endif
644 656
645 preempt_enable(); 657 preempt_enable();
646 } 658 }
@@ -651,6 +663,13 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
651 seq_printf(m, "\n\tbytes:\t"); 663 seq_printf(m, "\n\tbytes:\t");
652 for (i = 0; i < __NFSIOS_BYTESMAX; i++) 664 for (i = 0; i < __NFSIOS_BYTESMAX; i++)
653 seq_printf(m, "%Lu ", totals.bytes[i]); 665 seq_printf(m, "%Lu ", totals.bytes[i]);
666#ifdef CONFIG_NFS_FSCACHE
667 if (nfss->options & NFS_OPTION_FSCACHE) {
668 seq_printf(m, "\n\tfsc:\t");
669 for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
670 seq_printf(m, "%Lu ", totals.bytes[i]);
671 }
672#endif
654 seq_printf(m, "\n"); 673 seq_printf(m, "\n");
655 674
656 rpc_print_iostats(m, nfss->client); 675 rpc_print_iostats(m, nfss->client);
@@ -664,9 +683,12 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
664 */ 683 */
665static void nfs_umount_begin(struct super_block *sb) 684static void nfs_umount_begin(struct super_block *sb)
666{ 685{
667 struct nfs_server *server = NFS_SB(sb); 686 struct nfs_server *server;
668 struct rpc_clnt *rpc; 687 struct rpc_clnt *rpc;
669 688
689 lock_kernel();
690
691 server = NFS_SB(sb);
670 /* -EIO all pending I/O */ 692 /* -EIO all pending I/O */
671 rpc = server->client_acl; 693 rpc = server->client_acl;
672 if (!IS_ERR(rpc)) 694 if (!IS_ERR(rpc))
@@ -674,6 +696,8 @@ static void nfs_umount_begin(struct super_block *sb)
674 rpc = server->client; 696 rpc = server->client;
675 if (!IS_ERR(rpc)) 697 if (!IS_ERR(rpc))
676 rpc_killall_tasks(rpc); 698 rpc_killall_tasks(rpc);
699
700 unlock_kernel();
677} 701}
678 702
679/* 703/*
@@ -1018,6 +1042,7 @@ static int nfs_parse_mount_options(char *raw,
1018 case Opt_rdma: 1042 case Opt_rdma:
1019 mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */ 1043 mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */
1020 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; 1044 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
1045 xprt_load_transport(p);
1021 break; 1046 break;
1022 case Opt_acl: 1047 case Opt_acl:
1023 mnt->flags &= ~NFS_MOUNT_NOACL; 1048 mnt->flags &= ~NFS_MOUNT_NOACL;
@@ -1043,6 +1068,24 @@ static int nfs_parse_mount_options(char *raw,
1043 case Opt_noresvport: 1068 case Opt_noresvport:
1044 mnt->flags |= NFS_MOUNT_NORESVPORT; 1069 mnt->flags |= NFS_MOUNT_NORESVPORT;
1045 break; 1070 break;
1071 case Opt_fscache:
1072 mnt->options |= NFS_OPTION_FSCACHE;
1073 kfree(mnt->fscache_uniq);
1074 mnt->fscache_uniq = NULL;
1075 break;
1076 case Opt_nofscache:
1077 mnt->options &= ~NFS_OPTION_FSCACHE;
1078 kfree(mnt->fscache_uniq);
1079 mnt->fscache_uniq = NULL;
1080 break;
1081 case Opt_fscache_uniq:
1082 string = match_strdup(args);
1083 if (!string)
1084 goto out_nomem;
1085 kfree(mnt->fscache_uniq);
1086 mnt->fscache_uniq = string;
1087 mnt->options |= NFS_OPTION_FSCACHE;
1088 break;
1046 1089
1047 /* 1090 /*
1048 * options that take numeric values 1091 * options that take numeric values
@@ -1190,7 +1233,6 @@ static int nfs_parse_mount_options(char *raw,
1190 goto out_nomem; 1233 goto out_nomem;
1191 token = match_token(string, 1234 token = match_token(string,
1192 nfs_xprt_protocol_tokens, args); 1235 nfs_xprt_protocol_tokens, args);
1193 kfree(string);
1194 1236
1195 switch (token) { 1237 switch (token) {
1196 case Opt_xprt_udp: 1238 case Opt_xprt_udp:
@@ -1205,12 +1247,14 @@ static int nfs_parse_mount_options(char *raw,
1205 /* vector side protocols to TCP */ 1247 /* vector side protocols to TCP */
1206 mnt->flags |= NFS_MOUNT_TCP; 1248 mnt->flags |= NFS_MOUNT_TCP;
1207 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; 1249 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
1250 xprt_load_transport(string);
1208 break; 1251 break;
1209 default: 1252 default:
1210 errors++; 1253 errors++;
1211 dfprintk(MOUNT, "NFS: unrecognized " 1254 dfprintk(MOUNT, "NFS: unrecognized "
1212 "transport protocol\n"); 1255 "transport protocol\n");
1213 } 1256 }
1257 kfree(string);
1214 break; 1258 break;
1215 case Opt_mountproto: 1259 case Opt_mountproto:
1216 string = match_strdup(args); 1260 string = match_strdup(args);
@@ -1868,8 +1912,6 @@ static void nfs_clone_super(struct super_block *sb,
1868 nfs_initialise_sb(sb); 1912 nfs_initialise_sb(sb);
1869} 1913}
1870 1914
1871#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
1872
1873static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags) 1915static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags)
1874{ 1916{
1875 const struct nfs_server *a = s->s_fs_info; 1917 const struct nfs_server *a = s->s_fs_info;
@@ -2034,6 +2076,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2034 if (!s->s_root) { 2076 if (!s->s_root) {
2035 /* initial superblock/root creation */ 2077 /* initial superblock/root creation */
2036 nfs_fill_super(s, data); 2078 nfs_fill_super(s, data);
2079 nfs_fscache_get_super_cookie(s, data);
2037 } 2080 }
2038 2081
2039 mntroot = nfs_get_root(s, mntfh); 2082 mntroot = nfs_get_root(s, mntfh);
@@ -2054,6 +2097,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2054out: 2097out:
2055 kfree(data->nfs_server.hostname); 2098 kfree(data->nfs_server.hostname);
2056 kfree(data->mount_server.hostname); 2099 kfree(data->mount_server.hostname);
2100 kfree(data->fscache_uniq);
2057 security_free_mnt_opts(&data->lsm_opts); 2101 security_free_mnt_opts(&data->lsm_opts);
2058out_free_fh: 2102out_free_fh:
2059 kfree(mntfh); 2103 kfree(mntfh);
@@ -2067,8 +2111,7 @@ out_err_nosb:
2067error_splat_root: 2111error_splat_root:
2068 dput(mntroot); 2112 dput(mntroot);
2069error_splat_super: 2113error_splat_super:
2070 up_write(&s->s_umount); 2114 deactivate_locked_super(s);
2071 deactivate_super(s);
2072 goto out; 2115 goto out;
2073} 2116}
2074 2117
@@ -2081,6 +2124,7 @@ static void nfs_kill_super(struct super_block *s)
2081 2124
2082 bdi_unregister(&server->backing_dev_info); 2125 bdi_unregister(&server->backing_dev_info);
2083 kill_anon_super(s); 2126 kill_anon_super(s);
2127 nfs_fscache_release_super_cookie(s);
2084 nfs_free_server(server); 2128 nfs_free_server(server);
2085} 2129}
2086 2130
@@ -2163,8 +2207,7 @@ out_err_noserver:
2163 return error; 2207 return error;
2164 2208
2165error_splat_super: 2209error_splat_super:
2166 up_write(&s->s_umount); 2210 deactivate_locked_super(s);
2167 deactivate_super(s);
2168 dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error); 2211 dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error);
2169 return error; 2212 return error;
2170} 2213}
@@ -2388,6 +2431,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
2388 if (!s->s_root) { 2431 if (!s->s_root) {
2389 /* initial superblock/root creation */ 2432 /* initial superblock/root creation */
2390 nfs4_fill_super(s); 2433 nfs4_fill_super(s);
2434 nfs_fscache_get_super_cookie(s, data);
2391 } 2435 }
2392 2436
2393 mntroot = nfs4_get_root(s, mntfh); 2437 mntroot = nfs4_get_root(s, mntfh);
@@ -2409,6 +2453,7 @@ out:
2409 kfree(data->client_address); 2453 kfree(data->client_address);
2410 kfree(data->nfs_server.export_path); 2454 kfree(data->nfs_server.export_path);
2411 kfree(data->nfs_server.hostname); 2455 kfree(data->nfs_server.hostname);
2456 kfree(data->fscache_uniq);
2412 security_free_mnt_opts(&data->lsm_opts); 2457 security_free_mnt_opts(&data->lsm_opts);
2413out_free_fh: 2458out_free_fh:
2414 kfree(mntfh); 2459 kfree(mntfh);
@@ -2422,8 +2467,7 @@ out_free:
2422error_splat_root: 2467error_splat_root:
2423 dput(mntroot); 2468 dput(mntroot);
2424error_splat_super: 2469error_splat_super:
2425 up_write(&s->s_umount); 2470 deactivate_locked_super(s);
2426 deactivate_super(s);
2427 goto out; 2471 goto out;
2428} 2472}
2429 2473
@@ -2435,6 +2479,7 @@ static void nfs4_kill_super(struct super_block *sb)
2435 kill_anon_super(sb); 2479 kill_anon_super(sb);
2436 2480
2437 nfs4_renewd_prepare_shutdown(server); 2481 nfs4_renewd_prepare_shutdown(server);
2482 nfs_fscache_release_super_cookie(sb);
2438 nfs_free_server(server); 2483 nfs_free_server(server);
2439} 2484}
2440 2485
@@ -2516,8 +2561,7 @@ out_err_noserver:
2516 return error; 2561 return error;
2517 2562
2518error_splat_super: 2563error_splat_super:
2519 up_write(&s->s_umount); 2564 deactivate_locked_super(s);
2520 deactivate_super(s);
2521 dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error); 2565 dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error);
2522 return error; 2566 return error;
2523} 2567}
@@ -2601,8 +2645,7 @@ out_err_noserver:
2601 return error; 2645 return error;
2602 2646
2603error_splat_super: 2647error_splat_super:
2604 up_write(&s->s_umount); 2648 deactivate_locked_super(s);
2605 deactivate_super(s);
2606 dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error); 2649 dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
2607 return error; 2650 return error;
2608} 2651}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 9f9845859fc1..e560a78995a3 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -313,19 +313,34 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
313int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) 313int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
314{ 314{
315 struct inode *inode = mapping->host; 315 struct inode *inode = mapping->host;
316 unsigned long *bitlock = &NFS_I(inode)->flags;
316 struct nfs_pageio_descriptor pgio; 317 struct nfs_pageio_descriptor pgio;
317 int err; 318 int err;
318 319
320 /* Stop dirtying of new pages while we sync */
321 err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING,
322 nfs_wait_bit_killable, TASK_KILLABLE);
323 if (err)
324 goto out_err;
325
319 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); 326 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
320 327
321 nfs_pageio_init_write(&pgio, inode, wb_priority(wbc)); 328 nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
322 err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); 329 err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
323 nfs_pageio_complete(&pgio); 330 nfs_pageio_complete(&pgio);
331
332 clear_bit_unlock(NFS_INO_FLUSHING, bitlock);
333 smp_mb__after_clear_bit();
334 wake_up_bit(bitlock, NFS_INO_FLUSHING);
335
324 if (err < 0) 336 if (err < 0)
325 return err; 337 goto out_err;
326 if (pgio.pg_error < 0) 338 err = pgio.pg_error;
327 return pgio.pg_error; 339 if (err < 0)
340 goto out_err;
328 return 0; 341 return 0;
342out_err:
343 return err;
329} 344}
330 345
331/* 346/*
@@ -404,7 +419,6 @@ nfs_mark_request_commit(struct nfs_page *req)
404 struct nfs_inode *nfsi = NFS_I(inode); 419 struct nfs_inode *nfsi = NFS_I(inode);
405 420
406 spin_lock(&inode->i_lock); 421 spin_lock(&inode->i_lock);
407 nfsi->ncommit++;
408 set_bit(PG_CLEAN, &(req)->wb_flags); 422 set_bit(PG_CLEAN, &(req)->wb_flags);
409 radix_tree_tag_set(&nfsi->nfs_page_tree, 423 radix_tree_tag_set(&nfsi->nfs_page_tree,
410 req->wb_index, 424 req->wb_index,
@@ -524,6 +538,12 @@ static void nfs_cancel_commit_list(struct list_head *head)
524} 538}
525 539
526#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 540#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
541static int
542nfs_need_commit(struct nfs_inode *nfsi)
543{
544 return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT);
545}
546
527/* 547/*
528 * nfs_scan_commit - Scan an inode for commit requests 548 * nfs_scan_commit - Scan an inode for commit requests
529 * @inode: NFS inode to scan 549 * @inode: NFS inode to scan
@@ -538,16 +558,18 @@ static int
538nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) 558nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
539{ 559{
540 struct nfs_inode *nfsi = NFS_I(inode); 560 struct nfs_inode *nfsi = NFS_I(inode);
541 int res = 0;
542 561
543 if (nfsi->ncommit != 0) { 562 if (!nfs_need_commit(nfsi))
544 res = nfs_scan_list(nfsi, dst, idx_start, npages, 563 return 0;
545 NFS_PAGE_TAG_COMMIT); 564
546 nfsi->ncommit -= res; 565 return nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
547 }
548 return res;
549} 566}
550#else 567#else
568static inline int nfs_need_commit(struct nfs_inode *nfsi)
569{
570 return 0;
571}
572
551static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) 573static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
552{ 574{
553 return 0; 575 return 0;
@@ -820,7 +842,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
820 data->args.stable = NFS_UNSTABLE; 842 data->args.stable = NFS_UNSTABLE;
821 if (how & FLUSH_STABLE) { 843 if (how & FLUSH_STABLE) {
822 data->args.stable = NFS_DATA_SYNC; 844 data->args.stable = NFS_DATA_SYNC;
823 if (!NFS_I(inode)->ncommit) 845 if (!nfs_need_commit(NFS_I(inode)))
824 data->args.stable = NFS_FILE_SYNC; 846 data->args.stable = NFS_FILE_SYNC;
825 } 847 }
826 848
@@ -1425,18 +1447,13 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
1425{ 1447{
1426 struct writeback_control wbc = { 1448 struct writeback_control wbc = {
1427 .bdi = mapping->backing_dev_info, 1449 .bdi = mapping->backing_dev_info,
1428 .sync_mode = WB_SYNC_NONE, 1450 .sync_mode = WB_SYNC_ALL,
1429 .nr_to_write = LONG_MAX, 1451 .nr_to_write = LONG_MAX,
1430 .range_start = 0, 1452 .range_start = 0,
1431 .range_end = LLONG_MAX, 1453 .range_end = LLONG_MAX,
1432 .for_writepages = 1, 1454 .for_writepages = 1,
1433 }; 1455 };
1434 int ret;
1435 1456
1436 ret = __nfs_write_mapping(mapping, &wbc, how);
1437 if (ret < 0)
1438 return ret;
1439 wbc.sync_mode = WB_SYNC_ALL;
1440 return __nfs_write_mapping(mapping, &wbc, how); 1457 return __nfs_write_mapping(mapping, &wbc, how);
1441} 1458}
1442 1459
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 44d7d04dab95..503b9da159a3 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -1,6 +1,7 @@
1config NFSD 1config NFSD
2 tristate "NFS server support" 2 tristate "NFS server support"
3 depends on INET 3 depends on INET
4 depends on FILE_LOCKING
4 select LOCKD 5 select LOCKD
5 select SUNRPC 6 select SUNRPC
6 select EXPORTFS 7 select EXPORTFS
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 9dbd2eb91281..7c9fe838f038 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -18,6 +18,7 @@
18#include <linux/unistd.h> 18#include <linux/unistd.h>
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/major.h> 20#include <linux/major.h>
21#include <linux/magic.h>
21 22
22#include <linux/sunrpc/svc.h> 23#include <linux/sunrpc/svc.h>
23#include <linux/nfsd/nfsd.h> 24#include <linux/nfsd/nfsd.h>
@@ -202,6 +203,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
202 struct nfsd3_writeres *resp) 203 struct nfsd3_writeres *resp)
203{ 204{
204 __be32 nfserr; 205 __be32 nfserr;
206 unsigned long cnt = argp->len;
205 207
206 dprintk("nfsd: WRITE(3) %s %d bytes at %ld%s\n", 208 dprintk("nfsd: WRITE(3) %s %d bytes at %ld%s\n",
207 SVCFH_fmt(&argp->fh), 209 SVCFH_fmt(&argp->fh),
@@ -214,9 +216,9 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
214 nfserr = nfsd_write(rqstp, &resp->fh, NULL, 216 nfserr = nfsd_write(rqstp, &resp->fh, NULL,
215 argp->offset, 217 argp->offset,
216 rqstp->rq_vec, argp->vlen, 218 rqstp->rq_vec, argp->vlen,
217 argp->len, 219 &cnt,
218 &resp->committed); 220 &resp->committed);
219 resp->count = argp->count; 221 resp->count = cnt;
220 RETURN_STATUS(nfserr); 222 RETURN_STATUS(nfserr);
221} 223}
222 224
@@ -569,7 +571,7 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
569 struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb; 571 struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb;
570 572
571 /* Note that we don't care for remote fs's here */ 573 /* Note that we don't care for remote fs's here */
572 if (sb->s_magic == 0x4d44 /* MSDOS_SUPER_MAGIC */) { 574 if (sb->s_magic == MSDOS_SUPER_MAGIC) {
573 resp->f_properties = NFS3_FSF_BILLYBOY; 575 resp->f_properties = NFS3_FSF_BILLYBOY;
574 } 576 }
575 resp->f_maxfilesize = sb->s_maxbytes; 577 resp->f_maxfilesize = sb->s_maxbytes;
@@ -610,7 +612,7 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
610 resp->p_link_max = EXT2_LINK_MAX; 612 resp->p_link_max = EXT2_LINK_MAX;
611 resp->p_name_max = EXT2_NAME_LEN; 613 resp->p_name_max = EXT2_NAME_LEN;
612 break; 614 break;
613 case 0x4d44: /* MSDOS_SUPER_MAGIC */ 615 case MSDOS_SUPER_MAGIC:
614 resp->p_case_insensitive = 1; 616 resp->p_case_insensitive = 1;
615 resp->p_case_preserving = 0; 617 resp->p_case_preserving = 0;
616 break; 618 break;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index c464181b5994..290289bd44f7 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -218,7 +218,7 @@ static int
218encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec) 218encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
219{ 219{
220 __be32 *p; 220 __be32 *p;
221 int len = cb_rec->cbr_fhlen; 221 int len = cb_rec->cbr_fh.fh_size;
222 222
223 RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len); 223 RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
224 WRITE32(OP_CB_RECALL); 224 WRITE32(OP_CB_RECALL);
@@ -226,7 +226,7 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
226 WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t)); 226 WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t));
227 WRITE32(cb_rec->cbr_trunc); 227 WRITE32(cb_rec->cbr_trunc);
228 WRITE32(len); 228 WRITE32(len);
229 WRITEMEM(cb_rec->cbr_fhval, len); 229 WRITEMEM(&cb_rec->cbr_fh.fh_base, len);
230 return 0; 230 return 0;
231} 231}
232 232
@@ -361,9 +361,8 @@ static struct rpc_program cb_program = {
361/* Reference counting, callback cleanup, etc., all look racy as heck. 361/* Reference counting, callback cleanup, etc., all look racy as heck.
362 * And why is cb_set an atomic? */ 362 * And why is cb_set an atomic? */
363 363
364static int do_probe_callback(void *data) 364static struct rpc_clnt *setup_callback_client(struct nfs4_client *clp)
365{ 365{
366 struct nfs4_client *clp = data;
367 struct sockaddr_in addr; 366 struct sockaddr_in addr;
368 struct nfs4_callback *cb = &clp->cl_callback; 367 struct nfs4_callback *cb = &clp->cl_callback;
369 struct rpc_timeout timeparms = { 368 struct rpc_timeout timeparms = {
@@ -384,17 +383,10 @@ static int do_probe_callback(void *data)
384 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), 383 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
385 .client_name = clp->cl_principal, 384 .client_name = clp->cl_principal,
386 }; 385 };
387 struct rpc_message msg = {
388 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
389 .rpc_argp = clp,
390 };
391 struct rpc_clnt *client; 386 struct rpc_clnt *client;
392 int status;
393 387
394 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) { 388 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
395 status = nfserr_cb_path_down; 389 return ERR_PTR(-EINVAL);
396 goto out_err;
397 }
398 390
399 /* Initialize address */ 391 /* Initialize address */
400 memset(&addr, 0, sizeof(addr)); 392 memset(&addr, 0, sizeof(addr));
@@ -404,9 +396,29 @@ static int do_probe_callback(void *data)
404 396
405 /* Create RPC client */ 397 /* Create RPC client */
406 client = rpc_create(&args); 398 client = rpc_create(&args);
399 if (IS_ERR(client))
400 dprintk("NFSD: couldn't create callback client: %ld\n",
401 PTR_ERR(client));
402 return client;
403
404}
405
406static int do_probe_callback(void *data)
407{
408 struct nfs4_client *clp = data;
409 struct nfs4_callback *cb = &clp->cl_callback;
410 struct rpc_message msg = {
411 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
412 .rpc_argp = clp,
413 };
414 struct rpc_clnt *client;
415 int status;
416
417 client = setup_callback_client(clp);
407 if (IS_ERR(client)) { 418 if (IS_ERR(client)) {
408 dprintk("NFSD: couldn't create callback client\n");
409 status = PTR_ERR(client); 419 status = PTR_ERR(client);
420 dprintk("NFSD: couldn't create callback client: %d\n",
421 status);
410 goto out_err; 422 goto out_err;
411 } 423 }
412 424
@@ -422,10 +434,10 @@ static int do_probe_callback(void *data)
422out_release_client: 434out_release_client:
423 rpc_shutdown_client(client); 435 rpc_shutdown_client(client);
424out_err: 436out_err:
425 dprintk("NFSD: warning: no callback path to client %.*s\n", 437 dprintk("NFSD: warning: no callback path to client %.*s: error %d\n",
426 (int)clp->cl_name.len, clp->cl_name.data); 438 (int)clp->cl_name.len, clp->cl_name.data, status);
427 put_nfs4_client(clp); 439 put_nfs4_client(clp);
428 return status; 440 return 0;
429} 441}
430 442
431/* 443/*
@@ -451,7 +463,6 @@ nfsd4_probe_callback(struct nfs4_client *clp)
451 463
452/* 464/*
453 * called with dp->dl_count inc'ed. 465 * called with dp->dl_count inc'ed.
454 * nfs4_lock_state() may or may not have been called.
455 */ 466 */
456void 467void
457nfsd4_cb_recall(struct nfs4_delegation *dp) 468nfsd4_cb_recall(struct nfs4_delegation *dp)
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 9fa60a3ad48c..b2883e9c6381 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -93,6 +93,21 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
93 open->op_truncate = 0; 93 open->op_truncate = 0;
94 94
95 if (open->op_create) { 95 if (open->op_create) {
96 /* FIXME: check session persistence and pnfs flags.
97 * The nfsv4.1 spec requires the following semantics:
98 *
99 * Persistent | pNFS | Server REQUIRED | Client Allowed
100 * Reply Cache | server | |
101 * -------------+--------+-----------------+--------------------
102 * no | no | EXCLUSIVE4_1 | EXCLUSIVE4_1
103 * | | | (SHOULD)
104 * | | and EXCLUSIVE4 | or EXCLUSIVE4
105 * | | | (SHOULD NOT)
106 * no | yes | EXCLUSIVE4_1 | EXCLUSIVE4_1
107 * yes | no | GUARDED4 | GUARDED4
108 * yes | yes | GUARDED4 | GUARDED4
109 */
110
96 /* 111 /*
97 * Note: create modes (UNCHECKED,GUARDED...) are the same 112 * Note: create modes (UNCHECKED,GUARDED...) are the same
98 * in NFSv4 as in v3. 113 * in NFSv4 as in v3.
@@ -103,11 +118,13 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
103 (u32 *)open->op_verf.data, 118 (u32 *)open->op_verf.data,
104 &open->op_truncate, &created); 119 &open->op_truncate, &created);
105 120
106 /* If we ever decide to use different attrs to store the 121 /*
107 * verifier in nfsd_create_v3, then we'll need to change this 122 * Following rfc 3530 14.2.16, use the returned bitmask
123 * to indicate which attributes we used to store the
124 * verifier:
108 */ 125 */
109 if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0) 126 if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0)
110 open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS | 127 open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS |
111 FATTR4_WORD1_TIME_MODIFY); 128 FATTR4_WORD1_TIME_MODIFY);
112 } else { 129 } else {
113 status = nfsd_lookup(rqstp, current_fh, 130 status = nfsd_lookup(rqstp, current_fh,
@@ -118,13 +135,11 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
118 goto out; 135 goto out;
119 136
120 set_change_info(&open->op_cinfo, current_fh); 137 set_change_info(&open->op_cinfo, current_fh);
121
122 /* set reply cache */
123 fh_dup2(current_fh, &resfh); 138 fh_dup2(current_fh, &resfh);
124 open->op_stateowner->so_replay.rp_openfh_len = resfh.fh_handle.fh_size;
125 memcpy(open->op_stateowner->so_replay.rp_openfh,
126 &resfh.fh_handle.fh_base, resfh.fh_handle.fh_size);
127 139
140 /* set reply cache */
141 fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
142 &resfh.fh_handle);
128 if (!created) 143 if (!created)
129 status = do_open_permission(rqstp, current_fh, open, 144 status = do_open_permission(rqstp, current_fh, open,
130 NFSD_MAY_NOP); 145 NFSD_MAY_NOP);
@@ -150,10 +165,8 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
150 memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info)); 165 memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info));
151 166
152 /* set replay cache */ 167 /* set replay cache */
153 open->op_stateowner->so_replay.rp_openfh_len = current_fh->fh_handle.fh_size; 168 fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
154 memcpy(open->op_stateowner->so_replay.rp_openfh, 169 &current_fh->fh_handle);
155 &current_fh->fh_handle.fh_base,
156 current_fh->fh_handle.fh_size);
157 170
158 open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) && 171 open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
159 (open->op_iattr.ia_size == 0); 172 (open->op_iattr.ia_size == 0);
@@ -164,12 +177,23 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
164 return status; 177 return status;
165} 178}
166 179
180static void
181copy_clientid(clientid_t *clid, struct nfsd4_session *session)
182{
183 struct nfsd4_sessionid *sid =
184 (struct nfsd4_sessionid *)session->se_sessionid.data;
185
186 clid->cl_boot = sid->clientid.cl_boot;
187 clid->cl_id = sid->clientid.cl_id;
188}
167 189
168static __be32 190static __be32
169nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 191nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
170 struct nfsd4_open *open) 192 struct nfsd4_open *open)
171{ 193{
172 __be32 status; 194 __be32 status;
195 struct nfsd4_compoundres *resp;
196
173 dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n", 197 dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n",
174 (int)open->op_fname.len, open->op_fname.data, 198 (int)open->op_fname.len, open->op_fname.data,
175 open->op_stateowner); 199 open->op_stateowner);
@@ -178,16 +202,19 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
178 if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL) 202 if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
179 return nfserr_inval; 203 return nfserr_inval;
180 204
205 if (nfsd4_has_session(cstate))
206 copy_clientid(&open->op_clientid, cstate->session);
207
181 nfs4_lock_state(); 208 nfs4_lock_state();
182 209
183 /* check seqid for replay. set nfs4_owner */ 210 /* check seqid for replay. set nfs4_owner */
184 status = nfsd4_process_open1(open); 211 resp = rqstp->rq_resp;
212 status = nfsd4_process_open1(&resp->cstate, open);
185 if (status == nfserr_replay_me) { 213 if (status == nfserr_replay_me) {
186 struct nfs4_replay *rp = &open->op_stateowner->so_replay; 214 struct nfs4_replay *rp = &open->op_stateowner->so_replay;
187 fh_put(&cstate->current_fh); 215 fh_put(&cstate->current_fh);
188 cstate->current_fh.fh_handle.fh_size = rp->rp_openfh_len; 216 fh_copy_shallow(&cstate->current_fh.fh_handle,
189 memcpy(&cstate->current_fh.fh_handle.fh_base, rp->rp_openfh, 217 &rp->rp_openfh);
190 rp->rp_openfh_len);
191 status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); 218 status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
192 if (status) 219 if (status)
193 dprintk("nfsd4_open: replay failed" 220 dprintk("nfsd4_open: replay failed"
@@ -209,10 +236,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
209 236
210 switch (open->op_claim_type) { 237 switch (open->op_claim_type) {
211 case NFS4_OPEN_CLAIM_DELEGATE_CUR: 238 case NFS4_OPEN_CLAIM_DELEGATE_CUR:
212 status = nfserr_inval;
213 if (open->op_create)
214 goto out;
215 /* fall through */
216 case NFS4_OPEN_CLAIM_NULL: 239 case NFS4_OPEN_CLAIM_NULL:
217 /* 240 /*
218 * (1) set CURRENT_FH to the file being opened, 241 * (1) set CURRENT_FH to the file being opened,
@@ -455,8 +478,9 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
455 if (getattr->ga_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1) 478 if (getattr->ga_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
456 return nfserr_inval; 479 return nfserr_inval;
457 480
458 getattr->ga_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0; 481 getattr->ga_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
459 getattr->ga_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1; 482 getattr->ga_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
483 getattr->ga_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
460 484
461 getattr->ga_fhp = &cstate->current_fh; 485 getattr->ga_fhp = &cstate->current_fh;
462 return nfs_ok; 486 return nfs_ok;
@@ -520,9 +544,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
520 544
521 nfs4_lock_state(); 545 nfs4_lock_state();
522 /* check stateid */ 546 /* check stateid */
523 if ((status = nfs4_preprocess_stateid_op(&cstate->current_fh, 547 if ((status = nfs4_preprocess_stateid_op(cstate, &read->rd_stateid,
524 &read->rd_stateid, 548 RD_STATE, &read->rd_filp))) {
525 CHECK_FH | RD_STATE, &read->rd_filp))) {
526 dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); 549 dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
527 goto out; 550 goto out;
528 } 551 }
@@ -548,8 +571,9 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
548 if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1) 571 if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
549 return nfserr_inval; 572 return nfserr_inval;
550 573
551 readdir->rd_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0; 574 readdir->rd_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
552 readdir->rd_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1; 575 readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
576 readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
553 577
554 if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) || 578 if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) ||
555 (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE))) 579 (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE)))
@@ -653,8 +677,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
653 677
654 if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { 678 if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
655 nfs4_lock_state(); 679 nfs4_lock_state();
656 status = nfs4_preprocess_stateid_op(&cstate->current_fh, 680 status = nfs4_preprocess_stateid_op(cstate,
657 &setattr->sa_stateid, CHECK_FH | WR_STATE, NULL); 681 &setattr->sa_stateid, WR_STATE, NULL);
658 nfs4_unlock_state(); 682 nfs4_unlock_state();
659 if (status) { 683 if (status) {
660 dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n"); 684 dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
@@ -685,6 +709,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
685 struct file *filp = NULL; 709 struct file *filp = NULL;
686 u32 *p; 710 u32 *p;
687 __be32 status = nfs_ok; 711 __be32 status = nfs_ok;
712 unsigned long cnt;
688 713
689 /* no need to check permission - this will be done in nfsd_write() */ 714 /* no need to check permission - this will be done in nfsd_write() */
690 715
@@ -692,8 +717,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
692 return nfserr_inval; 717 return nfserr_inval;
693 718
694 nfs4_lock_state(); 719 nfs4_lock_state();
695 status = nfs4_preprocess_stateid_op(&cstate->current_fh, stateid, 720 status = nfs4_preprocess_stateid_op(cstate, stateid, WR_STATE, &filp);
696 CHECK_FH | WR_STATE, &filp);
697 if (filp) 721 if (filp)
698 get_file(filp); 722 get_file(filp);
699 nfs4_unlock_state(); 723 nfs4_unlock_state();
@@ -703,7 +727,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
703 return status; 727 return status;
704 } 728 }
705 729
706 write->wr_bytes_written = write->wr_buflen; 730 cnt = write->wr_buflen;
707 write->wr_how_written = write->wr_stable_how; 731 write->wr_how_written = write->wr_stable_how;
708 p = (u32 *)write->wr_verifier.data; 732 p = (u32 *)write->wr_verifier.data;
709 *p++ = nfssvc_boot.tv_sec; 733 *p++ = nfssvc_boot.tv_sec;
@@ -711,10 +735,12 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
711 735
712 status = nfsd_write(rqstp, &cstate->current_fh, filp, 736 status = nfsd_write(rqstp, &cstate->current_fh, filp,
713 write->wr_offset, rqstp->rq_vec, write->wr_vlen, 737 write->wr_offset, rqstp->rq_vec, write->wr_vlen,
714 write->wr_buflen, &write->wr_how_written); 738 &cnt, &write->wr_how_written);
715 if (filp) 739 if (filp)
716 fput(filp); 740 fput(filp);
717 741
742 write->wr_bytes_written = cnt;
743
718 if (status == nfserr_symlink) 744 if (status == nfserr_symlink)
719 status = nfserr_inval; 745 status = nfserr_inval;
720 return status; 746 return status;
@@ -737,8 +763,9 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
737 if (status) 763 if (status)
738 return status; 764 return status;
739 765
740 if ((verify->ve_bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0) 766 if ((verify->ve_bmval[0] & ~nfsd_suppattrs0(cstate->minorversion))
741 || (verify->ve_bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1)) 767 || (verify->ve_bmval[1] & ~nfsd_suppattrs1(cstate->minorversion))
768 || (verify->ve_bmval[2] & ~nfsd_suppattrs2(cstate->minorversion)))
742 return nfserr_attrnotsupp; 769 return nfserr_attrnotsupp;
743 if ((verify->ve_bmval[0] & FATTR4_WORD0_RDATTR_ERROR) 770 if ((verify->ve_bmval[0] & FATTR4_WORD0_RDATTR_ERROR)
744 || (verify->ve_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)) 771 || (verify->ve_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1))
@@ -766,7 +793,8 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
766 if (status) 793 if (status)
767 goto out_kfree; 794 goto out_kfree;
768 795
769 p = buf + 3; 796 /* skip bitmap */
797 p = buf + 1 + ntohl(buf[0]);
770 status = nfserr_not_same; 798 status = nfserr_not_same;
771 if (ntohl(*p++) != verify->ve_attrlen) 799 if (ntohl(*p++) != verify->ve_attrlen)
772 goto out_kfree; 800 goto out_kfree;
@@ -813,39 +841,17 @@ static inline void nfsd4_increment_op_stats(u32 opnum)
813 nfsdstats.nfs4_opcount[opnum]++; 841 nfsdstats.nfs4_opcount[opnum]++;
814} 842}
815 843
816static void cstate_free(struct nfsd4_compound_state *cstate)
817{
818 if (cstate == NULL)
819 return;
820 fh_put(&cstate->current_fh);
821 fh_put(&cstate->save_fh);
822 BUG_ON(cstate->replay_owner);
823 kfree(cstate);
824}
825
826static struct nfsd4_compound_state *cstate_alloc(void)
827{
828 struct nfsd4_compound_state *cstate;
829
830 cstate = kmalloc(sizeof(struct nfsd4_compound_state), GFP_KERNEL);
831 if (cstate == NULL)
832 return NULL;
833 fh_init(&cstate->current_fh, NFS4_FHSIZE);
834 fh_init(&cstate->save_fh, NFS4_FHSIZE);
835 cstate->replay_owner = NULL;
836 return cstate;
837}
838
839typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *, 844typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
840 void *); 845 void *);
846enum nfsd4_op_flags {
847 ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */
848 ALLOWED_ON_ABSENT_FS = 2 << 0, /* ops processed on absent fs */
849 ALLOWED_AS_FIRST_OP = 3 << 0, /* ops reqired first in compound */
850};
841 851
842struct nfsd4_operation { 852struct nfsd4_operation {
843 nfsd4op_func op_func; 853 nfsd4op_func op_func;
844 u32 op_flags; 854 u32 op_flags;
845/* Most ops require a valid current filehandle; a few don't: */
846#define ALLOWED_WITHOUT_FH 1
847/* GETATTR and ops not listed as returning NFS4ERR_MOVED: */
848#define ALLOWED_ON_ABSENT_FS 2
849 char *op_name; 855 char *op_name;
850}; 856};
851 857
@@ -854,6 +860,51 @@ static struct nfsd4_operation nfsd4_ops[];
854static const char *nfsd4_op_name(unsigned opnum); 860static const char *nfsd4_op_name(unsigned opnum);
855 861
856/* 862/*
863 * This is a replay of a compound for which no cache entry pages
864 * were used. Encode the sequence operation, and if cachethis is FALSE
865 * encode the uncache rep error on the next operation.
866 */
867static __be32
868nfsd4_enc_uncached_replay(struct nfsd4_compoundargs *args,
869 struct nfsd4_compoundres *resp)
870{
871 struct nfsd4_op *op;
872
873 dprintk("--> %s resp->opcnt %d ce_cachethis %u \n", __func__,
874 resp->opcnt, resp->cstate.slot->sl_cache_entry.ce_cachethis);
875
876 /* Encode the replayed sequence operation */
877 BUG_ON(resp->opcnt != 1);
878 op = &args->ops[resp->opcnt - 1];
879 nfsd4_encode_operation(resp, op);
880
881 /*return nfserr_retry_uncached_rep in next operation. */
882 if (resp->cstate.slot->sl_cache_entry.ce_cachethis == 0) {
883 op = &args->ops[resp->opcnt++];
884 op->status = nfserr_retry_uncached_rep;
885 nfsd4_encode_operation(resp, op);
886 }
887 return op->status;
888}
889
890/*
891 * Enforce NFSv4.1 COMPOUND ordering rules.
892 *
893 * TODO:
894 * - enforce NFS4ERR_NOT_ONLY_OP,
895 * - DESTROY_SESSION MUST be the final operation in the COMPOUND request.
896 */
897static bool nfs41_op_ordering_ok(struct nfsd4_compoundargs *args)
898{
899 if (args->minorversion && args->opcnt > 0) {
900 struct nfsd4_op *op = &args->ops[0];
901 return (op->status == nfserr_op_illegal) ||
902 (nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP);
903 }
904 return true;
905}
906
907/*
857 * COMPOUND call. 908 * COMPOUND call.
858 */ 909 */
859static __be32 910static __be32
@@ -863,12 +914,13 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
863{ 914{
864 struct nfsd4_op *op; 915 struct nfsd4_op *op;
865 struct nfsd4_operation *opdesc; 916 struct nfsd4_operation *opdesc;
866 struct nfsd4_compound_state *cstate = NULL; 917 struct nfsd4_compound_state *cstate = &resp->cstate;
867 int slack_bytes; 918 int slack_bytes;
868 __be32 status; 919 __be32 status;
869 920
870 resp->xbuf = &rqstp->rq_res; 921 resp->xbuf = &rqstp->rq_res;
871 resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len; 922 resp->p = rqstp->rq_res.head[0].iov_base +
923 rqstp->rq_res.head[0].iov_len;
872 resp->tagp = resp->p; 924 resp->tagp = resp->p;
873 /* reserve space for: taglen, tag, and opcnt */ 925 /* reserve space for: taglen, tag, and opcnt */
874 resp->p += 2 + XDR_QUADLEN(args->taglen); 926 resp->p += 2 + XDR_QUADLEN(args->taglen);
@@ -877,18 +929,25 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
877 resp->tag = args->tag; 929 resp->tag = args->tag;
878 resp->opcnt = 0; 930 resp->opcnt = 0;
879 resp->rqstp = rqstp; 931 resp->rqstp = rqstp;
932 resp->cstate.minorversion = args->minorversion;
933 resp->cstate.replay_owner = NULL;
934 fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
935 fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
936 /* Use the deferral mechanism only for NFSv4.0 compounds */
937 rqstp->rq_usedeferral = (args->minorversion == 0);
880 938
881 /* 939 /*
882 * According to RFC3010, this takes precedence over all other errors. 940 * According to RFC3010, this takes precedence over all other errors.
883 */ 941 */
884 status = nfserr_minor_vers_mismatch; 942 status = nfserr_minor_vers_mismatch;
885 if (args->minorversion > NFSD_SUPPORTED_MINOR_VERSION) 943 if (args->minorversion > nfsd_supported_minorversion)
886 goto out; 944 goto out;
887 945
888 status = nfserr_resource; 946 if (!nfs41_op_ordering_ok(args)) {
889 cstate = cstate_alloc(); 947 op = &args->ops[0];
890 if (cstate == NULL) 948 op->status = nfserr_sequence_pos;
891 goto out; 949 goto encode_op;
950 }
892 951
893 status = nfs_ok; 952 status = nfs_ok;
894 while (!status && resp->opcnt < args->opcnt) { 953 while (!status && resp->opcnt < args->opcnt) {
@@ -897,7 +956,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
897 dprintk("nfsv4 compound op #%d/%d: %d (%s)\n", 956 dprintk("nfsv4 compound op #%d/%d: %d (%s)\n",
898 resp->opcnt, args->opcnt, op->opnum, 957 resp->opcnt, args->opcnt, op->opnum,
899 nfsd4_op_name(op->opnum)); 958 nfsd4_op_name(op->opnum));
900
901 /* 959 /*
902 * The XDR decode routines may have pre-set op->status; 960 * The XDR decode routines may have pre-set op->status;
903 * for example, if there is a miscellaneous XDR error 961 * for example, if there is a miscellaneous XDR error
@@ -938,6 +996,15 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
938 BUG_ON(op->status == nfs_ok); 996 BUG_ON(op->status == nfs_ok);
939 997
940encode_op: 998encode_op:
999 /* Only from SEQUENCE or CREATE_SESSION */
1000 if (resp->cstate.status == nfserr_replay_cache) {
1001 dprintk("%s NFS4.1 replay from cache\n", __func__);
1002 if (nfsd4_not_cached(resp))
1003 status = nfsd4_enc_uncached_replay(args, resp);
1004 else
1005 status = op->status;
1006 goto out;
1007 }
941 if (op->status == nfserr_replay_me) { 1008 if (op->status == nfserr_replay_me) {
942 op->replay = &cstate->replay_owner->so_replay; 1009 op->replay = &cstate->replay_owner->so_replay;
943 nfsd4_encode_replay(resp, op); 1010 nfsd4_encode_replay(resp, op);
@@ -961,15 +1028,24 @@ encode_op:
961 1028
962 nfsd4_increment_op_stats(op->opnum); 1029 nfsd4_increment_op_stats(op->opnum);
963 } 1030 }
1031 if (!rqstp->rq_usedeferral && status == nfserr_dropit) {
1032 dprintk("%s Dropit - send NFS4ERR_DELAY\n", __func__);
1033 status = nfserr_jukebox;
1034 }
964 1035
965 cstate_free(cstate); 1036 resp->cstate.status = status;
1037 fh_put(&resp->cstate.current_fh);
1038 fh_put(&resp->cstate.save_fh);
1039 BUG_ON(resp->cstate.replay_owner);
966out: 1040out:
967 nfsd4_release_compoundargs(args); 1041 nfsd4_release_compoundargs(args);
1042 /* Reset deferral mechanism for RPC deferrals */
1043 rqstp->rq_usedeferral = 1;
968 dprintk("nfsv4 compound returned %d\n", ntohl(status)); 1044 dprintk("nfsv4 compound returned %d\n", ntohl(status));
969 return status; 1045 return status;
970} 1046}
971 1047
972static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = { 1048static struct nfsd4_operation nfsd4_ops[] = {
973 [OP_ACCESS] = { 1049 [OP_ACCESS] = {
974 .op_func = (nfsd4op_func)nfsd4_access, 1050 .op_func = (nfsd4op_func)nfsd4_access,
975 .op_name = "OP_ACCESS", 1051 .op_name = "OP_ACCESS",
@@ -1045,7 +1121,7 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
1045 .op_name = "OP_PUTFH", 1121 .op_name = "OP_PUTFH",
1046 }, 1122 },
1047 [OP_PUTPUBFH] = { 1123 [OP_PUTPUBFH] = {
1048 /* unsupported, just for future reference: */ 1124 .op_func = (nfsd4op_func)nfsd4_putrootfh,
1049 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, 1125 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
1050 .op_name = "OP_PUTPUBFH", 1126 .op_name = "OP_PUTPUBFH",
1051 }, 1127 },
@@ -1119,6 +1195,28 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
1119 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, 1195 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
1120 .op_name = "OP_RELEASE_LOCKOWNER", 1196 .op_name = "OP_RELEASE_LOCKOWNER",
1121 }, 1197 },
1198
1199 /* NFSv4.1 operations */
1200 [OP_EXCHANGE_ID] = {
1201 .op_func = (nfsd4op_func)nfsd4_exchange_id,
1202 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1203 .op_name = "OP_EXCHANGE_ID",
1204 },
1205 [OP_CREATE_SESSION] = {
1206 .op_func = (nfsd4op_func)nfsd4_create_session,
1207 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1208 .op_name = "OP_CREATE_SESSION",
1209 },
1210 [OP_DESTROY_SESSION] = {
1211 .op_func = (nfsd4op_func)nfsd4_destroy_session,
1212 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1213 .op_name = "OP_DESTROY_SESSION",
1214 },
1215 [OP_SEQUENCE] = {
1216 .op_func = (nfsd4op_func)nfsd4_sequence,
1217 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1218 .op_name = "OP_SEQUENCE",
1219 },
1122}; 1220};
1123 1221
1124static const char *nfsd4_op_name(unsigned opnum) 1222static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 74f7b67567fd..b5348405046b 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -182,36 +182,26 @@ out_unlock:
182 182
183typedef int (recdir_func)(struct dentry *, struct dentry *); 183typedef int (recdir_func)(struct dentry *, struct dentry *);
184 184
185struct dentry_list { 185struct name_list {
186 struct dentry *dentry; 186 char name[HEXDIR_LEN];
187 struct list_head list; 187 struct list_head list;
188}; 188};
189 189
190struct dentry_list_arg {
191 struct list_head dentries;
192 struct dentry *parent;
193};
194
195static int 190static int
196nfsd4_build_dentrylist(void *arg, const char *name, int namlen, 191nfsd4_build_namelist(void *arg, const char *name, int namlen,
197 loff_t offset, u64 ino, unsigned int d_type) 192 loff_t offset, u64 ino, unsigned int d_type)
198{ 193{
199 struct dentry_list_arg *dla = arg; 194 struct list_head *names = arg;
200 struct list_head *dentries = &dla->dentries; 195 struct name_list *entry;
201 struct dentry *parent = dla->parent;
202 struct dentry *dentry;
203 struct dentry_list *child;
204 196
205 if (name && isdotent(name, namlen)) 197 if (namlen != HEXDIR_LEN - 1)
206 return 0; 198 return 0;
207 dentry = lookup_one_len(name, parent, namlen); 199 entry = kmalloc(sizeof(struct name_list), GFP_KERNEL);
208 if (IS_ERR(dentry)) 200 if (entry == NULL)
209 return PTR_ERR(dentry);
210 child = kmalloc(sizeof(*child), GFP_KERNEL);
211 if (child == NULL)
212 return -ENOMEM; 201 return -ENOMEM;
213 child->dentry = dentry; 202 memcpy(entry->name, name, HEXDIR_LEN - 1);
214 list_add(&child->list, dentries); 203 entry->name[HEXDIR_LEN - 1] = '\0';
204 list_add(&entry->list, names);
215 return 0; 205 return 0;
216} 206}
217 207
@@ -220,11 +210,9 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
220{ 210{
221 const struct cred *original_cred; 211 const struct cred *original_cred;
222 struct file *filp; 212 struct file *filp;
223 struct dentry_list_arg dla = { 213 LIST_HEAD(names);
224 .parent = dir, 214 struct name_list *entry;
225 }; 215 struct dentry *dentry;
226 struct list_head *dentries = &dla.dentries;
227 struct dentry_list *child;
228 int status; 216 int status;
229 217
230 if (!rec_dir_init) 218 if (!rec_dir_init)
@@ -233,67 +221,42 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
233 status = nfs4_save_creds(&original_cred); 221 status = nfs4_save_creds(&original_cred);
234 if (status < 0) 222 if (status < 0)
235 return status; 223 return status;
236 INIT_LIST_HEAD(dentries);
237 224
238 filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY, 225 filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY,
239 current_cred()); 226 current_cred());
240 status = PTR_ERR(filp); 227 status = PTR_ERR(filp);
241 if (IS_ERR(filp)) 228 if (IS_ERR(filp))
242 goto out; 229 goto out;
243 INIT_LIST_HEAD(dentries); 230 status = vfs_readdir(filp, nfsd4_build_namelist, &names);
244 status = vfs_readdir(filp, nfsd4_build_dentrylist, &dla);
245 fput(filp); 231 fput(filp);
246 while (!list_empty(dentries)) { 232 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
247 child = list_entry(dentries->next, struct dentry_list, list); 233 while (!list_empty(&names)) {
248 status = f(dir, child->dentry); 234 entry = list_entry(names.next, struct name_list, list);
235
236 dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
237 if (IS_ERR(dentry)) {
238 status = PTR_ERR(dentry);
239 break;
240 }
241 status = f(dir, dentry);
242 dput(dentry);
249 if (status) 243 if (status)
250 goto out; 244 break;
251 list_del(&child->list); 245 list_del(&entry->list);
252 dput(child->dentry); 246 kfree(entry);
253 kfree(child);
254 } 247 }
248 mutex_unlock(&dir->d_inode->i_mutex);
255out: 249out:
256 while (!list_empty(dentries)) { 250 while (!list_empty(&names)) {
257 child = list_entry(dentries->next, struct dentry_list, list); 251 entry = list_entry(names.next, struct name_list, list);
258 list_del(&child->list); 252 list_del(&entry->list);
259 dput(child->dentry); 253 kfree(entry);
260 kfree(child);
261 } 254 }
262 nfs4_reset_creds(original_cred); 255 nfs4_reset_creds(original_cred);
263 return status; 256 return status;
264} 257}
265 258
266static int 259static int
267nfsd4_remove_clid_file(struct dentry *dir, struct dentry *dentry)
268{
269 int status;
270
271 if (!S_ISREG(dir->d_inode->i_mode)) {
272 printk("nfsd4: non-file found in client recovery directory\n");
273 return -EINVAL;
274 }
275 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
276 status = vfs_unlink(dir->d_inode, dentry);
277 mutex_unlock(&dir->d_inode->i_mutex);
278 return status;
279}
280
281static int
282nfsd4_clear_clid_dir(struct dentry *dir, struct dentry *dentry)
283{
284 int status;
285
286 /* For now this directory should already be empty, but we empty it of
287 * any regular files anyway, just in case the directory was created by
288 * a kernel from the future.... */
289 nfsd4_list_rec_dir(dentry, nfsd4_remove_clid_file);
290 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
291 status = vfs_rmdir(dir->d_inode, dentry);
292 mutex_unlock(&dir->d_inode->i_mutex);
293 return status;
294}
295
296static int
297nfsd4_unlink_clid_dir(char *name, int namlen) 260nfsd4_unlink_clid_dir(char *name, int namlen)
298{ 261{
299 struct dentry *dentry; 262 struct dentry *dentry;
@@ -301,20 +264,20 @@ nfsd4_unlink_clid_dir(char *name, int namlen)
301 264
302 dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); 265 dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
303 266
304 mutex_lock(&rec_dir.dentry->d_inode->i_mutex); 267 mutex_lock_nested(&rec_dir.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
305 dentry = lookup_one_len(name, rec_dir.dentry, namlen); 268 dentry = lookup_one_len(name, rec_dir.dentry, namlen);
306 mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
307 if (IS_ERR(dentry)) { 269 if (IS_ERR(dentry)) {
308 status = PTR_ERR(dentry); 270 status = PTR_ERR(dentry);
309 return status; 271 goto out_unlock;
310 } 272 }
311 status = -ENOENT; 273 status = -ENOENT;
312 if (!dentry->d_inode) 274 if (!dentry->d_inode)
313 goto out; 275 goto out;
314 276 status = vfs_rmdir(rec_dir.dentry->d_inode, dentry);
315 status = nfsd4_clear_clid_dir(rec_dir.dentry, dentry);
316out: 277out:
317 dput(dentry); 278 dput(dentry);
279out_unlock:
280 mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
318 return status; 281 return status;
319} 282}
320 283
@@ -353,10 +316,11 @@ purge_old(struct dentry *parent, struct dentry *child)
353{ 316{
354 int status; 317 int status;
355 318
356 if (nfs4_has_reclaimed_state(child->d_name.name)) 319 /* note: we currently use this path only for minorversion 0 */
320 if (nfs4_has_reclaimed_state(child->d_name.name, false))
357 return 0; 321 return 0;
358 322
359 status = nfsd4_clear_clid_dir(parent, child); 323 status = vfs_rmdir(parent->d_inode, child);
360 if (status) 324 if (status)
361 printk("failed to remove client recovery directory %s\n", 325 printk("failed to remove client recovery directory %s\n",
362 child->d_name.name); 326 child->d_name.name);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b6f60f48e94b..3b711f5147a7 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -68,6 +68,7 @@ static u32 current_delegid = 1;
68static u32 nfs4_init; 68static u32 nfs4_init;
69static stateid_t zerostateid; /* bits all 0 */ 69static stateid_t zerostateid; /* bits all 0 */
70static stateid_t onestateid; /* bits all 1 */ 70static stateid_t onestateid; /* bits all 1 */
71static u64 current_sessionid = 1;
71 72
72#define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t))) 73#define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t)))
73#define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) 74#define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
@@ -75,18 +76,21 @@ static stateid_t onestateid; /* bits all 1 */
75/* forward declarations */ 76/* forward declarations */
76static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); 77static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
77static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); 78static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
78static void release_stateid_lockowners(struct nfs4_stateid *open_stp);
79static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; 79static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
80static void nfs4_set_recdir(char *recdir); 80static void nfs4_set_recdir(char *recdir);
81 81
82/* Locking: 82/* Locking: */
83 * 83
84 * client_mutex: 84/* Currently used for almost all code touching nfsv4 state: */
85 * protects clientid_hashtbl[], clientstr_hashtbl[],
86 * unconfstr_hashtbl[], uncofid_hashtbl[].
87 */
88static DEFINE_MUTEX(client_mutex); 85static DEFINE_MUTEX(client_mutex);
89 86
87/*
88 * Currently used for the del_recall_lru and file hash table. In an
89 * effort to decrease the scope of the client_mutex, this spinlock may
90 * eventually cover more:
91 */
92static DEFINE_SPINLOCK(recall_lock);
93
90static struct kmem_cache *stateowner_slab = NULL; 94static struct kmem_cache *stateowner_slab = NULL;
91static struct kmem_cache *file_slab = NULL; 95static struct kmem_cache *file_slab = NULL;
92static struct kmem_cache *stateid_slab = NULL; 96static struct kmem_cache *stateid_slab = NULL;
@@ -117,37 +121,23 @@ opaque_hashval(const void *ptr, int nbytes)
117 return x; 121 return x;
118} 122}
119 123
120/* forward declarations */
121static void release_stateowner(struct nfs4_stateowner *sop);
122static void release_stateid(struct nfs4_stateid *stp, int flags);
123
124/*
125 * Delegation state
126 */
127
128/* recall_lock protects the del_recall_lru */
129static DEFINE_SPINLOCK(recall_lock);
130static struct list_head del_recall_lru; 124static struct list_head del_recall_lru;
131 125
132static void
133free_nfs4_file(struct kref *kref)
134{
135 struct nfs4_file *fp = container_of(kref, struct nfs4_file, fi_ref);
136 list_del(&fp->fi_hash);
137 iput(fp->fi_inode);
138 kmem_cache_free(file_slab, fp);
139}
140
141static inline void 126static inline void
142put_nfs4_file(struct nfs4_file *fi) 127put_nfs4_file(struct nfs4_file *fi)
143{ 128{
144 kref_put(&fi->fi_ref, free_nfs4_file); 129 if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) {
130 list_del(&fi->fi_hash);
131 spin_unlock(&recall_lock);
132 iput(fi->fi_inode);
133 kmem_cache_free(file_slab, fi);
134 }
145} 135}
146 136
147static inline void 137static inline void
148get_nfs4_file(struct nfs4_file *fi) 138get_nfs4_file(struct nfs4_file *fi)
149{ 139{
150 kref_get(&fi->fi_ref); 140 atomic_inc(&fi->fi_ref);
151} 141}
152 142
153static int num_delegations; 143static int num_delegations;
@@ -220,9 +210,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
220 dp->dl_stateid.si_stateownerid = current_delegid++; 210 dp->dl_stateid.si_stateownerid = current_delegid++;
221 dp->dl_stateid.si_fileid = 0; 211 dp->dl_stateid.si_fileid = 0;
222 dp->dl_stateid.si_generation = 0; 212 dp->dl_stateid.si_generation = 0;
223 dp->dl_fhlen = current_fh->fh_handle.fh_size; 213 fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
224 memcpy(dp->dl_fhval, &current_fh->fh_handle.fh_base,
225 current_fh->fh_handle.fh_size);
226 dp->dl_time = 0; 214 dp->dl_time = 0;
227 atomic_set(&dp->dl_count, 1); 215 atomic_set(&dp->dl_count, 1);
228 list_add(&dp->dl_perfile, &fp->fi_delegations); 216 list_add(&dp->dl_perfile, &fp->fi_delegations);
@@ -311,6 +299,290 @@ static struct list_head unconf_id_hashtbl[CLIENT_HASH_SIZE];
311static struct list_head client_lru; 299static struct list_head client_lru;
312static struct list_head close_lru; 300static struct list_head close_lru;
313 301
302static void unhash_generic_stateid(struct nfs4_stateid *stp)
303{
304 list_del(&stp->st_hash);
305 list_del(&stp->st_perfile);
306 list_del(&stp->st_perstateowner);
307}
308
309static void free_generic_stateid(struct nfs4_stateid *stp)
310{
311 put_nfs4_file(stp->st_file);
312 kmem_cache_free(stateid_slab, stp);
313}
314
315static void release_lock_stateid(struct nfs4_stateid *stp)
316{
317 unhash_generic_stateid(stp);
318 locks_remove_posix(stp->st_vfs_file, (fl_owner_t)stp->st_stateowner);
319 free_generic_stateid(stp);
320}
321
322static void unhash_lockowner(struct nfs4_stateowner *sop)
323{
324 struct nfs4_stateid *stp;
325
326 list_del(&sop->so_idhash);
327 list_del(&sop->so_strhash);
328 list_del(&sop->so_perstateid);
329 while (!list_empty(&sop->so_stateids)) {
330 stp = list_first_entry(&sop->so_stateids,
331 struct nfs4_stateid, st_perstateowner);
332 release_lock_stateid(stp);
333 }
334}
335
336static void release_lockowner(struct nfs4_stateowner *sop)
337{
338 unhash_lockowner(sop);
339 nfs4_put_stateowner(sop);
340}
341
342static void
343release_stateid_lockowners(struct nfs4_stateid *open_stp)
344{
345 struct nfs4_stateowner *lock_sop;
346
347 while (!list_empty(&open_stp->st_lockowners)) {
348 lock_sop = list_entry(open_stp->st_lockowners.next,
349 struct nfs4_stateowner, so_perstateid);
350 /* list_del(&open_stp->st_lockowners); */
351 BUG_ON(lock_sop->so_is_open_owner);
352 release_lockowner(lock_sop);
353 }
354}
355
356static void release_open_stateid(struct nfs4_stateid *stp)
357{
358 unhash_generic_stateid(stp);
359 release_stateid_lockowners(stp);
360 nfsd_close(stp->st_vfs_file);
361 free_generic_stateid(stp);
362}
363
364static void unhash_openowner(struct nfs4_stateowner *sop)
365{
366 struct nfs4_stateid *stp;
367
368 list_del(&sop->so_idhash);
369 list_del(&sop->so_strhash);
370 list_del(&sop->so_perclient);
371 list_del(&sop->so_perstateid); /* XXX: necessary? */
372 while (!list_empty(&sop->so_stateids)) {
373 stp = list_first_entry(&sop->so_stateids,
374 struct nfs4_stateid, st_perstateowner);
375 release_open_stateid(stp);
376 }
377}
378
379static void release_openowner(struct nfs4_stateowner *sop)
380{
381 unhash_openowner(sop);
382 list_del(&sop->so_close_lru);
383 nfs4_put_stateowner(sop);
384}
385
386static DEFINE_SPINLOCK(sessionid_lock);
387#define SESSION_HASH_SIZE 512
388static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE];
389
390static inline int
391hash_sessionid(struct nfs4_sessionid *sessionid)
392{
393 struct nfsd4_sessionid *sid = (struct nfsd4_sessionid *)sessionid;
394
395 return sid->sequence % SESSION_HASH_SIZE;
396}
397
398static inline void
399dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid)
400{
401 u32 *ptr = (u32 *)(&sessionid->data[0]);
402 dprintk("%s: %u:%u:%u:%u\n", fn, ptr[0], ptr[1], ptr[2], ptr[3]);
403}
404
405static void
406gen_sessionid(struct nfsd4_session *ses)
407{
408 struct nfs4_client *clp = ses->se_client;
409 struct nfsd4_sessionid *sid;
410
411 sid = (struct nfsd4_sessionid *)ses->se_sessionid.data;
412 sid->clientid = clp->cl_clientid;
413 sid->sequence = current_sessionid++;
414 sid->reserved = 0;
415}
416
417/*
418 * Give the client the number of slots it requests bound by
419 * NFSD_MAX_SLOTS_PER_SESSION and by sv_drc_max_pages.
420 *
421 * If we run out of pages (sv_drc_pages_used == sv_drc_max_pages) we
422 * should (up to a point) re-negotiate active sessions and reduce their
423 * slot usage to make rooom for new connections. For now we just fail the
424 * create session.
425 */
426static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan)
427{
428 int status = 0, np = fchan->maxreqs * NFSD_PAGES_PER_SLOT;
429
430 spin_lock(&nfsd_serv->sv_lock);
431 if (np + nfsd_serv->sv_drc_pages_used > nfsd_serv->sv_drc_max_pages)
432 np = nfsd_serv->sv_drc_max_pages - nfsd_serv->sv_drc_pages_used;
433 nfsd_serv->sv_drc_pages_used += np;
434 spin_unlock(&nfsd_serv->sv_lock);
435
436 if (np <= 0) {
437 status = nfserr_resource;
438 fchan->maxreqs = 0;
439 } else
440 fchan->maxreqs = np / NFSD_PAGES_PER_SLOT;
441
442 return status;
443}
444
445/*
446 * fchan holds the client values on input, and the server values on output
447 */
448static int init_forechannel_attrs(struct svc_rqst *rqstp,
449 struct nfsd4_session *session,
450 struct nfsd4_channel_attrs *fchan)
451{
452 int status = 0;
453 __u32 maxcount = svc_max_payload(rqstp);
454
455 /* headerpadsz set to zero in encode routine */
456
457 /* Use the client's max request and max response size if possible */
458 if (fchan->maxreq_sz > maxcount)
459 fchan->maxreq_sz = maxcount;
460 session->se_fmaxreq_sz = fchan->maxreq_sz;
461
462 if (fchan->maxresp_sz > maxcount)
463 fchan->maxresp_sz = maxcount;
464 session->se_fmaxresp_sz = fchan->maxresp_sz;
465
466 /* Set the max response cached size our default which is
467 * a multiple of PAGE_SIZE and small */
468 session->se_fmaxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE;
469 fchan->maxresp_cached = session->se_fmaxresp_cached;
470
471 /* Use the client's maxops if possible */
472 if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND)
473 fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND;
474 session->se_fmaxops = fchan->maxops;
475
476 /* try to use the client requested number of slots */
477 if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
478 fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
479
480 /* FIXME: Error means no more DRC pages so the server should
481 * recover pages from existing sessions. For now fail session
482 * creation.
483 */
484 status = set_forechannel_maxreqs(fchan);
485
486 session->se_fnumslots = fchan->maxreqs;
487 return status;
488}
489
490static int
491alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
492 struct nfsd4_create_session *cses)
493{
494 struct nfsd4_session *new, tmp;
495 int idx, status = nfserr_resource, slotsize;
496
497 memset(&tmp, 0, sizeof(tmp));
498
499 /* FIXME: For now, we just accept the client back channel attributes. */
500 status = init_forechannel_attrs(rqstp, &tmp, &cses->fore_channel);
501 if (status)
502 goto out;
503
504 /* allocate struct nfsd4_session and slot table in one piece */
505 slotsize = tmp.se_fnumslots * sizeof(struct nfsd4_slot);
506 new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
507 if (!new)
508 goto out;
509
510 memcpy(new, &tmp, sizeof(*new));
511
512 new->se_client = clp;
513 gen_sessionid(new);
514 idx = hash_sessionid(&new->se_sessionid);
515 memcpy(clp->cl_sessionid.data, new->se_sessionid.data,
516 NFS4_MAX_SESSIONID_LEN);
517
518 new->se_flags = cses->flags;
519 kref_init(&new->se_ref);
520 spin_lock(&sessionid_lock);
521 list_add(&new->se_hash, &sessionid_hashtbl[idx]);
522 list_add(&new->se_perclnt, &clp->cl_sessions);
523 spin_unlock(&sessionid_lock);
524
525 status = nfs_ok;
526out:
527 return status;
528}
529
530/* caller must hold sessionid_lock */
531static struct nfsd4_session *
532find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
533{
534 struct nfsd4_session *elem;
535 int idx;
536
537 dump_sessionid(__func__, sessionid);
538 idx = hash_sessionid(sessionid);
539 dprintk("%s: idx is %d\n", __func__, idx);
540 /* Search in the appropriate list */
541 list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) {
542 dump_sessionid("list traversal", &elem->se_sessionid);
543 if (!memcmp(elem->se_sessionid.data, sessionid->data,
544 NFS4_MAX_SESSIONID_LEN)) {
545 return elem;
546 }
547 }
548
549 dprintk("%s: session not found\n", __func__);
550 return NULL;
551}
552
553/* caller must hold sessionid_lock */
554static void
555unhash_session(struct nfsd4_session *ses)
556{
557 list_del(&ses->se_hash);
558 list_del(&ses->se_perclnt);
559}
560
561static void
562release_session(struct nfsd4_session *ses)
563{
564 spin_lock(&sessionid_lock);
565 unhash_session(ses);
566 spin_unlock(&sessionid_lock);
567 nfsd4_put_session(ses);
568}
569
570static void nfsd4_release_respages(struct page **respages, short resused);
571
572void
573free_session(struct kref *kref)
574{
575 struct nfsd4_session *ses;
576 int i;
577
578 ses = container_of(kref, struct nfsd4_session, se_ref);
579 for (i = 0; i < ses->se_fnumslots; i++) {
580 struct nfsd4_cache_entry *e = &ses->se_slots[i].sl_cache_entry;
581 nfsd4_release_respages(e->ce_respages, e->ce_resused);
582 }
583 kfree(ses);
584}
585
314static inline void 586static inline void
315renew_client(struct nfs4_client *clp) 587renew_client(struct nfs4_client *clp)
316{ 588{
@@ -330,8 +602,8 @@ STALE_CLIENTID(clientid_t *clid)
330{ 602{
331 if (clid->cl_boot == boot_time) 603 if (clid->cl_boot == boot_time)
332 return 0; 604 return 0;
333 dprintk("NFSD stale clientid (%08x/%08x)\n", 605 dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n",
334 clid->cl_boot, clid->cl_id); 606 clid->cl_boot, clid->cl_id, boot_time);
335 return 1; 607 return 1;
336} 608}
337 609
@@ -376,6 +648,8 @@ static inline void
376free_client(struct nfs4_client *clp) 648free_client(struct nfs4_client *clp)
377{ 649{
378 shutdown_callback_client(clp); 650 shutdown_callback_client(clp);
651 nfsd4_release_respages(clp->cl_slot.sl_cache_entry.ce_respages,
652 clp->cl_slot.sl_cache_entry.ce_resused);
379 if (clp->cl_cred.cr_group_info) 653 if (clp->cl_cred.cr_group_info)
380 put_group_info(clp->cl_cred.cr_group_info); 654 put_group_info(clp->cl_cred.cr_group_info);
381 kfree(clp->cl_principal); 655 kfree(clp->cl_principal);
@@ -420,7 +694,13 @@ expire_client(struct nfs4_client *clp)
420 list_del(&clp->cl_lru); 694 list_del(&clp->cl_lru);
421 while (!list_empty(&clp->cl_openowners)) { 695 while (!list_empty(&clp->cl_openowners)) {
422 sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); 696 sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
423 release_stateowner(sop); 697 release_openowner(sop);
698 }
699 while (!list_empty(&clp->cl_sessions)) {
700 struct nfsd4_session *ses;
701 ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
702 se_perclnt);
703 release_session(ses);
424 } 704 }
425 put_nfs4_client(clp); 705 put_nfs4_client(clp);
426} 706}
@@ -439,6 +719,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir)
439 INIT_LIST_HEAD(&clp->cl_strhash); 719 INIT_LIST_HEAD(&clp->cl_strhash);
440 INIT_LIST_HEAD(&clp->cl_openowners); 720 INIT_LIST_HEAD(&clp->cl_openowners);
441 INIT_LIST_HEAD(&clp->cl_delegations); 721 INIT_LIST_HEAD(&clp->cl_delegations);
722 INIT_LIST_HEAD(&clp->cl_sessions);
442 INIT_LIST_HEAD(&clp->cl_lru); 723 INIT_LIST_HEAD(&clp->cl_lru);
443 return clp; 724 return clp;
444} 725}
@@ -568,25 +849,45 @@ find_unconfirmed_client(clientid_t *clid)
568 return NULL; 849 return NULL;
569} 850}
570 851
852/*
853 * Return 1 iff clp's clientid establishment method matches the use_exchange_id
854 * parameter. Matching is based on the fact the at least one of the
855 * EXCHGID4_FLAG_USE_{NON_PNFS,PNFS_MDS,PNFS_DS} flags must be set for v4.1
856 *
857 * FIXME: we need to unify the clientid namespaces for nfsv4.x
858 * and correctly deal with client upgrade/downgrade in EXCHANGE_ID
859 * and SET_CLIENTID{,_CONFIRM}
860 */
861static inline int
862match_clientid_establishment(struct nfs4_client *clp, bool use_exchange_id)
863{
864 bool has_exchange_flags = (clp->cl_exchange_flags != 0);
865 return use_exchange_id == has_exchange_flags;
866}
867
571static struct nfs4_client * 868static struct nfs4_client *
572find_confirmed_client_by_str(const char *dname, unsigned int hashval) 869find_confirmed_client_by_str(const char *dname, unsigned int hashval,
870 bool use_exchange_id)
573{ 871{
574 struct nfs4_client *clp; 872 struct nfs4_client *clp;
575 873
576 list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) { 874 list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) {
577 if (same_name(clp->cl_recdir, dname)) 875 if (same_name(clp->cl_recdir, dname) &&
876 match_clientid_establishment(clp, use_exchange_id))
578 return clp; 877 return clp;
579 } 878 }
580 return NULL; 879 return NULL;
581} 880}
582 881
583static struct nfs4_client * 882static struct nfs4_client *
584find_unconfirmed_client_by_str(const char *dname, unsigned int hashval) 883find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
884 bool use_exchange_id)
585{ 885{
586 struct nfs4_client *clp; 886 struct nfs4_client *clp;
587 887
588 list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) { 888 list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) {
589 if (same_name(clp->cl_recdir, dname)) 889 if (same_name(clp->cl_recdir, dname) &&
890 match_clientid_establishment(clp, use_exchange_id))
590 return clp; 891 return clp;
591 } 892 }
592 return NULL; 893 return NULL;
@@ -685,6 +986,534 @@ out_err:
685 return; 986 return;
686} 987}
687 988
989void
990nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp)
991{
992 struct nfsd4_compoundres *resp = rqstp->rq_resp;
993
994 resp->cstate.statp = statp;
995}
996
997/*
998 * Dereference the result pages.
999 */
1000static void
1001nfsd4_release_respages(struct page **respages, short resused)
1002{
1003 int i;
1004
1005 dprintk("--> %s\n", __func__);
1006 for (i = 0; i < resused; i++) {
1007 if (!respages[i])
1008 continue;
1009 put_page(respages[i]);
1010 respages[i] = NULL;
1011 }
1012}
1013
1014static void
1015nfsd4_copy_pages(struct page **topages, struct page **frompages, short count)
1016{
1017 int i;
1018
1019 for (i = 0; i < count; i++) {
1020 topages[i] = frompages[i];
1021 if (!topages[i])
1022 continue;
1023 get_page(topages[i]);
1024 }
1025}
1026
1027/*
1028 * Cache the reply pages up to NFSD_PAGES_PER_SLOT + 1, clearing the previous
1029 * pages. We add a page to NFSD_PAGES_PER_SLOT for the case where the total
1030 * length of the XDR response is less than se_fmaxresp_cached
1031 * (NFSD_PAGES_PER_SLOT * PAGE_SIZE) but the xdr_buf pages is used for a
1032 * of the reply (e.g. readdir).
1033 *
1034 * Store the base and length of the rq_req.head[0] page
1035 * of the NFSv4.1 data, just past the rpc header.
1036 */
1037void
1038nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
1039{
1040 struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
1041 struct svc_rqst *rqstp = resp->rqstp;
1042 struct nfsd4_compoundargs *args = rqstp->rq_argp;
1043 struct nfsd4_op *op = &args->ops[resp->opcnt];
1044 struct kvec *resv = &rqstp->rq_res.head[0];
1045
1046 dprintk("--> %s entry %p\n", __func__, entry);
1047
1048 /* Don't cache a failed OP_SEQUENCE. */
1049 if (resp->opcnt == 1 && op->opnum == OP_SEQUENCE && resp->cstate.status)
1050 return;
1051
1052 nfsd4_release_respages(entry->ce_respages, entry->ce_resused);
1053 entry->ce_opcnt = resp->opcnt;
1054 entry->ce_status = resp->cstate.status;
1055
1056 /*
1057 * Don't need a page to cache just the sequence operation - the slot
1058 * does this for us!
1059 */
1060
1061 if (nfsd4_not_cached(resp)) {
1062 entry->ce_resused = 0;
1063 entry->ce_rpchdrlen = 0;
1064 dprintk("%s Just cache SEQUENCE. ce_cachethis %d\n", __func__,
1065 resp->cstate.slot->sl_cache_entry.ce_cachethis);
1066 return;
1067 }
1068 entry->ce_resused = rqstp->rq_resused;
1069 if (entry->ce_resused > NFSD_PAGES_PER_SLOT + 1)
1070 entry->ce_resused = NFSD_PAGES_PER_SLOT + 1;
1071 nfsd4_copy_pages(entry->ce_respages, rqstp->rq_respages,
1072 entry->ce_resused);
1073 entry->ce_datav.iov_base = resp->cstate.statp;
1074 entry->ce_datav.iov_len = resv->iov_len - ((char *)resp->cstate.statp -
1075 (char *)page_address(rqstp->rq_respages[0]));
1076 /* Current request rpc header length*/
1077 entry->ce_rpchdrlen = (char *)resp->cstate.statp -
1078 (char *)page_address(rqstp->rq_respages[0]);
1079}
1080
1081/*
1082 * We keep the rpc header, but take the nfs reply from the replycache.
1083 */
1084static int
1085nfsd41_copy_replay_data(struct nfsd4_compoundres *resp,
1086 struct nfsd4_cache_entry *entry)
1087{
1088 struct svc_rqst *rqstp = resp->rqstp;
1089 struct kvec *resv = &resp->rqstp->rq_res.head[0];
1090 int len;
1091
1092 /* Current request rpc header length*/
1093 len = (char *)resp->cstate.statp -
1094 (char *)page_address(rqstp->rq_respages[0]);
1095 if (entry->ce_datav.iov_len + len > PAGE_SIZE) {
1096 dprintk("%s v41 cached reply too large (%Zd).\n", __func__,
1097 entry->ce_datav.iov_len);
1098 return 0;
1099 }
1100 /* copy the cached reply nfsd data past the current rpc header */
1101 memcpy((char *)resv->iov_base + len, entry->ce_datav.iov_base,
1102 entry->ce_datav.iov_len);
1103 resv->iov_len = len + entry->ce_datav.iov_len;
1104 return 1;
1105}
1106
1107/*
1108 * Keep the first page of the replay. Copy the NFSv4.1 data from the first
1109 * cached page. Replace any futher replay pages from the cache.
1110 */
1111__be32
1112nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
1113 struct nfsd4_sequence *seq)
1114{
1115 struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
1116 __be32 status;
1117
1118 dprintk("--> %s entry %p\n", __func__, entry);
1119
1120 /*
1121 * If this is just the sequence operation, we did not keep
1122 * a page in the cache entry because we can just use the
1123 * slot info stored in struct nfsd4_sequence that was checked
1124 * against the slot in nfsd4_sequence().
1125 *
1126 * This occurs when seq->cachethis is FALSE, or when the client
1127 * session inactivity timer fires and a solo sequence operation
1128 * is sent (lease renewal).
1129 */
1130 if (seq && nfsd4_not_cached(resp)) {
1131 seq->maxslots = resp->cstate.session->se_fnumslots;
1132 return nfs_ok;
1133 }
1134
1135 if (!nfsd41_copy_replay_data(resp, entry)) {
1136 /*
1137 * Not enough room to use the replay rpc header, send the
1138 * cached header. Release all the allocated result pages.
1139 */
1140 svc_free_res_pages(resp->rqstp);
1141 nfsd4_copy_pages(resp->rqstp->rq_respages, entry->ce_respages,
1142 entry->ce_resused);
1143 } else {
1144 /* Release all but the first allocated result page */
1145
1146 resp->rqstp->rq_resused--;
1147 svc_free_res_pages(resp->rqstp);
1148
1149 nfsd4_copy_pages(&resp->rqstp->rq_respages[1],
1150 &entry->ce_respages[1],
1151 entry->ce_resused - 1);
1152 }
1153
1154 resp->rqstp->rq_resused = entry->ce_resused;
1155 resp->opcnt = entry->ce_opcnt;
1156 resp->cstate.iovlen = entry->ce_datav.iov_len + entry->ce_rpchdrlen;
1157 status = entry->ce_status;
1158
1159 return status;
1160}
1161
1162/*
1163 * Set the exchange_id flags returned by the server.
1164 */
1165static void
1166nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
1167{
1168 /* pNFS is not supported */
1169 new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
1170
1171 /* Referrals are supported, Migration is not. */
1172 new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
1173
1174 /* set the wire flags to return to client. */
1175 clid->flags = new->cl_exchange_flags;
1176}
1177
1178__be32
1179nfsd4_exchange_id(struct svc_rqst *rqstp,
1180 struct nfsd4_compound_state *cstate,
1181 struct nfsd4_exchange_id *exid)
1182{
1183 struct nfs4_client *unconf, *conf, *new;
1184 int status;
1185 unsigned int strhashval;
1186 char dname[HEXDIR_LEN];
1187 nfs4_verifier verf = exid->verifier;
1188 u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
1189
1190 dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
1191 " ip_addr=%u flags %x, spa_how %d\n",
1192 __func__, rqstp, exid, exid->clname.len, exid->clname.data,
1193 ip_addr, exid->flags, exid->spa_how);
1194
1195 if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A))
1196 return nfserr_inval;
1197
1198 /* Currently only support SP4_NONE */
1199 switch (exid->spa_how) {
1200 case SP4_NONE:
1201 break;
1202 case SP4_SSV:
1203 return nfserr_encr_alg_unsupp;
1204 default:
1205 BUG(); /* checked by xdr code */
1206 case SP4_MACH_CRED:
1207 return nfserr_serverfault; /* no excuse :-/ */
1208 }
1209
1210 status = nfs4_make_rec_clidname(dname, &exid->clname);
1211
1212 if (status)
1213 goto error;
1214
1215 strhashval = clientstr_hashval(dname);
1216
1217 nfs4_lock_state();
1218 status = nfs_ok;
1219
1220 conf = find_confirmed_client_by_str(dname, strhashval, true);
1221 if (conf) {
1222 if (!same_verf(&verf, &conf->cl_verifier)) {
1223 /* 18.35.4 case 8 */
1224 if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
1225 status = nfserr_not_same;
1226 goto out;
1227 }
1228 /* Client reboot: destroy old state */
1229 expire_client(conf);
1230 goto out_new;
1231 }
1232 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
1233 /* 18.35.4 case 9 */
1234 if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
1235 status = nfserr_perm;
1236 goto out;
1237 }
1238 expire_client(conf);
1239 goto out_new;
1240 }
1241 if (ip_addr != conf->cl_addr &&
1242 !(exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A)) {
1243 /* Client collision. 18.35.4 case 3 */
1244 status = nfserr_clid_inuse;
1245 goto out;
1246 }
1247 /*
1248 * Set bit when the owner id and verifier map to an already
1249 * confirmed client id (18.35.3).
1250 */
1251 exid->flags |= EXCHGID4_FLAG_CONFIRMED_R;
1252
1253 /*
1254 * Falling into 18.35.4 case 2, possible router replay.
1255 * Leave confirmed record intact and return same result.
1256 */
1257 copy_verf(conf, &verf);
1258 new = conf;
1259 goto out_copy;
1260 } else {
1261 /* 18.35.4 case 7 */
1262 if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
1263 status = nfserr_noent;
1264 goto out;
1265 }
1266 }
1267
1268 unconf = find_unconfirmed_client_by_str(dname, strhashval, true);
1269 if (unconf) {
1270 /*
1271 * Possible retry or client restart. Per 18.35.4 case 4,
1272 * a new unconfirmed record should be generated regardless
1273 * of whether any properties have changed.
1274 */
1275 expire_client(unconf);
1276 }
1277
1278out_new:
1279 /* Normal case */
1280 new = create_client(exid->clname, dname);
1281 if (new == NULL) {
1282 status = nfserr_resource;
1283 goto out;
1284 }
1285
1286 copy_verf(new, &verf);
1287 copy_cred(&new->cl_cred, &rqstp->rq_cred);
1288 new->cl_addr = ip_addr;
1289 gen_clid(new);
1290 gen_confirm(new);
1291 add_to_unconfirmed(new, strhashval);
1292out_copy:
1293 exid->clientid.cl_boot = new->cl_clientid.cl_boot;
1294 exid->clientid.cl_id = new->cl_clientid.cl_id;
1295
1296 new->cl_slot.sl_seqid = 0;
1297 exid->seqid = 1;
1298 nfsd4_set_ex_flags(new, exid);
1299
1300 dprintk("nfsd4_exchange_id seqid %d flags %x\n",
1301 new->cl_slot.sl_seqid, new->cl_exchange_flags);
1302 status = nfs_ok;
1303
1304out:
1305 nfs4_unlock_state();
1306error:
1307 dprintk("nfsd4_exchange_id returns %d\n", ntohl(status));
1308 return status;
1309}
1310
1311static int
1312check_slot_seqid(u32 seqid, struct nfsd4_slot *slot)
1313{
1314 dprintk("%s enter. seqid %d slot->sl_seqid %d\n", __func__, seqid,
1315 slot->sl_seqid);
1316
1317 /* The slot is in use, and no response has been sent. */
1318 if (slot->sl_inuse) {
1319 if (seqid == slot->sl_seqid)
1320 return nfserr_jukebox;
1321 else
1322 return nfserr_seq_misordered;
1323 }
1324 /* Normal */
1325 if (likely(seqid == slot->sl_seqid + 1))
1326 return nfs_ok;
1327 /* Replay */
1328 if (seqid == slot->sl_seqid)
1329 return nfserr_replay_cache;
1330 /* Wraparound */
1331 if (seqid == 1 && (slot->sl_seqid + 1) == 0)
1332 return nfs_ok;
1333 /* Misordered replay or misordered new request */
1334 return nfserr_seq_misordered;
1335}
1336
1337__be32
1338nfsd4_create_session(struct svc_rqst *rqstp,
1339 struct nfsd4_compound_state *cstate,
1340 struct nfsd4_create_session *cr_ses)
1341{
1342 u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
1343 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1344 struct nfs4_client *conf, *unconf;
1345 struct nfsd4_slot *slot = NULL;
1346 int status = 0;
1347
1348 nfs4_lock_state();
1349 unconf = find_unconfirmed_client(&cr_ses->clientid);
1350 conf = find_confirmed_client(&cr_ses->clientid);
1351
1352 if (conf) {
1353 slot = &conf->cl_slot;
1354 status = check_slot_seqid(cr_ses->seqid, slot);
1355 if (status == nfserr_replay_cache) {
1356 dprintk("Got a create_session replay! seqid= %d\n",
1357 slot->sl_seqid);
1358 cstate->slot = slot;
1359 cstate->status = status;
1360 /* Return the cached reply status */
1361 status = nfsd4_replay_cache_entry(resp, NULL);
1362 goto out;
1363 } else if (cr_ses->seqid != conf->cl_slot.sl_seqid + 1) {
1364 status = nfserr_seq_misordered;
1365 dprintk("Sequence misordered!\n");
1366 dprintk("Expected seqid= %d but got seqid= %d\n",
1367 slot->sl_seqid, cr_ses->seqid);
1368 goto out;
1369 }
1370 conf->cl_slot.sl_seqid++;
1371 } else if (unconf) {
1372 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
1373 (ip_addr != unconf->cl_addr)) {
1374 status = nfserr_clid_inuse;
1375 goto out;
1376 }
1377
1378 slot = &unconf->cl_slot;
1379 status = check_slot_seqid(cr_ses->seqid, slot);
1380 if (status) {
1381 /* an unconfirmed replay returns misordered */
1382 status = nfserr_seq_misordered;
1383 goto out;
1384 }
1385
1386 slot->sl_seqid++; /* from 0 to 1 */
1387 move_to_confirmed(unconf);
1388
1389 /*
1390 * We do not support RDMA or persistent sessions
1391 */
1392 cr_ses->flags &= ~SESSION4_PERSIST;
1393 cr_ses->flags &= ~SESSION4_RDMA;
1394
1395 conf = unconf;
1396 } else {
1397 status = nfserr_stale_clientid;
1398 goto out;
1399 }
1400
1401 status = alloc_init_session(rqstp, conf, cr_ses);
1402 if (status)
1403 goto out;
1404
1405 memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data,
1406 NFS4_MAX_SESSIONID_LEN);
1407 cr_ses->seqid = slot->sl_seqid;
1408
1409 slot->sl_inuse = true;
1410 cstate->slot = slot;
1411 /* Ensure a page is used for the cache */
1412 slot->sl_cache_entry.ce_cachethis = 1;
1413out:
1414 nfs4_unlock_state();
1415 dprintk("%s returns %d\n", __func__, ntohl(status));
1416 return status;
1417}
1418
1419__be32
1420nfsd4_destroy_session(struct svc_rqst *r,
1421 struct nfsd4_compound_state *cstate,
1422 struct nfsd4_destroy_session *sessionid)
1423{
1424 struct nfsd4_session *ses;
1425 u32 status = nfserr_badsession;
1426
1427 /* Notes:
1428 * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid
1429 * - Should we return nfserr_back_chan_busy if waiting for
1430 * callbacks on to-be-destroyed session?
1431 * - Do we need to clear any callback info from previous session?
1432 */
1433
1434 dump_sessionid(__func__, &sessionid->sessionid);
1435 spin_lock(&sessionid_lock);
1436 ses = find_in_sessionid_hashtbl(&sessionid->sessionid);
1437 if (!ses) {
1438 spin_unlock(&sessionid_lock);
1439 goto out;
1440 }
1441
1442 unhash_session(ses);
1443 spin_unlock(&sessionid_lock);
1444
1445 /* wait for callbacks */
1446 shutdown_callback_client(ses->se_client);
1447 nfsd4_put_session(ses);
1448 status = nfs_ok;
1449out:
1450 dprintk("%s returns %d\n", __func__, ntohl(status));
1451 return status;
1452}
1453
1454__be32
1455nfsd4_sequence(struct svc_rqst *rqstp,
1456 struct nfsd4_compound_state *cstate,
1457 struct nfsd4_sequence *seq)
1458{
1459 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1460 struct nfsd4_session *session;
1461 struct nfsd4_slot *slot;
1462 int status;
1463
1464 if (resp->opcnt != 1)
1465 return nfserr_sequence_pos;
1466
1467 spin_lock(&sessionid_lock);
1468 status = nfserr_badsession;
1469 session = find_in_sessionid_hashtbl(&seq->sessionid);
1470 if (!session)
1471 goto out;
1472
1473 status = nfserr_badslot;
1474 if (seq->slotid >= session->se_fnumslots)
1475 goto out;
1476
1477 slot = &session->se_slots[seq->slotid];
1478 dprintk("%s: slotid %d\n", __func__, seq->slotid);
1479
1480 status = check_slot_seqid(seq->seqid, slot);
1481 if (status == nfserr_replay_cache) {
1482 cstate->slot = slot;
1483 cstate->session = session;
1484 /* Return the cached reply status and set cstate->status
1485 * for nfsd4_svc_encode_compoundres processing */
1486 status = nfsd4_replay_cache_entry(resp, seq);
1487 cstate->status = nfserr_replay_cache;
1488 goto replay_cache;
1489 }
1490 if (status)
1491 goto out;
1492
1493 /* Success! bump slot seqid */
1494 slot->sl_inuse = true;
1495 slot->sl_seqid = seq->seqid;
1496 slot->sl_cache_entry.ce_cachethis = seq->cachethis;
1497 /* Always set the cache entry cachethis for solo sequence */
1498 if (nfsd4_is_solo_sequence(resp))
1499 slot->sl_cache_entry.ce_cachethis = 1;
1500
1501 cstate->slot = slot;
1502 cstate->session = session;
1503
1504replay_cache:
1505 /* Renew the clientid on success and on replay.
1506 * Hold a session reference until done processing the compound:
1507 * nfsd4_put_session called only if the cstate slot is set.
1508 */
1509 renew_client(session->se_client);
1510 nfsd4_get_session(session);
1511out:
1512 spin_unlock(&sessionid_lock);
1513 dprintk("%s: return %d\n", __func__, ntohl(status));
1514 return status;
1515}
1516
688__be32 1517__be32
689nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 1518nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
690 struct nfsd4_setclientid *setclid) 1519 struct nfsd4_setclientid *setclid)
@@ -716,14 +1545,13 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
716 strhashval = clientstr_hashval(dname); 1545 strhashval = clientstr_hashval(dname);
717 1546
718 nfs4_lock_state(); 1547 nfs4_lock_state();
719 conf = find_confirmed_client_by_str(dname, strhashval); 1548 conf = find_confirmed_client_by_str(dname, strhashval, false);
720 if (conf) { 1549 if (conf) {
721 /* RFC 3530 14.2.33 CASE 0: */ 1550 /* RFC 3530 14.2.33 CASE 0: */
722 status = nfserr_clid_inuse; 1551 status = nfserr_clid_inuse;
723 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred) 1552 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
724 || conf->cl_addr != sin->sin_addr.s_addr) { 1553 dprintk("NFSD: setclientid: string in use by client"
725 dprintk("NFSD: setclientid: string in use by clientat %pI4\n", 1554 " at %pI4\n", &conf->cl_addr);
726 &conf->cl_addr);
727 goto out; 1555 goto out;
728 } 1556 }
729 } 1557 }
@@ -732,7 +1560,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
732 * has a description of SETCLIENTID request processing consisting 1560 * has a description of SETCLIENTID request processing consisting
733 * of 5 bullet points, labeled as CASE0 - CASE4 below. 1561 * of 5 bullet points, labeled as CASE0 - CASE4 below.
734 */ 1562 */
735 unconf = find_unconfirmed_client_by_str(dname, strhashval); 1563 unconf = find_unconfirmed_client_by_str(dname, strhashval, false);
736 status = nfserr_resource; 1564 status = nfserr_resource;
737 if (!conf) { 1565 if (!conf) {
738 /* 1566 /*
@@ -887,7 +1715,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
887 unsigned int hash = 1715 unsigned int hash =
888 clientstr_hashval(unconf->cl_recdir); 1716 clientstr_hashval(unconf->cl_recdir);
889 conf = find_confirmed_client_by_str(unconf->cl_recdir, 1717 conf = find_confirmed_client_by_str(unconf->cl_recdir,
890 hash); 1718 hash, false);
891 if (conf) { 1719 if (conf) {
892 nfsd4_remove_clid_dir(conf); 1720 nfsd4_remove_clid_dir(conf);
893 expire_client(conf); 1721 expire_client(conf);
@@ -923,11 +1751,13 @@ alloc_init_file(struct inode *ino)
923 1751
924 fp = kmem_cache_alloc(file_slab, GFP_KERNEL); 1752 fp = kmem_cache_alloc(file_slab, GFP_KERNEL);
925 if (fp) { 1753 if (fp) {
926 kref_init(&fp->fi_ref); 1754 atomic_set(&fp->fi_ref, 1);
927 INIT_LIST_HEAD(&fp->fi_hash); 1755 INIT_LIST_HEAD(&fp->fi_hash);
928 INIT_LIST_HEAD(&fp->fi_stateids); 1756 INIT_LIST_HEAD(&fp->fi_stateids);
929 INIT_LIST_HEAD(&fp->fi_delegations); 1757 INIT_LIST_HEAD(&fp->fi_delegations);
1758 spin_lock(&recall_lock);
930 list_add(&fp->fi_hash, &file_hashtbl[hashval]); 1759 list_add(&fp->fi_hash, &file_hashtbl[hashval]);
1760 spin_unlock(&recall_lock);
931 fp->fi_inode = igrab(ino); 1761 fp->fi_inode = igrab(ino);
932 fp->fi_id = current_fileid++; 1762 fp->fi_id = current_fileid++;
933 fp->fi_had_conflict = false; 1763 fp->fi_had_conflict = false;
@@ -1037,48 +1867,6 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
1037 return sop; 1867 return sop;
1038} 1868}
1039 1869
1040static void
1041release_stateid_lockowners(struct nfs4_stateid *open_stp)
1042{
1043 struct nfs4_stateowner *lock_sop;
1044
1045 while (!list_empty(&open_stp->st_lockowners)) {
1046 lock_sop = list_entry(open_stp->st_lockowners.next,
1047 struct nfs4_stateowner, so_perstateid);
1048 /* list_del(&open_stp->st_lockowners); */
1049 BUG_ON(lock_sop->so_is_open_owner);
1050 release_stateowner(lock_sop);
1051 }
1052}
1053
1054static void
1055unhash_stateowner(struct nfs4_stateowner *sop)
1056{
1057 struct nfs4_stateid *stp;
1058
1059 list_del(&sop->so_idhash);
1060 list_del(&sop->so_strhash);
1061 if (sop->so_is_open_owner)
1062 list_del(&sop->so_perclient);
1063 list_del(&sop->so_perstateid);
1064 while (!list_empty(&sop->so_stateids)) {
1065 stp = list_entry(sop->so_stateids.next,
1066 struct nfs4_stateid, st_perstateowner);
1067 if (sop->so_is_open_owner)
1068 release_stateid(stp, OPEN_STATE);
1069 else
1070 release_stateid(stp, LOCK_STATE);
1071 }
1072}
1073
1074static void
1075release_stateowner(struct nfs4_stateowner *sop)
1076{
1077 unhash_stateowner(sop);
1078 list_del(&sop->so_close_lru);
1079 nfs4_put_stateowner(sop);
1080}
1081
1082static inline void 1870static inline void
1083init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { 1871init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
1084 struct nfs4_stateowner *sop = open->op_stateowner; 1872 struct nfs4_stateowner *sop = open->op_stateowner;
@@ -1100,30 +1888,13 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
1100 stp->st_stateid.si_generation = 0; 1888 stp->st_stateid.si_generation = 0;
1101 stp->st_access_bmap = 0; 1889 stp->st_access_bmap = 0;
1102 stp->st_deny_bmap = 0; 1890 stp->st_deny_bmap = 0;
1103 __set_bit(open->op_share_access, &stp->st_access_bmap); 1891 __set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK,
1892 &stp->st_access_bmap);
1104 __set_bit(open->op_share_deny, &stp->st_deny_bmap); 1893 __set_bit(open->op_share_deny, &stp->st_deny_bmap);
1105 stp->st_openstp = NULL; 1894 stp->st_openstp = NULL;
1106} 1895}
1107 1896
1108static void 1897static void
1109release_stateid(struct nfs4_stateid *stp, int flags)
1110{
1111 struct file *filp = stp->st_vfs_file;
1112
1113 list_del(&stp->st_hash);
1114 list_del(&stp->st_perfile);
1115 list_del(&stp->st_perstateowner);
1116 if (flags & OPEN_STATE) {
1117 release_stateid_lockowners(stp);
1118 stp->st_vfs_file = NULL;
1119 nfsd_close(filp);
1120 } else if (flags & LOCK_STATE)
1121 locks_remove_posix(filp, (fl_owner_t) stp->st_stateowner);
1122 put_nfs4_file(stp->st_file);
1123 kmem_cache_free(stateid_slab, stp);
1124}
1125
1126static void
1127move_to_close_lru(struct nfs4_stateowner *sop) 1898move_to_close_lru(struct nfs4_stateowner *sop)
1128{ 1899{
1129 dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop); 1900 dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop);
@@ -1160,20 +1931,33 @@ find_file(struct inode *ino)
1160 unsigned int hashval = file_hashval(ino); 1931 unsigned int hashval = file_hashval(ino);
1161 struct nfs4_file *fp; 1932 struct nfs4_file *fp;
1162 1933
1934 spin_lock(&recall_lock);
1163 list_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) { 1935 list_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) {
1164 if (fp->fi_inode == ino) { 1936 if (fp->fi_inode == ino) {
1165 get_nfs4_file(fp); 1937 get_nfs4_file(fp);
1938 spin_unlock(&recall_lock);
1166 return fp; 1939 return fp;
1167 } 1940 }
1168 } 1941 }
1942 spin_unlock(&recall_lock);
1169 return NULL; 1943 return NULL;
1170} 1944}
1171 1945
1172static inline int access_valid(u32 x) 1946static inline int access_valid(u32 x, u32 minorversion)
1173{ 1947{
1174 if (x < NFS4_SHARE_ACCESS_READ) 1948 if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ)
1175 return 0; 1949 return 0;
1176 if (x > NFS4_SHARE_ACCESS_BOTH) 1950 if ((x & NFS4_SHARE_ACCESS_MASK) > NFS4_SHARE_ACCESS_BOTH)
1951 return 0;
1952 x &= ~NFS4_SHARE_ACCESS_MASK;
1953 if (minorversion && x) {
1954 if ((x & NFS4_SHARE_WANT_MASK) > NFS4_SHARE_WANT_CANCEL)
1955 return 0;
1956 if ((x & NFS4_SHARE_WHEN_MASK) > NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED)
1957 return 0;
1958 x &= ~(NFS4_SHARE_WANT_MASK | NFS4_SHARE_WHEN_MASK);
1959 }
1960 if (x)
1177 return 0; 1961 return 0;
1178 return 1; 1962 return 1;
1179} 1963}
@@ -1409,7 +2193,8 @@ static struct lock_manager_operations nfsd_lease_mng_ops = {
1409 2193
1410 2194
1411__be32 2195__be32
1412nfsd4_process_open1(struct nfsd4_open *open) 2196nfsd4_process_open1(struct nfsd4_compound_state *cstate,
2197 struct nfsd4_open *open)
1413{ 2198{
1414 clientid_t *clientid = &open->op_clientid; 2199 clientid_t *clientid = &open->op_clientid;
1415 struct nfs4_client *clp = NULL; 2200 struct nfs4_client *clp = NULL;
@@ -1432,10 +2217,13 @@ nfsd4_process_open1(struct nfsd4_open *open)
1432 return nfserr_expired; 2217 return nfserr_expired;
1433 goto renew; 2218 goto renew;
1434 } 2219 }
2220 /* When sessions are used, skip open sequenceid processing */
2221 if (nfsd4_has_session(cstate))
2222 goto renew;
1435 if (!sop->so_confirmed) { 2223 if (!sop->so_confirmed) {
1436 /* Replace unconfirmed owners without checking for replay. */ 2224 /* Replace unconfirmed owners without checking for replay. */
1437 clp = sop->so_client; 2225 clp = sop->so_client;
1438 release_stateowner(sop); 2226 release_openowner(sop);
1439 open->op_stateowner = NULL; 2227 open->op_stateowner = NULL;
1440 goto renew; 2228 goto renew;
1441 } 2229 }
@@ -1709,6 +2497,7 @@ out:
1709__be32 2497__be32
1710nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) 2498nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
1711{ 2499{
2500 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1712 struct nfs4_file *fp = NULL; 2501 struct nfs4_file *fp = NULL;
1713 struct inode *ino = current_fh->fh_dentry->d_inode; 2502 struct inode *ino = current_fh->fh_dentry->d_inode;
1714 struct nfs4_stateid *stp = NULL; 2503 struct nfs4_stateid *stp = NULL;
@@ -1716,7 +2505,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
1716 __be32 status; 2505 __be32 status;
1717 2506
1718 status = nfserr_inval; 2507 status = nfserr_inval;
1719 if (!access_valid(open->op_share_access) 2508 if (!access_valid(open->op_share_access, resp->cstate.minorversion)
1720 || !deny_valid(open->op_share_deny)) 2509 || !deny_valid(open->op_share_deny))
1721 goto out; 2510 goto out;
1722 /* 2511 /*
@@ -1764,12 +2553,17 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
1764 init_stateid(stp, fp, open); 2553 init_stateid(stp, fp, open);
1765 status = nfsd4_truncate(rqstp, current_fh, open); 2554 status = nfsd4_truncate(rqstp, current_fh, open);
1766 if (status) { 2555 if (status) {
1767 release_stateid(stp, OPEN_STATE); 2556 release_open_stateid(stp);
1768 goto out; 2557 goto out;
1769 } 2558 }
2559 if (nfsd4_has_session(&resp->cstate))
2560 update_stateid(&stp->st_stateid);
1770 } 2561 }
1771 memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t)); 2562 memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
1772 2563
2564 if (nfsd4_has_session(&resp->cstate))
2565 open->op_stateowner->so_confirmed = 1;
2566
1773 /* 2567 /*
1774 * Attempt to hand out a delegation. No error return, because the 2568 * Attempt to hand out a delegation. No error return, because the
1775 * OPEN succeeds even if we fail. 2569 * OPEN succeeds even if we fail.
@@ -1790,7 +2584,8 @@ out:
1790 * To finish the open response, we just need to set the rflags. 2584 * To finish the open response, we just need to set the rflags.
1791 */ 2585 */
1792 open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX; 2586 open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX;
1793 if (!open->op_stateowner->so_confirmed) 2587 if (!open->op_stateowner->so_confirmed &&
2588 !nfsd4_has_session(&resp->cstate))
1794 open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM; 2589 open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM;
1795 2590
1796 return status; 2591 return status;
@@ -1898,7 +2693,7 @@ nfs4_laundromat(void)
1898 } 2693 }
1899 dprintk("NFSD: purging unused open stateowner (so_id %d)\n", 2694 dprintk("NFSD: purging unused open stateowner (so_id %d)\n",
1900 sop->so_id); 2695 sop->so_id);
1901 release_stateowner(sop); 2696 release_openowner(sop);
1902 } 2697 }
1903 if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT) 2698 if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT)
1904 clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT; 2699 clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT;
@@ -1983,10 +2778,7 @@ out:
1983static inline __be32 2778static inline __be32
1984check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags) 2779check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
1985{ 2780{
1986 /* Trying to call delegreturn with a special stateid? Yuch: */ 2781 if (ONE_STATEID(stateid) && (flags & RD_STATE))
1987 if (!(flags & (RD_STATE | WR_STATE)))
1988 return nfserr_bad_stateid;
1989 else if (ONE_STATEID(stateid) && (flags & RD_STATE))
1990 return nfs_ok; 2782 return nfs_ok;
1991 else if (locks_in_grace()) { 2783 else if (locks_in_grace()) {
1992 /* Answer in remaining cases depends on existance of 2784 /* Answer in remaining cases depends on existance of
@@ -2005,14 +2797,20 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
2005 * that are not able to provide mandatory locking. 2797 * that are not able to provide mandatory locking.
2006 */ 2798 */
2007static inline int 2799static inline int
2008io_during_grace_disallowed(struct inode *inode, int flags) 2800grace_disallows_io(struct inode *inode)
2009{ 2801{
2010 return locks_in_grace() && (flags & (RD_STATE | WR_STATE)) 2802 return locks_in_grace() && mandatory_lock(inode);
2011 && mandatory_lock(inode);
2012} 2803}
2013 2804
2014static int check_stateid_generation(stateid_t *in, stateid_t *ref) 2805static int check_stateid_generation(stateid_t *in, stateid_t *ref, int flags)
2015{ 2806{
2807 /*
2808 * When sessions are used the stateid generation number is ignored
2809 * when it is zero.
2810 */
2811 if ((flags & HAS_SESSION) && in->si_generation == 0)
2812 goto out;
2813
2016 /* If the client sends us a stateid from the future, it's buggy: */ 2814 /* If the client sends us a stateid from the future, it's buggy: */
2017 if (in->si_generation > ref->si_generation) 2815 if (in->si_generation > ref->si_generation)
2018 return nfserr_bad_stateid; 2816 return nfserr_bad_stateid;
@@ -2028,74 +2826,77 @@ static int check_stateid_generation(stateid_t *in, stateid_t *ref)
2028 */ 2826 */
2029 if (in->si_generation < ref->si_generation) 2827 if (in->si_generation < ref->si_generation)
2030 return nfserr_old_stateid; 2828 return nfserr_old_stateid;
2829out:
2031 return nfs_ok; 2830 return nfs_ok;
2032} 2831}
2033 2832
2833static int is_delegation_stateid(stateid_t *stateid)
2834{
2835 return stateid->si_fileid == 0;
2836}
2837
2034/* 2838/*
2035* Checks for stateid operations 2839* Checks for stateid operations
2036*/ 2840*/
2037__be32 2841__be32
2038nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int flags, struct file **filpp) 2842nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
2843 stateid_t *stateid, int flags, struct file **filpp)
2039{ 2844{
2040 struct nfs4_stateid *stp = NULL; 2845 struct nfs4_stateid *stp = NULL;
2041 struct nfs4_delegation *dp = NULL; 2846 struct nfs4_delegation *dp = NULL;
2042 stateid_t *stidp; 2847 struct svc_fh *current_fh = &cstate->current_fh;
2043 struct inode *ino = current_fh->fh_dentry->d_inode; 2848 struct inode *ino = current_fh->fh_dentry->d_inode;
2044 __be32 status; 2849 __be32 status;
2045 2850
2046 dprintk("NFSD: preprocess_stateid_op: stateid = (%08x/%08x/%08x/%08x)\n",
2047 stateid->si_boot, stateid->si_stateownerid,
2048 stateid->si_fileid, stateid->si_generation);
2049 if (filpp) 2851 if (filpp)
2050 *filpp = NULL; 2852 *filpp = NULL;
2051 2853
2052 if (io_during_grace_disallowed(ino, flags)) 2854 if (grace_disallows_io(ino))
2053 return nfserr_grace; 2855 return nfserr_grace;
2054 2856
2857 if (nfsd4_has_session(cstate))
2858 flags |= HAS_SESSION;
2859
2055 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) 2860 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
2056 return check_special_stateids(current_fh, stateid, flags); 2861 return check_special_stateids(current_fh, stateid, flags);
2057 2862
2058 /* STALE STATEID */
2059 status = nfserr_stale_stateid; 2863 status = nfserr_stale_stateid;
2060 if (STALE_STATEID(stateid)) 2864 if (STALE_STATEID(stateid))
2061 goto out; 2865 goto out;
2062 2866
2063 /* BAD STATEID */
2064 status = nfserr_bad_stateid; 2867 status = nfserr_bad_stateid;
2065 if (!stateid->si_fileid) { /* delegation stateid */ 2868 if (is_delegation_stateid(stateid)) {
2066 if(!(dp = find_delegation_stateid(ino, stateid))) { 2869 dp = find_delegation_stateid(ino, stateid);
2067 dprintk("NFSD: delegation stateid not found\n"); 2870 if (!dp)
2068 goto out; 2871 goto out;
2069 } 2872 status = check_stateid_generation(stateid, &dp->dl_stateid,
2070 stidp = &dp->dl_stateid; 2873 flags);
2874 if (status)
2875 goto out;
2876 status = nfs4_check_delegmode(dp, flags);
2877 if (status)
2878 goto out;
2879 renew_client(dp->dl_client);
2880 if (filpp)
2881 *filpp = dp->dl_vfs_file;
2071 } else { /* open or lock stateid */ 2882 } else { /* open or lock stateid */
2072 if (!(stp = find_stateid(stateid, flags))) { 2883 stp = find_stateid(stateid, flags);
2073 dprintk("NFSD: open or lock stateid not found\n"); 2884 if (!stp)
2074 goto out; 2885 goto out;
2075 } 2886 if (nfs4_check_fh(current_fh, stp))
2076 if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp))
2077 goto out; 2887 goto out;
2078 if (!stp->st_stateowner->so_confirmed) 2888 if (!stp->st_stateowner->so_confirmed)
2079 goto out; 2889 goto out;
2080 stidp = &stp->st_stateid; 2890 status = check_stateid_generation(stateid, &stp->st_stateid,
2081 } 2891 flags);
2082 status = check_stateid_generation(stateid, stidp); 2892 if (status)
2083 if (status) 2893 goto out;
2084 goto out; 2894 status = nfs4_check_openmode(stp, flags);
2085 if (stp) { 2895 if (status)
2086 if ((status = nfs4_check_openmode(stp,flags)))
2087 goto out; 2896 goto out;
2088 renew_client(stp->st_stateowner->so_client); 2897 renew_client(stp->st_stateowner->so_client);
2089 if (filpp) 2898 if (filpp)
2090 *filpp = stp->st_vfs_file; 2899 *filpp = stp->st_vfs_file;
2091 } else {
2092 if ((status = nfs4_check_delegmode(dp, flags)))
2093 goto out;
2094 renew_client(dp->dl_client);
2095 if (flags & DELEG_RET)
2096 unhash_delegation(dp);
2097 if (filpp)
2098 *filpp = dp->dl_vfs_file;
2099 } 2900 }
2100 status = nfs_ok; 2901 status = nfs_ok;
2101out: 2902out:
@@ -2113,10 +2914,14 @@ setlkflg (int type)
2113 * Checks for sequence id mutating operations. 2914 * Checks for sequence id mutating operations.
2114 */ 2915 */
2115static __be32 2916static __be32
2116nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *stateid, int flags, struct nfs4_stateowner **sopp, struct nfs4_stateid **stpp, struct nfsd4_lock *lock) 2917nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
2918 stateid_t *stateid, int flags,
2919 struct nfs4_stateowner **sopp,
2920 struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
2117{ 2921{
2118 struct nfs4_stateid *stp; 2922 struct nfs4_stateid *stp;
2119 struct nfs4_stateowner *sop; 2923 struct nfs4_stateowner *sop;
2924 struct svc_fh *current_fh = &cstate->current_fh;
2120 __be32 status; 2925 __be32 status;
2121 2926
2122 dprintk("NFSD: preprocess_seqid_op: seqid=%d " 2927 dprintk("NFSD: preprocess_seqid_op: seqid=%d "
@@ -2134,6 +2939,10 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
2134 2939
2135 if (STALE_STATEID(stateid)) 2940 if (STALE_STATEID(stateid))
2136 return nfserr_stale_stateid; 2941 return nfserr_stale_stateid;
2942
2943 if (nfsd4_has_session(cstate))
2944 flags |= HAS_SESSION;
2945
2137 /* 2946 /*
2138 * We return BAD_STATEID if filehandle doesn't match stateid, 2947 * We return BAD_STATEID if filehandle doesn't match stateid,
2139 * the confirmed flag is incorrecly set, or the generation 2948 * the confirmed flag is incorrecly set, or the generation
@@ -2166,8 +2975,9 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
2166 if (lock->lk_is_new) { 2975 if (lock->lk_is_new) {
2167 if (!sop->so_is_open_owner) 2976 if (!sop->so_is_open_owner)
2168 return nfserr_bad_stateid; 2977 return nfserr_bad_stateid;
2169 if (!same_clid(&clp->cl_clientid, lockclid)) 2978 if (!(flags & HAS_SESSION) &&
2170 return nfserr_bad_stateid; 2979 !same_clid(&clp->cl_clientid, lockclid))
2980 return nfserr_bad_stateid;
2171 /* stp is the open stateid */ 2981 /* stp is the open stateid */
2172 status = nfs4_check_openmode(stp, lkflg); 2982 status = nfs4_check_openmode(stp, lkflg);
2173 if (status) 2983 if (status)
@@ -2190,7 +3000,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
2190 * For the moment, we ignore the possibility of 3000 * For the moment, we ignore the possibility of
2191 * generation number wraparound. 3001 * generation number wraparound.
2192 */ 3002 */
2193 if (seqid != sop->so_seqid) 3003 if (!(flags & HAS_SESSION) && seqid != sop->so_seqid)
2194 goto check_replay; 3004 goto check_replay;
2195 3005
2196 if (sop->so_confirmed && flags & CONFIRM) { 3006 if (sop->so_confirmed && flags & CONFIRM) {
@@ -2203,7 +3013,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
2203 " confirmed yet!\n"); 3013 " confirmed yet!\n");
2204 return nfserr_bad_stateid; 3014 return nfserr_bad_stateid;
2205 } 3015 }
2206 status = check_stateid_generation(stateid, &stp->st_stateid); 3016 status = check_stateid_generation(stateid, &stp->st_stateid, flags);
2207 if (status) 3017 if (status)
2208 return status; 3018 return status;
2209 renew_client(sop->so_client); 3019 renew_client(sop->so_client);
@@ -2239,7 +3049,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2239 3049
2240 nfs4_lock_state(); 3050 nfs4_lock_state();
2241 3051
2242 if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, 3052 if ((status = nfs4_preprocess_seqid_op(cstate,
2243 oc->oc_seqid, &oc->oc_req_stateid, 3053 oc->oc_seqid, &oc->oc_req_stateid,
2244 CONFIRM | OPEN_STATE, 3054 CONFIRM | OPEN_STATE,
2245 &oc->oc_stateowner, &stp, NULL))) 3055 &oc->oc_stateowner, &stp, NULL)))
@@ -2304,12 +3114,12 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
2304 (int)cstate->current_fh.fh_dentry->d_name.len, 3114 (int)cstate->current_fh.fh_dentry->d_name.len,
2305 cstate->current_fh.fh_dentry->d_name.name); 3115 cstate->current_fh.fh_dentry->d_name.name);
2306 3116
2307 if (!access_valid(od->od_share_access) 3117 if (!access_valid(od->od_share_access, cstate->minorversion)
2308 || !deny_valid(od->od_share_deny)) 3118 || !deny_valid(od->od_share_deny))
2309 return nfserr_inval; 3119 return nfserr_inval;
2310 3120
2311 nfs4_lock_state(); 3121 nfs4_lock_state();
2312 if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, 3122 if ((status = nfs4_preprocess_seqid_op(cstate,
2313 od->od_seqid, 3123 od->od_seqid,
2314 &od->od_stateid, 3124 &od->od_stateid,
2315 OPEN_STATE, 3125 OPEN_STATE,
@@ -2362,7 +3172,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2362 3172
2363 nfs4_lock_state(); 3173 nfs4_lock_state();
2364 /* check close_lru for replay */ 3174 /* check close_lru for replay */
2365 if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, 3175 if ((status = nfs4_preprocess_seqid_op(cstate,
2366 close->cl_seqid, 3176 close->cl_seqid,
2367 &close->cl_stateid, 3177 &close->cl_stateid,
2368 OPEN_STATE | CLOSE_STATE, 3178 OPEN_STATE | CLOSE_STATE,
@@ -2373,7 +3183,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2373 memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t)); 3183 memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t));
2374 3184
2375 /* release_stateid() calls nfsd_close() if needed */ 3185 /* release_stateid() calls nfsd_close() if needed */
2376 release_stateid(stp, OPEN_STATE); 3186 release_open_stateid(stp);
2377 3187
2378 /* place unused nfs4_stateowners on so_close_lru list to be 3188 /* place unused nfs4_stateowners on so_close_lru list to be
2379 * released by the laundromat service after the lease period 3189 * released by the laundromat service after the lease period
@@ -2394,16 +3204,40 @@ __be32
2394nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 3204nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2395 struct nfsd4_delegreturn *dr) 3205 struct nfsd4_delegreturn *dr)
2396{ 3206{
3207 struct nfs4_delegation *dp;
3208 stateid_t *stateid = &dr->dr_stateid;
3209 struct inode *inode;
2397 __be32 status; 3210 __be32 status;
3211 int flags = 0;
2398 3212
2399 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) 3213 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
2400 goto out; 3214 return status;
3215 inode = cstate->current_fh.fh_dentry->d_inode;
2401 3216
3217 if (nfsd4_has_session(cstate))
3218 flags |= HAS_SESSION;
2402 nfs4_lock_state(); 3219 nfs4_lock_state();
2403 status = nfs4_preprocess_stateid_op(&cstate->current_fh, 3220 status = nfserr_bad_stateid;
2404 &dr->dr_stateid, DELEG_RET, NULL); 3221 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
2405 nfs4_unlock_state(); 3222 goto out;
3223 status = nfserr_stale_stateid;
3224 if (STALE_STATEID(stateid))
3225 goto out;
3226 status = nfserr_bad_stateid;
3227 if (!is_delegation_stateid(stateid))
3228 goto out;
3229 dp = find_delegation_stateid(inode, stateid);
3230 if (!dp)
3231 goto out;
3232 status = check_stateid_generation(stateid, &dp->dl_stateid, flags);
3233 if (status)
3234 goto out;
3235 renew_client(dp->dl_client);
3236
3237 unhash_delegation(dp);
2406out: 3238out:
3239 nfs4_unlock_state();
3240
2407 return status; 3241 return status;
2408} 3242}
2409 3243
@@ -2684,11 +3518,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2684 struct nfs4_file *fp; 3518 struct nfs4_file *fp;
2685 3519
2686 status = nfserr_stale_clientid; 3520 status = nfserr_stale_clientid;
2687 if (STALE_CLIENTID(&lock->lk_new_clientid)) 3521 if (!nfsd4_has_session(cstate) &&
3522 STALE_CLIENTID(&lock->lk_new_clientid))
2688 goto out; 3523 goto out;
2689 3524
2690 /* validate and update open stateid and open seqid */ 3525 /* validate and update open stateid and open seqid */
2691 status = nfs4_preprocess_seqid_op(&cstate->current_fh, 3526 status = nfs4_preprocess_seqid_op(cstate,
2692 lock->lk_new_open_seqid, 3527 lock->lk_new_open_seqid,
2693 &lock->lk_new_open_stateid, 3528 &lock->lk_new_open_stateid,
2694 OPEN_STATE, 3529 OPEN_STATE,
@@ -2715,7 +3550,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2715 goto out; 3550 goto out;
2716 } else { 3551 } else {
2717 /* lock (lock owner + lock stateid) already exists */ 3552 /* lock (lock owner + lock stateid) already exists */
2718 status = nfs4_preprocess_seqid_op(&cstate->current_fh, 3553 status = nfs4_preprocess_seqid_op(cstate,
2719 lock->lk_old_lock_seqid, 3554 lock->lk_old_lock_seqid,
2720 &lock->lk_old_lock_stateid, 3555 &lock->lk_old_lock_stateid,
2721 LOCK_STATE, 3556 LOCK_STATE,
@@ -2788,7 +3623,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2788 } 3623 }
2789out: 3624out:
2790 if (status && lock->lk_is_new && lock_sop) 3625 if (status && lock->lk_is_new && lock_sop)
2791 release_stateowner(lock_sop); 3626 release_lockowner(lock_sop);
2792 if (lock->lk_replay_owner) { 3627 if (lock->lk_replay_owner) {
2793 nfs4_get_stateowner(lock->lk_replay_owner); 3628 nfs4_get_stateowner(lock->lk_replay_owner);
2794 cstate->replay_owner = lock->lk_replay_owner; 3629 cstate->replay_owner = lock->lk_replay_owner;
@@ -2838,7 +3673,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2838 nfs4_lock_state(); 3673 nfs4_lock_state();
2839 3674
2840 status = nfserr_stale_clientid; 3675 status = nfserr_stale_clientid;
2841 if (STALE_CLIENTID(&lockt->lt_clientid)) 3676 if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid))
2842 goto out; 3677 goto out;
2843 3678
2844 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) { 3679 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) {
@@ -2911,7 +3746,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2911 3746
2912 nfs4_lock_state(); 3747 nfs4_lock_state();
2913 3748
2914 if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, 3749 if ((status = nfs4_preprocess_seqid_op(cstate,
2915 locku->lu_seqid, 3750 locku->lu_seqid,
2916 &locku->lu_stateid, 3751 &locku->lu_stateid,
2917 LOCK_STATE, 3752 LOCK_STATE,
@@ -3037,7 +3872,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
3037 /* unhash_stateowner deletes so_perclient only 3872 /* unhash_stateowner deletes so_perclient only
3038 * for openowners. */ 3873 * for openowners. */
3039 list_del(&sop->so_perclient); 3874 list_del(&sop->so_perclient);
3040 release_stateowner(sop); 3875 release_lockowner(sop);
3041 } 3876 }
3042out: 3877out:
3043 nfs4_unlock_state(); 3878 nfs4_unlock_state();
@@ -3051,12 +3886,12 @@ alloc_reclaim(void)
3051} 3886}
3052 3887
3053int 3888int
3054nfs4_has_reclaimed_state(const char *name) 3889nfs4_has_reclaimed_state(const char *name, bool use_exchange_id)
3055{ 3890{
3056 unsigned int strhashval = clientstr_hashval(name); 3891 unsigned int strhashval = clientstr_hashval(name);
3057 struct nfs4_client *clp; 3892 struct nfs4_client *clp;
3058 3893
3059 clp = find_confirmed_client_by_str(name, strhashval); 3894 clp = find_confirmed_client_by_str(name, strhashval, use_exchange_id);
3060 return clp ? 1 : 0; 3895 return clp ? 1 : 0;
3061} 3896}
3062 3897
@@ -3153,6 +3988,8 @@ nfs4_state_init(void)
3153 INIT_LIST_HEAD(&unconf_str_hashtbl[i]); 3988 INIT_LIST_HEAD(&unconf_str_hashtbl[i]);
3154 INIT_LIST_HEAD(&unconf_id_hashtbl[i]); 3989 INIT_LIST_HEAD(&unconf_id_hashtbl[i]);
3155 } 3990 }
3991 for (i = 0; i < SESSION_HASH_SIZE; i++)
3992 INIT_LIST_HEAD(&sessionid_hashtbl[i]);
3156 for (i = 0; i < FILE_HASH_SIZE; i++) { 3993 for (i = 0; i < FILE_HASH_SIZE; i++) {
3157 INIT_LIST_HEAD(&file_hashtbl[i]); 3994 INIT_LIST_HEAD(&file_hashtbl[i]);
3158 } 3995 }
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 9250067943d8..b73549d293be 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -45,6 +45,7 @@
45#include <linux/fs.h> 45#include <linux/fs.h>
46#include <linux/namei.h> 46#include <linux/namei.h>
47#include <linux/vfs.h> 47#include <linux/vfs.h>
48#include <linux/utsname.h>
48#include <linux/sunrpc/xdr.h> 49#include <linux/sunrpc/xdr.h>
49#include <linux/sunrpc/svc.h> 50#include <linux/sunrpc/svc.h>
50#include <linux/sunrpc/clnt.h> 51#include <linux/sunrpc/clnt.h>
@@ -188,6 +189,11 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
188 return p; 189 return p;
189} 190}
190 191
192static int zero_clientid(clientid_t *clid)
193{
194 return (clid->cl_boot == 0) && (clid->cl_id == 0);
195}
196
191static int 197static int
192defer_free(struct nfsd4_compoundargs *argp, 198defer_free(struct nfsd4_compoundargs *argp,
193 void (*release)(const void *), void *p) 199 void (*release)(const void *), void *p)
@@ -230,6 +236,7 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
230 236
231 bmval[0] = 0; 237 bmval[0] = 0;
232 bmval[1] = 0; 238 bmval[1] = 0;
239 bmval[2] = 0;
233 240
234 READ_BUF(4); 241 READ_BUF(4);
235 READ32(bmlen); 242 READ32(bmlen);
@@ -241,13 +248,27 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
241 READ32(bmval[0]); 248 READ32(bmval[0]);
242 if (bmlen > 1) 249 if (bmlen > 1)
243 READ32(bmval[1]); 250 READ32(bmval[1]);
251 if (bmlen > 2)
252 READ32(bmval[2]);
244 253
245 DECODE_TAIL; 254 DECODE_TAIL;
246} 255}
247 256
257static u32 nfsd_attrmask[] = {
258 NFSD_WRITEABLE_ATTRS_WORD0,
259 NFSD_WRITEABLE_ATTRS_WORD1,
260 NFSD_WRITEABLE_ATTRS_WORD2
261};
262
263static u32 nfsd41_ex_attrmask[] = {
264 NFSD_SUPPATTR_EXCLCREAT_WORD0,
265 NFSD_SUPPATTR_EXCLCREAT_WORD1,
266 NFSD_SUPPATTR_EXCLCREAT_WORD2
267};
268
248static __be32 269static __be32
249nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *iattr, 270nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, u32 *writable,
250 struct nfs4_acl **acl) 271 struct iattr *iattr, struct nfs4_acl **acl)
251{ 272{
252 int expected_len, len = 0; 273 int expected_len, len = 0;
253 u32 dummy32; 274 u32 dummy32;
@@ -263,9 +284,12 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
263 * According to spec, unsupported attributes return ERR_ATTRNOTSUPP; 284 * According to spec, unsupported attributes return ERR_ATTRNOTSUPP;
264 * read-only attributes return ERR_INVAL. 285 * read-only attributes return ERR_INVAL.
265 */ 286 */
266 if ((bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0) || (bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1)) 287 if ((bmval[0] & ~nfsd_suppattrs0(argp->minorversion)) ||
288 (bmval[1] & ~nfsd_suppattrs1(argp->minorversion)) ||
289 (bmval[2] & ~nfsd_suppattrs2(argp->minorversion)))
267 return nfserr_attrnotsupp; 290 return nfserr_attrnotsupp;
268 if ((bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0) || (bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1)) 291 if ((bmval[0] & ~writable[0]) || (bmval[1] & ~writable[1]) ||
292 (bmval[2] & ~writable[2]))
269 return nfserr_inval; 293 return nfserr_inval;
270 294
271 READ_BUF(4); 295 READ_BUF(4);
@@ -400,6 +424,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
400 goto xdr_error; 424 goto xdr_error;
401 } 425 }
402 } 426 }
427 BUG_ON(bmval[2]); /* no such writeable attr supported yet */
403 if (len != expected_len) 428 if (len != expected_len)
404 goto xdr_error; 429 goto xdr_error;
405 430
@@ -493,7 +518,9 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
493 if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval))) 518 if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval)))
494 return status; 519 return status;
495 520
496 if ((status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, &create->cr_acl))) 521 status = nfsd4_decode_fattr(argp, create->cr_bmval, nfsd_attrmask,
522 &create->cr_iattr, &create->cr_acl);
523 if (status)
497 goto out; 524 goto out;
498 525
499 DECODE_TAIL; 526 DECODE_TAIL;
@@ -583,6 +610,8 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
583 READ_BUF(lockt->lt_owner.len); 610 READ_BUF(lockt->lt_owner.len);
584 READMEM(lockt->lt_owner.data, lockt->lt_owner.len); 611 READMEM(lockt->lt_owner.data, lockt->lt_owner.len);
585 612
613 if (argp->minorversion && !zero_clientid(&lockt->lt_clientid))
614 return nfserr_inval;
586 DECODE_TAIL; 615 DECODE_TAIL;
587} 616}
588 617
@@ -652,13 +681,26 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
652 switch (open->op_createmode) { 681 switch (open->op_createmode) {
653 case NFS4_CREATE_UNCHECKED: 682 case NFS4_CREATE_UNCHECKED:
654 case NFS4_CREATE_GUARDED: 683 case NFS4_CREATE_GUARDED:
655 if ((status = nfsd4_decode_fattr(argp, open->op_bmval, &open->op_iattr, &open->op_acl))) 684 status = nfsd4_decode_fattr(argp, open->op_bmval,
685 nfsd_attrmask, &open->op_iattr, &open->op_acl);
686 if (status)
656 goto out; 687 goto out;
657 break; 688 break;
658 case NFS4_CREATE_EXCLUSIVE: 689 case NFS4_CREATE_EXCLUSIVE:
659 READ_BUF(8); 690 READ_BUF(8);
660 COPYMEM(open->op_verf.data, 8); 691 COPYMEM(open->op_verf.data, 8);
661 break; 692 break;
693 case NFS4_CREATE_EXCLUSIVE4_1:
694 if (argp->minorversion < 1)
695 goto xdr_error;
696 READ_BUF(8);
697 COPYMEM(open->op_verf.data, 8);
698 status = nfsd4_decode_fattr(argp, open->op_bmval,
699 nfsd41_ex_attrmask, &open->op_iattr,
700 &open->op_acl);
701 if (status)
702 goto out;
703 break;
662 default: 704 default:
663 goto xdr_error; 705 goto xdr_error;
664 } 706 }
@@ -851,7 +893,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
851 status = nfsd4_decode_stateid(argp, &setattr->sa_stateid); 893 status = nfsd4_decode_stateid(argp, &setattr->sa_stateid);
852 if (status) 894 if (status)
853 return status; 895 return status;
854 return nfsd4_decode_fattr(argp, setattr->sa_bmval, 896 return nfsd4_decode_fattr(argp, setattr->sa_bmval, nfsd_attrmask,
855 &setattr->sa_iattr, &setattr->sa_acl); 897 &setattr->sa_iattr, &setattr->sa_acl);
856} 898}
857 899
@@ -993,6 +1035,241 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel
993 READ_BUF(rlockowner->rl_owner.len); 1035 READ_BUF(rlockowner->rl_owner.len);
994 READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len); 1036 READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len);
995 1037
1038 if (argp->minorversion && !zero_clientid(&rlockowner->rl_clientid))
1039 return nfserr_inval;
1040 DECODE_TAIL;
1041}
1042
1043static __be32
1044nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
1045 struct nfsd4_exchange_id *exid)
1046{
1047 int dummy;
1048 DECODE_HEAD;
1049
1050 READ_BUF(NFS4_VERIFIER_SIZE);
1051 COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE);
1052
1053 READ_BUF(4);
1054 READ32(exid->clname.len);
1055
1056 READ_BUF(exid->clname.len);
1057 SAVEMEM(exid->clname.data, exid->clname.len);
1058
1059 READ_BUF(4);
1060 READ32(exid->flags);
1061
1062 /* Ignore state_protect4_a */
1063 READ_BUF(4);
1064 READ32(exid->spa_how);
1065 switch (exid->spa_how) {
1066 case SP4_NONE:
1067 break;
1068 case SP4_MACH_CRED:
1069 /* spo_must_enforce */
1070 READ_BUF(4);
1071 READ32(dummy);
1072 READ_BUF(dummy * 4);
1073 p += dummy;
1074
1075 /* spo_must_allow */
1076 READ_BUF(4);
1077 READ32(dummy);
1078 READ_BUF(dummy * 4);
1079 p += dummy;
1080 break;
1081 case SP4_SSV:
1082 /* ssp_ops */
1083 READ_BUF(4);
1084 READ32(dummy);
1085 READ_BUF(dummy * 4);
1086 p += dummy;
1087
1088 READ_BUF(4);
1089 READ32(dummy);
1090 READ_BUF(dummy * 4);
1091 p += dummy;
1092
1093 /* ssp_hash_algs<> */
1094 READ_BUF(4);
1095 READ32(dummy);
1096 READ_BUF(dummy);
1097 p += XDR_QUADLEN(dummy);
1098
1099 /* ssp_encr_algs<> */
1100 READ_BUF(4);
1101 READ32(dummy);
1102 READ_BUF(dummy);
1103 p += XDR_QUADLEN(dummy);
1104
1105 /* ssp_window and ssp_num_gss_handles */
1106 READ_BUF(8);
1107 READ32(dummy);
1108 READ32(dummy);
1109 break;
1110 default:
1111 goto xdr_error;
1112 }
1113
1114 /* Ignore Implementation ID */
1115 READ_BUF(4); /* nfs_impl_id4 array length */
1116 READ32(dummy);
1117
1118 if (dummy > 1)
1119 goto xdr_error;
1120
1121 if (dummy == 1) {
1122 /* nii_domain */
1123 READ_BUF(4);
1124 READ32(dummy);
1125 READ_BUF(dummy);
1126 p += XDR_QUADLEN(dummy);
1127
1128 /* nii_name */
1129 READ_BUF(4);
1130 READ32(dummy);
1131 READ_BUF(dummy);
1132 p += XDR_QUADLEN(dummy);
1133
1134 /* nii_date */
1135 READ_BUF(12);
1136 p += 3;
1137 }
1138 DECODE_TAIL;
1139}
1140
1141static __be32
1142nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
1143 struct nfsd4_create_session *sess)
1144{
1145 DECODE_HEAD;
1146
1147 u32 dummy;
1148 char *machine_name;
1149 int i;
1150 int nr_secflavs;
1151
1152 READ_BUF(16);
1153 COPYMEM(&sess->clientid, 8);
1154 READ32(sess->seqid);
1155 READ32(sess->flags);
1156
1157 /* Fore channel attrs */
1158 READ_BUF(28);
1159 READ32(dummy); /* headerpadsz is always 0 */
1160 READ32(sess->fore_channel.maxreq_sz);
1161 READ32(sess->fore_channel.maxresp_sz);
1162 READ32(sess->fore_channel.maxresp_cached);
1163 READ32(sess->fore_channel.maxops);
1164 READ32(sess->fore_channel.maxreqs);
1165 READ32(sess->fore_channel.nr_rdma_attrs);
1166 if (sess->fore_channel.nr_rdma_attrs == 1) {
1167 READ_BUF(4);
1168 READ32(sess->fore_channel.rdma_attrs);
1169 } else if (sess->fore_channel.nr_rdma_attrs > 1) {
1170 dprintk("Too many fore channel attr bitmaps!\n");
1171 goto xdr_error;
1172 }
1173
1174 /* Back channel attrs */
1175 READ_BUF(28);
1176 READ32(dummy); /* headerpadsz is always 0 */
1177 READ32(sess->back_channel.maxreq_sz);
1178 READ32(sess->back_channel.maxresp_sz);
1179 READ32(sess->back_channel.maxresp_cached);
1180 READ32(sess->back_channel.maxops);
1181 READ32(sess->back_channel.maxreqs);
1182 READ32(sess->back_channel.nr_rdma_attrs);
1183 if (sess->back_channel.nr_rdma_attrs == 1) {
1184 READ_BUF(4);
1185 READ32(sess->back_channel.rdma_attrs);
1186 } else if (sess->back_channel.nr_rdma_attrs > 1) {
1187 dprintk("Too many back channel attr bitmaps!\n");
1188 goto xdr_error;
1189 }
1190
1191 READ_BUF(8);
1192 READ32(sess->callback_prog);
1193
1194 /* callback_sec_params4 */
1195 READ32(nr_secflavs);
1196 for (i = 0; i < nr_secflavs; ++i) {
1197 READ_BUF(4);
1198 READ32(dummy);
1199 switch (dummy) {
1200 case RPC_AUTH_NULL:
1201 /* Nothing to read */
1202 break;
1203 case RPC_AUTH_UNIX:
1204 READ_BUF(8);
1205 /* stamp */
1206 READ32(dummy);
1207
1208 /* machine name */
1209 READ32(dummy);
1210 READ_BUF(dummy);
1211 SAVEMEM(machine_name, dummy);
1212
1213 /* uid, gid */
1214 READ_BUF(8);
1215 READ32(sess->uid);
1216 READ32(sess->gid);
1217
1218 /* more gids */
1219 READ_BUF(4);
1220 READ32(dummy);
1221 READ_BUF(dummy * 4);
1222 for (i = 0; i < dummy; ++i)
1223 READ32(dummy);
1224 break;
1225 case RPC_AUTH_GSS:
1226 dprintk("RPC_AUTH_GSS callback secflavor "
1227 "not supported!\n");
1228 READ_BUF(8);
1229 /* gcbp_service */
1230 READ32(dummy);
1231 /* gcbp_handle_from_server */
1232 READ32(dummy);
1233 READ_BUF(dummy);
1234 p += XDR_QUADLEN(dummy);
1235 /* gcbp_handle_from_client */
1236 READ_BUF(4);
1237 READ32(dummy);
1238 READ_BUF(dummy);
1239 p += XDR_QUADLEN(dummy);
1240 break;
1241 default:
1242 dprintk("Illegal callback secflavor\n");
1243 return nfserr_inval;
1244 }
1245 }
1246 DECODE_TAIL;
1247}
1248
1249static __be32
1250nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp,
1251 struct nfsd4_destroy_session *destroy_session)
1252{
1253 DECODE_HEAD;
1254 READ_BUF(NFS4_MAX_SESSIONID_LEN);
1255 COPYMEM(destroy_session->sessionid.data, NFS4_MAX_SESSIONID_LEN);
1256
1257 DECODE_TAIL;
1258}
1259
1260static __be32
1261nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
1262 struct nfsd4_sequence *seq)
1263{
1264 DECODE_HEAD;
1265
1266 READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
1267 COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
1268 READ32(seq->seqid);
1269 READ32(seq->slotid);
1270 READ32(seq->maxslots);
1271 READ32(seq->cachethis);
1272
996 DECODE_TAIL; 1273 DECODE_TAIL;
997} 1274}
998 1275
@@ -1005,7 +1282,7 @@ nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
1005static __be32 1282static __be32
1006nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p) 1283nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p)
1007{ 1284{
1008 return nfserr_opnotsupp; 1285 return nfserr_notsupp;
1009} 1286}
1010 1287
1011typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *); 1288typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *);
@@ -1031,7 +1308,7 @@ static nfsd4_dec nfsd4_dec_ops[] = {
1031 [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_open_confirm, 1308 [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_open_confirm,
1032 [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade, 1309 [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade,
1033 [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh, 1310 [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh,
1034 [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_notsupp, 1311 [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_noop,
1035 [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop, 1312 [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop,
1036 [OP_READ] = (nfsd4_dec)nfsd4_decode_read, 1313 [OP_READ] = (nfsd4_dec)nfsd4_decode_read,
1037 [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir, 1314 [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir,
@@ -1050,6 +1327,67 @@ static nfsd4_dec nfsd4_dec_ops[] = {
1050 [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_release_lockowner, 1327 [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_release_lockowner,
1051}; 1328};
1052 1329
1330static nfsd4_dec nfsd41_dec_ops[] = {
1331 [OP_ACCESS] (nfsd4_dec)nfsd4_decode_access,
1332 [OP_CLOSE] (nfsd4_dec)nfsd4_decode_close,
1333 [OP_COMMIT] (nfsd4_dec)nfsd4_decode_commit,
1334 [OP_CREATE] (nfsd4_dec)nfsd4_decode_create,
1335 [OP_DELEGPURGE] (nfsd4_dec)nfsd4_decode_notsupp,
1336 [OP_DELEGRETURN] (nfsd4_dec)nfsd4_decode_delegreturn,
1337 [OP_GETATTR] (nfsd4_dec)nfsd4_decode_getattr,
1338 [OP_GETFH] (nfsd4_dec)nfsd4_decode_noop,
1339 [OP_LINK] (nfsd4_dec)nfsd4_decode_link,
1340 [OP_LOCK] (nfsd4_dec)nfsd4_decode_lock,
1341 [OP_LOCKT] (nfsd4_dec)nfsd4_decode_lockt,
1342 [OP_LOCKU] (nfsd4_dec)nfsd4_decode_locku,
1343 [OP_LOOKUP] (nfsd4_dec)nfsd4_decode_lookup,
1344 [OP_LOOKUPP] (nfsd4_dec)nfsd4_decode_noop,
1345 [OP_NVERIFY] (nfsd4_dec)nfsd4_decode_verify,
1346 [OP_OPEN] (nfsd4_dec)nfsd4_decode_open,
1347 [OP_OPENATTR] (nfsd4_dec)nfsd4_decode_notsupp,
1348 [OP_OPEN_CONFIRM] (nfsd4_dec)nfsd4_decode_notsupp,
1349 [OP_OPEN_DOWNGRADE] (nfsd4_dec)nfsd4_decode_open_downgrade,
1350 [OP_PUTFH] (nfsd4_dec)nfsd4_decode_putfh,
1351 [OP_PUTPUBFH] (nfsd4_dec)nfsd4_decode_notsupp,
1352 [OP_PUTROOTFH] (nfsd4_dec)nfsd4_decode_noop,
1353 [OP_READ] (nfsd4_dec)nfsd4_decode_read,
1354 [OP_READDIR] (nfsd4_dec)nfsd4_decode_readdir,
1355 [OP_READLINK] (nfsd4_dec)nfsd4_decode_noop,
1356 [OP_REMOVE] (nfsd4_dec)nfsd4_decode_remove,
1357 [OP_RENAME] (nfsd4_dec)nfsd4_decode_rename,
1358 [OP_RENEW] (nfsd4_dec)nfsd4_decode_notsupp,
1359 [OP_RESTOREFH] (nfsd4_dec)nfsd4_decode_noop,
1360 [OP_SAVEFH] (nfsd4_dec)nfsd4_decode_noop,
1361 [OP_SECINFO] (nfsd4_dec)nfsd4_decode_secinfo,
1362 [OP_SETATTR] (nfsd4_dec)nfsd4_decode_setattr,
1363 [OP_SETCLIENTID] (nfsd4_dec)nfsd4_decode_notsupp,
1364 [OP_SETCLIENTID_CONFIRM](nfsd4_dec)nfsd4_decode_notsupp,
1365 [OP_VERIFY] (nfsd4_dec)nfsd4_decode_verify,
1366 [OP_WRITE] (nfsd4_dec)nfsd4_decode_write,
1367 [OP_RELEASE_LOCKOWNER] (nfsd4_dec)nfsd4_decode_notsupp,
1368
1369 /* new operations for NFSv4.1 */
1370 [OP_BACKCHANNEL_CTL] (nfsd4_dec)nfsd4_decode_notsupp,
1371 [OP_BIND_CONN_TO_SESSION](nfsd4_dec)nfsd4_decode_notsupp,
1372 [OP_EXCHANGE_ID] (nfsd4_dec)nfsd4_decode_exchange_id,
1373 [OP_CREATE_SESSION] (nfsd4_dec)nfsd4_decode_create_session,
1374 [OP_DESTROY_SESSION] (nfsd4_dec)nfsd4_decode_destroy_session,
1375 [OP_FREE_STATEID] (nfsd4_dec)nfsd4_decode_notsupp,
1376 [OP_GET_DIR_DELEGATION] (nfsd4_dec)nfsd4_decode_notsupp,
1377 [OP_GETDEVICEINFO] (nfsd4_dec)nfsd4_decode_notsupp,
1378 [OP_GETDEVICELIST] (nfsd4_dec)nfsd4_decode_notsupp,
1379 [OP_LAYOUTCOMMIT] (nfsd4_dec)nfsd4_decode_notsupp,
1380 [OP_LAYOUTGET] (nfsd4_dec)nfsd4_decode_notsupp,
1381 [OP_LAYOUTRETURN] (nfsd4_dec)nfsd4_decode_notsupp,
1382 [OP_SECINFO_NO_NAME] (nfsd4_dec)nfsd4_decode_notsupp,
1383 [OP_SEQUENCE] (nfsd4_dec)nfsd4_decode_sequence,
1384 [OP_SET_SSV] (nfsd4_dec)nfsd4_decode_notsupp,
1385 [OP_TEST_STATEID] (nfsd4_dec)nfsd4_decode_notsupp,
1386 [OP_WANT_DELEGATION] (nfsd4_dec)nfsd4_decode_notsupp,
1387 [OP_DESTROY_CLIENTID] (nfsd4_dec)nfsd4_decode_notsupp,
1388 [OP_RECLAIM_COMPLETE] (nfsd4_dec)nfsd4_decode_notsupp,
1389};
1390
1053struct nfsd4_minorversion_ops { 1391struct nfsd4_minorversion_ops {
1054 nfsd4_dec *decoders; 1392 nfsd4_dec *decoders;
1055 int nops; 1393 int nops;
@@ -1057,6 +1395,7 @@ struct nfsd4_minorversion_ops {
1057 1395
1058static struct nfsd4_minorversion_ops nfsd4_minorversion[] = { 1396static struct nfsd4_minorversion_ops nfsd4_minorversion[] = {
1059 [0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) }, 1397 [0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) },
1398 [1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
1060}; 1399};
1061 1400
1062static __be32 1401static __be32
@@ -1412,6 +1751,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1412{ 1751{
1413 u32 bmval0 = bmval[0]; 1752 u32 bmval0 = bmval[0];
1414 u32 bmval1 = bmval[1]; 1753 u32 bmval1 = bmval[1];
1754 u32 bmval2 = bmval[2];
1415 struct kstat stat; 1755 struct kstat stat;
1416 struct svc_fh tempfh; 1756 struct svc_fh tempfh;
1417 struct kstatfs statfs; 1757 struct kstatfs statfs;
@@ -1425,12 +1765,16 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1425 int err; 1765 int err;
1426 int aclsupport = 0; 1766 int aclsupport = 0;
1427 struct nfs4_acl *acl = NULL; 1767 struct nfs4_acl *acl = NULL;
1768 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1769 u32 minorversion = resp->cstate.minorversion;
1428 1770
1429 BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1); 1771 BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
1430 BUG_ON(bmval0 & ~NFSD_SUPPORTED_ATTRS_WORD0); 1772 BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion));
1431 BUG_ON(bmval1 & ~NFSD_SUPPORTED_ATTRS_WORD1); 1773 BUG_ON(bmval1 & ~nfsd_suppattrs1(minorversion));
1774 BUG_ON(bmval2 & ~nfsd_suppattrs2(minorversion));
1432 1775
1433 if (exp->ex_fslocs.migrated) { 1776 if (exp->ex_fslocs.migrated) {
1777 BUG_ON(bmval[2]);
1434 status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err); 1778 status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err);
1435 if (status) 1779 if (status)
1436 goto out; 1780 goto out;
@@ -1476,22 +1820,42 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1476 if ((buflen -= 16) < 0) 1820 if ((buflen -= 16) < 0)
1477 goto out_resource; 1821 goto out_resource;
1478 1822
1479 WRITE32(2); 1823 if (unlikely(bmval2)) {
1480 WRITE32(bmval0); 1824 WRITE32(3);
1481 WRITE32(bmval1); 1825 WRITE32(bmval0);
1826 WRITE32(bmval1);
1827 WRITE32(bmval2);
1828 } else if (likely(bmval1)) {
1829 WRITE32(2);
1830 WRITE32(bmval0);
1831 WRITE32(bmval1);
1832 } else {
1833 WRITE32(1);
1834 WRITE32(bmval0);
1835 }
1482 attrlenp = p++; /* to be backfilled later */ 1836 attrlenp = p++; /* to be backfilled later */
1483 1837
1484 if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { 1838 if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
1485 u32 word0 = NFSD_SUPPORTED_ATTRS_WORD0; 1839 u32 word0 = nfsd_suppattrs0(minorversion);
1840 u32 word1 = nfsd_suppattrs1(minorversion);
1841 u32 word2 = nfsd_suppattrs2(minorversion);
1842
1486 if ((buflen -= 12) < 0) 1843 if ((buflen -= 12) < 0)
1487 goto out_resource; 1844 goto out_resource;
1488 if (!aclsupport) 1845 if (!aclsupport)
1489 word0 &= ~FATTR4_WORD0_ACL; 1846 word0 &= ~FATTR4_WORD0_ACL;
1490 if (!exp->ex_fslocs.locations) 1847 if (!exp->ex_fslocs.locations)
1491 word0 &= ~FATTR4_WORD0_FS_LOCATIONS; 1848 word0 &= ~FATTR4_WORD0_FS_LOCATIONS;
1492 WRITE32(2); 1849 if (!word2) {
1493 WRITE32(word0); 1850 WRITE32(2);
1494 WRITE32(NFSD_SUPPORTED_ATTRS_WORD1); 1851 WRITE32(word0);
1852 WRITE32(word1);
1853 } else {
1854 WRITE32(3);
1855 WRITE32(word0);
1856 WRITE32(word1);
1857 WRITE32(word2);
1858 }
1495 } 1859 }
1496 if (bmval0 & FATTR4_WORD0_TYPE) { 1860 if (bmval0 & FATTR4_WORD0_TYPE) {
1497 if ((buflen -= 4) < 0) 1861 if ((buflen -= 4) < 0)
@@ -1801,6 +2165,13 @@ out_acl:
1801 } 2165 }
1802 WRITE64(stat.ino); 2166 WRITE64(stat.ino);
1803 } 2167 }
2168 if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
2169 WRITE32(3);
2170 WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
2171 WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD1);
2172 WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD2);
2173 }
2174
1804 *attrlenp = htonl((char *)p - (char *)attrlenp - 4); 2175 *attrlenp = htonl((char *)p - (char *)attrlenp - 4);
1805 *countp = p - buffer; 2176 *countp = p - buffer;
1806 status = nfs_ok; 2177 status = nfs_ok;
@@ -1843,6 +2214,15 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
1843 dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen); 2214 dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
1844 if (IS_ERR(dentry)) 2215 if (IS_ERR(dentry))
1845 return nfserrno(PTR_ERR(dentry)); 2216 return nfserrno(PTR_ERR(dentry));
2217 if (!dentry->d_inode) {
2218 /*
2219 * nfsd_buffered_readdir drops the i_mutex between
2220 * readdir and calling this callback, leaving a window
2221 * where this directory entry could have gone away.
2222 */
2223 dput(dentry);
2224 return nfserr_noent;
2225 }
1846 2226
1847 exp_get(exp); 2227 exp_get(exp);
1848 /* 2228 /*
@@ -1905,6 +2285,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
1905 struct nfsd4_readdir *cd = container_of(ccd, struct nfsd4_readdir, common); 2285 struct nfsd4_readdir *cd = container_of(ccd, struct nfsd4_readdir, common);
1906 int buflen; 2286 int buflen;
1907 __be32 *p = cd->buffer; 2287 __be32 *p = cd->buffer;
2288 __be32 *cookiep;
1908 __be32 nfserr = nfserr_toosmall; 2289 __be32 nfserr = nfserr_toosmall;
1909 2290
1910 /* In nfsv4, "." and ".." never make it onto the wire.. */ 2291 /* In nfsv4, "." and ".." never make it onto the wire.. */
@@ -1921,7 +2302,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
1921 goto fail; 2302 goto fail;
1922 2303
1923 *p++ = xdr_one; /* mark entry present */ 2304 *p++ = xdr_one; /* mark entry present */
1924 cd->offset = p; /* remember pointer */ 2305 cookiep = p;
1925 p = xdr_encode_hyper(p, NFS_OFFSET_MAX); /* offset of next entry */ 2306 p = xdr_encode_hyper(p, NFS_OFFSET_MAX); /* offset of next entry */
1926 p = xdr_encode_array(p, name, namlen); /* name length & name */ 2307 p = xdr_encode_array(p, name, namlen); /* name length & name */
1927 2308
@@ -1935,6 +2316,8 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
1935 goto fail; 2316 goto fail;
1936 case nfserr_dropit: 2317 case nfserr_dropit:
1937 goto fail; 2318 goto fail;
2319 case nfserr_noent:
2320 goto skip_entry;
1938 default: 2321 default:
1939 /* 2322 /*
1940 * If the client requested the RDATTR_ERROR attribute, 2323 * If the client requested the RDATTR_ERROR attribute,
@@ -1953,6 +2336,8 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
1953 } 2336 }
1954 cd->buflen -= (p - cd->buffer); 2337 cd->buflen -= (p - cd->buffer);
1955 cd->buffer = p; 2338 cd->buffer = p;
2339 cd->offset = cookiep;
2340skip_entry:
1956 cd->common.err = nfs_ok; 2341 cd->common.err = nfs_ok;
1957 return 0; 2342 return 0;
1958fail: 2343fail:
@@ -2572,6 +2957,143 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
2572} 2957}
2573 2958
2574static __be32 2959static __be32
2960nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr,
2961 struct nfsd4_exchange_id *exid)
2962{
2963 ENCODE_HEAD;
2964 char *major_id;
2965 char *server_scope;
2966 int major_id_sz;
2967 int server_scope_sz;
2968 uint64_t minor_id = 0;
2969
2970 if (nfserr)
2971 return nfserr;
2972
2973 major_id = utsname()->nodename;
2974 major_id_sz = strlen(major_id);
2975 server_scope = utsname()->nodename;
2976 server_scope_sz = strlen(server_scope);
2977
2978 RESERVE_SPACE(
2979 8 /* eir_clientid */ +
2980 4 /* eir_sequenceid */ +
2981 4 /* eir_flags */ +
2982 4 /* spr_how (SP4_NONE) */ +
2983 8 /* so_minor_id */ +
2984 4 /* so_major_id.len */ +
2985 (XDR_QUADLEN(major_id_sz) * 4) +
2986 4 /* eir_server_scope.len */ +
2987 (XDR_QUADLEN(server_scope_sz) * 4) +
2988 4 /* eir_server_impl_id.count (0) */);
2989
2990 WRITEMEM(&exid->clientid, 8);
2991 WRITE32(exid->seqid);
2992 WRITE32(exid->flags);
2993
2994 /* state_protect4_r. Currently only support SP4_NONE */
2995 BUG_ON(exid->spa_how != SP4_NONE);
2996 WRITE32(exid->spa_how);
2997
2998 /* The server_owner struct */
2999 WRITE64(minor_id); /* Minor id */
3000 /* major id */
3001 WRITE32(major_id_sz);
3002 WRITEMEM(major_id, major_id_sz);
3003
3004 /* Server scope */
3005 WRITE32(server_scope_sz);
3006 WRITEMEM(server_scope, server_scope_sz);
3007
3008 /* Implementation id */
3009 WRITE32(0); /* zero length nfs_impl_id4 array */
3010 ADJUST_ARGS();
3011 return 0;
3012}
3013
3014static __be32
3015nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr,
3016 struct nfsd4_create_session *sess)
3017{
3018 ENCODE_HEAD;
3019
3020 if (nfserr)
3021 return nfserr;
3022
3023 RESERVE_SPACE(24);
3024 WRITEMEM(sess->sessionid.data, NFS4_MAX_SESSIONID_LEN);
3025 WRITE32(sess->seqid);
3026 WRITE32(sess->flags);
3027 ADJUST_ARGS();
3028
3029 RESERVE_SPACE(28);
3030 WRITE32(0); /* headerpadsz */
3031 WRITE32(sess->fore_channel.maxreq_sz);
3032 WRITE32(sess->fore_channel.maxresp_sz);
3033 WRITE32(sess->fore_channel.maxresp_cached);
3034 WRITE32(sess->fore_channel.maxops);
3035 WRITE32(sess->fore_channel.maxreqs);
3036 WRITE32(sess->fore_channel.nr_rdma_attrs);
3037 ADJUST_ARGS();
3038
3039 if (sess->fore_channel.nr_rdma_attrs) {
3040 RESERVE_SPACE(4);
3041 WRITE32(sess->fore_channel.rdma_attrs);
3042 ADJUST_ARGS();
3043 }
3044
3045 RESERVE_SPACE(28);
3046 WRITE32(0); /* headerpadsz */
3047 WRITE32(sess->back_channel.maxreq_sz);
3048 WRITE32(sess->back_channel.maxresp_sz);
3049 WRITE32(sess->back_channel.maxresp_cached);
3050 WRITE32(sess->back_channel.maxops);
3051 WRITE32(sess->back_channel.maxreqs);
3052 WRITE32(sess->back_channel.nr_rdma_attrs);
3053 ADJUST_ARGS();
3054
3055 if (sess->back_channel.nr_rdma_attrs) {
3056 RESERVE_SPACE(4);
3057 WRITE32(sess->back_channel.rdma_attrs);
3058 ADJUST_ARGS();
3059 }
3060 return 0;
3061}
3062
3063static __be32
3064nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr,
3065 struct nfsd4_destroy_session *destroy_session)
3066{
3067 return nfserr;
3068}
3069
3070__be32
3071nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
3072 struct nfsd4_sequence *seq)
3073{
3074 ENCODE_HEAD;
3075
3076 if (nfserr)
3077 return nfserr;
3078
3079 RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 20);
3080 WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
3081 WRITE32(seq->seqid);
3082 WRITE32(seq->slotid);
3083 WRITE32(seq->maxslots);
3084 /*
3085 * FIXME: for now:
3086 * target_maxslots = maxslots
3087 * status_flags = 0
3088 */
3089 WRITE32(seq->maxslots);
3090 WRITE32(0);
3091
3092 ADJUST_ARGS();
3093 return 0;
3094}
3095
3096static __be32
2575nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) 3097nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
2576{ 3098{
2577 return nfserr; 3099 return nfserr;
@@ -2579,6 +3101,11 @@ nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
2579 3101
2580typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *); 3102typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *);
2581 3103
3104/*
3105 * Note: nfsd4_enc_ops vector is shared for v4.0 and v4.1
3106 * since we don't need to filter out obsolete ops as this is
3107 * done in the decoding phase.
3108 */
2582static nfsd4_enc nfsd4_enc_ops[] = { 3109static nfsd4_enc nfsd4_enc_ops[] = {
2583 [OP_ACCESS] = (nfsd4_enc)nfsd4_encode_access, 3110 [OP_ACCESS] = (nfsd4_enc)nfsd4_encode_access,
2584 [OP_CLOSE] = (nfsd4_enc)nfsd4_encode_close, 3111 [OP_CLOSE] = (nfsd4_enc)nfsd4_encode_close,
@@ -2617,8 +3144,77 @@ static nfsd4_enc nfsd4_enc_ops[] = {
2617 [OP_VERIFY] = (nfsd4_enc)nfsd4_encode_noop, 3144 [OP_VERIFY] = (nfsd4_enc)nfsd4_encode_noop,
2618 [OP_WRITE] = (nfsd4_enc)nfsd4_encode_write, 3145 [OP_WRITE] = (nfsd4_enc)nfsd4_encode_write,
2619 [OP_RELEASE_LOCKOWNER] = (nfsd4_enc)nfsd4_encode_noop, 3146 [OP_RELEASE_LOCKOWNER] = (nfsd4_enc)nfsd4_encode_noop,
3147
3148 /* NFSv4.1 operations */
3149 [OP_BACKCHANNEL_CTL] = (nfsd4_enc)nfsd4_encode_noop,
3150 [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
3151 [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id,
3152 [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session,
3153 [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session,
3154 [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
3155 [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
3156 [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop,
3157 [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop,
3158 [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop,
3159 [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop,
3160 [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop,
3161 [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_noop,
3162 [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence,
3163 [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop,
3164 [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
3165 [OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
3166 [OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop,
3167 [OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop,
2620}; 3168};
2621 3169
3170/*
3171 * Calculate the total amount of memory that the compound response has taken
3172 * after encoding the current operation.
3173 *
3174 * pad: add on 8 bytes for the next operation's op_code and status so that
3175 * there is room to cache a failure on the next operation.
3176 *
3177 * Compare this length to the session se_fmaxresp_cached.
3178 *
3179 * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so
3180 * will be at least a page and will therefore hold the xdr_buf head.
3181 */
3182static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
3183{
3184 int status = 0;
3185 struct xdr_buf *xb = &resp->rqstp->rq_res;
3186 struct nfsd4_compoundargs *args = resp->rqstp->rq_argp;
3187 struct nfsd4_session *session = NULL;
3188 struct nfsd4_slot *slot = resp->cstate.slot;
3189 u32 length, tlen = 0, pad = 8;
3190
3191 if (!nfsd4_has_session(&resp->cstate))
3192 return status;
3193
3194 session = resp->cstate.session;
3195 if (session == NULL || slot->sl_cache_entry.ce_cachethis == 0)
3196 return status;
3197
3198 if (resp->opcnt >= args->opcnt)
3199 pad = 0; /* this is the last operation */
3200
3201 if (xb->page_len == 0) {
3202 length = (char *)resp->p - (char *)xb->head[0].iov_base + pad;
3203 } else {
3204 if (xb->tail[0].iov_base && xb->tail[0].iov_len > 0)
3205 tlen = (char *)resp->p - (char *)xb->tail[0].iov_base;
3206
3207 length = xb->head[0].iov_len + xb->page_len + tlen + pad;
3208 }
3209 dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__,
3210 length, xb->page_len, tlen, pad);
3211
3212 if (length <= session->se_fmaxresp_cached)
3213 return status;
3214 else
3215 return nfserr_rep_too_big_to_cache;
3216}
3217
2622void 3218void
2623nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) 3219nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
2624{ 3220{
@@ -2635,6 +3231,9 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
2635 BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) || 3231 BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) ||
2636 !nfsd4_enc_ops[op->opnum]); 3232 !nfsd4_enc_ops[op->opnum]);
2637 op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u); 3233 op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u);
3234 /* nfsd4_check_drc_limit guarantees enough room for error status */
3235 if (!op->status && nfsd4_check_drc_limit(resp))
3236 op->status = nfserr_rep_too_big_to_cache;
2638status: 3237status:
2639 /* 3238 /*
2640 * Note: We write the status directly, instead of using WRITE32(), 3239 * Note: We write the status directly, instead of using WRITE32(),
@@ -2735,6 +3334,18 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
2735 iov = &rqstp->rq_res.head[0]; 3334 iov = &rqstp->rq_res.head[0];
2736 iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base; 3335 iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
2737 BUG_ON(iov->iov_len > PAGE_SIZE); 3336 BUG_ON(iov->iov_len > PAGE_SIZE);
3337 if (nfsd4_has_session(&resp->cstate)) {
3338 if (resp->cstate.status == nfserr_replay_cache &&
3339 !nfsd4_not_cached(resp)) {
3340 iov->iov_len = resp->cstate.iovlen;
3341 } else {
3342 nfsd4_store_cache_entry(resp);
3343 dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
3344 resp->cstate.slot->sl_inuse = 0;
3345 }
3346 if (resp->cstate.session)
3347 nfsd4_put_session(resp->cstate.session);
3348 }
2738 return 1; 3349 return 1;
2739} 3350}
2740 3351
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 3d93b2064ce5..af16849d243a 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -60,6 +60,7 @@ enum {
60 NFSD_FO_UnlockFS, 60 NFSD_FO_UnlockFS,
61 NFSD_Threads, 61 NFSD_Threads,
62 NFSD_Pool_Threads, 62 NFSD_Pool_Threads,
63 NFSD_Pool_Stats,
63 NFSD_Versions, 64 NFSD_Versions,
64 NFSD_Ports, 65 NFSD_Ports,
65 NFSD_MaxBlkSize, 66 NFSD_MaxBlkSize,
@@ -172,6 +173,16 @@ static const struct file_operations exports_operations = {
172 .owner = THIS_MODULE, 173 .owner = THIS_MODULE,
173}; 174};
174 175
176extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
177
178static struct file_operations pool_stats_operations = {
179 .open = nfsd_pool_stats_open,
180 .read = seq_read,
181 .llseek = seq_lseek,
182 .release = seq_release,
183 .owner = THIS_MODULE,
184};
185
175/*----------------------------------------------------------------------------*/ 186/*----------------------------------------------------------------------------*/
176/* 187/*
177 * payload - write methods 188 * payload - write methods
@@ -781,8 +792,9 @@ out_free:
781static ssize_t __write_versions(struct file *file, char *buf, size_t size) 792static ssize_t __write_versions(struct file *file, char *buf, size_t size)
782{ 793{
783 char *mesg = buf; 794 char *mesg = buf;
784 char *vers, sign; 795 char *vers, *minorp, sign;
785 int len, num; 796 int len, num;
797 unsigned minor;
786 ssize_t tlen = 0; 798 ssize_t tlen = 0;
787 char *sep; 799 char *sep;
788 800
@@ -803,9 +815,20 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
803 do { 815 do {
804 sign = *vers; 816 sign = *vers;
805 if (sign == '+' || sign == '-') 817 if (sign == '+' || sign == '-')
806 num = simple_strtol((vers+1), NULL, 0); 818 num = simple_strtol((vers+1), &minorp, 0);
807 else 819 else
808 num = simple_strtol(vers, NULL, 0); 820 num = simple_strtol(vers, &minorp, 0);
821 if (*minorp == '.') {
822 if (num < 4)
823 return -EINVAL;
824 minor = simple_strtoul(minorp+1, NULL, 0);
825 if (minor == 0)
826 return -EINVAL;
827 if (nfsd_minorversion(minor, sign == '-' ?
828 NFSD_CLEAR : NFSD_SET) < 0)
829 return -EINVAL;
830 goto next;
831 }
809 switch(num) { 832 switch(num) {
810 case 2: 833 case 2:
811 case 3: 834 case 3:
@@ -815,6 +838,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
815 default: 838 default:
816 return -EINVAL; 839 return -EINVAL;
817 } 840 }
841 next:
818 vers += len + 1; 842 vers += len + 1;
819 tlen += len; 843 tlen += len;
820 } while ((len = qword_get(&mesg, vers, size)) > 0); 844 } while ((len = qword_get(&mesg, vers, size)) > 0);
@@ -833,6 +857,13 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
833 num); 857 num);
834 sep = " "; 858 sep = " ";
835 } 859 }
860 if (nfsd_vers(4, NFSD_AVAIL))
861 for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION; minor++)
862 len += sprintf(buf+len, " %c4.%u",
863 (nfsd_vers(4, NFSD_TEST) &&
864 nfsd_minorversion(minor, NFSD_TEST)) ?
865 '+' : '-',
866 minor);
836 len += sprintf(buf+len, "\n"); 867 len += sprintf(buf+len, "\n");
837 return len; 868 return len;
838} 869}
@@ -938,10 +969,12 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
938 char transport[16]; 969 char transport[16];
939 int port; 970 int port;
940 if (sscanf(buf, "%15s %4d", transport, &port) == 2) { 971 if (sscanf(buf, "%15s %4d", transport, &port) == 2) {
972 if (port < 1 || port > 65535)
973 return -EINVAL;
941 err = nfsd_create_serv(); 974 err = nfsd_create_serv();
942 if (!err) { 975 if (!err) {
943 err = svc_create_xprt(nfsd_serv, 976 err = svc_create_xprt(nfsd_serv,
944 transport, port, 977 transport, PF_INET, port,
945 SVC_SOCK_ANONYMOUS); 978 SVC_SOCK_ANONYMOUS);
946 if (err == -ENOENT) 979 if (err == -ENOENT)
947 /* Give a reasonable perror msg for 980 /* Give a reasonable perror msg for
@@ -960,7 +993,7 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
960 char transport[16]; 993 char transport[16];
961 int port; 994 int port;
962 if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) { 995 if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) {
963 if (port == 0) 996 if (port < 1 || port > 65535)
964 return -EINVAL; 997 return -EINVAL;
965 if (nfsd_serv) { 998 if (nfsd_serv) {
966 xprt = svc_find_xprt(nfsd_serv, transport, 999 xprt = svc_find_xprt(nfsd_serv, transport,
@@ -1246,6 +1279,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
1246 [NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR}, 1279 [NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR},
1247 [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR}, 1280 [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR},
1248 [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR}, 1281 [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR},
1282 [NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO},
1249 [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, 1283 [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
1250 [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, 1284 [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
1251 [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, 1285 [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 6f7f26351227..e298e260b5f1 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -180,6 +180,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
180{ 180{
181 __be32 nfserr; 181 __be32 nfserr;
182 int stable = 1; 182 int stable = 1;
183 unsigned long cnt = argp->len;
183 184
184 dprintk("nfsd: WRITE %s %d bytes at %d\n", 185 dprintk("nfsd: WRITE %s %d bytes at %d\n",
185 SVCFH_fmt(&argp->fh), 186 SVCFH_fmt(&argp->fh),
@@ -188,7 +189,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
188 nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL, 189 nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
189 argp->offset, 190 argp->offset,
190 rqstp->rq_vec, argp->vlen, 191 rqstp->rq_vec, argp->vlen,
191 argp->len, 192 &cnt,
192 &stable); 193 &stable);
193 return nfsd_return_attrs(nfserr, resp); 194 return nfsd_return_attrs(nfserr, resp);
194} 195}
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 07e4f5d7baa8..cbba4a935786 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -22,6 +22,7 @@
22#include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/fs_struct.h> 23#include <linux/fs_struct.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/swap.h>
25 26
26#include <linux/sunrpc/types.h> 27#include <linux/sunrpc/types.h>
27#include <linux/sunrpc/stats.h> 28#include <linux/sunrpc/stats.h>
@@ -40,9 +41,6 @@
40extern struct svc_program nfsd_program; 41extern struct svc_program nfsd_program;
41static int nfsd(void *vrqstp); 42static int nfsd(void *vrqstp);
42struct timeval nfssvc_boot; 43struct timeval nfssvc_boot;
43static atomic_t nfsd_busy;
44static unsigned long nfsd_last_call;
45static DEFINE_SPINLOCK(nfsd_call_lock);
46 44
47/* 45/*
48 * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members 46 * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members
@@ -123,6 +121,8 @@ struct svc_program nfsd_program = {
123 121
124}; 122};
125 123
124u32 nfsd_supported_minorversion;
125
126int nfsd_vers(int vers, enum vers_op change) 126int nfsd_vers(int vers, enum vers_op change)
127{ 127{
128 if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS) 128 if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS)
@@ -149,6 +149,28 @@ int nfsd_vers(int vers, enum vers_op change)
149 } 149 }
150 return 0; 150 return 0;
151} 151}
152
153int nfsd_minorversion(u32 minorversion, enum vers_op change)
154{
155 if (minorversion > NFSD_SUPPORTED_MINOR_VERSION)
156 return -1;
157 switch(change) {
158 case NFSD_SET:
159 nfsd_supported_minorversion = minorversion;
160 break;
161 case NFSD_CLEAR:
162 if (minorversion == 0)
163 return -1;
164 nfsd_supported_minorversion = minorversion - 1;
165 break;
166 case NFSD_TEST:
167 return minorversion <= nfsd_supported_minorversion;
168 case NFSD_AVAIL:
169 return minorversion <= NFSD_SUPPORTED_MINOR_VERSION;
170 }
171 return 0;
172}
173
152/* 174/*
153 * Maximum number of nfsd processes 175 * Maximum number of nfsd processes
154 */ 176 */
@@ -200,6 +222,28 @@ void nfsd_reset_versions(void)
200 } 222 }
201} 223}
202 224
225/*
226 * Each session guarantees a negotiated per slot memory cache for replies
227 * which in turn consumes memory beyond the v2/v3/v4.0 server. A dedicated
228 * NFSv4.1 server might want to use more memory for a DRC than a machine
229 * with mutiple services.
230 *
231 * Impose a hard limit on the number of pages for the DRC which varies
232 * according to the machines free pages. This is of course only a default.
233 *
234 * For now this is a #defined shift which could be under admin control
235 * in the future.
236 */
237static void set_max_drc(void)
238{
239 /* The percent of nr_free_buffer_pages used by the V4.1 server DRC */
240 #define NFSD_DRC_SIZE_SHIFT 7
241 nfsd_serv->sv_drc_max_pages = nr_free_buffer_pages()
242 >> NFSD_DRC_SIZE_SHIFT;
243 nfsd_serv->sv_drc_pages_used = 0;
244 dprintk("%s svc_drc_max_pages %u\n", __func__,
245 nfsd_serv->sv_drc_max_pages);
246}
203 247
204int nfsd_create_serv(void) 248int nfsd_create_serv(void)
205{ 249{
@@ -227,12 +271,12 @@ int nfsd_create_serv(void)
227 nfsd_max_blksize /= 2; 271 nfsd_max_blksize /= 2;
228 } 272 }
229 273
230 atomic_set(&nfsd_busy, 0);
231 nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, 274 nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
232 AF_INET,
233 nfsd_last_thread, nfsd, THIS_MODULE); 275 nfsd_last_thread, nfsd, THIS_MODULE);
234 if (nfsd_serv == NULL) 276 if (nfsd_serv == NULL)
235 err = -ENOMEM; 277 err = -ENOMEM;
278 else
279 set_max_drc();
236 280
237 do_gettimeofday(&nfssvc_boot); /* record boot time */ 281 do_gettimeofday(&nfssvc_boot); /* record boot time */
238 return err; 282 return err;
@@ -244,7 +288,7 @@ static int nfsd_init_socks(int port)
244 if (!list_empty(&nfsd_serv->sv_permsocks)) 288 if (!list_empty(&nfsd_serv->sv_permsocks))
245 return 0; 289 return 0;
246 290
247 error = svc_create_xprt(nfsd_serv, "udp", port, 291 error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port,
248 SVC_SOCK_DEFAULTS); 292 SVC_SOCK_DEFAULTS);
249 if (error < 0) 293 if (error < 0)
250 return error; 294 return error;
@@ -253,7 +297,7 @@ static int nfsd_init_socks(int port)
253 if (error < 0) 297 if (error < 0)
254 return error; 298 return error;
255 299
256 error = svc_create_xprt(nfsd_serv, "tcp", port, 300 error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port,
257 SVC_SOCK_DEFAULTS); 301 SVC_SOCK_DEFAULTS);
258 if (error < 0) 302 if (error < 0)
259 return error; 303 return error;
@@ -376,26 +420,6 @@ nfsd_svc(unsigned short port, int nrservs)
376 return error; 420 return error;
377} 421}
378 422
379static inline void
380update_thread_usage(int busy_threads)
381{
382 unsigned long prev_call;
383 unsigned long diff;
384 int decile;
385
386 spin_lock(&nfsd_call_lock);
387 prev_call = nfsd_last_call;
388 nfsd_last_call = jiffies;
389 decile = busy_threads*10/nfsdstats.th_cnt;
390 if (decile>0 && decile <= 10) {
391 diff = nfsd_last_call - prev_call;
392 if ( (nfsdstats.th_usage[decile-1] += diff) >= NFSD_USAGE_WRAP)
393 nfsdstats.th_usage[decile-1] -= NFSD_USAGE_WRAP;
394 if (decile == 10)
395 nfsdstats.th_fullcnt++;
396 }
397 spin_unlock(&nfsd_call_lock);
398}
399 423
400/* 424/*
401 * This is the NFS server kernel thread 425 * This is the NFS server kernel thread
@@ -404,7 +428,6 @@ static int
404nfsd(void *vrqstp) 428nfsd(void *vrqstp)
405{ 429{
406 struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp; 430 struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
407 struct fs_struct *fsp;
408 int err, preverr = 0; 431 int err, preverr = 0;
409 432
410 /* Lock module and set up kernel thread */ 433 /* Lock module and set up kernel thread */
@@ -413,13 +436,11 @@ nfsd(void *vrqstp)
413 /* At this point, the thread shares current->fs 436 /* At this point, the thread shares current->fs
414 * with the init process. We need to create files with a 437 * with the init process. We need to create files with a
415 * umask of 0 instead of init's umask. */ 438 * umask of 0 instead of init's umask. */
416 fsp = copy_fs_struct(current->fs); 439 if (unshare_fs_struct() < 0) {
417 if (!fsp) {
418 printk("Unable to start nfsd thread: out of memory\n"); 440 printk("Unable to start nfsd thread: out of memory\n");
419 goto out; 441 goto out;
420 } 442 }
421 exit_fs(current); 443
422 current->fs = fsp;
423 current->fs->umask = 0; 444 current->fs->umask = 0;
424 445
425 /* 446 /*
@@ -464,8 +485,6 @@ nfsd(void *vrqstp)
464 continue; 485 continue;
465 } 486 }
466 487
467 update_thread_usage(atomic_read(&nfsd_busy));
468 atomic_inc(&nfsd_busy);
469 488
470 /* Lock the export hash tables for reading. */ 489 /* Lock the export hash tables for reading. */
471 exp_readlock(); 490 exp_readlock();
@@ -474,8 +493,6 @@ nfsd(void *vrqstp)
474 493
475 /* Unlock export hash tables */ 494 /* Unlock export hash tables */
476 exp_readunlock(); 495 exp_readunlock();
477 update_thread_usage(atomic_read(&nfsd_busy));
478 atomic_dec(&nfsd_busy);
479 } 496 }
480 497
481 /* Clear signals before calling svc_exit_thread() */ 498 /* Clear signals before calling svc_exit_thread() */
@@ -543,6 +560,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
543 + rqstp->rq_res.head[0].iov_len; 560 + rqstp->rq_res.head[0].iov_len;
544 rqstp->rq_res.head[0].iov_len += sizeof(__be32); 561 rqstp->rq_res.head[0].iov_len += sizeof(__be32);
545 562
563 /* NFSv4.1 DRC requires statp */
564 if (rqstp->rq_vers == 4)
565 nfsd4_set_statp(rqstp, statp);
566
546 /* Now call the procedure handler, and encode NFS status. */ 567 /* Now call the procedure handler, and encode NFS status. */
547 nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); 568 nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
548 nfserr = map_new_errors(rqstp->rq_vers, nfserr); 569 nfserr = map_new_errors(rqstp->rq_vers, nfserr);
@@ -574,3 +595,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
574 nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1); 595 nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1);
575 return 1; 596 return 1;
576} 597}
598
599int nfsd_pool_stats_open(struct inode *inode, struct file *file)
600{
601 if (nfsd_serv == NULL)
602 return -ENODEV;
603 return svc_pool_stats_open(nfsd_serv, file);
604}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 78376b6c0236..b660435978d2 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -116,10 +116,15 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
116 } 116 }
117 if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) { 117 if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) {
118 /* successfully crossed mount point */ 118 /* successfully crossed mount point */
119 exp_put(exp); 119 /*
120 *expp = exp2; 120 * This is subtle: dentry is *not* under mnt at this point.
121 * The only reason we are safe is that original mnt is pinned
122 * down by exp, so we should dput before putting exp.
123 */
121 dput(dentry); 124 dput(dentry);
122 *dpp = mounts; 125 *dpp = mounts;
126 exp_put(exp);
127 *expp = exp2;
123 } else { 128 } else {
124 exp_put(exp2); 129 exp_put(exp2);
125 dput(mounts); 130 dput(mounts);
@@ -366,8 +371,9 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
366 } 371 }
367 372
368 /* Revoke setuid/setgid on chown */ 373 /* Revoke setuid/setgid on chown */
369 if (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) || 374 if (!S_ISDIR(inode->i_mode) &&
370 ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid)) { 375 (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) ||
376 ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid))) {
371 iap->ia_valid |= ATTR_KILL_PRIV; 377 iap->ia_valid |= ATTR_KILL_PRIV;
372 if (iap->ia_valid & ATTR_MODE) { 378 if (iap->ia_valid & ATTR_MODE) {
373 /* we're setting mode too, just clear the s*id bits */ 379 /* we're setting mode too, just clear the s*id bits */
@@ -960,7 +966,7 @@ static void kill_suid(struct dentry *dentry)
960static __be32 966static __be32
961nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 967nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
962 loff_t offset, struct kvec *vec, int vlen, 968 loff_t offset, struct kvec *vec, int vlen,
963 unsigned long cnt, int *stablep) 969 unsigned long *cnt, int *stablep)
964{ 970{
965 struct svc_export *exp; 971 struct svc_export *exp;
966 struct dentry *dentry; 972 struct dentry *dentry;
@@ -974,7 +980,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
974 err = nfserr_perm; 980 err = nfserr_perm;
975 981
976 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && 982 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
977 (!lock_may_write(file->f_path.dentry->d_inode, offset, cnt))) 983 (!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt)))
978 goto out; 984 goto out;
979#endif 985#endif
980 986
@@ -1009,7 +1015,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1009 host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset); 1015 host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
1010 set_fs(oldfs); 1016 set_fs(oldfs);
1011 if (host_err >= 0) { 1017 if (host_err >= 0) {
1012 nfsdstats.io_write += cnt; 1018 *cnt = host_err;
1019 nfsdstats.io_write += host_err;
1013 fsnotify_modify(file->f_path.dentry); 1020 fsnotify_modify(file->f_path.dentry);
1014 } 1021 }
1015 1022
@@ -1056,7 +1063,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1056 dprintk("nfsd: write complete host_err=%d\n", host_err); 1063 dprintk("nfsd: write complete host_err=%d\n", host_err);
1057 if (host_err >= 0) 1064 if (host_err >= 0)
1058 err = 0; 1065 err = 0;
1059 else 1066 else
1060 err = nfserrno(host_err); 1067 err = nfserrno(host_err);
1061out: 1068out:
1062 return err; 1069 return err;
@@ -1098,7 +1105,7 @@ out:
1098 */ 1105 */
1099__be32 1106__be32
1100nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 1107nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1101 loff_t offset, struct kvec *vec, int vlen, unsigned long cnt, 1108 loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt,
1102 int *stablep) 1109 int *stablep)
1103{ 1110{
1104 __be32 err = 0; 1111 __be32 err = 0;
@@ -1179,6 +1186,21 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
1179 return 0; 1186 return 0;
1180} 1187}
1181 1188
1189/* HPUX client sometimes creates a file in mode 000, and sets size to 0.
1190 * setting size to 0 may fail for some specific file systems by the permission
1191 * checking which requires WRITE permission but the mode is 000.
1192 * we ignore the resizing(to 0) on the just new created file, since the size is
1193 * 0 after file created.
1194 *
1195 * call this only after vfs_create() is called.
1196 * */
1197static void
1198nfsd_check_ignore_resizing(struct iattr *iap)
1199{
1200 if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0))
1201 iap->ia_valid &= ~ATTR_SIZE;
1202}
1203
1182/* 1204/*
1183 * Create a file (regular, directory, device, fifo); UNIX sockets 1205 * Create a file (regular, directory, device, fifo); UNIX sockets
1184 * not yet implemented. 1206 * not yet implemented.
@@ -1274,6 +1296,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1274 switch (type) { 1296 switch (type) {
1275 case S_IFREG: 1297 case S_IFREG:
1276 host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL); 1298 host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
1299 if (!host_err)
1300 nfsd_check_ignore_resizing(iap);
1277 break; 1301 break;
1278 case S_IFDIR: 1302 case S_IFDIR:
1279 host_err = vfs_mkdir(dirp, dchild, iap->ia_mode); 1303 host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
@@ -1427,6 +1451,8 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1427 /* setattr will sync the child (or not) */ 1451 /* setattr will sync the child (or not) */
1428 } 1452 }
1429 1453
1454 nfsd_check_ignore_resizing(iap);
1455
1430 if (createmode == NFS3_CREATE_EXCLUSIVE) { 1456 if (createmode == NFS3_CREATE_EXCLUSIVE) {
1431 /* Cram the verifier into atime/mtime */ 1457 /* Cram the verifier into atime/mtime */
1432 iap->ia_valid = ATTR_MTIME|ATTR_ATIME 1458 iap->ia_valid = ATTR_MTIME|ATTR_ATIME
@@ -1864,8 +1890,8 @@ static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen,
1864 return 0; 1890 return 0;
1865} 1891}
1866 1892
1867static int nfsd_buffered_readdir(struct file *file, filldir_t func, 1893static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func,
1868 struct readdir_cd *cdp, loff_t *offsetp) 1894 struct readdir_cd *cdp, loff_t *offsetp)
1869{ 1895{
1870 struct readdir_data buf; 1896 struct readdir_data buf;
1871 struct buffered_dirent *de; 1897 struct buffered_dirent *de;
@@ -1875,11 +1901,12 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func,
1875 1901
1876 buf.dirent = (void *)__get_free_page(GFP_KERNEL); 1902 buf.dirent = (void *)__get_free_page(GFP_KERNEL);
1877 if (!buf.dirent) 1903 if (!buf.dirent)
1878 return -ENOMEM; 1904 return nfserrno(-ENOMEM);
1879 1905
1880 offset = *offsetp; 1906 offset = *offsetp;
1881 1907
1882 while (1) { 1908 while (1) {
1909 struct inode *dir_inode = file->f_path.dentry->d_inode;
1883 unsigned int reclen; 1910 unsigned int reclen;
1884 1911
1885 cdp->err = nfserr_eof; /* will be cleared on successful read */ 1912 cdp->err = nfserr_eof; /* will be cleared on successful read */
@@ -1898,26 +1925,38 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func,
1898 if (!size) 1925 if (!size)
1899 break; 1926 break;
1900 1927
1928 /*
1929 * Various filldir functions may end up calling back into
1930 * lookup_one_len() and the file system's ->lookup() method.
1931 * These expect i_mutex to be held, as it would within readdir.
1932 */
1933 host_err = mutex_lock_killable(&dir_inode->i_mutex);
1934 if (host_err)
1935 break;
1936
1901 de = (struct buffered_dirent *)buf.dirent; 1937 de = (struct buffered_dirent *)buf.dirent;
1902 while (size > 0) { 1938 while (size > 0) {
1903 offset = de->offset; 1939 offset = de->offset;
1904 1940
1905 if (func(cdp, de->name, de->namlen, de->offset, 1941 if (func(cdp, de->name, de->namlen, de->offset,
1906 de->ino, de->d_type)) 1942 de->ino, de->d_type))
1907 goto done; 1943 break;
1908 1944
1909 if (cdp->err != nfs_ok) 1945 if (cdp->err != nfs_ok)
1910 goto done; 1946 break;
1911 1947
1912 reclen = ALIGN(sizeof(*de) + de->namlen, 1948 reclen = ALIGN(sizeof(*de) + de->namlen,
1913 sizeof(u64)); 1949 sizeof(u64));
1914 size -= reclen; 1950 size -= reclen;
1915 de = (struct buffered_dirent *)((char *)de + reclen); 1951 de = (struct buffered_dirent *)((char *)de + reclen);
1916 } 1952 }
1953 mutex_unlock(&dir_inode->i_mutex);
1954 if (size > 0) /* We bailed out early */
1955 break;
1956
1917 offset = vfs_llseek(file, 0, SEEK_CUR); 1957 offset = vfs_llseek(file, 0, SEEK_CUR);
1918 } 1958 }
1919 1959
1920 done:
1921 free_page((unsigned long)(buf.dirent)); 1960 free_page((unsigned long)(buf.dirent));
1922 1961
1923 if (host_err) 1962 if (host_err)
diff --git a/fs/nilfs2/Makefile b/fs/nilfs2/Makefile
new file mode 100644
index 000000000000..df3e62c1ddc5
--- /dev/null
+++ b/fs/nilfs2/Makefile
@@ -0,0 +1,5 @@
1obj-$(CONFIG_NILFS2_FS) += nilfs2.o
2nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \
3 btnode.o bmap.o btree.o direct.o dat.o recovery.o \
4 the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \
5 ifile.o alloc.o gcinode.o ioctl.o gcdat.o
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
new file mode 100644
index 000000000000..d69e6ae59251
--- /dev/null
+++ b/fs/nilfs2/alloc.c
@@ -0,0 +1,504 @@
1/*
2 * alloc.c - NILFS dat/inode allocator
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Original code was written by Koji Sato <koji@osrg.net>.
21 * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
22 * Amagai Yoshiji <amagai@osrg.net>.
23 */
24
25#include <linux/types.h>
26#include <linux/buffer_head.h>
27#include <linux/fs.h>
28#include <linux/bitops.h>
29#include "mdt.h"
30#include "alloc.h"
31
32
33static inline unsigned long
34nilfs_palloc_groups_per_desc_block(const struct inode *inode)
35{
36 return (1UL << inode->i_blkbits) /
37 sizeof(struct nilfs_palloc_group_desc);
38}
39
40static inline unsigned long
41nilfs_palloc_groups_count(const struct inode *inode)
42{
43 return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */));
44}
45
46int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
47{
48 struct nilfs_mdt_info *mi = NILFS_MDT(inode);
49
50 mi->mi_bgl = kmalloc(sizeof(*mi->mi_bgl), GFP_NOFS);
51 if (!mi->mi_bgl)
52 return -ENOMEM;
53
54 bgl_lock_init(mi->mi_bgl);
55
56 nilfs_mdt_set_entry_size(inode, entry_size, 0);
57
58 mi->mi_blocks_per_group =
59 DIV_ROUND_UP(nilfs_palloc_entries_per_group(inode),
60 mi->mi_entries_per_block) + 1;
61 /* Number of blocks in a group including entry blocks and
62 a bitmap block */
63 mi->mi_blocks_per_desc_block =
64 nilfs_palloc_groups_per_desc_block(inode) *
65 mi->mi_blocks_per_group + 1;
66 /* Number of blocks per descriptor including the
67 descriptor block */
68 return 0;
69}
70
71static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
72 unsigned long *offset)
73{
74 __u64 group = nr;
75
76 *offset = do_div(group, nilfs_palloc_entries_per_group(inode));
77 return group;
78}
79
80static unsigned long
81nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
82{
83 unsigned long desc_block =
84 group / nilfs_palloc_groups_per_desc_block(inode);
85 return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block;
86}
87
88static unsigned long
89nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
90{
91 unsigned long desc_offset =
92 group % nilfs_palloc_groups_per_desc_block(inode);
93 return nilfs_palloc_desc_blkoff(inode, group) + 1 +
94 desc_offset * NILFS_MDT(inode)->mi_blocks_per_group;
95}
96
97static unsigned long
98nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
99 const struct nilfs_palloc_group_desc *desc)
100{
101 unsigned long nfree;
102
103 spin_lock(nilfs_mdt_bgl_lock(inode, group));
104 nfree = le32_to_cpu(desc->pg_nfrees);
105 spin_unlock(nilfs_mdt_bgl_lock(inode, group));
106 return nfree;
107}
108
109static void
110nilfs_palloc_group_desc_add_entries(struct inode *inode,
111 unsigned long group,
112 struct nilfs_palloc_group_desc *desc,
113 u32 n)
114{
115 spin_lock(nilfs_mdt_bgl_lock(inode, group));
116 le32_add_cpu(&desc->pg_nfrees, n);
117 spin_unlock(nilfs_mdt_bgl_lock(inode, group));
118}
119
120static unsigned long
121nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
122{
123 unsigned long group, group_offset;
124
125 group = nilfs_palloc_group(inode, nr, &group_offset);
126
127 return nilfs_palloc_bitmap_blkoff(inode, group) + 1 +
128 group_offset / NILFS_MDT(inode)->mi_entries_per_block;
129}
130
131static void nilfs_palloc_desc_block_init(struct inode *inode,
132 struct buffer_head *bh, void *kaddr)
133{
134 struct nilfs_palloc_group_desc *desc = kaddr + bh_offset(bh);
135 unsigned long n = nilfs_palloc_groups_per_desc_block(inode);
136 __le32 nfrees;
137
138 nfrees = cpu_to_le32(nilfs_palloc_entries_per_group(inode));
139 while (n-- > 0) {
140 desc->pg_nfrees = nfrees;
141 desc++;
142 }
143}
144
145static int nilfs_palloc_get_desc_block(struct inode *inode,
146 unsigned long group,
147 int create, struct buffer_head **bhp)
148{
149 return nilfs_mdt_get_block(inode,
150 nilfs_palloc_desc_blkoff(inode, group),
151 create, nilfs_palloc_desc_block_init, bhp);
152}
153
154static int nilfs_palloc_get_bitmap_block(struct inode *inode,
155 unsigned long group,
156 int create, struct buffer_head **bhp)
157{
158 return nilfs_mdt_get_block(inode,
159 nilfs_palloc_bitmap_blkoff(inode, group),
160 create, NULL, bhp);
161}
162
163int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
164 int create, struct buffer_head **bhp)
165{
166 return nilfs_mdt_get_block(inode, nilfs_palloc_entry_blkoff(inode, nr),
167 create, NULL, bhp);
168}
169
170static struct nilfs_palloc_group_desc *
171nilfs_palloc_block_get_group_desc(const struct inode *inode,
172 unsigned long group,
173 const struct buffer_head *bh, void *kaddr)
174{
175 return (struct nilfs_palloc_group_desc *)(kaddr + bh_offset(bh)) +
176 group % nilfs_palloc_groups_per_desc_block(inode);
177}
178
179static unsigned char *
180nilfs_palloc_block_get_bitmap(const struct inode *inode,
181 const struct buffer_head *bh, void *kaddr)
182{
183 return (unsigned char *)(kaddr + bh_offset(bh));
184}
185
186void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
187 const struct buffer_head *bh, void *kaddr)
188{
189 unsigned long entry_offset, group_offset;
190
191 nilfs_palloc_group(inode, nr, &group_offset);
192 entry_offset = group_offset % NILFS_MDT(inode)->mi_entries_per_block;
193
194 return kaddr + bh_offset(bh) +
195 entry_offset * NILFS_MDT(inode)->mi_entry_size;
196}
197
198static int nilfs_palloc_find_available_slot(struct inode *inode,
199 unsigned long group,
200 unsigned long target,
201 unsigned char *bitmap,
202 int bsize) /* size in bits */
203{
204 int curr, pos, end, i;
205
206 if (target > 0) {
207 end = (target + BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
208 if (end > bsize)
209 end = bsize;
210 pos = nilfs_find_next_zero_bit(bitmap, end, target);
211 if (pos < end &&
212 !nilfs_set_bit_atomic(
213 nilfs_mdt_bgl_lock(inode, group), pos, bitmap))
214 return pos;
215 } else
216 end = 0;
217
218 for (i = 0, curr = end;
219 i < bsize;
220 i += BITS_PER_LONG, curr += BITS_PER_LONG) {
221 /* wrap around */
222 if (curr >= bsize)
223 curr = 0;
224 while (*((unsigned long *)bitmap + curr / BITS_PER_LONG)
225 != ~0UL) {
226 end = curr + BITS_PER_LONG;
227 if (end > bsize)
228 end = bsize;
229 pos = nilfs_find_next_zero_bit(bitmap, end, curr);
230 if ((pos < end) &&
231 !nilfs_set_bit_atomic(
232 nilfs_mdt_bgl_lock(inode, group), pos,
233 bitmap))
234 return pos;
235 }
236 }
237 return -ENOSPC;
238}
239
240static unsigned long
241nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
242 unsigned long curr, unsigned long max)
243{
244 return min_t(unsigned long,
245 nilfs_palloc_groups_per_desc_block(inode) -
246 curr % nilfs_palloc_groups_per_desc_block(inode),
247 max - curr + 1);
248}
249
250int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
251 struct nilfs_palloc_req *req)
252{
253 struct buffer_head *desc_bh, *bitmap_bh;
254 struct nilfs_palloc_group_desc *desc;
255 unsigned char *bitmap;
256 void *desc_kaddr, *bitmap_kaddr;
257 unsigned long group, maxgroup, ngroups;
258 unsigned long group_offset, maxgroup_offset;
259 unsigned long n, entries_per_group, groups_per_desc_block;
260 unsigned long i, j;
261 int pos, ret;
262
263 ngroups = nilfs_palloc_groups_count(inode);
264 maxgroup = ngroups - 1;
265 group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
266 entries_per_group = nilfs_palloc_entries_per_group(inode);
267 groups_per_desc_block = nilfs_palloc_groups_per_desc_block(inode);
268
269 for (i = 0; i < ngroups; i += n) {
270 if (group >= ngroups) {
271 /* wrap around */
272 group = 0;
273 maxgroup = nilfs_palloc_group(inode, req->pr_entry_nr,
274 &maxgroup_offset) - 1;
275 }
276 ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh);
277 if (ret < 0)
278 return ret;
279 desc_kaddr = kmap(desc_bh->b_page);
280 desc = nilfs_palloc_block_get_group_desc(
281 inode, group, desc_bh, desc_kaddr);
282 n = nilfs_palloc_rest_groups_in_desc_block(inode, group,
283 maxgroup);
284 for (j = 0; j < n; j++, desc++, group++) {
285 if (nilfs_palloc_group_desc_nfrees(inode, group, desc)
286 > 0) {
287 ret = nilfs_palloc_get_bitmap_block(
288 inode, group, 1, &bitmap_bh);
289 if (ret < 0)
290 goto out_desc;
291 bitmap_kaddr = kmap(bitmap_bh->b_page);
292 bitmap = nilfs_palloc_block_get_bitmap(
293 inode, bitmap_bh, bitmap_kaddr);
294 pos = nilfs_palloc_find_available_slot(
295 inode, group, group_offset, bitmap,
296 entries_per_group);
297 if (pos >= 0) {
298 /* found a free entry */
299 nilfs_palloc_group_desc_add_entries(
300 inode, group, desc, -1);
301 req->pr_entry_nr =
302 entries_per_group * group + pos;
303 kunmap(desc_bh->b_page);
304 kunmap(bitmap_bh->b_page);
305
306 req->pr_desc_bh = desc_bh;
307 req->pr_bitmap_bh = bitmap_bh;
308 return 0;
309 }
310 kunmap(bitmap_bh->b_page);
311 brelse(bitmap_bh);
312 }
313
314 group_offset = 0;
315 }
316
317 kunmap(desc_bh->b_page);
318 brelse(desc_bh);
319 }
320
321 /* no entries left */
322 return -ENOSPC;
323
324 out_desc:
325 kunmap(desc_bh->b_page);
326 brelse(desc_bh);
327 return ret;
328}
329
330void nilfs_palloc_commit_alloc_entry(struct inode *inode,
331 struct nilfs_palloc_req *req)
332{
333 nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh);
334 nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh);
335 nilfs_mdt_mark_dirty(inode);
336
337 brelse(req->pr_bitmap_bh);
338 brelse(req->pr_desc_bh);
339}
340
341void nilfs_palloc_commit_free_entry(struct inode *inode,
342 struct nilfs_palloc_req *req)
343{
344 struct nilfs_palloc_group_desc *desc;
345 unsigned long group, group_offset;
346 unsigned char *bitmap;
347 void *desc_kaddr, *bitmap_kaddr;
348
349 group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
350 desc_kaddr = kmap(req->pr_desc_bh->b_page);
351 desc = nilfs_palloc_block_get_group_desc(inode, group,
352 req->pr_desc_bh, desc_kaddr);
353 bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
354 bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh,
355 bitmap_kaddr);
356
357 if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
358 group_offset, bitmap))
359 printk(KERN_WARNING "%s: entry number %llu already freed\n",
360 __func__, (unsigned long long)req->pr_entry_nr);
361
362 nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
363
364 kunmap(req->pr_bitmap_bh->b_page);
365 kunmap(req->pr_desc_bh->b_page);
366
367 nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh);
368 nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh);
369 nilfs_mdt_mark_dirty(inode);
370
371 brelse(req->pr_bitmap_bh);
372 brelse(req->pr_desc_bh);
373}
374
375void nilfs_palloc_abort_alloc_entry(struct inode *inode,
376 struct nilfs_palloc_req *req)
377{
378 struct nilfs_palloc_group_desc *desc;
379 void *desc_kaddr, *bitmap_kaddr;
380 unsigned char *bitmap;
381 unsigned long group, group_offset;
382
383 group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
384 desc_kaddr = kmap(req->pr_desc_bh->b_page);
385 desc = nilfs_palloc_block_get_group_desc(inode, group,
386 req->pr_desc_bh, desc_kaddr);
387 bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
388 bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh,
389 bitmap_kaddr);
390 if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
391 group_offset, bitmap))
392 printk(KERN_WARNING "%s: entry numer %llu already freed\n",
393 __func__, (unsigned long long)req->pr_entry_nr);
394
395 nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
396
397 kunmap(req->pr_bitmap_bh->b_page);
398 kunmap(req->pr_desc_bh->b_page);
399
400 brelse(req->pr_bitmap_bh);
401 brelse(req->pr_desc_bh);
402
403 req->pr_entry_nr = 0;
404 req->pr_bitmap_bh = NULL;
405 req->pr_desc_bh = NULL;
406}
407
408int nilfs_palloc_prepare_free_entry(struct inode *inode,
409 struct nilfs_palloc_req *req)
410{
411 struct buffer_head *desc_bh, *bitmap_bh;
412 unsigned long group, group_offset;
413 int ret;
414
415 group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
416 ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh);
417 if (ret < 0)
418 return ret;
419 ret = nilfs_palloc_get_bitmap_block(inode, group, 1, &bitmap_bh);
420 if (ret < 0) {
421 brelse(desc_bh);
422 return ret;
423 }
424
425 req->pr_desc_bh = desc_bh;
426 req->pr_bitmap_bh = bitmap_bh;
427 return 0;
428}
429
430void nilfs_palloc_abort_free_entry(struct inode *inode,
431 struct nilfs_palloc_req *req)
432{
433 brelse(req->pr_bitmap_bh);
434 brelse(req->pr_desc_bh);
435
436 req->pr_entry_nr = 0;
437 req->pr_bitmap_bh = NULL;
438 req->pr_desc_bh = NULL;
439}
440
441static int
442nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
443{
444 __u64 first, last;
445
446 first = group * nilfs_palloc_entries_per_group(inode);
447 last = first + nilfs_palloc_entries_per_group(inode) - 1;
448 return (nr >= first) && (nr <= last);
449}
450
451int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
452{
453 struct buffer_head *desc_bh, *bitmap_bh;
454 struct nilfs_palloc_group_desc *desc;
455 unsigned char *bitmap;
456 void *desc_kaddr, *bitmap_kaddr;
457 unsigned long group, group_offset;
458 int i, j, n, ret;
459
460 for (i = 0; i < nitems; i += n) {
461 group = nilfs_palloc_group(inode, entry_nrs[i], &group_offset);
462 ret = nilfs_palloc_get_desc_block(inode, group, 0, &desc_bh);
463 if (ret < 0)
464 return ret;
465 ret = nilfs_palloc_get_bitmap_block(inode, group, 0,
466 &bitmap_bh);
467 if (ret < 0) {
468 brelse(desc_bh);
469 return ret;
470 }
471 desc_kaddr = kmap(desc_bh->b_page);
472 desc = nilfs_palloc_block_get_group_desc(
473 inode, group, desc_bh, desc_kaddr);
474 bitmap_kaddr = kmap(bitmap_bh->b_page);
475 bitmap = nilfs_palloc_block_get_bitmap(
476 inode, bitmap_bh, bitmap_kaddr);
477 for (j = i, n = 0;
478 (j < nitems) && nilfs_palloc_group_is_in(inode, group,
479 entry_nrs[j]);
480 j++, n++) {
481 nilfs_palloc_group(inode, entry_nrs[j], &group_offset);
482 if (!nilfs_clear_bit_atomic(
483 nilfs_mdt_bgl_lock(inode, group),
484 group_offset, bitmap)) {
485 printk(KERN_WARNING
486 "%s: entry number %llu already freed\n",
487 __func__,
488 (unsigned long long)entry_nrs[j]);
489 }
490 }
491 nilfs_palloc_group_desc_add_entries(inode, group, desc, n);
492
493 kunmap(bitmap_bh->b_page);
494 kunmap(desc_bh->b_page);
495
496 nilfs_mdt_mark_buffer_dirty(desc_bh);
497 nilfs_mdt_mark_buffer_dirty(bitmap_bh);
498 nilfs_mdt_mark_dirty(inode);
499
500 brelse(bitmap_bh);
501 brelse(desc_bh);
502 }
503 return 0;
504}
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
new file mode 100644
index 000000000000..4ace5475c2c7
--- /dev/null
+++ b/fs/nilfs2/alloc.h
@@ -0,0 +1,72 @@
1/*
2 * alloc.h - persistent object (dat entry/disk inode) allocator/deallocator
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Original code was written by Koji Sato <koji@osrg.net>.
21 * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
22 * Amagai Yoshiji <amagai@osrg.net>.
23 */
24
25#ifndef _NILFS_ALLOC_H
26#define _NILFS_ALLOC_H
27
28#include <linux/types.h>
29#include <linux/buffer_head.h>
30#include <linux/fs.h>
31
32static inline unsigned long
33nilfs_palloc_entries_per_group(const struct inode *inode)
34{
35 return 1UL << (inode->i_blkbits + 3 /* log2(8 = CHAR_BITS) */);
36}
37
38int nilfs_palloc_init_blockgroup(struct inode *, unsigned);
39int nilfs_palloc_get_entry_block(struct inode *, __u64, int,
40 struct buffer_head **);
41void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
42 const struct buffer_head *, void *);
43
44/**
45 * nilfs_palloc_req - persistent alloctor request and reply
46 * @pr_entry_nr: entry number (vblocknr or inode number)
47 * @pr_desc_bh: buffer head of the buffer containing block group descriptors
48 * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap
49 * @pr_entry_bh: buffer head of the buffer containing translation entries
50 */
51struct nilfs_palloc_req {
52 __u64 pr_entry_nr;
53 struct buffer_head *pr_desc_bh;
54 struct buffer_head *pr_bitmap_bh;
55 struct buffer_head *pr_entry_bh;
56};
57
58int nilfs_palloc_prepare_alloc_entry(struct inode *,
59 struct nilfs_palloc_req *);
60void nilfs_palloc_commit_alloc_entry(struct inode *,
61 struct nilfs_palloc_req *);
62void nilfs_palloc_abort_alloc_entry(struct inode *, struct nilfs_palloc_req *);
63void nilfs_palloc_commit_free_entry(struct inode *, struct nilfs_palloc_req *);
64int nilfs_palloc_prepare_free_entry(struct inode *, struct nilfs_palloc_req *);
65void nilfs_palloc_abort_free_entry(struct inode *, struct nilfs_palloc_req *);
66int nilfs_palloc_freev(struct inode *, __u64 *, size_t);
67
68#define nilfs_set_bit_atomic ext2_set_bit_atomic
69#define nilfs_clear_bit_atomic ext2_clear_bit_atomic
70#define nilfs_find_next_zero_bit ext2_find_next_zero_bit
71
72#endif /* _NILFS_ALLOC_H */
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
new file mode 100644
index 000000000000..064279e33bbb
--- /dev/null
+++ b/fs/nilfs2/bmap.c
@@ -0,0 +1,788 @@
1/*
2 * bmap.c - NILFS block mapping.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#include <linux/fs.h>
24#include <linux/string.h>
25#include <linux/errno.h>
26#include "nilfs.h"
27#include "bmap.h"
28#include "sb.h"
29#include "btnode.h"
30#include "mdt.h"
31#include "dat.h"
32#include "alloc.h"
33
34int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
35 __u64 *ptrp)
36{
37 __u64 ptr;
38 int ret;
39
40 down_read(&bmap->b_sem);
41 ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp);
42 if (ret < 0)
43 goto out;
44 if (bmap->b_pops->bpop_translate != NULL) {
45 ret = bmap->b_pops->bpop_translate(bmap, *ptrp, &ptr);
46 if (ret < 0)
47 goto out;
48 *ptrp = ptr;
49 }
50
51 out:
52 up_read(&bmap->b_sem);
53 return ret;
54}
55
56
57/**
58 * nilfs_bmap_lookup - find a record
59 * @bmap: bmap
60 * @key: key
61 * @recp: pointer to record
62 *
63 * Description: nilfs_bmap_lookup() finds a record whose key matches @key in
64 * @bmap.
65 *
66 * Return Value: On success, 0 is returned and the record associated with @key
67 * is stored in the place pointed by @recp. On error, one of the following
68 * negative error codes is returned.
69 *
70 * %-EIO - I/O error.
71 *
72 * %-ENOMEM - Insufficient amount of memory available.
73 *
74 * %-ENOENT - A record associated with @key does not exist.
75 */
76int nilfs_bmap_lookup(struct nilfs_bmap *bmap,
77 unsigned long key,
78 unsigned long *recp)
79{
80 __u64 ptr;
81 int ret;
82
83 /* XXX: use macro for level 1 */
84 ret = nilfs_bmap_lookup_at_level(bmap, key, 1, &ptr);
85 if (recp != NULL)
86 *recp = ptr;
87 return ret;
88}
89
90static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
91{
92 __u64 keys[NILFS_BMAP_SMALL_HIGH + 1];
93 __u64 ptrs[NILFS_BMAP_SMALL_HIGH + 1];
94 int ret, n;
95
96 if (bmap->b_ops->bop_check_insert != NULL) {
97 ret = bmap->b_ops->bop_check_insert(bmap, key);
98 if (ret > 0) {
99 n = bmap->b_ops->bop_gather_data(
100 bmap, keys, ptrs, NILFS_BMAP_SMALL_HIGH + 1);
101 if (n < 0)
102 return n;
103 ret = nilfs_btree_convert_and_insert(
104 bmap, key, ptr, keys, ptrs, n,
105 NILFS_BMAP_LARGE_LOW, NILFS_BMAP_LARGE_HIGH);
106 if (ret == 0)
107 bmap->b_u.u_flags |= NILFS_BMAP_LARGE;
108
109 return ret;
110 } else if (ret < 0)
111 return ret;
112 }
113
114 return bmap->b_ops->bop_insert(bmap, key, ptr);
115}
116
117/**
118 * nilfs_bmap_insert - insert a new key-record pair into a bmap
119 * @bmap: bmap
120 * @key: key
121 * @rec: record
122 *
123 * Description: nilfs_bmap_insert() inserts the new key-record pair specified
124 * by @key and @rec into @bmap.
125 *
126 * Return Value: On success, 0 is returned. On error, one of the following
127 * negative error codes is returned.
128 *
129 * %-EIO - I/O error.
130 *
131 * %-ENOMEM - Insufficient amount of memory available.
132 *
133 * %-EEXIST - A record associated with @key already exist.
134 */
135int nilfs_bmap_insert(struct nilfs_bmap *bmap,
136 unsigned long key,
137 unsigned long rec)
138{
139 int ret;
140
141 down_write(&bmap->b_sem);
142 ret = nilfs_bmap_do_insert(bmap, key, rec);
143 up_write(&bmap->b_sem);
144 return ret;
145}
146
147static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key)
148{
149 __u64 keys[NILFS_BMAP_LARGE_LOW + 1];
150 __u64 ptrs[NILFS_BMAP_LARGE_LOW + 1];
151 int ret, n;
152
153 if (bmap->b_ops->bop_check_delete != NULL) {
154 ret = bmap->b_ops->bop_check_delete(bmap, key);
155 if (ret > 0) {
156 n = bmap->b_ops->bop_gather_data(
157 bmap, keys, ptrs, NILFS_BMAP_LARGE_LOW + 1);
158 if (n < 0)
159 return n;
160 ret = nilfs_direct_delete_and_convert(
161 bmap, key, keys, ptrs, n,
162 NILFS_BMAP_SMALL_LOW, NILFS_BMAP_SMALL_HIGH);
163 if (ret == 0)
164 bmap->b_u.u_flags &= ~NILFS_BMAP_LARGE;
165
166 return ret;
167 } else if (ret < 0)
168 return ret;
169 }
170
171 return bmap->b_ops->bop_delete(bmap, key);
172}
173
174int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key)
175{
176 __u64 lastkey;
177 int ret;
178
179 down_read(&bmap->b_sem);
180 ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
181 if (!ret)
182 *key = lastkey;
183 up_read(&bmap->b_sem);
184 return ret;
185}
186
187/**
188 * nilfs_bmap_delete - delete a key-record pair from a bmap
189 * @bmap: bmap
190 * @key: key
191 *
192 * Description: nilfs_bmap_delete() deletes the key-record pair specified by
193 * @key from @bmap.
194 *
195 * Return Value: On success, 0 is returned. On error, one of the following
196 * negative error codes is returned.
197 *
198 * %-EIO - I/O error.
199 *
200 * %-ENOMEM - Insufficient amount of memory available.
201 *
202 * %-ENOENT - A record associated with @key does not exist.
203 */
204int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key)
205{
206 int ret;
207
208 down_write(&bmap->b_sem);
209 ret = nilfs_bmap_do_delete(bmap, key);
210 up_write(&bmap->b_sem);
211 return ret;
212}
213
214static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key)
215{
216 __u64 lastkey;
217 int ret;
218
219 ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
220 if (ret < 0) {
221 if (ret == -ENOENT)
222 ret = 0;
223 return ret;
224 }
225
226 while (key <= lastkey) {
227 ret = nilfs_bmap_do_delete(bmap, lastkey);
228 if (ret < 0)
229 return ret;
230 ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
231 if (ret < 0) {
232 if (ret == -ENOENT)
233 ret = 0;
234 return ret;
235 }
236 }
237 return 0;
238}
239
240/**
241 * nilfs_bmap_truncate - truncate a bmap to a specified key
242 * @bmap: bmap
243 * @key: key
244 *
245 * Description: nilfs_bmap_truncate() removes key-record pairs whose keys are
246 * greater than or equal to @key from @bmap.
247 *
248 * Return Value: On success, 0 is returned. On error, one of the following
249 * negative error codes is returned.
250 *
251 * %-EIO - I/O error.
252 *
253 * %-ENOMEM - Insufficient amount of memory available.
254 */
255int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key)
256{
257 int ret;
258
259 down_write(&bmap->b_sem);
260 ret = nilfs_bmap_do_truncate(bmap, key);
261 up_write(&bmap->b_sem);
262 return ret;
263}
264
265/**
266 * nilfs_bmap_clear - free resources a bmap holds
267 * @bmap: bmap
268 *
269 * Description: nilfs_bmap_clear() frees resources associated with @bmap.
270 */
271void nilfs_bmap_clear(struct nilfs_bmap *bmap)
272{
273 down_write(&bmap->b_sem);
274 if (bmap->b_ops->bop_clear != NULL)
275 bmap->b_ops->bop_clear(bmap);
276 up_write(&bmap->b_sem);
277}
278
279/**
280 * nilfs_bmap_propagate - propagate dirty state
281 * @bmap: bmap
282 * @bh: buffer head
283 *
284 * Description: nilfs_bmap_propagate() marks the buffers that directly or
285 * indirectly refer to the block specified by @bh dirty.
286 *
287 * Return Value: On success, 0 is returned. On error, one of the following
288 * negative error codes is returned.
289 *
290 * %-EIO - I/O error.
291 *
292 * %-ENOMEM - Insufficient amount of memory available.
293 */
294int nilfs_bmap_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh)
295{
296 int ret;
297
298 down_write(&bmap->b_sem);
299 ret = bmap->b_ops->bop_propagate(bmap, bh);
300 up_write(&bmap->b_sem);
301 return ret;
302}
303
304/**
305 * nilfs_bmap_lookup_dirty_buffers -
306 * @bmap: bmap
307 * @listp: pointer to buffer head list
308 */
309void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *bmap,
310 struct list_head *listp)
311{
312 if (bmap->b_ops->bop_lookup_dirty_buffers != NULL)
313 bmap->b_ops->bop_lookup_dirty_buffers(bmap, listp);
314}
315
316/**
317 * nilfs_bmap_assign - assign a new block number to a block
318 * @bmap: bmap
319 * @bhp: pointer to buffer head
320 * @blocknr: block number
321 * @binfo: block information
322 *
323 * Description: nilfs_bmap_assign() assigns the block number @blocknr to the
324 * buffer specified by @bh.
325 *
326 * Return Value: On success, 0 is returned and the buffer head of a newly
327 * create buffer and the block information associated with the buffer are
328 * stored in the place pointed by @bh and @binfo, respectively. On error, one
329 * of the following negative error codes is returned.
330 *
331 * %-EIO - I/O error.
332 *
333 * %-ENOMEM - Insufficient amount of memory available.
334 */
335int nilfs_bmap_assign(struct nilfs_bmap *bmap,
336 struct buffer_head **bh,
337 unsigned long blocknr,
338 union nilfs_binfo *binfo)
339{
340 int ret;
341
342 down_write(&bmap->b_sem);
343 ret = bmap->b_ops->bop_assign(bmap, bh, blocknr, binfo);
344 up_write(&bmap->b_sem);
345 return ret;
346}
347
348/**
349 * nilfs_bmap_mark - mark block dirty
350 * @bmap: bmap
351 * @key: key
352 * @level: level
353 *
354 * Description: nilfs_bmap_mark() marks the block specified by @key and @level
355 * as dirty.
356 *
357 * Return Value: On success, 0 is returned. On error, one of the following
358 * negative error codes is returned.
359 *
360 * %-EIO - I/O error.
361 *
362 * %-ENOMEM - Insufficient amount of memory available.
363 */
364int nilfs_bmap_mark(struct nilfs_bmap *bmap, __u64 key, int level)
365{
366 int ret;
367
368 if (bmap->b_ops->bop_mark == NULL)
369 return 0;
370
371 down_write(&bmap->b_sem);
372 ret = bmap->b_ops->bop_mark(bmap, key, level);
373 up_write(&bmap->b_sem);
374 return ret;
375}
376
377/**
378 * nilfs_bmap_test_and_clear_dirty - test and clear a bmap dirty state
379 * @bmap: bmap
380 *
381 * Description: nilfs_test_and_clear() is the atomic operation to test and
382 * clear the dirty state of @bmap.
383 *
384 * Return Value: 1 is returned if @bmap is dirty, or 0 if clear.
385 */
386int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap)
387{
388 int ret;
389
390 down_write(&bmap->b_sem);
391 ret = nilfs_bmap_dirty(bmap);
392 nilfs_bmap_clear_dirty(bmap);
393 up_write(&bmap->b_sem);
394 return ret;
395}
396
397
398/*
399 * Internal use only
400 */
401
402void nilfs_bmap_add_blocks(const struct nilfs_bmap *bmap, int n)
403{
404 inode_add_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
405 if (NILFS_MDT(bmap->b_inode))
406 nilfs_mdt_mark_dirty(bmap->b_inode);
407 else
408 mark_inode_dirty(bmap->b_inode);
409}
410
411void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n)
412{
413 inode_sub_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
414 if (NILFS_MDT(bmap->b_inode))
415 nilfs_mdt_mark_dirty(bmap->b_inode);
416 else
417 mark_inode_dirty(bmap->b_inode);
418}
419
420int nilfs_bmap_get_block(const struct nilfs_bmap *bmap, __u64 ptr,
421 struct buffer_head **bhp)
422{
423 return nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache,
424 ptr, 0, bhp, 0);
425}
426
427void nilfs_bmap_put_block(const struct nilfs_bmap *bmap,
428 struct buffer_head *bh)
429{
430 brelse(bh);
431}
432
433int nilfs_bmap_get_new_block(const struct nilfs_bmap *bmap, __u64 ptr,
434 struct buffer_head **bhp)
435{
436 int ret;
437
438 ret = nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache,
439 ptr, 0, bhp, 1);
440 if (ret < 0)
441 return ret;
442 set_buffer_nilfs_volatile(*bhp);
443 return 0;
444}
445
446void nilfs_bmap_delete_block(const struct nilfs_bmap *bmap,
447 struct buffer_head *bh)
448{
449 nilfs_btnode_delete(bh);
450}
451
452__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
453 const struct buffer_head *bh)
454{
455 struct buffer_head *pbh;
456 __u64 key;
457
458 key = page_index(bh->b_page) << (PAGE_CACHE_SHIFT -
459 bmap->b_inode->i_blkbits);
460 for (pbh = page_buffers(bh->b_page); pbh != bh;
461 pbh = pbh->b_this_page, key++);
462
463 return key;
464}
465
466__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *bmap, __u64 key)
467{
468 __s64 diff;
469
470 diff = key - bmap->b_last_allocated_key;
471 if ((nilfs_bmap_keydiff_abs(diff) < NILFS_INODE_BMAP_SIZE) &&
472 (bmap->b_last_allocated_ptr != NILFS_BMAP_INVALID_PTR) &&
473 (bmap->b_last_allocated_ptr + diff > 0))
474 return bmap->b_last_allocated_ptr + diff;
475 else
476 return NILFS_BMAP_INVALID_PTR;
477}
478
479static struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
480{
481 return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
482}
483
484#define NILFS_BMAP_GROUP_DIV 8
485__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap)
486{
487 struct inode *dat = nilfs_bmap_get_dat(bmap);
488 unsigned long entries_per_group = nilfs_palloc_entries_per_group(dat);
489 unsigned long group = bmap->b_inode->i_ino / entries_per_group;
490
491 return group * entries_per_group +
492 (bmap->b_inode->i_ino % NILFS_BMAP_GROUP_DIV) *
493 (entries_per_group / NILFS_BMAP_GROUP_DIV);
494}
495
496static int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap,
497 union nilfs_bmap_ptr_req *req)
498{
499 return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
500}
501
502static void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap,
503 union nilfs_bmap_ptr_req *req)
504{
505 nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
506}
507
508static void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap,
509 union nilfs_bmap_ptr_req *req)
510{
511 nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
512}
513
514static int nilfs_bmap_prepare_start_v(struct nilfs_bmap *bmap,
515 union nilfs_bmap_ptr_req *req)
516{
517 return nilfs_dat_prepare_start(nilfs_bmap_get_dat(bmap), &req->bpr_req);
518}
519
520static void nilfs_bmap_commit_start_v(struct nilfs_bmap *bmap,
521 union nilfs_bmap_ptr_req *req,
522 sector_t blocknr)
523{
524 nilfs_dat_commit_start(nilfs_bmap_get_dat(bmap), &req->bpr_req,
525 blocknr);
526}
527
528static void nilfs_bmap_abort_start_v(struct nilfs_bmap *bmap,
529 union nilfs_bmap_ptr_req *req)
530{
531 nilfs_dat_abort_start(nilfs_bmap_get_dat(bmap), &req->bpr_req);
532}
533
534static int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap,
535 union nilfs_bmap_ptr_req *req)
536{
537 return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
538}
539
540static void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap,
541 union nilfs_bmap_ptr_req *req)
542{
543 nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 0);
544}
545
546static void nilfs_bmap_commit_end_vmdt(struct nilfs_bmap *bmap,
547 union nilfs_bmap_ptr_req *req)
548{
549 nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 1);
550}
551
552static void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap,
553 union nilfs_bmap_ptr_req *req)
554{
555 nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
556}
557
558int nilfs_bmap_move_v(const struct nilfs_bmap *bmap, __u64 vblocknr,
559 sector_t blocknr)
560{
561 return nilfs_dat_move(nilfs_bmap_get_dat(bmap), vblocknr, blocknr);
562}
563
564int nilfs_bmap_mark_dirty(const struct nilfs_bmap *bmap, __u64 vblocknr)
565{
566 return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr);
567}
568
569int nilfs_bmap_prepare_update(struct nilfs_bmap *bmap,
570 union nilfs_bmap_ptr_req *oldreq,
571 union nilfs_bmap_ptr_req *newreq)
572{
573 int ret;
574
575 ret = bmap->b_pops->bpop_prepare_end_ptr(bmap, oldreq);
576 if (ret < 0)
577 return ret;
578 ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, newreq);
579 if (ret < 0)
580 bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq);
581
582 return ret;
583}
584
585void nilfs_bmap_commit_update(struct nilfs_bmap *bmap,
586 union nilfs_bmap_ptr_req *oldreq,
587 union nilfs_bmap_ptr_req *newreq)
588{
589 bmap->b_pops->bpop_commit_end_ptr(bmap, oldreq);
590 bmap->b_pops->bpop_commit_alloc_ptr(bmap, newreq);
591}
592
593void nilfs_bmap_abort_update(struct nilfs_bmap *bmap,
594 union nilfs_bmap_ptr_req *oldreq,
595 union nilfs_bmap_ptr_req *newreq)
596{
597 bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq);
598 bmap->b_pops->bpop_abort_alloc_ptr(bmap, newreq);
599}
600
601static int nilfs_bmap_translate_v(const struct nilfs_bmap *bmap, __u64 ptr,
602 __u64 *ptrp)
603{
604 sector_t blocknr;
605 int ret;
606
607 ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), ptr, &blocknr);
608 if (ret < 0)
609 return ret;
610 if (ptrp != NULL)
611 *ptrp = blocknr;
612 return 0;
613}
614
615static int nilfs_bmap_prepare_alloc_p(struct nilfs_bmap *bmap,
616 union nilfs_bmap_ptr_req *req)
617{
618 /* ignore target ptr */
619 req->bpr_ptr = bmap->b_last_allocated_ptr++;
620 return 0;
621}
622
623static void nilfs_bmap_commit_alloc_p(struct nilfs_bmap *bmap,
624 union nilfs_bmap_ptr_req *req)
625{
626 /* do nothing */
627}
628
629static void nilfs_bmap_abort_alloc_p(struct nilfs_bmap *bmap,
630 union nilfs_bmap_ptr_req *req)
631{
632 bmap->b_last_allocated_ptr--;
633}
634
635static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_v = {
636 .bpop_prepare_alloc_ptr = nilfs_bmap_prepare_alloc_v,
637 .bpop_commit_alloc_ptr = nilfs_bmap_commit_alloc_v,
638 .bpop_abort_alloc_ptr = nilfs_bmap_abort_alloc_v,
639 .bpop_prepare_start_ptr = nilfs_bmap_prepare_start_v,
640 .bpop_commit_start_ptr = nilfs_bmap_commit_start_v,
641 .bpop_abort_start_ptr = nilfs_bmap_abort_start_v,
642 .bpop_prepare_end_ptr = nilfs_bmap_prepare_end_v,
643 .bpop_commit_end_ptr = nilfs_bmap_commit_end_v,
644 .bpop_abort_end_ptr = nilfs_bmap_abort_end_v,
645
646 .bpop_translate = nilfs_bmap_translate_v,
647};
648
649static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_vmdt = {
650 .bpop_prepare_alloc_ptr = nilfs_bmap_prepare_alloc_v,
651 .bpop_commit_alloc_ptr = nilfs_bmap_commit_alloc_v,
652 .bpop_abort_alloc_ptr = nilfs_bmap_abort_alloc_v,
653 .bpop_prepare_start_ptr = nilfs_bmap_prepare_start_v,
654 .bpop_commit_start_ptr = nilfs_bmap_commit_start_v,
655 .bpop_abort_start_ptr = nilfs_bmap_abort_start_v,
656 .bpop_prepare_end_ptr = nilfs_bmap_prepare_end_v,
657 .bpop_commit_end_ptr = nilfs_bmap_commit_end_vmdt,
658 .bpop_abort_end_ptr = nilfs_bmap_abort_end_v,
659
660 .bpop_translate = nilfs_bmap_translate_v,
661};
662
663static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_p = {
664 .bpop_prepare_alloc_ptr = nilfs_bmap_prepare_alloc_p,
665 .bpop_commit_alloc_ptr = nilfs_bmap_commit_alloc_p,
666 .bpop_abort_alloc_ptr = nilfs_bmap_abort_alloc_p,
667 .bpop_prepare_start_ptr = NULL,
668 .bpop_commit_start_ptr = NULL,
669 .bpop_abort_start_ptr = NULL,
670 .bpop_prepare_end_ptr = NULL,
671 .bpop_commit_end_ptr = NULL,
672 .bpop_abort_end_ptr = NULL,
673
674 .bpop_translate = NULL,
675};
676
677static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_gc = {
678 .bpop_prepare_alloc_ptr = NULL,
679 .bpop_commit_alloc_ptr = NULL,
680 .bpop_abort_alloc_ptr = NULL,
681 .bpop_prepare_start_ptr = NULL,
682 .bpop_commit_start_ptr = NULL,
683 .bpop_abort_start_ptr = NULL,
684 .bpop_prepare_end_ptr = NULL,
685 .bpop_commit_end_ptr = NULL,
686 .bpop_abort_end_ptr = NULL,
687
688 .bpop_translate = NULL,
689};
690
691static struct lock_class_key nilfs_bmap_dat_lock_key;
692
693/**
694 * nilfs_bmap_read - read a bmap from an inode
695 * @bmap: bmap
696 * @raw_inode: on-disk inode
697 *
698 * Description: nilfs_bmap_read() initializes the bmap @bmap.
699 *
700 * Return Value: On success, 0 is returned. On error, the following negative
701 * error code is returned.
702 *
703 * %-ENOMEM - Insufficient amount of memory available.
704 */
705int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
706{
707 if (raw_inode == NULL)
708 memset(bmap->b_u.u_data, 0, NILFS_BMAP_SIZE);
709 else
710 memcpy(bmap->b_u.u_data, raw_inode->i_bmap, NILFS_BMAP_SIZE);
711
712 init_rwsem(&bmap->b_sem);
713 bmap->b_state = 0;
714 bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
715 switch (bmap->b_inode->i_ino) {
716 case NILFS_DAT_INO:
717 bmap->b_pops = &nilfs_bmap_ptr_ops_p;
718 bmap->b_last_allocated_key = 0; /* XXX: use macro */
719 bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
720 lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
721 break;
722 case NILFS_CPFILE_INO:
723 case NILFS_SUFILE_INO:
724 bmap->b_pops = &nilfs_bmap_ptr_ops_vmdt;
725 bmap->b_last_allocated_key = 0; /* XXX: use macro */
726 bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
727 break;
728 default:
729 bmap->b_pops = &nilfs_bmap_ptr_ops_v;
730 bmap->b_last_allocated_key = 0; /* XXX: use macro */
731 bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
732 break;
733 }
734
735 return (bmap->b_u.u_flags & NILFS_BMAP_LARGE) ?
736 nilfs_btree_init(bmap,
737 NILFS_BMAP_LARGE_LOW,
738 NILFS_BMAP_LARGE_HIGH) :
739 nilfs_direct_init(bmap,
740 NILFS_BMAP_SMALL_LOW,
741 NILFS_BMAP_SMALL_HIGH);
742}
743
744/**
745 * nilfs_bmap_write - write back a bmap to an inode
746 * @bmap: bmap
747 * @raw_inode: on-disk inode
748 *
749 * Description: nilfs_bmap_write() stores @bmap in @raw_inode.
750 */
751void nilfs_bmap_write(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
752{
753 down_write(&bmap->b_sem);
754 memcpy(raw_inode->i_bmap, bmap->b_u.u_data,
755 NILFS_INODE_BMAP_SIZE * sizeof(__le64));
756 if (bmap->b_inode->i_ino == NILFS_DAT_INO)
757 bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
758
759 up_write(&bmap->b_sem);
760}
761
762void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
763{
764 memset(&bmap->b_u, 0, NILFS_BMAP_SIZE);
765 init_rwsem(&bmap->b_sem);
766 bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
767 bmap->b_pops = &nilfs_bmap_ptr_ops_gc;
768 bmap->b_last_allocated_key = 0;
769 bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
770 bmap->b_state = 0;
771 nilfs_btree_init_gc(bmap);
772}
773
774void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
775{
776 memcpy(gcbmap, bmap, sizeof(union nilfs_bmap_union));
777 init_rwsem(&gcbmap->b_sem);
778 lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
779 gcbmap->b_inode = &NILFS_BMAP_I(gcbmap)->vfs_inode;
780}
781
782void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
783{
784 memcpy(bmap, gcbmap, sizeof(union nilfs_bmap_union));
785 init_rwsem(&bmap->b_sem);
786 lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
787 bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
788}
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
new file mode 100644
index 000000000000..4f2708abb1ba
--- /dev/null
+++ b/fs/nilfs2/bmap.h
@@ -0,0 +1,244 @@
1/*
2 * bmap.h - NILFS block mapping.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_BMAP_H
24#define _NILFS_BMAP_H
25
26#include <linux/types.h>
27#include <linux/fs.h>
28#include <linux/buffer_head.h>
29#include <linux/nilfs2_fs.h>
30#include "alloc.h"
31
32#define NILFS_BMAP_INVALID_PTR 0
33
34#define nilfs_bmap_dkey_to_key(dkey) le64_to_cpu(dkey)
35#define nilfs_bmap_key_to_dkey(key) cpu_to_le64(key)
36#define nilfs_bmap_dptr_to_ptr(dptr) le64_to_cpu(dptr)
37#define nilfs_bmap_ptr_to_dptr(ptr) cpu_to_le64(ptr)
38
39#define nilfs_bmap_keydiff_abs(diff) ((diff) < 0 ? -(diff) : (diff))
40
41
42struct nilfs_bmap;
43
44/**
45 * union nilfs_bmap_ptr_req - request for bmap ptr
46 * @bpr_ptr: bmap pointer
47 * @bpr_req: request for persistent allocator
48 */
49union nilfs_bmap_ptr_req {
50 __u64 bpr_ptr;
51 struct nilfs_palloc_req bpr_req;
52};
53
54/**
55 * struct nilfs_bmap_stats - bmap statistics
56 * @bs_nblocks: number of blocks created or deleted
57 */
58struct nilfs_bmap_stats {
59 unsigned int bs_nblocks;
60};
61
62/**
63 * struct nilfs_bmap_operations - bmap operation table
64 */
65struct nilfs_bmap_operations {
66 int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *);
67 int (*bop_insert)(struct nilfs_bmap *, __u64, __u64);
68 int (*bop_delete)(struct nilfs_bmap *, __u64);
69 void (*bop_clear)(struct nilfs_bmap *);
70
71 int (*bop_propagate)(const struct nilfs_bmap *, struct buffer_head *);
72 void (*bop_lookup_dirty_buffers)(struct nilfs_bmap *,
73 struct list_head *);
74
75 int (*bop_assign)(struct nilfs_bmap *,
76 struct buffer_head **,
77 sector_t,
78 union nilfs_binfo *);
79 int (*bop_mark)(struct nilfs_bmap *, __u64, int);
80
81 /* The following functions are internal use only. */
82 int (*bop_last_key)(const struct nilfs_bmap *, __u64 *);
83 int (*bop_check_insert)(const struct nilfs_bmap *, __u64);
84 int (*bop_check_delete)(struct nilfs_bmap *, __u64);
85 int (*bop_gather_data)(struct nilfs_bmap *, __u64 *, __u64 *, int);
86};
87
88
89/**
90 * struct nilfs_bmap_ptr_operations - bmap ptr operation table
91 */
92struct nilfs_bmap_ptr_operations {
93 int (*bpop_prepare_alloc_ptr)(struct nilfs_bmap *,
94 union nilfs_bmap_ptr_req *);
95 void (*bpop_commit_alloc_ptr)(struct nilfs_bmap *,
96 union nilfs_bmap_ptr_req *);
97 void (*bpop_abort_alloc_ptr)(struct nilfs_bmap *,
98 union nilfs_bmap_ptr_req *);
99 int (*bpop_prepare_start_ptr)(struct nilfs_bmap *,
100 union nilfs_bmap_ptr_req *);
101 void (*bpop_commit_start_ptr)(struct nilfs_bmap *,
102 union nilfs_bmap_ptr_req *,
103 sector_t);
104 void (*bpop_abort_start_ptr)(struct nilfs_bmap *,
105 union nilfs_bmap_ptr_req *);
106 int (*bpop_prepare_end_ptr)(struct nilfs_bmap *,
107 union nilfs_bmap_ptr_req *);
108 void (*bpop_commit_end_ptr)(struct nilfs_bmap *,
109 union nilfs_bmap_ptr_req *);
110 void (*bpop_abort_end_ptr)(struct nilfs_bmap *,
111 union nilfs_bmap_ptr_req *);
112
113 int (*bpop_translate)(const struct nilfs_bmap *, __u64, __u64 *);
114};
115
116
117#define NILFS_BMAP_SIZE (NILFS_INODE_BMAP_SIZE * sizeof(__le64))
118#define NILFS_BMAP_KEY_BIT (sizeof(unsigned long) * 8 /* CHAR_BIT */)
119#define NILFS_BMAP_NEW_PTR_INIT \
120 (1UL << (sizeof(unsigned long) * 8 /* CHAR_BIT */ - 1))
121
122static inline int nilfs_bmap_is_new_ptr(unsigned long ptr)
123{
124 return !!(ptr & NILFS_BMAP_NEW_PTR_INIT);
125}
126
127
128/**
129 * struct nilfs_bmap - bmap structure
130 * @b_u: raw data
131 * @b_sem: semaphore
132 * @b_inode: owner of bmap
133 * @b_ops: bmap operation table
134 * @b_pops: bmap ptr operation table
135 * @b_low: low watermark of conversion
136 * @b_high: high watermark of conversion
137 * @b_last_allocated_key: last allocated key for data block
138 * @b_last_allocated_ptr: last allocated ptr for data block
139 * @b_state: state
140 */
141struct nilfs_bmap {
142 union {
143 __u8 u_flags;
144 __le64 u_data[NILFS_BMAP_SIZE / sizeof(__le64)];
145 } b_u;
146 struct rw_semaphore b_sem;
147 struct inode *b_inode;
148 const struct nilfs_bmap_operations *b_ops;
149 const struct nilfs_bmap_ptr_operations *b_pops;
150 __u64 b_low;
151 __u64 b_high;
152 __u64 b_last_allocated_key;
153 __u64 b_last_allocated_ptr;
154 int b_state;
155};
156
157/* state */
158#define NILFS_BMAP_DIRTY 0x00000001
159
160
161int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
162int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
163void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
164int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *);
165int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long);
166int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long);
167int nilfs_bmap_last_key(struct nilfs_bmap *, unsigned long *);
168int nilfs_bmap_truncate(struct nilfs_bmap *, unsigned long);
169void nilfs_bmap_clear(struct nilfs_bmap *);
170int nilfs_bmap_propagate(struct nilfs_bmap *, struct buffer_head *);
171void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *, struct list_head *);
172int nilfs_bmap_assign(struct nilfs_bmap *, struct buffer_head **,
173 unsigned long, union nilfs_binfo *);
174int nilfs_bmap_lookup_at_level(struct nilfs_bmap *, __u64, int, __u64 *);
175int nilfs_bmap_mark(struct nilfs_bmap *, __u64, int);
176
177void nilfs_bmap_init_gc(struct nilfs_bmap *);
178void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
179void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
180
181
182/*
183 * Internal use only
184 */
185
186int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t);
187int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64);
188
189
190__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
191 const struct buffer_head *);
192
193__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64);
194__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *);
195
196int nilfs_bmap_prepare_update(struct nilfs_bmap *,
197 union nilfs_bmap_ptr_req *,
198 union nilfs_bmap_ptr_req *);
199void nilfs_bmap_commit_update(struct nilfs_bmap *,
200 union nilfs_bmap_ptr_req *,
201 union nilfs_bmap_ptr_req *);
202void nilfs_bmap_abort_update(struct nilfs_bmap *,
203 union nilfs_bmap_ptr_req *,
204 union nilfs_bmap_ptr_req *);
205
206void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int);
207void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int);
208
209
210int nilfs_bmap_get_block(const struct nilfs_bmap *, __u64,
211 struct buffer_head **);
212void nilfs_bmap_put_block(const struct nilfs_bmap *, struct buffer_head *);
213int nilfs_bmap_get_new_block(const struct nilfs_bmap *, __u64,
214 struct buffer_head **);
215void nilfs_bmap_delete_block(const struct nilfs_bmap *, struct buffer_head *);
216
217
218/* Assume that bmap semaphore is locked. */
219static inline int nilfs_bmap_dirty(const struct nilfs_bmap *bmap)
220{
221 return !!(bmap->b_state & NILFS_BMAP_DIRTY);
222}
223
224/* Assume that bmap semaphore is locked. */
225static inline void nilfs_bmap_set_dirty(struct nilfs_bmap *bmap)
226{
227 bmap->b_state |= NILFS_BMAP_DIRTY;
228}
229
230/* Assume that bmap semaphore is locked. */
231static inline void nilfs_bmap_clear_dirty(struct nilfs_bmap *bmap)
232{
233 bmap->b_state &= ~NILFS_BMAP_DIRTY;
234}
235
236
237#define NILFS_BMAP_LARGE 0x1
238
239#define NILFS_BMAP_SMALL_LOW NILFS_DIRECT_KEY_MIN
240#define NILFS_BMAP_SMALL_HIGH NILFS_DIRECT_KEY_MAX
241#define NILFS_BMAP_LARGE_LOW NILFS_BTREE_ROOT_NCHILDREN_MAX
242#define NILFS_BMAP_LARGE_HIGH NILFS_BTREE_KEY_MAX
243
244#endif /* _NILFS_BMAP_H */
diff --git a/fs/nilfs2/bmap_union.h b/fs/nilfs2/bmap_union.h
new file mode 100644
index 000000000000..d41509bff47b
--- /dev/null
+++ b/fs/nilfs2/bmap_union.h
@@ -0,0 +1,42 @@
1/*
2 * bmap_union.h - NILFS block mapping.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_BMAP_UNION_H
24#define _NILFS_BMAP_UNION_H
25
26#include "bmap.h"
27#include "direct.h"
28#include "btree.h"
29
30/**
31 * nilfs_bmap_union -
32 * @bi_bmap: bmap structure
33 * @bi_btree: direct map structure
34 * @bi_direct: B-tree structure
35 */
36union nilfs_bmap_union {
37 struct nilfs_bmap bi_bmap;
38 struct nilfs_direct bi_direct;
39 struct nilfs_btree bi_btree;
40};
41
42#endif /* _NILFS_BMAP_UNION_H */
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
new file mode 100644
index 000000000000..4cc07b2c30e0
--- /dev/null
+++ b/fs/nilfs2/btnode.c
@@ -0,0 +1,316 @@
1/*
2 * btnode.c - NILFS B-tree node cache
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * This file was originally written by Seiji Kihara <kihara@osrg.net>
21 * and fully revised by Ryusuke Konishi <ryusuke@osrg.net> for
22 * stabilization and simplification.
23 *
24 */
25
26#include <linux/types.h>
27#include <linux/buffer_head.h>
28#include <linux/mm.h>
29#include <linux/backing-dev.h>
30#include "nilfs.h"
31#include "mdt.h"
32#include "dat.h"
33#include "page.h"
34#include "btnode.h"
35
36
37void nilfs_btnode_cache_init_once(struct address_space *btnc)
38{
39 INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC);
40 spin_lock_init(&btnc->tree_lock);
41 INIT_LIST_HEAD(&btnc->private_list);
42 spin_lock_init(&btnc->private_lock);
43
44 spin_lock_init(&btnc->i_mmap_lock);
45 INIT_RAW_PRIO_TREE_ROOT(&btnc->i_mmap);
46 INIT_LIST_HEAD(&btnc->i_mmap_nonlinear);
47}
48
49static struct address_space_operations def_btnode_aops;
50
51void nilfs_btnode_cache_init(struct address_space *btnc)
52{
53 btnc->host = NULL; /* can safely set to host inode ? */
54 btnc->flags = 0;
55 mapping_set_gfp_mask(btnc, GFP_NOFS);
56 btnc->assoc_mapping = NULL;
57 btnc->backing_dev_info = &default_backing_dev_info;
58 btnc->a_ops = &def_btnode_aops;
59}
60
61void nilfs_btnode_cache_clear(struct address_space *btnc)
62{
63 invalidate_mapping_pages(btnc, 0, -1);
64 truncate_inode_pages(btnc, 0);
65}
66
67int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
68 sector_t pblocknr, struct buffer_head **pbh,
69 int newblk)
70{
71 struct buffer_head *bh;
72 struct inode *inode = NILFS_BTNC_I(btnc);
73 int err;
74
75 bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
76 if (unlikely(!bh))
77 return -ENOMEM;
78
79 err = -EEXIST; /* internal code */
80 if (newblk) {
81 if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) ||
82 buffer_dirty(bh))) {
83 brelse(bh);
84 BUG();
85 }
86 bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
87 bh->b_blocknr = blocknr;
88 set_buffer_mapped(bh);
89 set_buffer_uptodate(bh);
90 goto found;
91 }
92
93 if (buffer_uptodate(bh) || buffer_dirty(bh))
94 goto found;
95
96 if (pblocknr == 0) {
97 pblocknr = blocknr;
98 if (inode->i_ino != NILFS_DAT_INO) {
99 struct inode *dat =
100 nilfs_dat_inode(NILFS_I_NILFS(inode));
101
102 /* blocknr is a virtual block number */
103 err = nilfs_dat_translate(dat, blocknr, &pblocknr);
104 if (unlikely(err)) {
105 brelse(bh);
106 goto out_locked;
107 }
108 }
109 }
110 lock_buffer(bh);
111 if (buffer_uptodate(bh)) {
112 unlock_buffer(bh);
113 err = -EEXIST; /* internal code */
114 goto found;
115 }
116 set_buffer_mapped(bh);
117 bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
118 bh->b_blocknr = pblocknr; /* set block address for read */
119 bh->b_end_io = end_buffer_read_sync;
120 get_bh(bh);
121 submit_bh(READ, bh);
122 bh->b_blocknr = blocknr; /* set back to the given block address */
123 err = 0;
124found:
125 *pbh = bh;
126
127out_locked:
128 unlock_page(bh->b_page);
129 page_cache_release(bh->b_page);
130 return err;
131}
132
133int nilfs_btnode_get(struct address_space *btnc, __u64 blocknr,
134 sector_t pblocknr, struct buffer_head **pbh, int newblk)
135{
136 struct buffer_head *bh;
137 int err;
138
139 err = nilfs_btnode_submit_block(btnc, blocknr, pblocknr, pbh, newblk);
140 if (err == -EEXIST) /* internal code (cache hit) */
141 return 0;
142 if (unlikely(err))
143 return err;
144
145 bh = *pbh;
146 wait_on_buffer(bh);
147 if (!buffer_uptodate(bh)) {
148 brelse(bh);
149 return -EIO;
150 }
151 return 0;
152}
153
154/**
155 * nilfs_btnode_delete - delete B-tree node buffer
156 * @bh: buffer to be deleted
157 *
158 * nilfs_btnode_delete() invalidates the specified buffer and delete the page
159 * including the buffer if the page gets unbusy.
160 */
161void nilfs_btnode_delete(struct buffer_head *bh)
162{
163 struct address_space *mapping;
164 struct page *page = bh->b_page;
165 pgoff_t index = page_index(page);
166 int still_dirty;
167
168 page_cache_get(page);
169 lock_page(page);
170 wait_on_page_writeback(page);
171
172 nilfs_forget_buffer(bh);
173 still_dirty = PageDirty(page);
174 mapping = page->mapping;
175 unlock_page(page);
176 page_cache_release(page);
177
178 if (!still_dirty && mapping)
179 invalidate_inode_pages2_range(mapping, index, index);
180}
181
182/**
183 * nilfs_btnode_prepare_change_key
184 * prepare to move contents of the block for old key to one of new key.
185 * the old buffer will not be removed, but might be reused for new buffer.
186 * it might return -ENOMEM because of memory allocation errors,
187 * and might return -EIO because of disk read errors.
188 */
189int nilfs_btnode_prepare_change_key(struct address_space *btnc,
190 struct nilfs_btnode_chkey_ctxt *ctxt)
191{
192 struct buffer_head *obh, *nbh;
193 struct inode *inode = NILFS_BTNC_I(btnc);
194 __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
195 int err;
196
197 if (oldkey == newkey)
198 return 0;
199
200 obh = ctxt->bh;
201 ctxt->newbh = NULL;
202
203 if (inode->i_blkbits == PAGE_CACHE_SHIFT) {
204 lock_page(obh->b_page);
205 /*
206 * We cannot call radix_tree_preload for the kernels older
207 * than 2.6.23, because it is not exported for modules.
208 */
209 err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
210 if (err)
211 goto failed_unlock;
212 /* BUG_ON(oldkey != obh->b_page->index); */
213 if (unlikely(oldkey != obh->b_page->index))
214 NILFS_PAGE_BUG(obh->b_page,
215 "invalid oldkey %lld (newkey=%lld)",
216 (unsigned long long)oldkey,
217 (unsigned long long)newkey);
218
219retry:
220 spin_lock_irq(&btnc->tree_lock);
221 err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page);
222 spin_unlock_irq(&btnc->tree_lock);
223 /*
224 * Note: page->index will not change to newkey until
225 * nilfs_btnode_commit_change_key() will be called.
226 * To protect the page in intermediate state, the page lock
227 * is held.
228 */
229 radix_tree_preload_end();
230 if (!err)
231 return 0;
232 else if (err != -EEXIST)
233 goto failed_unlock;
234
235 err = invalidate_inode_pages2_range(btnc, newkey, newkey);
236 if (!err)
237 goto retry;
238 /* fallback to copy mode */
239 unlock_page(obh->b_page);
240 }
241
242 err = nilfs_btnode_get(btnc, newkey, 0, &nbh, 1);
243 if (likely(!err)) {
244 BUG_ON(nbh == obh);
245 ctxt->newbh = nbh;
246 }
247 return err;
248
249 failed_unlock:
250 unlock_page(obh->b_page);
251 return err;
252}
253
254/**
255 * nilfs_btnode_commit_change_key
256 * commit the change_key operation prepared by prepare_change_key().
257 */
258void nilfs_btnode_commit_change_key(struct address_space *btnc,
259 struct nilfs_btnode_chkey_ctxt *ctxt)
260{
261 struct buffer_head *obh = ctxt->bh, *nbh = ctxt->newbh;
262 __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
263 struct page *opage;
264
265 if (oldkey == newkey)
266 return;
267
268 if (nbh == NULL) { /* blocksize == pagesize */
269 opage = obh->b_page;
270 if (unlikely(oldkey != opage->index))
271 NILFS_PAGE_BUG(opage,
272 "invalid oldkey %lld (newkey=%lld)",
273 (unsigned long long)oldkey,
274 (unsigned long long)newkey);
275 if (!test_set_buffer_dirty(obh) && TestSetPageDirty(opage))
276 BUG();
277
278 spin_lock_irq(&btnc->tree_lock);
279 radix_tree_delete(&btnc->page_tree, oldkey);
280 radix_tree_tag_set(&btnc->page_tree, newkey,
281 PAGECACHE_TAG_DIRTY);
282 spin_unlock_irq(&btnc->tree_lock);
283
284 opage->index = obh->b_blocknr = newkey;
285 unlock_page(opage);
286 } else {
287 nilfs_copy_buffer(nbh, obh);
288 nilfs_btnode_mark_dirty(nbh);
289
290 nbh->b_blocknr = newkey;
291 ctxt->bh = nbh;
292 nilfs_btnode_delete(obh); /* will decrement bh->b_count */
293 }
294}
295
296/**
297 * nilfs_btnode_abort_change_key
298 * abort the change_key operation prepared by prepare_change_key().
299 */
300void nilfs_btnode_abort_change_key(struct address_space *btnc,
301 struct nilfs_btnode_chkey_ctxt *ctxt)
302{
303 struct buffer_head *nbh = ctxt->newbh;
304 __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
305
306 if (oldkey == newkey)
307 return;
308
309 if (nbh == NULL) { /* blocksize == pagesize */
310 spin_lock_irq(&btnc->tree_lock);
311 radix_tree_delete(&btnc->page_tree, newkey);
312 spin_unlock_irq(&btnc->tree_lock);
313 unlock_page(ctxt->bh->b_page);
314 } else
315 brelse(nbh);
316}
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
new file mode 100644
index 000000000000..35faa86444a7
--- /dev/null
+++ b/fs/nilfs2/btnode.h
@@ -0,0 +1,58 @@
1/*
2 * btnode.h - NILFS B-tree node cache
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Seiji Kihara <kihara@osrg.net>
21 * Revised by Ryusuke Konishi <ryusuke@osrg.net>
22 */
23
24#ifndef _NILFS_BTNODE_H
25#define _NILFS_BTNODE_H
26
27#include <linux/types.h>
28#include <linux/buffer_head.h>
29#include <linux/fs.h>
30#include <linux/backing-dev.h>
31
32
33struct nilfs_btnode_chkey_ctxt {
34 __u64 oldkey;
35 __u64 newkey;
36 struct buffer_head *bh;
37 struct buffer_head *newbh;
38};
39
40void nilfs_btnode_cache_init_once(struct address_space *);
41void nilfs_btnode_cache_init(struct address_space *);
42void nilfs_btnode_cache_clear(struct address_space *);
43int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t,
44 struct buffer_head **, int);
45int nilfs_btnode_get(struct address_space *, __u64, sector_t,
46 struct buffer_head **, int);
47void nilfs_btnode_delete(struct buffer_head *);
48int nilfs_btnode_prepare_change_key(struct address_space *,
49 struct nilfs_btnode_chkey_ctxt *);
50void nilfs_btnode_commit_change_key(struct address_space *,
51 struct nilfs_btnode_chkey_ctxt *);
52void nilfs_btnode_abort_change_key(struct address_space *,
53 struct nilfs_btnode_chkey_ctxt *);
54
55#define nilfs_btnode_mark_dirty(bh) nilfs_mark_buffer_dirty(bh)
56
57
58#endif /* _NILFS_BTNODE_H */
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
new file mode 100644
index 000000000000..6b37a2767293
--- /dev/null
+++ b/fs/nilfs2/btree.c
@@ -0,0 +1,2269 @@
1/*
2 * btree.c - NILFS B-tree.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#include <linux/slab.h>
24#include <linux/string.h>
25#include <linux/errno.h>
26#include <linux/pagevec.h>
27#include "nilfs.h"
28#include "page.h"
29#include "btnode.h"
30#include "btree.h"
31#include "alloc.h"
32
33/**
34 * struct nilfs_btree_path - A path on which B-tree operations are executed
35 * @bp_bh: buffer head of node block
36 * @bp_sib_bh: buffer head of sibling node block
37 * @bp_index: index of child node
38 * @bp_oldreq: ptr end request for old ptr
39 * @bp_newreq: ptr alloc request for new ptr
40 * @bp_op: rebalance operation
41 */
42struct nilfs_btree_path {
43 struct buffer_head *bp_bh;
44 struct buffer_head *bp_sib_bh;
45 int bp_index;
46 union nilfs_bmap_ptr_req bp_oldreq;
47 union nilfs_bmap_ptr_req bp_newreq;
48 struct nilfs_btnode_chkey_ctxt bp_ctxt;
49 void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *,
50 int, __u64 *, __u64 *);
51};
52
53/*
54 * B-tree path operations
55 */
56
57static struct kmem_cache *nilfs_btree_path_cache;
58
59int __init nilfs_btree_path_cache_init(void)
60{
61 nilfs_btree_path_cache =
62 kmem_cache_create("nilfs2_btree_path_cache",
63 sizeof(struct nilfs_btree_path) *
64 NILFS_BTREE_LEVEL_MAX, 0, 0, NULL);
65 return (nilfs_btree_path_cache != NULL) ? 0 : -ENOMEM;
66}
67
68void nilfs_btree_path_cache_destroy(void)
69{
70 kmem_cache_destroy(nilfs_btree_path_cache);
71}
72
73static inline struct nilfs_btree_path *
74nilfs_btree_alloc_path(const struct nilfs_btree *btree)
75{
76 return (struct nilfs_btree_path *)
77 kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
78}
79
80static inline void nilfs_btree_free_path(const struct nilfs_btree *btree,
81 struct nilfs_btree_path *path)
82{
83 kmem_cache_free(nilfs_btree_path_cache, path);
84}
85
86static void nilfs_btree_init_path(const struct nilfs_btree *btree,
87 struct nilfs_btree_path *path)
88{
89 int level;
90
91 for (level = NILFS_BTREE_LEVEL_DATA;
92 level < NILFS_BTREE_LEVEL_MAX;
93 level++) {
94 path[level].bp_bh = NULL;
95 path[level].bp_sib_bh = NULL;
96 path[level].bp_index = 0;
97 path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
98 path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
99 path[level].bp_op = NULL;
100 }
101}
102
103static void nilfs_btree_clear_path(const struct nilfs_btree *btree,
104 struct nilfs_btree_path *path)
105{
106 int level;
107
108 for (level = NILFS_BTREE_LEVEL_DATA;
109 level < NILFS_BTREE_LEVEL_MAX;
110 level++) {
111 if (path[level].bp_bh != NULL) {
112 nilfs_bmap_put_block(&btree->bt_bmap,
113 path[level].bp_bh);
114 path[level].bp_bh = NULL;
115 }
116 /* sib_bh is released or deleted by prepare or commit
117 * operations. */
118 path[level].bp_sib_bh = NULL;
119 path[level].bp_index = 0;
120 path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
121 path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
122 path[level].bp_op = NULL;
123 }
124}
125
126
127/*
128 * B-tree node operations
129 */
130
131static inline int
132nilfs_btree_node_get_flags(const struct nilfs_btree *btree,
133 const struct nilfs_btree_node *node)
134{
135 return node->bn_flags;
136}
137
138static inline void
139nilfs_btree_node_set_flags(struct nilfs_btree *btree,
140 struct nilfs_btree_node *node,
141 int flags)
142{
143 node->bn_flags = flags;
144}
145
146static inline int nilfs_btree_node_root(const struct nilfs_btree *btree,
147 const struct nilfs_btree_node *node)
148{
149 return nilfs_btree_node_get_flags(btree, node) & NILFS_BTREE_NODE_ROOT;
150}
151
152static inline int
153nilfs_btree_node_get_level(const struct nilfs_btree *btree,
154 const struct nilfs_btree_node *node)
155{
156 return node->bn_level;
157}
158
159static inline void
160nilfs_btree_node_set_level(struct nilfs_btree *btree,
161 struct nilfs_btree_node *node,
162 int level)
163{
164 node->bn_level = level;
165}
166
167static inline int
168nilfs_btree_node_get_nchildren(const struct nilfs_btree *btree,
169 const struct nilfs_btree_node *node)
170{
171 return le16_to_cpu(node->bn_nchildren);
172}
173
174static inline void
175nilfs_btree_node_set_nchildren(struct nilfs_btree *btree,
176 struct nilfs_btree_node *node,
177 int nchildren)
178{
179 node->bn_nchildren = cpu_to_le16(nchildren);
180}
181
182static inline int
183nilfs_btree_node_size(const struct nilfs_btree *btree)
184{
185 return 1 << btree->bt_bmap.b_inode->i_blkbits;
186}
187
188static inline int
189nilfs_btree_node_nchildren_min(const struct nilfs_btree *btree,
190 const struct nilfs_btree_node *node)
191{
192 return nilfs_btree_node_root(btree, node) ?
193 NILFS_BTREE_ROOT_NCHILDREN_MIN :
194 NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
195}
196
197static inline int
198nilfs_btree_node_nchildren_max(const struct nilfs_btree *btree,
199 const struct nilfs_btree_node *node)
200{
201 return nilfs_btree_node_root(btree, node) ?
202 NILFS_BTREE_ROOT_NCHILDREN_MAX :
203 NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree));
204}
205
206static inline __le64 *
207nilfs_btree_node_dkeys(const struct nilfs_btree *btree,
208 const struct nilfs_btree_node *node)
209{
210 return (__le64 *)((char *)(node + 1) +
211 (nilfs_btree_node_root(btree, node) ?
212 0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE));
213}
214
215static inline __le64 *
216nilfs_btree_node_dptrs(const struct nilfs_btree *btree,
217 const struct nilfs_btree_node *node)
218{
219 return (__le64 *)(nilfs_btree_node_dkeys(btree, node) +
220 nilfs_btree_node_nchildren_max(btree, node));
221}
222
223static inline __u64
224nilfs_btree_node_get_key(const struct nilfs_btree *btree,
225 const struct nilfs_btree_node *node, int index)
226{
227 return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(btree, node) +
228 index));
229}
230
231static inline void
232nilfs_btree_node_set_key(struct nilfs_btree *btree,
233 struct nilfs_btree_node *node, int index, __u64 key)
234{
235 *(nilfs_btree_node_dkeys(btree, node) + index) =
236 nilfs_bmap_key_to_dkey(key);
237}
238
239static inline __u64
240nilfs_btree_node_get_ptr(const struct nilfs_btree *btree,
241 const struct nilfs_btree_node *node,
242 int index)
243{
244 return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(btree, node) +
245 index));
246}
247
248static inline void
249nilfs_btree_node_set_ptr(struct nilfs_btree *btree,
250 struct nilfs_btree_node *node,
251 int index,
252 __u64 ptr)
253{
254 *(nilfs_btree_node_dptrs(btree, node) + index) =
255 nilfs_bmap_ptr_to_dptr(ptr);
256}
257
258static void nilfs_btree_node_init(struct nilfs_btree *btree,
259 struct nilfs_btree_node *node,
260 int flags, int level, int nchildren,
261 const __u64 *keys, const __u64 *ptrs)
262{
263 __le64 *dkeys;
264 __le64 *dptrs;
265 int i;
266
267 nilfs_btree_node_set_flags(btree, node, flags);
268 nilfs_btree_node_set_level(btree, node, level);
269 nilfs_btree_node_set_nchildren(btree, node, nchildren);
270
271 dkeys = nilfs_btree_node_dkeys(btree, node);
272 dptrs = nilfs_btree_node_dptrs(btree, node);
273 for (i = 0; i < nchildren; i++) {
274 dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]);
275 dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]);
276 }
277}
278
279/* Assume the buffer heads corresponding to left and right are locked. */
280static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
281 struct nilfs_btree_node *left,
282 struct nilfs_btree_node *right,
283 int n)
284{
285 __le64 *ldkeys, *rdkeys;
286 __le64 *ldptrs, *rdptrs;
287 int lnchildren, rnchildren;
288
289 ldkeys = nilfs_btree_node_dkeys(btree, left);
290 ldptrs = nilfs_btree_node_dptrs(btree, left);
291 lnchildren = nilfs_btree_node_get_nchildren(btree, left);
292
293 rdkeys = nilfs_btree_node_dkeys(btree, right);
294 rdptrs = nilfs_btree_node_dptrs(btree, right);
295 rnchildren = nilfs_btree_node_get_nchildren(btree, right);
296
297 memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys));
298 memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs));
299 memmove(rdkeys, rdkeys + n, (rnchildren - n) * sizeof(*rdkeys));
300 memmove(rdptrs, rdptrs + n, (rnchildren - n) * sizeof(*rdptrs));
301
302 lnchildren += n;
303 rnchildren -= n;
304 nilfs_btree_node_set_nchildren(btree, left, lnchildren);
305 nilfs_btree_node_set_nchildren(btree, right, rnchildren);
306}
307
308/* Assume that the buffer heads corresponding to left and right are locked. */
309static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
310 struct nilfs_btree_node *left,
311 struct nilfs_btree_node *right,
312 int n)
313{
314 __le64 *ldkeys, *rdkeys;
315 __le64 *ldptrs, *rdptrs;
316 int lnchildren, rnchildren;
317
318 ldkeys = nilfs_btree_node_dkeys(btree, left);
319 ldptrs = nilfs_btree_node_dptrs(btree, left);
320 lnchildren = nilfs_btree_node_get_nchildren(btree, left);
321
322 rdkeys = nilfs_btree_node_dkeys(btree, right);
323 rdptrs = nilfs_btree_node_dptrs(btree, right);
324 rnchildren = nilfs_btree_node_get_nchildren(btree, right);
325
326 memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys));
327 memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs));
328 memcpy(rdkeys, ldkeys + lnchildren - n, n * sizeof(*rdkeys));
329 memcpy(rdptrs, ldptrs + lnchildren - n, n * sizeof(*rdptrs));
330
331 lnchildren -= n;
332 rnchildren += n;
333 nilfs_btree_node_set_nchildren(btree, left, lnchildren);
334 nilfs_btree_node_set_nchildren(btree, right, rnchildren);
335}
336
337/* Assume that the buffer head corresponding to node is locked. */
338static void nilfs_btree_node_insert(struct nilfs_btree *btree,
339 struct nilfs_btree_node *node,
340 __u64 key, __u64 ptr, int index)
341{
342 __le64 *dkeys;
343 __le64 *dptrs;
344 int nchildren;
345
346 dkeys = nilfs_btree_node_dkeys(btree, node);
347 dptrs = nilfs_btree_node_dptrs(btree, node);
348 nchildren = nilfs_btree_node_get_nchildren(btree, node);
349 if (index < nchildren) {
350 memmove(dkeys + index + 1, dkeys + index,
351 (nchildren - index) * sizeof(*dkeys));
352 memmove(dptrs + index + 1, dptrs + index,
353 (nchildren - index) * sizeof(*dptrs));
354 }
355 dkeys[index] = nilfs_bmap_key_to_dkey(key);
356 dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr);
357 nchildren++;
358 nilfs_btree_node_set_nchildren(btree, node, nchildren);
359}
360
361/* Assume that the buffer head corresponding to node is locked. */
362static void nilfs_btree_node_delete(struct nilfs_btree *btree,
363 struct nilfs_btree_node *node,
364 __u64 *keyp, __u64 *ptrp, int index)
365{
366 __u64 key;
367 __u64 ptr;
368 __le64 *dkeys;
369 __le64 *dptrs;
370 int nchildren;
371
372 dkeys = nilfs_btree_node_dkeys(btree, node);
373 dptrs = nilfs_btree_node_dptrs(btree, node);
374 key = nilfs_bmap_dkey_to_key(dkeys[index]);
375 ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]);
376 nchildren = nilfs_btree_node_get_nchildren(btree, node);
377 if (keyp != NULL)
378 *keyp = key;
379 if (ptrp != NULL)
380 *ptrp = ptr;
381
382 if (index < nchildren - 1) {
383 memmove(dkeys + index, dkeys + index + 1,
384 (nchildren - index - 1) * sizeof(*dkeys));
385 memmove(dptrs + index, dptrs + index + 1,
386 (nchildren - index - 1) * sizeof(*dptrs));
387 }
388 nchildren--;
389 nilfs_btree_node_set_nchildren(btree, node, nchildren);
390}
391
392static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
393 const struct nilfs_btree_node *node,
394 __u64 key, int *indexp)
395{
396 __u64 nkey;
397 int index, low, high, s;
398
399 /* binary search */
400 low = 0;
401 high = nilfs_btree_node_get_nchildren(btree, node) - 1;
402 index = 0;
403 s = 0;
404 while (low <= high) {
405 index = (low + high) / 2;
406 nkey = nilfs_btree_node_get_key(btree, node, index);
407 if (nkey == key) {
408 s = 0;
409 goto out;
410 } else if (nkey < key) {
411 low = index + 1;
412 s = -1;
413 } else {
414 high = index - 1;
415 s = 1;
416 }
417 }
418
419 /* adjust index */
420 if (nilfs_btree_node_get_level(btree, node) >
421 NILFS_BTREE_LEVEL_NODE_MIN) {
422 if ((s > 0) && (index > 0))
423 index--;
424 } else if (s < 0)
425 index++;
426
427 out:
428 *indexp = index;
429
430 return s == 0;
431}
432
433static inline struct nilfs_btree_node *
434nilfs_btree_get_root(const struct nilfs_btree *btree)
435{
436 return (struct nilfs_btree_node *)btree->bt_bmap.b_u.u_data;
437}
438
439static inline struct nilfs_btree_node *
440nilfs_btree_get_nonroot_node(const struct nilfs_btree *btree,
441 const struct nilfs_btree_path *path,
442 int level)
443{
444 return (struct nilfs_btree_node *)path[level].bp_bh->b_data;
445}
446
447static inline struct nilfs_btree_node *
448nilfs_btree_get_sib_node(const struct nilfs_btree *btree,
449 const struct nilfs_btree_path *path,
450 int level)
451{
452 return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data;
453}
454
455static inline int nilfs_btree_height(const struct nilfs_btree *btree)
456{
457 return nilfs_btree_node_get_level(btree, nilfs_btree_get_root(btree))
458 + 1;
459}
460
461static inline struct nilfs_btree_node *
462nilfs_btree_get_node(const struct nilfs_btree *btree,
463 const struct nilfs_btree_path *path,
464 int level)
465{
466 return (level == nilfs_btree_height(btree) - 1) ?
467 nilfs_btree_get_root(btree) :
468 nilfs_btree_get_nonroot_node(btree, path, level);
469}
470
471static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
472 struct nilfs_btree_path *path,
473 __u64 key, __u64 *ptrp, int minlevel)
474{
475 struct nilfs_btree_node *node;
476 __u64 ptr;
477 int level, index, found, ret;
478
479 node = nilfs_btree_get_root(btree);
480 level = nilfs_btree_node_get_level(btree, node);
481 if ((level < minlevel) ||
482 (nilfs_btree_node_get_nchildren(btree, node) <= 0))
483 return -ENOENT;
484
485 found = nilfs_btree_node_lookup(btree, node, key, &index);
486 ptr = nilfs_btree_node_get_ptr(btree, node, index);
487 path[level].bp_bh = NULL;
488 path[level].bp_index = index;
489
490 for (level--; level >= minlevel; level--) {
491 ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr,
492 &path[level].bp_bh);
493 if (ret < 0)
494 return ret;
495 node = nilfs_btree_get_nonroot_node(btree, path, level);
496 BUG_ON(level != nilfs_btree_node_get_level(btree, node));
497 if (!found)
498 found = nilfs_btree_node_lookup(btree, node, key,
499 &index);
500 else
501 index = 0;
502 if (index < nilfs_btree_node_nchildren_max(btree, node))
503 ptr = nilfs_btree_node_get_ptr(btree, node, index);
504 else {
505 WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN);
506 /* insert */
507 ptr = NILFS_BMAP_INVALID_PTR;
508 }
509 path[level].bp_index = index;
510 }
511 if (!found)
512 return -ENOENT;
513
514 if (ptrp != NULL)
515 *ptrp = ptr;
516
517 return 0;
518}
519
520static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
521 struct nilfs_btree_path *path,
522 __u64 *keyp, __u64 *ptrp)
523{
524 struct nilfs_btree_node *node;
525 __u64 ptr;
526 int index, level, ret;
527
528 node = nilfs_btree_get_root(btree);
529 index = nilfs_btree_node_get_nchildren(btree, node) - 1;
530 if (index < 0)
531 return -ENOENT;
532 level = nilfs_btree_node_get_level(btree, node);
533 ptr = nilfs_btree_node_get_ptr(btree, node, index);
534 path[level].bp_bh = NULL;
535 path[level].bp_index = index;
536
537 for (level--; level > 0; level--) {
538 ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr,
539 &path[level].bp_bh);
540 if (ret < 0)
541 return ret;
542 node = nilfs_btree_get_nonroot_node(btree, path, level);
543 BUG_ON(level != nilfs_btree_node_get_level(btree, node));
544 index = nilfs_btree_node_get_nchildren(btree, node) - 1;
545 ptr = nilfs_btree_node_get_ptr(btree, node, index);
546 path[level].bp_index = index;
547 }
548
549 if (keyp != NULL)
550 *keyp = nilfs_btree_node_get_key(btree, node, index);
551 if (ptrp != NULL)
552 *ptrp = ptr;
553
554 return 0;
555}
556
557static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
558 __u64 key, int level, __u64 *ptrp)
559{
560 struct nilfs_btree *btree;
561 struct nilfs_btree_path *path;
562 __u64 ptr;
563 int ret;
564
565 btree = (struct nilfs_btree *)bmap;
566 path = nilfs_btree_alloc_path(btree);
567 if (path == NULL)
568 return -ENOMEM;
569 nilfs_btree_init_path(btree, path);
570
571 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
572
573 if (ptrp != NULL)
574 *ptrp = ptr;
575
576 nilfs_btree_clear_path(btree, path);
577 nilfs_btree_free_path(btree, path);
578
579 return ret;
580}
581
582static void nilfs_btree_promote_key(struct nilfs_btree *btree,
583 struct nilfs_btree_path *path,
584 int level, __u64 key)
585{
586 if (level < nilfs_btree_height(btree) - 1) {
587 do {
588 lock_buffer(path[level].bp_bh);
589 nilfs_btree_node_set_key(
590 btree,
591 nilfs_btree_get_nonroot_node(
592 btree, path, level),
593 path[level].bp_index, key);
594 if (!buffer_dirty(path[level].bp_bh))
595 nilfs_btnode_mark_dirty(path[level].bp_bh);
596 unlock_buffer(path[level].bp_bh);
597 } while ((path[level].bp_index == 0) &&
598 (++level < nilfs_btree_height(btree) - 1));
599 }
600
601 /* root */
602 if (level == nilfs_btree_height(btree) - 1) {
603 nilfs_btree_node_set_key(btree,
604 nilfs_btree_get_root(btree),
605 path[level].bp_index, key);
606 }
607}
608
609static void nilfs_btree_do_insert(struct nilfs_btree *btree,
610 struct nilfs_btree_path *path,
611 int level, __u64 *keyp, __u64 *ptrp)
612{
613 struct nilfs_btree_node *node;
614
615 if (level < nilfs_btree_height(btree) - 1) {
616 lock_buffer(path[level].bp_bh);
617 node = nilfs_btree_get_nonroot_node(btree, path, level);
618 nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
619 path[level].bp_index);
620 if (!buffer_dirty(path[level].bp_bh))
621 nilfs_btnode_mark_dirty(path[level].bp_bh);
622 unlock_buffer(path[level].bp_bh);
623
624 if (path[level].bp_index == 0)
625 nilfs_btree_promote_key(btree, path, level + 1,
626 nilfs_btree_node_get_key(
627 btree, node, 0));
628 } else {
629 node = nilfs_btree_get_root(btree);
630 nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
631 path[level].bp_index);
632 }
633}
634
635static void nilfs_btree_carry_left(struct nilfs_btree *btree,
636 struct nilfs_btree_path *path,
637 int level, __u64 *keyp, __u64 *ptrp)
638{
639 struct nilfs_btree_node *node, *left;
640 int nchildren, lnchildren, n, move;
641
642 lock_buffer(path[level].bp_bh);
643 lock_buffer(path[level].bp_sib_bh);
644
645 node = nilfs_btree_get_nonroot_node(btree, path, level);
646 left = nilfs_btree_get_sib_node(btree, path, level);
647 nchildren = nilfs_btree_node_get_nchildren(btree, node);
648 lnchildren = nilfs_btree_node_get_nchildren(btree, left);
649 move = 0;
650
651 n = (nchildren + lnchildren + 1) / 2 - lnchildren;
652 if (n > path[level].bp_index) {
653 /* move insert point */
654 n--;
655 move = 1;
656 }
657
658 nilfs_btree_node_move_left(btree, left, node, n);
659
660 if (!buffer_dirty(path[level].bp_bh))
661 nilfs_btnode_mark_dirty(path[level].bp_bh);
662 if (!buffer_dirty(path[level].bp_sib_bh))
663 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
664
665 unlock_buffer(path[level].bp_bh);
666 unlock_buffer(path[level].bp_sib_bh);
667
668 nilfs_btree_promote_key(btree, path, level + 1,
669 nilfs_btree_node_get_key(btree, node, 0));
670
671 if (move) {
672 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
673 path[level].bp_bh = path[level].bp_sib_bh;
674 path[level].bp_sib_bh = NULL;
675 path[level].bp_index += lnchildren;
676 path[level + 1].bp_index--;
677 } else {
678 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
679 path[level].bp_sib_bh = NULL;
680 path[level].bp_index -= n;
681 }
682
683 nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
684}
685
686static void nilfs_btree_carry_right(struct nilfs_btree *btree,
687 struct nilfs_btree_path *path,
688 int level, __u64 *keyp, __u64 *ptrp)
689{
690 struct nilfs_btree_node *node, *right;
691 int nchildren, rnchildren, n, move;
692
693 lock_buffer(path[level].bp_bh);
694 lock_buffer(path[level].bp_sib_bh);
695
696 node = nilfs_btree_get_nonroot_node(btree, path, level);
697 right = nilfs_btree_get_sib_node(btree, path, level);
698 nchildren = nilfs_btree_node_get_nchildren(btree, node);
699 rnchildren = nilfs_btree_node_get_nchildren(btree, right);
700 move = 0;
701
702 n = (nchildren + rnchildren + 1) / 2 - rnchildren;
703 if (n > nchildren - path[level].bp_index) {
704 /* move insert point */
705 n--;
706 move = 1;
707 }
708
709 nilfs_btree_node_move_right(btree, node, right, n);
710
711 if (!buffer_dirty(path[level].bp_bh))
712 nilfs_btnode_mark_dirty(path[level].bp_bh);
713 if (!buffer_dirty(path[level].bp_sib_bh))
714 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
715
716 unlock_buffer(path[level].bp_bh);
717 unlock_buffer(path[level].bp_sib_bh);
718
719 path[level + 1].bp_index++;
720 nilfs_btree_promote_key(btree, path, level + 1,
721 nilfs_btree_node_get_key(btree, right, 0));
722 path[level + 1].bp_index--;
723
724 if (move) {
725 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
726 path[level].bp_bh = path[level].bp_sib_bh;
727 path[level].bp_sib_bh = NULL;
728 path[level].bp_index -=
729 nilfs_btree_node_get_nchildren(btree, node);
730 path[level + 1].bp_index++;
731 } else {
732 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
733 path[level].bp_sib_bh = NULL;
734 }
735
736 nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
737}
738
739static void nilfs_btree_split(struct nilfs_btree *btree,
740 struct nilfs_btree_path *path,
741 int level, __u64 *keyp, __u64 *ptrp)
742{
743 struct nilfs_btree_node *node, *right;
744 __u64 newkey;
745 __u64 newptr;
746 int nchildren, n, move;
747
748 lock_buffer(path[level].bp_bh);
749 lock_buffer(path[level].bp_sib_bh);
750
751 node = nilfs_btree_get_nonroot_node(btree, path, level);
752 right = nilfs_btree_get_sib_node(btree, path, level);
753 nchildren = nilfs_btree_node_get_nchildren(btree, node);
754 move = 0;
755
756 n = (nchildren + 1) / 2;
757 if (n > nchildren - path[level].bp_index) {
758 n--;
759 move = 1;
760 }
761
762 nilfs_btree_node_move_right(btree, node, right, n);
763
764 if (!buffer_dirty(path[level].bp_bh))
765 nilfs_btnode_mark_dirty(path[level].bp_bh);
766 if (!buffer_dirty(path[level].bp_sib_bh))
767 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
768
769 unlock_buffer(path[level].bp_bh);
770 unlock_buffer(path[level].bp_sib_bh);
771
772 newkey = nilfs_btree_node_get_key(btree, right, 0);
773 newptr = path[level].bp_newreq.bpr_ptr;
774
775 if (move) {
776 path[level].bp_index -=
777 nilfs_btree_node_get_nchildren(btree, node);
778 nilfs_btree_node_insert(btree, right, *keyp, *ptrp,
779 path[level].bp_index);
780
781 *keyp = nilfs_btree_node_get_key(btree, right, 0);
782 *ptrp = path[level].bp_newreq.bpr_ptr;
783
784 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
785 path[level].bp_bh = path[level].bp_sib_bh;
786 path[level].bp_sib_bh = NULL;
787 } else {
788 nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
789
790 *keyp = nilfs_btree_node_get_key(btree, right, 0);
791 *ptrp = path[level].bp_newreq.bpr_ptr;
792
793 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
794 path[level].bp_sib_bh = NULL;
795 }
796
797 path[level + 1].bp_index++;
798}
799
800static void nilfs_btree_grow(struct nilfs_btree *btree,
801 struct nilfs_btree_path *path,
802 int level, __u64 *keyp, __u64 *ptrp)
803{
804 struct nilfs_btree_node *root, *child;
805 int n;
806
807 lock_buffer(path[level].bp_sib_bh);
808
809 root = nilfs_btree_get_root(btree);
810 child = nilfs_btree_get_sib_node(btree, path, level);
811
812 n = nilfs_btree_node_get_nchildren(btree, root);
813
814 nilfs_btree_node_move_right(btree, root, child, n);
815 nilfs_btree_node_set_level(btree, root, level + 1);
816
817 if (!buffer_dirty(path[level].bp_sib_bh))
818 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
819
820 unlock_buffer(path[level].bp_sib_bh);
821
822 path[level].bp_bh = path[level].bp_sib_bh;
823 path[level].bp_sib_bh = NULL;
824
825 nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
826
827 *keyp = nilfs_btree_node_get_key(btree, child, 0);
828 *ptrp = path[level].bp_newreq.bpr_ptr;
829}
830
831static __u64 nilfs_btree_find_near(const struct nilfs_btree *btree,
832 const struct nilfs_btree_path *path)
833{
834 struct nilfs_btree_node *node;
835 int level;
836
837 if (path == NULL)
838 return NILFS_BMAP_INVALID_PTR;
839
840 /* left sibling */
841 level = NILFS_BTREE_LEVEL_NODE_MIN;
842 if (path[level].bp_index > 0) {
843 node = nilfs_btree_get_node(btree, path, level);
844 return nilfs_btree_node_get_ptr(btree, node,
845 path[level].bp_index - 1);
846 }
847
848 /* parent */
849 level = NILFS_BTREE_LEVEL_NODE_MIN + 1;
850 if (level <= nilfs_btree_height(btree) - 1) {
851 node = nilfs_btree_get_node(btree, path, level);
852 return nilfs_btree_node_get_ptr(btree, node,
853 path[level].bp_index);
854 }
855
856 return NILFS_BMAP_INVALID_PTR;
857}
858
859static __u64 nilfs_btree_find_target_v(const struct nilfs_btree *btree,
860 const struct nilfs_btree_path *path,
861 __u64 key)
862{
863 __u64 ptr;
864
865 ptr = nilfs_bmap_find_target_seq(&btree->bt_bmap, key);
866 if (ptr != NILFS_BMAP_INVALID_PTR)
867 /* sequential access */
868 return ptr;
869 else {
870 ptr = nilfs_btree_find_near(btree, path);
871 if (ptr != NILFS_BMAP_INVALID_PTR)
872 /* near */
873 return ptr;
874 }
875 /* block group */
876 return nilfs_bmap_find_target_in_group(&btree->bt_bmap);
877}
878
879static void nilfs_btree_set_target_v(struct nilfs_btree *btree, __u64 key,
880 __u64 ptr)
881{
882 btree->bt_bmap.b_last_allocated_key = key;
883 btree->bt_bmap.b_last_allocated_ptr = ptr;
884}
885
886static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
887 struct nilfs_btree_path *path,
888 int *levelp, __u64 key, __u64 ptr,
889 struct nilfs_bmap_stats *stats)
890{
891 struct buffer_head *bh;
892 struct nilfs_btree_node *node, *parent, *sib;
893 __u64 sibptr;
894 int pindex, level, ret;
895
896 stats->bs_nblocks = 0;
897 level = NILFS_BTREE_LEVEL_DATA;
898
899 /* allocate a new ptr for data block */
900 if (btree->bt_ops->btop_find_target != NULL)
901 path[level].bp_newreq.bpr_ptr =
902 btree->bt_ops->btop_find_target(btree, path, key);
903
904 ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
905 &btree->bt_bmap, &path[level].bp_newreq);
906 if (ret < 0)
907 goto err_out_data;
908
909 for (level = NILFS_BTREE_LEVEL_NODE_MIN;
910 level < nilfs_btree_height(btree) - 1;
911 level++) {
912 node = nilfs_btree_get_nonroot_node(btree, path, level);
913 if (nilfs_btree_node_get_nchildren(btree, node) <
914 nilfs_btree_node_nchildren_max(btree, node)) {
915 path[level].bp_op = nilfs_btree_do_insert;
916 stats->bs_nblocks++;
917 goto out;
918 }
919
920 parent = nilfs_btree_get_node(btree, path, level + 1);
921 pindex = path[level + 1].bp_index;
922
923 /* left sibling */
924 if (pindex > 0) {
925 sibptr = nilfs_btree_node_get_ptr(btree, parent,
926 pindex - 1);
927 ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
928 &bh);
929 if (ret < 0)
930 goto err_out_child_node;
931 sib = (struct nilfs_btree_node *)bh->b_data;
932 if (nilfs_btree_node_get_nchildren(btree, sib) <
933 nilfs_btree_node_nchildren_max(btree, sib)) {
934 path[level].bp_sib_bh = bh;
935 path[level].bp_op = nilfs_btree_carry_left;
936 stats->bs_nblocks++;
937 goto out;
938 } else
939 nilfs_bmap_put_block(&btree->bt_bmap, bh);
940 }
941
942 /* right sibling */
943 if (pindex <
944 nilfs_btree_node_get_nchildren(btree, parent) - 1) {
945 sibptr = nilfs_btree_node_get_ptr(btree, parent,
946 pindex + 1);
947 ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
948 &bh);
949 if (ret < 0)
950 goto err_out_child_node;
951 sib = (struct nilfs_btree_node *)bh->b_data;
952 if (nilfs_btree_node_get_nchildren(btree, sib) <
953 nilfs_btree_node_nchildren_max(btree, sib)) {
954 path[level].bp_sib_bh = bh;
955 path[level].bp_op = nilfs_btree_carry_right;
956 stats->bs_nblocks++;
957 goto out;
958 } else
959 nilfs_bmap_put_block(&btree->bt_bmap, bh);
960 }
961
962 /* split */
963 path[level].bp_newreq.bpr_ptr =
964 path[level - 1].bp_newreq.bpr_ptr + 1;
965 ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
966 &btree->bt_bmap, &path[level].bp_newreq);
967 if (ret < 0)
968 goto err_out_child_node;
969 ret = nilfs_bmap_get_new_block(&btree->bt_bmap,
970 path[level].bp_newreq.bpr_ptr,
971 &bh);
972 if (ret < 0)
973 goto err_out_curr_node;
974
975 stats->bs_nblocks++;
976
977 lock_buffer(bh);
978 nilfs_btree_node_init(btree,
979 (struct nilfs_btree_node *)bh->b_data,
980 0, level, 0, NULL, NULL);
981 unlock_buffer(bh);
982 path[level].bp_sib_bh = bh;
983 path[level].bp_op = nilfs_btree_split;
984 }
985
986 /* root */
987 node = nilfs_btree_get_root(btree);
988 if (nilfs_btree_node_get_nchildren(btree, node) <
989 nilfs_btree_node_nchildren_max(btree, node)) {
990 path[level].bp_op = nilfs_btree_do_insert;
991 stats->bs_nblocks++;
992 goto out;
993 }
994
995 /* grow */
996 path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
997 ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
998 &btree->bt_bmap, &path[level].bp_newreq);
999 if (ret < 0)
1000 goto err_out_child_node;
1001 ret = nilfs_bmap_get_new_block(&btree->bt_bmap,
1002 path[level].bp_newreq.bpr_ptr, &bh);
1003 if (ret < 0)
1004 goto err_out_curr_node;
1005
1006 lock_buffer(bh);
1007 nilfs_btree_node_init(btree, (struct nilfs_btree_node *)bh->b_data,
1008 0, level, 0, NULL, NULL);
1009 unlock_buffer(bh);
1010 path[level].bp_sib_bh = bh;
1011 path[level].bp_op = nilfs_btree_grow;
1012
1013 level++;
1014 path[level].bp_op = nilfs_btree_do_insert;
1015
1016 /* a newly-created node block and a data block are added */
1017 stats->bs_nblocks += 2;
1018
1019 /* success */
1020 out:
1021 *levelp = level;
1022 return ret;
1023
1024 /* error */
1025 err_out_curr_node:
1026 btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap,
1027 &path[level].bp_newreq);
1028 err_out_child_node:
1029 for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
1030 nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh);
1031 btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(
1032 &btree->bt_bmap, &path[level].bp_newreq);
1033
1034 }
1035
1036 btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap,
1037 &path[level].bp_newreq);
1038 err_out_data:
1039 *levelp = level;
1040 stats->bs_nblocks = 0;
1041 return ret;
1042}
1043
1044static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
1045 struct nilfs_btree_path *path,
1046 int maxlevel, __u64 key, __u64 ptr)
1047{
1048 int level;
1049
1050 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
1051 ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
1052 if (btree->bt_ops->btop_set_target != NULL)
1053 btree->bt_ops->btop_set_target(btree, key, ptr);
1054
1055 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
1056 if (btree->bt_bmap.b_pops->bpop_commit_alloc_ptr != NULL) {
1057 btree->bt_bmap.b_pops->bpop_commit_alloc_ptr(
1058 &btree->bt_bmap, &path[level - 1].bp_newreq);
1059 }
1060 path[level].bp_op(btree, path, level, &key, &ptr);
1061 }
1062
1063 if (!nilfs_bmap_dirty(&btree->bt_bmap))
1064 nilfs_bmap_set_dirty(&btree->bt_bmap);
1065}
1066
1067static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
1068{
1069 struct nilfs_btree *btree;
1070 struct nilfs_btree_path *path;
1071 struct nilfs_bmap_stats stats;
1072 int level, ret;
1073
1074 btree = (struct nilfs_btree *)bmap;
1075 path = nilfs_btree_alloc_path(btree);
1076 if (path == NULL)
1077 return -ENOMEM;
1078 nilfs_btree_init_path(btree, path);
1079
1080 ret = nilfs_btree_do_lookup(btree, path, key, NULL,
1081 NILFS_BTREE_LEVEL_NODE_MIN);
1082 if (ret != -ENOENT) {
1083 if (ret == 0)
1084 ret = -EEXIST;
1085 goto out;
1086 }
1087
1088 ret = nilfs_btree_prepare_insert(btree, path, &level, key, ptr, &stats);
1089 if (ret < 0)
1090 goto out;
1091 nilfs_btree_commit_insert(btree, path, level, key, ptr);
1092 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
1093
1094 out:
1095 nilfs_btree_clear_path(btree, path);
1096 nilfs_btree_free_path(btree, path);
1097 return ret;
1098}
1099
1100static void nilfs_btree_do_delete(struct nilfs_btree *btree,
1101 struct nilfs_btree_path *path,
1102 int level, __u64 *keyp, __u64 *ptrp)
1103{
1104 struct nilfs_btree_node *node;
1105
1106 if (level < nilfs_btree_height(btree) - 1) {
1107 lock_buffer(path[level].bp_bh);
1108 node = nilfs_btree_get_nonroot_node(btree, path, level);
1109 nilfs_btree_node_delete(btree, node, keyp, ptrp,
1110 path[level].bp_index);
1111 if (!buffer_dirty(path[level].bp_bh))
1112 nilfs_btnode_mark_dirty(path[level].bp_bh);
1113 unlock_buffer(path[level].bp_bh);
1114 if (path[level].bp_index == 0)
1115 nilfs_btree_promote_key(btree, path, level + 1,
1116 nilfs_btree_node_get_key(btree, node, 0));
1117 } else {
1118 node = nilfs_btree_get_root(btree);
1119 nilfs_btree_node_delete(btree, node, keyp, ptrp,
1120 path[level].bp_index);
1121 }
1122}
1123
1124static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
1125 struct nilfs_btree_path *path,
1126 int level, __u64 *keyp, __u64 *ptrp)
1127{
1128 struct nilfs_btree_node *node, *left;
1129 int nchildren, lnchildren, n;
1130
1131 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1132
1133 lock_buffer(path[level].bp_bh);
1134 lock_buffer(path[level].bp_sib_bh);
1135
1136 node = nilfs_btree_get_nonroot_node(btree, path, level);
1137 left = nilfs_btree_get_sib_node(btree, path, level);
1138 nchildren = nilfs_btree_node_get_nchildren(btree, node);
1139 lnchildren = nilfs_btree_node_get_nchildren(btree, left);
1140
1141 n = (nchildren + lnchildren) / 2 - nchildren;
1142
1143 nilfs_btree_node_move_right(btree, left, node, n);
1144
1145 if (!buffer_dirty(path[level].bp_bh))
1146 nilfs_btnode_mark_dirty(path[level].bp_bh);
1147 if (!buffer_dirty(path[level].bp_sib_bh))
1148 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
1149
1150 unlock_buffer(path[level].bp_bh);
1151 unlock_buffer(path[level].bp_sib_bh);
1152
1153 nilfs_btree_promote_key(btree, path, level + 1,
1154 nilfs_btree_node_get_key(btree, node, 0));
1155
1156 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
1157 path[level].bp_sib_bh = NULL;
1158 path[level].bp_index += n;
1159}
1160
1161static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
1162 struct nilfs_btree_path *path,
1163 int level, __u64 *keyp, __u64 *ptrp)
1164{
1165 struct nilfs_btree_node *node, *right;
1166 int nchildren, rnchildren, n;
1167
1168 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1169
1170 lock_buffer(path[level].bp_bh);
1171 lock_buffer(path[level].bp_sib_bh);
1172
1173 node = nilfs_btree_get_nonroot_node(btree, path, level);
1174 right = nilfs_btree_get_sib_node(btree, path, level);
1175 nchildren = nilfs_btree_node_get_nchildren(btree, node);
1176 rnchildren = nilfs_btree_node_get_nchildren(btree, right);
1177
1178 n = (nchildren + rnchildren) / 2 - nchildren;
1179
1180 nilfs_btree_node_move_left(btree, node, right, n);
1181
1182 if (!buffer_dirty(path[level].bp_bh))
1183 nilfs_btnode_mark_dirty(path[level].bp_bh);
1184 if (!buffer_dirty(path[level].bp_sib_bh))
1185 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
1186
1187 unlock_buffer(path[level].bp_bh);
1188 unlock_buffer(path[level].bp_sib_bh);
1189
1190 path[level + 1].bp_index++;
1191 nilfs_btree_promote_key(btree, path, level + 1,
1192 nilfs_btree_node_get_key(btree, right, 0));
1193 path[level + 1].bp_index--;
1194
1195 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
1196 path[level].bp_sib_bh = NULL;
1197}
1198
1199static void nilfs_btree_concat_left(struct nilfs_btree *btree,
1200 struct nilfs_btree_path *path,
1201 int level, __u64 *keyp, __u64 *ptrp)
1202{
1203 struct nilfs_btree_node *node, *left;
1204 int n;
1205
1206 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1207
1208 lock_buffer(path[level].bp_bh);
1209 lock_buffer(path[level].bp_sib_bh);
1210
1211 node = nilfs_btree_get_nonroot_node(btree, path, level);
1212 left = nilfs_btree_get_sib_node(btree, path, level);
1213
1214 n = nilfs_btree_node_get_nchildren(btree, node);
1215
1216 nilfs_btree_node_move_left(btree, left, node, n);
1217
1218 if (!buffer_dirty(path[level].bp_sib_bh))
1219 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
1220
1221 unlock_buffer(path[level].bp_bh);
1222 unlock_buffer(path[level].bp_sib_bh);
1223
1224 nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh);
1225 path[level].bp_bh = path[level].bp_sib_bh;
1226 path[level].bp_sib_bh = NULL;
1227 path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left);
1228}
1229
1230static void nilfs_btree_concat_right(struct nilfs_btree *btree,
1231 struct nilfs_btree_path *path,
1232 int level, __u64 *keyp, __u64 *ptrp)
1233{
1234 struct nilfs_btree_node *node, *right;
1235 int n;
1236
1237 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1238
1239 lock_buffer(path[level].bp_bh);
1240 lock_buffer(path[level].bp_sib_bh);
1241
1242 node = nilfs_btree_get_nonroot_node(btree, path, level);
1243 right = nilfs_btree_get_sib_node(btree, path, level);
1244
1245 n = nilfs_btree_node_get_nchildren(btree, right);
1246
1247 nilfs_btree_node_move_left(btree, node, right, n);
1248
1249 if (!buffer_dirty(path[level].bp_bh))
1250 nilfs_btnode_mark_dirty(path[level].bp_bh);
1251
1252 unlock_buffer(path[level].bp_bh);
1253 unlock_buffer(path[level].bp_sib_bh);
1254
1255 nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh);
1256 path[level].bp_sib_bh = NULL;
1257 path[level + 1].bp_index++;
1258}
1259
1260static void nilfs_btree_shrink(struct nilfs_btree *btree,
1261 struct nilfs_btree_path *path,
1262 int level, __u64 *keyp, __u64 *ptrp)
1263{
1264 struct nilfs_btree_node *root, *child;
1265 int n;
1266
1267 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1268
1269 lock_buffer(path[level].bp_bh);
1270 root = nilfs_btree_get_root(btree);
1271 child = nilfs_btree_get_nonroot_node(btree, path, level);
1272
1273 nilfs_btree_node_delete(btree, root, NULL, NULL, 0);
1274 nilfs_btree_node_set_level(btree, root, level);
1275 n = nilfs_btree_node_get_nchildren(btree, child);
1276 nilfs_btree_node_move_left(btree, root, child, n);
1277 unlock_buffer(path[level].bp_bh);
1278
1279 nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh);
1280 path[level].bp_bh = NULL;
1281}
1282
1283
1284static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1285 struct nilfs_btree_path *path,
1286 int *levelp,
1287 struct nilfs_bmap_stats *stats)
1288{
1289 struct buffer_head *bh;
1290 struct nilfs_btree_node *node, *parent, *sib;
1291 __u64 sibptr;
1292 int pindex, level, ret;
1293
1294 ret = 0;
1295 stats->bs_nblocks = 0;
1296 for (level = NILFS_BTREE_LEVEL_NODE_MIN;
1297 level < nilfs_btree_height(btree) - 1;
1298 level++) {
1299 node = nilfs_btree_get_nonroot_node(btree, path, level);
1300 path[level].bp_oldreq.bpr_ptr =
1301 nilfs_btree_node_get_ptr(btree, node,
1302 path[level].bp_index);
1303 if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
1304 ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr(
1305 &btree->bt_bmap, &path[level].bp_oldreq);
1306 if (ret < 0)
1307 goto err_out_child_node;
1308 }
1309
1310 if (nilfs_btree_node_get_nchildren(btree, node) >
1311 nilfs_btree_node_nchildren_min(btree, node)) {
1312 path[level].bp_op = nilfs_btree_do_delete;
1313 stats->bs_nblocks++;
1314 goto out;
1315 }
1316
1317 parent = nilfs_btree_get_node(btree, path, level + 1);
1318 pindex = path[level + 1].bp_index;
1319
1320 if (pindex > 0) {
1321 /* left sibling */
1322 sibptr = nilfs_btree_node_get_ptr(btree, parent,
1323 pindex - 1);
1324 ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
1325 &bh);
1326 if (ret < 0)
1327 goto err_out_curr_node;
1328 sib = (struct nilfs_btree_node *)bh->b_data;
1329 if (nilfs_btree_node_get_nchildren(btree, sib) >
1330 nilfs_btree_node_nchildren_min(btree, sib)) {
1331 path[level].bp_sib_bh = bh;
1332 path[level].bp_op = nilfs_btree_borrow_left;
1333 stats->bs_nblocks++;
1334 goto out;
1335 } else {
1336 path[level].bp_sib_bh = bh;
1337 path[level].bp_op = nilfs_btree_concat_left;
1338 stats->bs_nblocks++;
1339 /* continue; */
1340 }
1341 } else if (pindex <
1342 nilfs_btree_node_get_nchildren(btree, parent) - 1) {
1343 /* right sibling */
1344 sibptr = nilfs_btree_node_get_ptr(btree, parent,
1345 pindex + 1);
1346 ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
1347 &bh);
1348 if (ret < 0)
1349 goto err_out_curr_node;
1350 sib = (struct nilfs_btree_node *)bh->b_data;
1351 if (nilfs_btree_node_get_nchildren(btree, sib) >
1352 nilfs_btree_node_nchildren_min(btree, sib)) {
1353 path[level].bp_sib_bh = bh;
1354 path[level].bp_op = nilfs_btree_borrow_right;
1355 stats->bs_nblocks++;
1356 goto out;
1357 } else {
1358 path[level].bp_sib_bh = bh;
1359 path[level].bp_op = nilfs_btree_concat_right;
1360 stats->bs_nblocks++;
1361 /* continue; */
1362 }
1363 } else {
1364 /* no siblings */
1365 /* the only child of the root node */
1366 WARN_ON(level != nilfs_btree_height(btree) - 2);
1367 if (nilfs_btree_node_get_nchildren(btree, node) - 1 <=
1368 NILFS_BTREE_ROOT_NCHILDREN_MAX) {
1369 path[level].bp_op = nilfs_btree_shrink;
1370 stats->bs_nblocks += 2;
1371 } else {
1372 path[level].bp_op = nilfs_btree_do_delete;
1373 stats->bs_nblocks++;
1374 }
1375
1376 goto out;
1377
1378 }
1379 }
1380
1381 node = nilfs_btree_get_root(btree);
1382 path[level].bp_oldreq.bpr_ptr =
1383 nilfs_btree_node_get_ptr(btree, node, path[level].bp_index);
1384 if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
1385 ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr(
1386 &btree->bt_bmap, &path[level].bp_oldreq);
1387 if (ret < 0)
1388 goto err_out_child_node;
1389 }
1390 /* child of the root node is deleted */
1391 path[level].bp_op = nilfs_btree_do_delete;
1392 stats->bs_nblocks++;
1393
1394 /* success */
1395 out:
1396 *levelp = level;
1397 return ret;
1398
1399 /* error */
1400 err_out_curr_node:
1401 if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL)
1402 btree->bt_bmap.b_pops->bpop_abort_end_ptr(
1403 &btree->bt_bmap, &path[level].bp_oldreq);
1404 err_out_child_node:
1405 for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
1406 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
1407 if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL)
1408 btree->bt_bmap.b_pops->bpop_abort_end_ptr(
1409 &btree->bt_bmap, &path[level].bp_oldreq);
1410 }
1411 *levelp = level;
1412 stats->bs_nblocks = 0;
1413 return ret;
1414}
1415
1416static void nilfs_btree_commit_delete(struct nilfs_btree *btree,
1417 struct nilfs_btree_path *path,
1418 int maxlevel)
1419{
1420 int level;
1421
1422 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
1423 if (btree->bt_bmap.b_pops->bpop_commit_end_ptr != NULL)
1424 btree->bt_bmap.b_pops->bpop_commit_end_ptr(
1425 &btree->bt_bmap, &path[level].bp_oldreq);
1426 path[level].bp_op(btree, path, level, NULL, NULL);
1427 }
1428
1429 if (!nilfs_bmap_dirty(&btree->bt_bmap))
1430 nilfs_bmap_set_dirty(&btree->bt_bmap);
1431}
1432
1433static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
1434
1435{
1436 struct nilfs_btree *btree;
1437 struct nilfs_btree_path *path;
1438 struct nilfs_bmap_stats stats;
1439 int level, ret;
1440
1441 btree = (struct nilfs_btree *)bmap;
1442 path = nilfs_btree_alloc_path(btree);
1443 if (path == NULL)
1444 return -ENOMEM;
1445 nilfs_btree_init_path(btree, path);
1446 ret = nilfs_btree_do_lookup(btree, path, key, NULL,
1447 NILFS_BTREE_LEVEL_NODE_MIN);
1448 if (ret < 0)
1449 goto out;
1450
1451 ret = nilfs_btree_prepare_delete(btree, path, &level, &stats);
1452 if (ret < 0)
1453 goto out;
1454 nilfs_btree_commit_delete(btree, path, level);
1455 nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
1456
1457out:
1458 nilfs_btree_clear_path(btree, path);
1459 nilfs_btree_free_path(btree, path);
1460 return ret;
1461}
1462
1463static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
1464{
1465 struct nilfs_btree *btree;
1466 struct nilfs_btree_path *path;
1467 int ret;
1468
1469 btree = (struct nilfs_btree *)bmap;
1470 path = nilfs_btree_alloc_path(btree);
1471 if (path == NULL)
1472 return -ENOMEM;
1473 nilfs_btree_init_path(btree, path);
1474
1475 ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
1476
1477 nilfs_btree_clear_path(btree, path);
1478 nilfs_btree_free_path(btree, path);
1479
1480 return ret;
1481}
1482
1483static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
1484{
1485 struct buffer_head *bh;
1486 struct nilfs_btree *btree;
1487 struct nilfs_btree_node *root, *node;
1488 __u64 maxkey, nextmaxkey;
1489 __u64 ptr;
1490 int nchildren, ret;
1491
1492 btree = (struct nilfs_btree *)bmap;
1493 root = nilfs_btree_get_root(btree);
1494 switch (nilfs_btree_height(btree)) {
1495 case 2:
1496 bh = NULL;
1497 node = root;
1498 break;
1499 case 3:
1500 nchildren = nilfs_btree_node_get_nchildren(btree, root);
1501 if (nchildren > 1)
1502 return 0;
1503 ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
1504 ret = nilfs_bmap_get_block(bmap, ptr, &bh);
1505 if (ret < 0)
1506 return ret;
1507 node = (struct nilfs_btree_node *)bh->b_data;
1508 break;
1509 default:
1510 return 0;
1511 }
1512
1513 nchildren = nilfs_btree_node_get_nchildren(btree, node);
1514 maxkey = nilfs_btree_node_get_key(btree, node, nchildren - 1);
1515 nextmaxkey = (nchildren > 1) ?
1516 nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0;
1517 if (bh != NULL)
1518 nilfs_bmap_put_block(bmap, bh);
1519
1520 return (maxkey == key) && (nextmaxkey < bmap->b_low);
1521}
1522
1523static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
1524 __u64 *keys, __u64 *ptrs, int nitems)
1525{
1526 struct buffer_head *bh;
1527 struct nilfs_btree *btree;
1528 struct nilfs_btree_node *node, *root;
1529 __le64 *dkeys;
1530 __le64 *dptrs;
1531 __u64 ptr;
1532 int nchildren, i, ret;
1533
1534 btree = (struct nilfs_btree *)bmap;
1535 root = nilfs_btree_get_root(btree);
1536 switch (nilfs_btree_height(btree)) {
1537 case 2:
1538 bh = NULL;
1539 node = root;
1540 break;
1541 case 3:
1542 nchildren = nilfs_btree_node_get_nchildren(btree, root);
1543 WARN_ON(nchildren > 1);
1544 ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
1545 ret = nilfs_bmap_get_block(bmap, ptr, &bh);
1546 if (ret < 0)
1547 return ret;
1548 node = (struct nilfs_btree_node *)bh->b_data;
1549 break;
1550 default:
1551 node = NULL;
1552 return -EINVAL;
1553 }
1554
1555 nchildren = nilfs_btree_node_get_nchildren(btree, node);
1556 if (nchildren < nitems)
1557 nitems = nchildren;
1558 dkeys = nilfs_btree_node_dkeys(btree, node);
1559 dptrs = nilfs_btree_node_dptrs(btree, node);
1560 for (i = 0; i < nitems; i++) {
1561 keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]);
1562 ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]);
1563 }
1564
1565 if (bh != NULL)
1566 nilfs_bmap_put_block(bmap, bh);
1567
1568 return nitems;
1569}
1570
1571static int
1572nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
1573 union nilfs_bmap_ptr_req *dreq,
1574 union nilfs_bmap_ptr_req *nreq,
1575 struct buffer_head **bhp,
1576 struct nilfs_bmap_stats *stats)
1577{
1578 struct buffer_head *bh;
1579 struct nilfs_btree *btree;
1580 int ret;
1581
1582 btree = (struct nilfs_btree *)bmap;
1583 stats->bs_nblocks = 0;
1584
1585 /* for data */
1586 /* cannot find near ptr */
1587 if (btree->bt_ops->btop_find_target != NULL)
1588 dreq->bpr_ptr
1589 = btree->bt_ops->btop_find_target(btree, NULL, key);
1590 ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, dreq);
1591 if (ret < 0)
1592 return ret;
1593
1594 *bhp = NULL;
1595 stats->bs_nblocks++;
1596 if (nreq != NULL) {
1597 nreq->bpr_ptr = dreq->bpr_ptr + 1;
1598 ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, nreq);
1599 if (ret < 0)
1600 goto err_out_dreq;
1601
1602 ret = nilfs_bmap_get_new_block(bmap, nreq->bpr_ptr, &bh);
1603 if (ret < 0)
1604 goto err_out_nreq;
1605
1606 *bhp = bh;
1607 stats->bs_nblocks++;
1608 }
1609
1610 /* success */
1611 return 0;
1612
1613 /* error */
1614 err_out_nreq:
1615 bmap->b_pops->bpop_abort_alloc_ptr(bmap, nreq);
1616 err_out_dreq:
1617 bmap->b_pops->bpop_abort_alloc_ptr(bmap, dreq);
1618 stats->bs_nblocks = 0;
1619 return ret;
1620
1621}
1622
1623static void
1624nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
1625 __u64 key, __u64 ptr,
1626 const __u64 *keys, const __u64 *ptrs,
1627 int n, __u64 low, __u64 high,
1628 union nilfs_bmap_ptr_req *dreq,
1629 union nilfs_bmap_ptr_req *nreq,
1630 struct buffer_head *bh)
1631{
1632 struct nilfs_btree *btree;
1633 struct nilfs_btree_node *node;
1634 __u64 tmpptr;
1635
1636 /* free resources */
1637 if (bmap->b_ops->bop_clear != NULL)
1638 bmap->b_ops->bop_clear(bmap);
1639
1640 /* ptr must be a pointer to a buffer head. */
1641 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
1642
1643 /* convert and insert */
1644 btree = (struct nilfs_btree *)bmap;
1645 nilfs_btree_init(bmap, low, high);
1646 if (nreq != NULL) {
1647 if (bmap->b_pops->bpop_commit_alloc_ptr != NULL) {
1648 bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq);
1649 bmap->b_pops->bpop_commit_alloc_ptr(bmap, nreq);
1650 }
1651
1652 /* create child node at level 1 */
1653 lock_buffer(bh);
1654 node = (struct nilfs_btree_node *)bh->b_data;
1655 nilfs_btree_node_init(btree, node, 0, 1, n, keys, ptrs);
1656 nilfs_btree_node_insert(btree, node,
1657 key, dreq->bpr_ptr, n);
1658 if (!buffer_dirty(bh))
1659 nilfs_btnode_mark_dirty(bh);
1660 if (!nilfs_bmap_dirty(bmap))
1661 nilfs_bmap_set_dirty(bmap);
1662
1663 unlock_buffer(bh);
1664 nilfs_bmap_put_block(bmap, bh);
1665
1666 /* create root node at level 2 */
1667 node = nilfs_btree_get_root(btree);
1668 tmpptr = nreq->bpr_ptr;
1669 nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
1670 2, 1, &keys[0], &tmpptr);
1671 } else {
1672 if (bmap->b_pops->bpop_commit_alloc_ptr != NULL)
1673 bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq);
1674
1675 /* create root node at level 1 */
1676 node = nilfs_btree_get_root(btree);
1677 nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
1678 1, n, keys, ptrs);
1679 nilfs_btree_node_insert(btree, node,
1680 key, dreq->bpr_ptr, n);
1681 if (!nilfs_bmap_dirty(bmap))
1682 nilfs_bmap_set_dirty(bmap);
1683 }
1684
1685 if (btree->bt_ops->btop_set_target != NULL)
1686 btree->bt_ops->btop_set_target(btree, key, dreq->bpr_ptr);
1687}
1688
1689/**
1690 * nilfs_btree_convert_and_insert -
1691 * @bmap:
1692 * @key:
1693 * @ptr:
1694 * @keys:
1695 * @ptrs:
1696 * @n:
1697 * @low:
1698 * @high:
1699 */
1700int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
1701 __u64 key, __u64 ptr,
1702 const __u64 *keys, const __u64 *ptrs,
1703 int n, __u64 low, __u64 high)
1704{
1705 struct buffer_head *bh;
1706 union nilfs_bmap_ptr_req dreq, nreq, *di, *ni;
1707 struct nilfs_bmap_stats stats;
1708 int ret;
1709
1710 if (n + 1 <= NILFS_BTREE_ROOT_NCHILDREN_MAX) {
1711 di = &dreq;
1712 ni = NULL;
1713 } else if ((n + 1) <= NILFS_BTREE_NODE_NCHILDREN_MAX(
1714 1 << bmap->b_inode->i_blkbits)) {
1715 di = &dreq;
1716 ni = &nreq;
1717 } else {
1718 di = NULL;
1719 ni = NULL;
1720 BUG();
1721 }
1722
1723 ret = nilfs_btree_prepare_convert_and_insert(bmap, key, di, ni, &bh,
1724 &stats);
1725 if (ret < 0)
1726 return ret;
1727 nilfs_btree_commit_convert_and_insert(bmap, key, ptr, keys, ptrs, n,
1728 low, high, di, ni, bh);
1729 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
1730 return 0;
1731}
1732
1733static int nilfs_btree_propagate_p(struct nilfs_btree *btree,
1734 struct nilfs_btree_path *path,
1735 int level,
1736 struct buffer_head *bh)
1737{
1738 while ((++level < nilfs_btree_height(btree) - 1) &&
1739 !buffer_dirty(path[level].bp_bh))
1740 nilfs_btnode_mark_dirty(path[level].bp_bh);
1741
1742 return 0;
1743}
1744
1745static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
1746 struct nilfs_btree_path *path,
1747 int level)
1748{
1749 struct nilfs_btree_node *parent;
1750 int ret;
1751
1752 parent = nilfs_btree_get_node(btree, path, level + 1);
1753 path[level].bp_oldreq.bpr_ptr =
1754 nilfs_btree_node_get_ptr(btree, parent,
1755 path[level + 1].bp_index);
1756 path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
1757 ret = nilfs_bmap_prepare_update(&btree->bt_bmap,
1758 &path[level].bp_oldreq,
1759 &path[level].bp_newreq);
1760 if (ret < 0)
1761 return ret;
1762
1763 if (buffer_nilfs_node(path[level].bp_bh)) {
1764 path[level].bp_ctxt.oldkey = path[level].bp_oldreq.bpr_ptr;
1765 path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr;
1766 path[level].bp_ctxt.bh = path[level].bp_bh;
1767 ret = nilfs_btnode_prepare_change_key(
1768 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
1769 &path[level].bp_ctxt);
1770 if (ret < 0) {
1771 nilfs_bmap_abort_update(&btree->bt_bmap,
1772 &path[level].bp_oldreq,
1773 &path[level].bp_newreq);
1774 return ret;
1775 }
1776 }
1777
1778 return 0;
1779}
1780
1781static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
1782 struct nilfs_btree_path *path,
1783 int level)
1784{
1785 struct nilfs_btree_node *parent;
1786
1787 nilfs_bmap_commit_update(&btree->bt_bmap,
1788 &path[level].bp_oldreq,
1789 &path[level].bp_newreq);
1790
1791 if (buffer_nilfs_node(path[level].bp_bh)) {
1792 nilfs_btnode_commit_change_key(
1793 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
1794 &path[level].bp_ctxt);
1795 path[level].bp_bh = path[level].bp_ctxt.bh;
1796 }
1797 set_buffer_nilfs_volatile(path[level].bp_bh);
1798
1799 parent = nilfs_btree_get_node(btree, path, level + 1);
1800 nilfs_btree_node_set_ptr(btree, parent, path[level + 1].bp_index,
1801 path[level].bp_newreq.bpr_ptr);
1802}
1803
1804static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
1805 struct nilfs_btree_path *path,
1806 int level)
1807{
1808 nilfs_bmap_abort_update(&btree->bt_bmap,
1809 &path[level].bp_oldreq,
1810 &path[level].bp_newreq);
1811 if (buffer_nilfs_node(path[level].bp_bh))
1812 nilfs_btnode_abort_change_key(
1813 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
1814 &path[level].bp_ctxt);
1815}
1816
1817static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
1818 struct nilfs_btree_path *path,
1819 int minlevel,
1820 int *maxlevelp)
1821{
1822 int level, ret;
1823
1824 level = minlevel;
1825 if (!buffer_nilfs_volatile(path[level].bp_bh)) {
1826 ret = nilfs_btree_prepare_update_v(btree, path, level);
1827 if (ret < 0)
1828 return ret;
1829 }
1830 while ((++level < nilfs_btree_height(btree) - 1) &&
1831 !buffer_dirty(path[level].bp_bh)) {
1832
1833 WARN_ON(buffer_nilfs_volatile(path[level].bp_bh));
1834 ret = nilfs_btree_prepare_update_v(btree, path, level);
1835 if (ret < 0)
1836 goto out;
1837 }
1838
1839 /* success */
1840 *maxlevelp = level - 1;
1841 return 0;
1842
1843 /* error */
1844 out:
1845 while (--level > minlevel)
1846 nilfs_btree_abort_update_v(btree, path, level);
1847 if (!buffer_nilfs_volatile(path[level].bp_bh))
1848 nilfs_btree_abort_update_v(btree, path, level);
1849 return ret;
1850}
1851
1852static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree,
1853 struct nilfs_btree_path *path,
1854 int minlevel,
1855 int maxlevel,
1856 struct buffer_head *bh)
1857{
1858 int level;
1859
1860 if (!buffer_nilfs_volatile(path[minlevel].bp_bh))
1861 nilfs_btree_commit_update_v(btree, path, minlevel);
1862
1863 for (level = minlevel + 1; level <= maxlevel; level++)
1864 nilfs_btree_commit_update_v(btree, path, level);
1865}
1866
1867static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
1868 struct nilfs_btree_path *path,
1869 int level,
1870 struct buffer_head *bh)
1871{
1872 int maxlevel, ret;
1873 struct nilfs_btree_node *parent;
1874 __u64 ptr;
1875
1876 get_bh(bh);
1877 path[level].bp_bh = bh;
1878 ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel);
1879 if (ret < 0)
1880 goto out;
1881
1882 if (buffer_nilfs_volatile(path[level].bp_bh)) {
1883 parent = nilfs_btree_get_node(btree, path, level + 1);
1884 ptr = nilfs_btree_node_get_ptr(btree, parent,
1885 path[level + 1].bp_index);
1886 ret = nilfs_bmap_mark_dirty(&btree->bt_bmap, ptr);
1887 if (ret < 0)
1888 goto out;
1889 }
1890
1891 nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh);
1892
1893 out:
1894 brelse(path[level].bp_bh);
1895 path[level].bp_bh = NULL;
1896 return ret;
1897}
1898
1899static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
1900 struct buffer_head *bh)
1901{
1902 struct nilfs_btree *btree;
1903 struct nilfs_btree_path *path;
1904 struct nilfs_btree_node *node;
1905 __u64 key;
1906 int level, ret;
1907
1908 WARN_ON(!buffer_dirty(bh));
1909
1910 btree = (struct nilfs_btree *)bmap;
1911 path = nilfs_btree_alloc_path(btree);
1912 if (path == NULL)
1913 return -ENOMEM;
1914 nilfs_btree_init_path(btree, path);
1915
1916 if (buffer_nilfs_node(bh)) {
1917 node = (struct nilfs_btree_node *)bh->b_data;
1918 key = nilfs_btree_node_get_key(btree, node, 0);
1919 level = nilfs_btree_node_get_level(btree, node);
1920 } else {
1921 key = nilfs_bmap_data_get_key(bmap, bh);
1922 level = NILFS_BTREE_LEVEL_DATA;
1923 }
1924
1925 ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1);
1926 if (ret < 0) {
1927 if (unlikely(ret == -ENOENT))
1928 printk(KERN_CRIT "%s: key = %llu, level == %d\n",
1929 __func__, (unsigned long long)key, level);
1930 goto out;
1931 }
1932
1933 ret = btree->bt_ops->btop_propagate(btree, path, level, bh);
1934
1935 out:
1936 nilfs_btree_clear_path(btree, path);
1937 nilfs_btree_free_path(btree, path);
1938
1939 return ret;
1940}
1941
1942static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap,
1943 struct buffer_head *bh)
1944{
1945 return nilfs_bmap_mark_dirty(bmap, bh->b_blocknr);
1946}
1947
1948static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
1949 struct list_head *lists,
1950 struct buffer_head *bh)
1951{
1952 struct list_head *head;
1953 struct buffer_head *cbh;
1954 struct nilfs_btree_node *node, *cnode;
1955 __u64 key, ckey;
1956 int level;
1957
1958 get_bh(bh);
1959 node = (struct nilfs_btree_node *)bh->b_data;
1960 key = nilfs_btree_node_get_key(btree, node, 0);
1961 level = nilfs_btree_node_get_level(btree, node);
1962 list_for_each(head, &lists[level]) {
1963 cbh = list_entry(head, struct buffer_head, b_assoc_buffers);
1964 cnode = (struct nilfs_btree_node *)cbh->b_data;
1965 ckey = nilfs_btree_node_get_key(btree, cnode, 0);
1966 if (key < ckey)
1967 break;
1968 }
1969 list_add_tail(&bh->b_assoc_buffers, head);
1970}
1971
1972static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap,
1973 struct list_head *listp)
1974{
1975 struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
1976 struct address_space *btcache = &NILFS_BMAP_I(bmap)->i_btnode_cache;
1977 struct list_head lists[NILFS_BTREE_LEVEL_MAX];
1978 struct pagevec pvec;
1979 struct buffer_head *bh, *head;
1980 pgoff_t index = 0;
1981 int level, i;
1982
1983 for (level = NILFS_BTREE_LEVEL_NODE_MIN;
1984 level < NILFS_BTREE_LEVEL_MAX;
1985 level++)
1986 INIT_LIST_HEAD(&lists[level]);
1987
1988 pagevec_init(&pvec, 0);
1989
1990 while (pagevec_lookup_tag(&pvec, btcache, &index, PAGECACHE_TAG_DIRTY,
1991 PAGEVEC_SIZE)) {
1992 for (i = 0; i < pagevec_count(&pvec); i++) {
1993 bh = head = page_buffers(pvec.pages[i]);
1994 do {
1995 if (buffer_dirty(bh))
1996 nilfs_btree_add_dirty_buffer(btree,
1997 lists, bh);
1998 } while ((bh = bh->b_this_page) != head);
1999 }
2000 pagevec_release(&pvec);
2001 cond_resched();
2002 }
2003
2004 for (level = NILFS_BTREE_LEVEL_NODE_MIN;
2005 level < NILFS_BTREE_LEVEL_MAX;
2006 level++)
2007 list_splice(&lists[level], listp->prev);
2008}
2009
2010static int nilfs_btree_assign_p(struct nilfs_btree *btree,
2011 struct nilfs_btree_path *path,
2012 int level,
2013 struct buffer_head **bh,
2014 sector_t blocknr,
2015 union nilfs_binfo *binfo)
2016{
2017 struct nilfs_btree_node *parent;
2018 __u64 key;
2019 __u64 ptr;
2020 int ret;
2021
2022 parent = nilfs_btree_get_node(btree, path, level + 1);
2023 ptr = nilfs_btree_node_get_ptr(btree, parent,
2024 path[level + 1].bp_index);
2025 if (buffer_nilfs_node(*bh)) {
2026 path[level].bp_ctxt.oldkey = ptr;
2027 path[level].bp_ctxt.newkey = blocknr;
2028 path[level].bp_ctxt.bh = *bh;
2029 ret = nilfs_btnode_prepare_change_key(
2030 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
2031 &path[level].bp_ctxt);
2032 if (ret < 0)
2033 return ret;
2034 nilfs_btnode_commit_change_key(
2035 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
2036 &path[level].bp_ctxt);
2037 *bh = path[level].bp_ctxt.bh;
2038 }
2039
2040 nilfs_btree_node_set_ptr(btree, parent,
2041 path[level + 1].bp_index, blocknr);
2042
2043 key = nilfs_btree_node_get_key(btree, parent,
2044 path[level + 1].bp_index);
2045 /* on-disk format */
2046 binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
2047 binfo->bi_dat.bi_level = level;
2048
2049 return 0;
2050}
2051
2052static int nilfs_btree_assign_v(struct nilfs_btree *btree,
2053 struct nilfs_btree_path *path,
2054 int level,
2055 struct buffer_head **bh,
2056 sector_t blocknr,
2057 union nilfs_binfo *binfo)
2058{
2059 struct nilfs_btree_node *parent;
2060 __u64 key;
2061 __u64 ptr;
2062 union nilfs_bmap_ptr_req req;
2063 int ret;
2064
2065 parent = nilfs_btree_get_node(btree, path, level + 1);
2066 ptr = nilfs_btree_node_get_ptr(btree, parent,
2067 path[level + 1].bp_index);
2068 req.bpr_ptr = ptr;
2069 ret = btree->bt_bmap.b_pops->bpop_prepare_start_ptr(&btree->bt_bmap,
2070 &req);
2071 if (ret < 0)
2072 return ret;
2073 btree->bt_bmap.b_pops->bpop_commit_start_ptr(&btree->bt_bmap,
2074 &req, blocknr);
2075
2076 key = nilfs_btree_node_get_key(btree, parent,
2077 path[level + 1].bp_index);
2078 /* on-disk format */
2079 binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
2080 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
2081
2082 return 0;
2083}
2084
2085static int nilfs_btree_assign(struct nilfs_bmap *bmap,
2086 struct buffer_head **bh,
2087 sector_t blocknr,
2088 union nilfs_binfo *binfo)
2089{
2090 struct nilfs_btree *btree;
2091 struct nilfs_btree_path *path;
2092 struct nilfs_btree_node *node;
2093 __u64 key;
2094 int level, ret;
2095
2096 btree = (struct nilfs_btree *)bmap;
2097 path = nilfs_btree_alloc_path(btree);
2098 if (path == NULL)
2099 return -ENOMEM;
2100 nilfs_btree_init_path(btree, path);
2101
2102 if (buffer_nilfs_node(*bh)) {
2103 node = (struct nilfs_btree_node *)(*bh)->b_data;
2104 key = nilfs_btree_node_get_key(btree, node, 0);
2105 level = nilfs_btree_node_get_level(btree, node);
2106 } else {
2107 key = nilfs_bmap_data_get_key(bmap, *bh);
2108 level = NILFS_BTREE_LEVEL_DATA;
2109 }
2110
2111 ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1);
2112 if (ret < 0) {
2113 WARN_ON(ret == -ENOENT);
2114 goto out;
2115 }
2116
2117 ret = btree->bt_ops->btop_assign(btree, path, level, bh,
2118 blocknr, binfo);
2119
2120 out:
2121 nilfs_btree_clear_path(btree, path);
2122 nilfs_btree_free_path(btree, path);
2123
2124 return ret;
2125}
2126
2127static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
2128 struct buffer_head **bh,
2129 sector_t blocknr,
2130 union nilfs_binfo *binfo)
2131{
2132 struct nilfs_btree *btree;
2133 struct nilfs_btree_node *node;
2134 __u64 key;
2135 int ret;
2136
2137 btree = (struct nilfs_btree *)bmap;
2138 ret = nilfs_bmap_move_v(bmap, (*bh)->b_blocknr, blocknr);
2139 if (ret < 0)
2140 return ret;
2141
2142 if (buffer_nilfs_node(*bh)) {
2143 node = (struct nilfs_btree_node *)(*bh)->b_data;
2144 key = nilfs_btree_node_get_key(btree, node, 0);
2145 } else
2146 key = nilfs_bmap_data_get_key(bmap, *bh);
2147
2148 /* on-disk format */
2149 binfo->bi_v.bi_vblocknr = cpu_to_le64((*bh)->b_blocknr);
2150 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
2151
2152 return 0;
2153}
2154
2155static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
2156{
2157 struct buffer_head *bh;
2158 struct nilfs_btree *btree;
2159 struct nilfs_btree_path *path;
2160 __u64 ptr;
2161 int ret;
2162
2163 btree = (struct nilfs_btree *)bmap;
2164 path = nilfs_btree_alloc_path(btree);
2165 if (path == NULL)
2166 return -ENOMEM;
2167 nilfs_btree_init_path(btree, path);
2168
2169 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1);
2170 if (ret < 0) {
2171 WARN_ON(ret == -ENOENT);
2172 goto out;
2173 }
2174 ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr, &bh);
2175 if (ret < 0) {
2176 WARN_ON(ret == -ENOENT);
2177 goto out;
2178 }
2179
2180 if (!buffer_dirty(bh))
2181 nilfs_btnode_mark_dirty(bh);
2182 nilfs_bmap_put_block(&btree->bt_bmap, bh);
2183 if (!nilfs_bmap_dirty(&btree->bt_bmap))
2184 nilfs_bmap_set_dirty(&btree->bt_bmap);
2185
2186 out:
2187 nilfs_btree_clear_path(btree, path);
2188 nilfs_btree_free_path(btree, path);
2189 return ret;
2190}
2191
2192static const struct nilfs_bmap_operations nilfs_btree_ops = {
2193 .bop_lookup = nilfs_btree_lookup,
2194 .bop_insert = nilfs_btree_insert,
2195 .bop_delete = nilfs_btree_delete,
2196 .bop_clear = NULL,
2197
2198 .bop_propagate = nilfs_btree_propagate,
2199
2200 .bop_lookup_dirty_buffers = nilfs_btree_lookup_dirty_buffers,
2201
2202 .bop_assign = nilfs_btree_assign,
2203 .bop_mark = nilfs_btree_mark,
2204
2205 .bop_last_key = nilfs_btree_last_key,
2206 .bop_check_insert = NULL,
2207 .bop_check_delete = nilfs_btree_check_delete,
2208 .bop_gather_data = nilfs_btree_gather_data,
2209};
2210
2211static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
2212 .bop_lookup = NULL,
2213 .bop_insert = NULL,
2214 .bop_delete = NULL,
2215 .bop_clear = NULL,
2216
2217 .bop_propagate = nilfs_btree_propagate_gc,
2218
2219 .bop_lookup_dirty_buffers = nilfs_btree_lookup_dirty_buffers,
2220
2221 .bop_assign = nilfs_btree_assign_gc,
2222 .bop_mark = NULL,
2223
2224 .bop_last_key = NULL,
2225 .bop_check_insert = NULL,
2226 .bop_check_delete = NULL,
2227 .bop_gather_data = NULL,
2228};
2229
2230static const struct nilfs_btree_operations nilfs_btree_ops_v = {
2231 .btop_find_target = nilfs_btree_find_target_v,
2232 .btop_set_target = nilfs_btree_set_target_v,
2233 .btop_propagate = nilfs_btree_propagate_v,
2234 .btop_assign = nilfs_btree_assign_v,
2235};
2236
2237static const struct nilfs_btree_operations nilfs_btree_ops_p = {
2238 .btop_find_target = NULL,
2239 .btop_set_target = NULL,
2240 .btop_propagate = nilfs_btree_propagate_p,
2241 .btop_assign = nilfs_btree_assign_p,
2242};
2243
2244int nilfs_btree_init(struct nilfs_bmap *bmap, __u64 low, __u64 high)
2245{
2246 struct nilfs_btree *btree;
2247
2248 btree = (struct nilfs_btree *)bmap;
2249 bmap->b_ops = &nilfs_btree_ops;
2250 bmap->b_low = low;
2251 bmap->b_high = high;
2252 switch (bmap->b_inode->i_ino) {
2253 case NILFS_DAT_INO:
2254 btree->bt_ops = &nilfs_btree_ops_p;
2255 break;
2256 default:
2257 btree->bt_ops = &nilfs_btree_ops_v;
2258 break;
2259 }
2260
2261 return 0;
2262}
2263
2264void nilfs_btree_init_gc(struct nilfs_bmap *bmap)
2265{
2266 bmap->b_low = NILFS_BMAP_LARGE_LOW;
2267 bmap->b_high = NILFS_BMAP_LARGE_HIGH;
2268 bmap->b_ops = &nilfs_btree_ops_gc;
2269}
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
new file mode 100644
index 000000000000..4766deb52fb1
--- /dev/null
+++ b/fs/nilfs2/btree.h
@@ -0,0 +1,117 @@
1/*
2 * btree.h - NILFS B-tree.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_BTREE_H
24#define _NILFS_BTREE_H
25
26#include <linux/types.h>
27#include <linux/buffer_head.h>
28#include <linux/list.h>
29#include <linux/nilfs2_fs.h>
30#include "btnode.h"
31#include "bmap.h"
32
33struct nilfs_btree;
34struct nilfs_btree_path;
35
36/**
37 * struct nilfs_btree_operations - B-tree operation table
38 */
39struct nilfs_btree_operations {
40 __u64 (*btop_find_target)(const struct nilfs_btree *,
41 const struct nilfs_btree_path *, __u64);
42 void (*btop_set_target)(struct nilfs_btree *, __u64, __u64);
43
44 struct the_nilfs *(*btop_get_nilfs)(struct nilfs_btree *);
45
46 int (*btop_propagate)(struct nilfs_btree *,
47 struct nilfs_btree_path *,
48 int,
49 struct buffer_head *);
50 int (*btop_assign)(struct nilfs_btree *,
51 struct nilfs_btree_path *,
52 int,
53 struct buffer_head **,
54 sector_t,
55 union nilfs_binfo *);
56};
57
58/**
59 * struct nilfs_btree_node - B-tree node
60 * @bn_flags: flags
61 * @bn_level: level
62 * @bn_nchildren: number of children
63 * @bn_pad: padding
64 */
65struct nilfs_btree_node {
66 __u8 bn_flags;
67 __u8 bn_level;
68 __le16 bn_nchildren;
69 __le32 bn_pad;
70};
71
72/* flags */
73#define NILFS_BTREE_NODE_ROOT 0x01
74
75/* level */
76#define NILFS_BTREE_LEVEL_DATA 0
77#define NILFS_BTREE_LEVEL_NODE_MIN (NILFS_BTREE_LEVEL_DATA + 1)
78#define NILFS_BTREE_LEVEL_MAX 14
79
80/**
81 * struct nilfs_btree - B-tree structure
82 * @bt_bmap: bmap base structure
83 * @bt_ops: B-tree operation table
84 */
85struct nilfs_btree {
86 struct nilfs_bmap bt_bmap;
87
88 /* B-tree-specific members */
89 const struct nilfs_btree_operations *bt_ops;
90};
91
92
93#define NILFS_BTREE_ROOT_SIZE NILFS_BMAP_SIZE
94#define NILFS_BTREE_ROOT_NCHILDREN_MAX \
95 ((NILFS_BTREE_ROOT_SIZE - sizeof(struct nilfs_btree_node)) / \
96 (sizeof(__le64 /* dkey */) + sizeof(__le64 /* dptr */)))
97#define NILFS_BTREE_ROOT_NCHILDREN_MIN 0
98#define NILFS_BTREE_NODE_EXTRA_PAD_SIZE (sizeof(__le64))
99#define NILFS_BTREE_NODE_NCHILDREN_MAX(nodesize) \
100 (((nodesize) - sizeof(struct nilfs_btree_node) - \
101 NILFS_BTREE_NODE_EXTRA_PAD_SIZE) / \
102 (sizeof(__le64 /* dkey */) + sizeof(__le64 /* dptr */)))
103#define NILFS_BTREE_NODE_NCHILDREN_MIN(nodesize) \
104 ((NILFS_BTREE_NODE_NCHILDREN_MAX(nodesize) - 1) / 2 + 1)
105#define NILFS_BTREE_KEY_MIN ((__u64)0)
106#define NILFS_BTREE_KEY_MAX (~(__u64)0)
107
108
109int nilfs_btree_path_cache_init(void);
110void nilfs_btree_path_cache_destroy(void);
111int nilfs_btree_init(struct nilfs_bmap *, __u64, __u64);
112int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64,
113 const __u64 *, const __u64 *,
114 int, __u64, __u64);
115void nilfs_btree_init_gc(struct nilfs_bmap *);
116
117#endif /* _NILFS_BTREE_H */
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
new file mode 100644
index 000000000000..300f1cdfa862
--- /dev/null
+++ b/fs/nilfs2/cpfile.c
@@ -0,0 +1,927 @@
1/*
2 * cpfile.c - NILFS checkpoint file.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#include <linux/kernel.h>
24#include <linux/fs.h>
25#include <linux/string.h>
26#include <linux/buffer_head.h>
27#include <linux/errno.h>
28#include <linux/nilfs2_fs.h>
29#include "mdt.h"
30#include "cpfile.h"
31
32
33static inline unsigned long
34nilfs_cpfile_checkpoints_per_block(const struct inode *cpfile)
35{
36 return NILFS_MDT(cpfile)->mi_entries_per_block;
37}
38
39/* block number from the beginning of the file */
40static unsigned long
41nilfs_cpfile_get_blkoff(const struct inode *cpfile, __u64 cno)
42{
43 __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;
44 do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
45 return (unsigned long)tcno;
46}
47
48/* offset in block */
49static unsigned long
50nilfs_cpfile_get_offset(const struct inode *cpfile, __u64 cno)
51{
52 __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;
53 return do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
54}
55
56static unsigned long
57nilfs_cpfile_checkpoints_in_block(const struct inode *cpfile,
58 __u64 curr,
59 __u64 max)
60{
61 return min_t(__u64,
62 nilfs_cpfile_checkpoints_per_block(cpfile) -
63 nilfs_cpfile_get_offset(cpfile, curr),
64 max - curr);
65}
66
67static inline int nilfs_cpfile_is_in_first(const struct inode *cpfile,
68 __u64 cno)
69{
70 return nilfs_cpfile_get_blkoff(cpfile, cno) == 0;
71}
72
73static unsigned int
74nilfs_cpfile_block_add_valid_checkpoints(const struct inode *cpfile,
75 struct buffer_head *bh,
76 void *kaddr,
77 unsigned int n)
78{
79 struct nilfs_checkpoint *cp = kaddr + bh_offset(bh);
80 unsigned int count;
81
82 count = le32_to_cpu(cp->cp_checkpoints_count) + n;
83 cp->cp_checkpoints_count = cpu_to_le32(count);
84 return count;
85}
86
87static unsigned int
88nilfs_cpfile_block_sub_valid_checkpoints(const struct inode *cpfile,
89 struct buffer_head *bh,
90 void *kaddr,
91 unsigned int n)
92{
93 struct nilfs_checkpoint *cp = kaddr + bh_offset(bh);
94 unsigned int count;
95
96 WARN_ON(le32_to_cpu(cp->cp_checkpoints_count) < n);
97 count = le32_to_cpu(cp->cp_checkpoints_count) - n;
98 cp->cp_checkpoints_count = cpu_to_le32(count);
99 return count;
100}
101
102static inline struct nilfs_cpfile_header *
103nilfs_cpfile_block_get_header(const struct inode *cpfile,
104 struct buffer_head *bh,
105 void *kaddr)
106{
107 return kaddr + bh_offset(bh);
108}
109
110static struct nilfs_checkpoint *
111nilfs_cpfile_block_get_checkpoint(const struct inode *cpfile, __u64 cno,
112 struct buffer_head *bh,
113 void *kaddr)
114{
115 return kaddr + bh_offset(bh) + nilfs_cpfile_get_offset(cpfile, cno) *
116 NILFS_MDT(cpfile)->mi_entry_size;
117}
118
119static void nilfs_cpfile_block_init(struct inode *cpfile,
120 struct buffer_head *bh,
121 void *kaddr)
122{
123 struct nilfs_checkpoint *cp = kaddr + bh_offset(bh);
124 size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
125 int n = nilfs_cpfile_checkpoints_per_block(cpfile);
126
127 while (n-- > 0) {
128 nilfs_checkpoint_set_invalid(cp);
129 cp = (void *)cp + cpsz;
130 }
131}
132
133static inline int nilfs_cpfile_get_header_block(struct inode *cpfile,
134 struct buffer_head **bhp)
135{
136 return nilfs_mdt_get_block(cpfile, 0, 0, NULL, bhp);
137}
138
139static inline int nilfs_cpfile_get_checkpoint_block(struct inode *cpfile,
140 __u64 cno,
141 int create,
142 struct buffer_head **bhp)
143{
144 return nilfs_mdt_get_block(cpfile,
145 nilfs_cpfile_get_blkoff(cpfile, cno),
146 create, nilfs_cpfile_block_init, bhp);
147}
148
149static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile,
150 __u64 cno)
151{
152 return nilfs_mdt_delete_block(cpfile,
153 nilfs_cpfile_get_blkoff(cpfile, cno));
154}
155
156/**
157 * nilfs_cpfile_get_checkpoint - get a checkpoint
158 * @cpfile: inode of checkpoint file
159 * @cno: checkpoint number
160 * @create: create flag
161 * @cpp: pointer to a checkpoint
162 * @bhp: pointer to a buffer head
163 *
164 * Description: nilfs_cpfile_get_checkpoint() acquires the checkpoint
165 * specified by @cno. A new checkpoint will be created if @cno is the current
166 * checkpoint number and @create is nonzero.
167 *
168 * Return Value: On success, 0 is returned, and the checkpoint and the
169 * buffer head of the buffer on which the checkpoint is located are stored in
170 * the place pointed by @cpp and @bhp, respectively. On error, one of the
171 * following negative error codes is returned.
172 *
173 * %-EIO - I/O error.
174 *
175 * %-ENOMEM - Insufficient amount of memory available.
176 *
177 * %-ENOENT - No such checkpoint.
178 *
179 * %-EINVAL - invalid checkpoint.
180 */
181int nilfs_cpfile_get_checkpoint(struct inode *cpfile,
182 __u64 cno,
183 int create,
184 struct nilfs_checkpoint **cpp,
185 struct buffer_head **bhp)
186{
187 struct buffer_head *header_bh, *cp_bh;
188 struct nilfs_cpfile_header *header;
189 struct nilfs_checkpoint *cp;
190 void *kaddr;
191 int ret;
192
193 if (unlikely(cno < 1 || cno > nilfs_mdt_cno(cpfile) ||
194 (cno < nilfs_mdt_cno(cpfile) && create)))
195 return -EINVAL;
196
197 down_write(&NILFS_MDT(cpfile)->mi_sem);
198
199 ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
200 if (ret < 0)
201 goto out_sem;
202 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, create, &cp_bh);
203 if (ret < 0)
204 goto out_header;
205 kaddr = kmap(cp_bh->b_page);
206 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
207 if (nilfs_checkpoint_invalid(cp)) {
208 if (!create) {
209 kunmap(cp_bh->b_page);
210 brelse(cp_bh);
211 ret = -ENOENT;
212 goto out_header;
213 }
214 /* a newly-created checkpoint */
215 nilfs_checkpoint_clear_invalid(cp);
216 if (!nilfs_cpfile_is_in_first(cpfile, cno))
217 nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh,
218 kaddr, 1);
219 nilfs_mdt_mark_buffer_dirty(cp_bh);
220
221 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
222 header = nilfs_cpfile_block_get_header(cpfile, header_bh,
223 kaddr);
224 le64_add_cpu(&header->ch_ncheckpoints, 1);
225 kunmap_atomic(kaddr, KM_USER0);
226 nilfs_mdt_mark_buffer_dirty(header_bh);
227 nilfs_mdt_mark_dirty(cpfile);
228 }
229
230 if (cpp != NULL)
231 *cpp = cp;
232 *bhp = cp_bh;
233
234 out_header:
235 brelse(header_bh);
236
237 out_sem:
238 up_write(&NILFS_MDT(cpfile)->mi_sem);
239 return ret;
240}
241
242/**
243 * nilfs_cpfile_put_checkpoint - put a checkpoint
244 * @cpfile: inode of checkpoint file
245 * @cno: checkpoint number
246 * @bh: buffer head
247 *
248 * Description: nilfs_cpfile_put_checkpoint() releases the checkpoint
249 * specified by @cno. @bh must be the buffer head which has been returned by
250 * a previous call to nilfs_cpfile_get_checkpoint() with @cno.
251 */
252void nilfs_cpfile_put_checkpoint(struct inode *cpfile, __u64 cno,
253 struct buffer_head *bh)
254{
255 kunmap(bh->b_page);
256 brelse(bh);
257}
258
259/**
260 * nilfs_cpfile_delete_checkpoints - delete checkpoints
261 * @cpfile: inode of checkpoint file
262 * @start: start checkpoint number
263 * @end: end checkpoint numer
264 *
265 * Description: nilfs_cpfile_delete_checkpoints() deletes the checkpoints in
266 * the period from @start to @end, excluding @end itself. The checkpoints
267 * which have been already deleted are ignored.
268 *
269 * Return Value: On success, 0 is returned. On error, one of the following
270 * negative error codes is returned.
271 *
272 * %-EIO - I/O error.
273 *
274 * %-ENOMEM - Insufficient amount of memory available.
275 *
276 * %-EINVAL - invalid checkpoints.
277 */
278int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
279 __u64 start,
280 __u64 end)
281{
282 struct buffer_head *header_bh, *cp_bh;
283 struct nilfs_cpfile_header *header;
284 struct nilfs_checkpoint *cp;
285 size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
286 __u64 cno;
287 void *kaddr;
288 unsigned long tnicps;
289 int ret, ncps, nicps, count, i;
290
291 if (unlikely(start == 0 || start > end)) {
292 printk(KERN_ERR "%s: invalid range of checkpoint numbers: "
293 "[%llu, %llu)\n", __func__,
294 (unsigned long long)start, (unsigned long long)end);
295 return -EINVAL;
296 }
297
298 /* cannot delete the latest checkpoint */
299 if (start == nilfs_mdt_cno(cpfile) - 1)
300 return -EPERM;
301
302 down_write(&NILFS_MDT(cpfile)->mi_sem);
303
304 ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
305 if (ret < 0)
306 goto out_sem;
307 tnicps = 0;
308
309 for (cno = start; cno < end; cno += ncps) {
310 ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, end);
311 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
312 if (ret < 0) {
313 if (ret != -ENOENT)
314 goto out_header;
315 /* skip hole */
316 ret = 0;
317 continue;
318 }
319
320 kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
321 cp = nilfs_cpfile_block_get_checkpoint(
322 cpfile, cno, cp_bh, kaddr);
323 nicps = 0;
324 for (i = 0; i < ncps; i++, cp = (void *)cp + cpsz) {
325 WARN_ON(nilfs_checkpoint_snapshot(cp));
326 if (!nilfs_checkpoint_invalid(cp)) {
327 nilfs_checkpoint_set_invalid(cp);
328 nicps++;
329 }
330 }
331 if (nicps > 0) {
332 tnicps += nicps;
333 nilfs_mdt_mark_buffer_dirty(cp_bh);
334 nilfs_mdt_mark_dirty(cpfile);
335 if (!nilfs_cpfile_is_in_first(cpfile, cno) &&
336 (count = nilfs_cpfile_block_sub_valid_checkpoints(
337 cpfile, cp_bh, kaddr, nicps)) == 0) {
338 /* make hole */
339 kunmap_atomic(kaddr, KM_USER0);
340 brelse(cp_bh);
341 ret = nilfs_cpfile_delete_checkpoint_block(
342 cpfile, cno);
343 if (ret == 0)
344 continue;
345 printk(KERN_ERR "%s: cannot delete block\n",
346 __func__);
347 goto out_header;
348 }
349 }
350
351 kunmap_atomic(kaddr, KM_USER0);
352 brelse(cp_bh);
353 }
354
355 if (tnicps > 0) {
356 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
357 header = nilfs_cpfile_block_get_header(cpfile, header_bh,
358 kaddr);
359 le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps);
360 nilfs_mdt_mark_buffer_dirty(header_bh);
361 nilfs_mdt_mark_dirty(cpfile);
362 kunmap_atomic(kaddr, KM_USER0);
363 }
364
365 out_header:
366 brelse(header_bh);
367
368 out_sem:
369 up_write(&NILFS_MDT(cpfile)->mi_sem);
370 return ret;
371}
372
373static void nilfs_cpfile_checkpoint_to_cpinfo(struct inode *cpfile,
374 struct nilfs_checkpoint *cp,
375 struct nilfs_cpinfo *ci)
376{
377 ci->ci_flags = le32_to_cpu(cp->cp_flags);
378 ci->ci_cno = le64_to_cpu(cp->cp_cno);
379 ci->ci_create = le64_to_cpu(cp->cp_create);
380 ci->ci_nblk_inc = le64_to_cpu(cp->cp_nblk_inc);
381 ci->ci_inodes_count = le64_to_cpu(cp->cp_inodes_count);
382 ci->ci_blocks_count = le64_to_cpu(cp->cp_blocks_count);
383 ci->ci_next = le64_to_cpu(cp->cp_snapshot_list.ssl_next);
384}
385
386static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
387 struct nilfs_cpinfo *ci, size_t nci)
388{
389 struct nilfs_checkpoint *cp;
390 struct buffer_head *bh;
391 size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
392 __u64 cur_cno = nilfs_mdt_cno(cpfile), cno = *cnop;
393 void *kaddr;
394 int n, ret;
395 int ncps, i;
396
397 if (cno == 0)
398 return -ENOENT; /* checkpoint number 0 is invalid */
399 down_read(&NILFS_MDT(cpfile)->mi_sem);
400
401 for (n = 0; cno < cur_cno && n < nci; cno += ncps) {
402 ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno);
403 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
404 if (ret < 0) {
405 if (ret != -ENOENT)
406 goto out;
407 continue; /* skip hole */
408 }
409
410 kaddr = kmap_atomic(bh->b_page, KM_USER0);
411 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
412 for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) {
413 if (!nilfs_checkpoint_invalid(cp))
414 nilfs_cpfile_checkpoint_to_cpinfo(
415 cpfile, cp, &ci[n++]);
416 }
417 kunmap_atomic(kaddr, KM_USER0);
418 brelse(bh);
419 }
420
421 ret = n;
422 if (n > 0)
423 *cnop = ci[n - 1].ci_cno + 1;
424
425 out:
426 up_read(&NILFS_MDT(cpfile)->mi_sem);
427 return ret;
428}
429
430static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
431 struct nilfs_cpinfo *ci, size_t nci)
432{
433 struct buffer_head *bh;
434 struct nilfs_cpfile_header *header;
435 struct nilfs_checkpoint *cp;
436 __u64 curr = *cnop, next;
437 unsigned long curr_blkoff, next_blkoff;
438 void *kaddr;
439 int n = 0, ret;
440
441 down_read(&NILFS_MDT(cpfile)->mi_sem);
442
443 if (curr == 0) {
444 ret = nilfs_cpfile_get_header_block(cpfile, &bh);
445 if (ret < 0)
446 goto out;
447 kaddr = kmap_atomic(bh->b_page, KM_USER0);
448 header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
449 curr = le64_to_cpu(header->ch_snapshot_list.ssl_next);
450 kunmap_atomic(kaddr, KM_USER0);
451 brelse(bh);
452 if (curr == 0) {
453 ret = 0;
454 goto out;
455 }
456 } else if (unlikely(curr == ~(__u64)0)) {
457 ret = 0;
458 goto out;
459 }
460
461 curr_blkoff = nilfs_cpfile_get_blkoff(cpfile, curr);
462 ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr, 0, &bh);
463 if (unlikely(ret < 0)) {
464 if (ret == -ENOENT)
465 ret = 0; /* No snapshots (started from a hole block) */
466 goto out;
467 }
468 kaddr = kmap_atomic(bh->b_page, KM_USER0);
469 while (n < nci) {
470 cp = nilfs_cpfile_block_get_checkpoint(cpfile, curr, bh, kaddr);
471 curr = ~(__u64)0; /* Terminator */
472 if (unlikely(nilfs_checkpoint_invalid(cp) ||
473 !nilfs_checkpoint_snapshot(cp)))
474 break;
475 nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, &ci[n++]);
476 next = le64_to_cpu(cp->cp_snapshot_list.ssl_next);
477 if (next == 0)
478 break; /* reach end of the snapshot list */
479
480 next_blkoff = nilfs_cpfile_get_blkoff(cpfile, next);
481 if (curr_blkoff != next_blkoff) {
482 kunmap_atomic(kaddr, KM_USER0);
483 brelse(bh);
484 ret = nilfs_cpfile_get_checkpoint_block(cpfile, next,
485 0, &bh);
486 if (unlikely(ret < 0)) {
487 WARN_ON(ret == -ENOENT);
488 goto out;
489 }
490 kaddr = kmap_atomic(bh->b_page, KM_USER0);
491 }
492 curr = next;
493 curr_blkoff = next_blkoff;
494 }
495 kunmap_atomic(kaddr, KM_USER0);
496 brelse(bh);
497 *cnop = curr;
498 ret = n;
499
500 out:
501 up_read(&NILFS_MDT(cpfile)->mi_sem);
502 return ret;
503}
504
505/**
506 * nilfs_cpfile_get_cpinfo -
507 * @cpfile:
508 * @cno:
509 * @ci:
510 * @nci:
511 */
512
513ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode,
514 struct nilfs_cpinfo *ci, size_t nci)
515{
516 switch (mode) {
517 case NILFS_CHECKPOINT:
518 return nilfs_cpfile_do_get_cpinfo(cpfile, cnop, ci, nci);
519 case NILFS_SNAPSHOT:
520 return nilfs_cpfile_do_get_ssinfo(cpfile, cnop, ci, nci);
521 default:
522 return -EINVAL;
523 }
524}
525
526/**
527 * nilfs_cpfile_delete_checkpoint -
528 * @cpfile:
529 * @cno:
530 */
531int nilfs_cpfile_delete_checkpoint(struct inode *cpfile, __u64 cno)
532{
533 struct nilfs_cpinfo ci;
534 __u64 tcno = cno;
535 ssize_t nci;
536 int ret;
537
538 nci = nilfs_cpfile_do_get_cpinfo(cpfile, &tcno, &ci, 1);
539 if (nci < 0)
540 return nci;
541 else if (nci == 0 || ci.ci_cno != cno)
542 return -ENOENT;
543
544 /* cannot delete the latest checkpoint nor snapshots */
545 ret = nilfs_cpinfo_snapshot(&ci);
546 if (ret < 0)
547 return ret;
548 else if (ret > 0 || cno == nilfs_mdt_cno(cpfile) - 1)
549 return -EPERM;
550
551 return nilfs_cpfile_delete_checkpoints(cpfile, cno, cno + 1);
552}
553
554static struct nilfs_snapshot_list *
555nilfs_cpfile_block_get_snapshot_list(const struct inode *cpfile,
556 __u64 cno,
557 struct buffer_head *bh,
558 void *kaddr)
559{
560 struct nilfs_cpfile_header *header;
561 struct nilfs_checkpoint *cp;
562 struct nilfs_snapshot_list *list;
563
564 if (cno != 0) {
565 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
566 list = &cp->cp_snapshot_list;
567 } else {
568 header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
569 list = &header->ch_snapshot_list;
570 }
571 return list;
572}
573
574static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
575{
576 struct buffer_head *header_bh, *curr_bh, *prev_bh, *cp_bh;
577 struct nilfs_cpfile_header *header;
578 struct nilfs_checkpoint *cp;
579 struct nilfs_snapshot_list *list;
580 __u64 curr, prev;
581 unsigned long curr_blkoff, prev_blkoff;
582 void *kaddr;
583 int ret;
584
585 if (cno == 0)
586 return -ENOENT; /* checkpoint number 0 is invalid */
587 down_write(&NILFS_MDT(cpfile)->mi_sem);
588
589 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
590 if (ret < 0)
591 goto out_sem;
592 kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
593 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
594 if (nilfs_checkpoint_invalid(cp)) {
595 ret = -ENOENT;
596 kunmap_atomic(kaddr, KM_USER0);
597 goto out_cp;
598 }
599 if (nilfs_checkpoint_snapshot(cp)) {
600 ret = 0;
601 kunmap_atomic(kaddr, KM_USER0);
602 goto out_cp;
603 }
604 kunmap_atomic(kaddr, KM_USER0);
605
606 ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
607 if (ret < 0)
608 goto out_cp;
609 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
610 header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
611 list = &header->ch_snapshot_list;
612 curr_bh = header_bh;
613 get_bh(curr_bh);
614 curr = 0;
615 curr_blkoff = 0;
616 prev = le64_to_cpu(list->ssl_prev);
617 while (prev > cno) {
618 prev_blkoff = nilfs_cpfile_get_blkoff(cpfile, prev);
619 curr = prev;
620 if (curr_blkoff != prev_blkoff) {
621 kunmap_atomic(kaddr, KM_USER0);
622 brelse(curr_bh);
623 ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr,
624 0, &curr_bh);
625 if (ret < 0)
626 goto out_header;
627 kaddr = kmap_atomic(curr_bh->b_page, KM_USER0);
628 }
629 curr_blkoff = prev_blkoff;
630 cp = nilfs_cpfile_block_get_checkpoint(
631 cpfile, curr, curr_bh, kaddr);
632 list = &cp->cp_snapshot_list;
633 prev = le64_to_cpu(list->ssl_prev);
634 }
635 kunmap_atomic(kaddr, KM_USER0);
636
637 if (prev != 0) {
638 ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0,
639 &prev_bh);
640 if (ret < 0)
641 goto out_curr;
642 } else {
643 prev_bh = header_bh;
644 get_bh(prev_bh);
645 }
646
647 kaddr = kmap_atomic(curr_bh->b_page, KM_USER0);
648 list = nilfs_cpfile_block_get_snapshot_list(
649 cpfile, curr, curr_bh, kaddr);
650 list->ssl_prev = cpu_to_le64(cno);
651 kunmap_atomic(kaddr, KM_USER0);
652
653 kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
654 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
655 cp->cp_snapshot_list.ssl_next = cpu_to_le64(curr);
656 cp->cp_snapshot_list.ssl_prev = cpu_to_le64(prev);
657 nilfs_checkpoint_set_snapshot(cp);
658 kunmap_atomic(kaddr, KM_USER0);
659
660 kaddr = kmap_atomic(prev_bh->b_page, KM_USER0);
661 list = nilfs_cpfile_block_get_snapshot_list(
662 cpfile, prev, prev_bh, kaddr);
663 list->ssl_next = cpu_to_le64(cno);
664 kunmap_atomic(kaddr, KM_USER0);
665
666 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
667 header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
668 le64_add_cpu(&header->ch_nsnapshots, 1);
669 kunmap_atomic(kaddr, KM_USER0);
670
671 nilfs_mdt_mark_buffer_dirty(prev_bh);
672 nilfs_mdt_mark_buffer_dirty(curr_bh);
673 nilfs_mdt_mark_buffer_dirty(cp_bh);
674 nilfs_mdt_mark_buffer_dirty(header_bh);
675 nilfs_mdt_mark_dirty(cpfile);
676
677 brelse(prev_bh);
678
679 out_curr:
680 brelse(curr_bh);
681
682 out_header:
683 brelse(header_bh);
684
685 out_cp:
686 brelse(cp_bh);
687
688 out_sem:
689 up_write(&NILFS_MDT(cpfile)->mi_sem);
690 return ret;
691}
692
693static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno)
694{
695 struct buffer_head *header_bh, *next_bh, *prev_bh, *cp_bh;
696 struct nilfs_cpfile_header *header;
697 struct nilfs_checkpoint *cp;
698 struct nilfs_snapshot_list *list;
699 __u64 next, prev;
700 void *kaddr;
701 int ret;
702
703 if (cno == 0)
704 return -ENOENT; /* checkpoint number 0 is invalid */
705 down_write(&NILFS_MDT(cpfile)->mi_sem);
706
707 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
708 if (ret < 0)
709 goto out_sem;
710 kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
711 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
712 if (nilfs_checkpoint_invalid(cp)) {
713 ret = -ENOENT;
714 kunmap_atomic(kaddr, KM_USER0);
715 goto out_cp;
716 }
717 if (!nilfs_checkpoint_snapshot(cp)) {
718 ret = 0;
719 kunmap_atomic(kaddr, KM_USER0);
720 goto out_cp;
721 }
722
723 list = &cp->cp_snapshot_list;
724 next = le64_to_cpu(list->ssl_next);
725 prev = le64_to_cpu(list->ssl_prev);
726 kunmap_atomic(kaddr, KM_USER0);
727
728 ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
729 if (ret < 0)
730 goto out_cp;
731 if (next != 0) {
732 ret = nilfs_cpfile_get_checkpoint_block(cpfile, next, 0,
733 &next_bh);
734 if (ret < 0)
735 goto out_header;
736 } else {
737 next_bh = header_bh;
738 get_bh(next_bh);
739 }
740 if (prev != 0) {
741 ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0,
742 &prev_bh);
743 if (ret < 0)
744 goto out_next;
745 } else {
746 prev_bh = header_bh;
747 get_bh(prev_bh);
748 }
749
750 kaddr = kmap_atomic(next_bh->b_page, KM_USER0);
751 list = nilfs_cpfile_block_get_snapshot_list(
752 cpfile, next, next_bh, kaddr);
753 list->ssl_prev = cpu_to_le64(prev);
754 kunmap_atomic(kaddr, KM_USER0);
755
756 kaddr = kmap_atomic(prev_bh->b_page, KM_USER0);
757 list = nilfs_cpfile_block_get_snapshot_list(
758 cpfile, prev, prev_bh, kaddr);
759 list->ssl_next = cpu_to_le64(next);
760 kunmap_atomic(kaddr, KM_USER0);
761
762 kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
763 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
764 cp->cp_snapshot_list.ssl_next = cpu_to_le64(0);
765 cp->cp_snapshot_list.ssl_prev = cpu_to_le64(0);
766 nilfs_checkpoint_clear_snapshot(cp);
767 kunmap_atomic(kaddr, KM_USER0);
768
769 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
770 header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
771 le64_add_cpu(&header->ch_nsnapshots, -1);
772 kunmap_atomic(kaddr, KM_USER0);
773
774 nilfs_mdt_mark_buffer_dirty(next_bh);
775 nilfs_mdt_mark_buffer_dirty(prev_bh);
776 nilfs_mdt_mark_buffer_dirty(cp_bh);
777 nilfs_mdt_mark_buffer_dirty(header_bh);
778 nilfs_mdt_mark_dirty(cpfile);
779
780 brelse(prev_bh);
781
782 out_next:
783 brelse(next_bh);
784
785 out_header:
786 brelse(header_bh);
787
788 out_cp:
789 brelse(cp_bh);
790
791 out_sem:
792 up_write(&NILFS_MDT(cpfile)->mi_sem);
793 return ret;
794}
795
796/**
797 * nilfs_cpfile_is_snapshot -
798 * @cpfile: inode of checkpoint file
799 * @cno: checkpoint number
800 *
801 * Description:
802 *
803 * Return Value: On success, 1 is returned if the checkpoint specified by
804 * @cno is a snapshot, or 0 if not. On error, one of the following negative
805 * error codes is returned.
806 *
807 * %-EIO - I/O error.
808 *
809 * %-ENOMEM - Insufficient amount of memory available.
810 *
811 * %-ENOENT - No such checkpoint.
812 */
813int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
814{
815 struct buffer_head *bh;
816 struct nilfs_checkpoint *cp;
817 void *kaddr;
818 int ret;
819
820 if (cno == 0)
821 return -ENOENT; /* checkpoint number 0 is invalid */
822 down_read(&NILFS_MDT(cpfile)->mi_sem);
823
824 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
825 if (ret < 0)
826 goto out;
827 kaddr = kmap_atomic(bh->b_page, KM_USER0);
828 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
829 ret = nilfs_checkpoint_snapshot(cp);
830 kunmap_atomic(kaddr, KM_USER0);
831 brelse(bh);
832
833 out:
834 up_read(&NILFS_MDT(cpfile)->mi_sem);
835 return ret;
836}
837
838/**
839 * nilfs_cpfile_change_cpmode - change checkpoint mode
840 * @cpfile: inode of checkpoint file
841 * @cno: checkpoint number
842 * @status: mode of checkpoint
843 *
844 * Description: nilfs_change_cpmode() changes the mode of the checkpoint
845 * specified by @cno. The mode @mode is NILFS_CHECKPOINT or NILFS_SNAPSHOT.
846 *
847 * Return Value: On success, 0 is returned. On error, one of the following
848 * negative error codes is returned.
849 *
850 * %-EIO - I/O error.
851 *
852 * %-ENOMEM - Insufficient amount of memory available.
853 *
854 * %-ENOENT - No such checkpoint.
855 */
856int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
857{
858 struct the_nilfs *nilfs;
859 int ret;
860
861 nilfs = NILFS_MDT(cpfile)->mi_nilfs;
862
863 switch (mode) {
864 case NILFS_CHECKPOINT:
865 /*
866 * Check for protecting existing snapshot mounts:
867 * bd_mount_sem is used to make this operation atomic and
868 * exclusive with a new mount job. Though it doesn't cover
869 * umount, it's enough for the purpose.
870 */
871 down(&nilfs->ns_bdev->bd_mount_sem);
872 if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) {
873 /* Current implementation does not have to protect
874 plain read-only mounts since they are exclusive
875 with a read/write mount and are protected from the
876 cleaner. */
877 ret = -EBUSY;
878 } else
879 ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
880 up(&nilfs->ns_bdev->bd_mount_sem);
881 return ret;
882 case NILFS_SNAPSHOT:
883 return nilfs_cpfile_set_snapshot(cpfile, cno);
884 default:
885 return -EINVAL;
886 }
887}
888
889/**
890 * nilfs_cpfile_get_stat - get checkpoint statistics
891 * @cpfile: inode of checkpoint file
892 * @stat: pointer to a structure of checkpoint statistics
893 *
894 * Description: nilfs_cpfile_get_stat() returns information about checkpoints.
895 *
896 * Return Value: On success, 0 is returned, and checkpoints information is
897 * stored in the place pointed by @stat. On error, one of the following
898 * negative error codes is returned.
899 *
900 * %-EIO - I/O error.
901 *
902 * %-ENOMEM - Insufficient amount of memory available.
903 */
904int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
905{
906 struct buffer_head *bh;
907 struct nilfs_cpfile_header *header;
908 void *kaddr;
909 int ret;
910
911 down_read(&NILFS_MDT(cpfile)->mi_sem);
912
913 ret = nilfs_cpfile_get_header_block(cpfile, &bh);
914 if (ret < 0)
915 goto out_sem;
916 kaddr = kmap_atomic(bh->b_page, KM_USER0);
917 header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
918 cpstat->cs_cno = nilfs_mdt_cno(cpfile);
919 cpstat->cs_ncps = le64_to_cpu(header->ch_ncheckpoints);
920 cpstat->cs_nsss = le64_to_cpu(header->ch_nsnapshots);
921 kunmap_atomic(kaddr, KM_USER0);
922 brelse(bh);
923
924 out_sem:
925 up_read(&NILFS_MDT(cpfile)->mi_sem);
926 return ret;
927}
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
new file mode 100644
index 000000000000..1a8a1008c342
--- /dev/null
+++ b/fs/nilfs2/cpfile.h
@@ -0,0 +1,45 @@
1/*
2 * cpfile.h - NILFS checkpoint file.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_CPFILE_H
24#define _NILFS_CPFILE_H
25
26#include <linux/fs.h>
27#include <linux/buffer_head.h>
28#include <linux/nilfs2_fs.h>
29
30#define NILFS_CPFILE_GFP NILFS_MDT_GFP
31
32
33int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
34 struct nilfs_checkpoint **,
35 struct buffer_head **);
36void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *);
37int nilfs_cpfile_delete_checkpoints(struct inode *, __u64, __u64);
38int nilfs_cpfile_delete_checkpoint(struct inode *, __u64);
39int nilfs_cpfile_change_cpmode(struct inode *, __u64, int);
40int nilfs_cpfile_is_snapshot(struct inode *, __u64);
41int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
42ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int,
43 struct nilfs_cpinfo *, size_t);
44
45#endif /* _NILFS_CPFILE_H */
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
new file mode 100644
index 000000000000..bb8a5818e7f1
--- /dev/null
+++ b/fs/nilfs2/dat.c
@@ -0,0 +1,430 @@
1/*
2 * dat.c - NILFS disk address translation.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#include <linux/types.h>
24#include <linux/buffer_head.h>
25#include <linux/string.h>
26#include <linux/errno.h>
27#include "nilfs.h"
28#include "mdt.h"
29#include "alloc.h"
30#include "dat.h"
31
32
33#define NILFS_CNO_MIN ((__u64)1)
34#define NILFS_CNO_MAX (~(__u64)0)
35
36static int nilfs_dat_prepare_entry(struct inode *dat,
37 struct nilfs_palloc_req *req, int create)
38{
39 return nilfs_palloc_get_entry_block(dat, req->pr_entry_nr,
40 create, &req->pr_entry_bh);
41}
42
43static void nilfs_dat_commit_entry(struct inode *dat,
44 struct nilfs_palloc_req *req)
45{
46 nilfs_mdt_mark_buffer_dirty(req->pr_entry_bh);
47 nilfs_mdt_mark_dirty(dat);
48 brelse(req->pr_entry_bh);
49}
50
51static void nilfs_dat_abort_entry(struct inode *dat,
52 struct nilfs_palloc_req *req)
53{
54 brelse(req->pr_entry_bh);
55}
56
57int nilfs_dat_prepare_alloc(struct inode *dat, struct nilfs_palloc_req *req)
58{
59 int ret;
60
61 ret = nilfs_palloc_prepare_alloc_entry(dat, req);
62 if (ret < 0)
63 return ret;
64
65 ret = nilfs_dat_prepare_entry(dat, req, 1);
66 if (ret < 0)
67 nilfs_palloc_abort_alloc_entry(dat, req);
68
69 return ret;
70}
71
72void nilfs_dat_commit_alloc(struct inode *dat, struct nilfs_palloc_req *req)
73{
74 struct nilfs_dat_entry *entry;
75 void *kaddr;
76
77 kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
78 entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
79 req->pr_entry_bh, kaddr);
80 entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
81 entry->de_end = cpu_to_le64(NILFS_CNO_MAX);
82 entry->de_blocknr = cpu_to_le64(0);
83 kunmap_atomic(kaddr, KM_USER0);
84
85 nilfs_palloc_commit_alloc_entry(dat, req);
86 nilfs_dat_commit_entry(dat, req);
87}
88
89void nilfs_dat_abort_alloc(struct inode *dat, struct nilfs_palloc_req *req)
90{
91 nilfs_dat_abort_entry(dat, req);
92 nilfs_palloc_abort_alloc_entry(dat, req);
93}
94
95int nilfs_dat_prepare_free(struct inode *dat, struct nilfs_palloc_req *req)
96{
97 int ret;
98
99 ret = nilfs_palloc_prepare_free_entry(dat, req);
100 if (ret < 0)
101 return ret;
102 ret = nilfs_dat_prepare_entry(dat, req, 0);
103 if (ret < 0) {
104 nilfs_palloc_abort_free_entry(dat, req);
105 return ret;
106 }
107 return 0;
108}
109
110void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req)
111{
112 struct nilfs_dat_entry *entry;
113 void *kaddr;
114
115 kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
116 entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
117 req->pr_entry_bh, kaddr);
118 entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
119 entry->de_end = cpu_to_le64(NILFS_CNO_MIN);
120 entry->de_blocknr = cpu_to_le64(0);
121 kunmap_atomic(kaddr, KM_USER0);
122
123 nilfs_dat_commit_entry(dat, req);
124 nilfs_palloc_commit_free_entry(dat, req);
125}
126
127void nilfs_dat_abort_free(struct inode *dat, struct nilfs_palloc_req *req)
128{
129 nilfs_dat_abort_entry(dat, req);
130 nilfs_palloc_abort_free_entry(dat, req);
131}
132
133int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req)
134{
135 int ret;
136
137 ret = nilfs_dat_prepare_entry(dat, req, 0);
138 WARN_ON(ret == -ENOENT);
139 return ret;
140}
141
142void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
143 sector_t blocknr)
144{
145 struct nilfs_dat_entry *entry;
146 void *kaddr;
147
148 kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
149 entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
150 req->pr_entry_bh, kaddr);
151 entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat));
152 if (entry->de_blocknr != cpu_to_le64(0) ||
153 entry->de_end != cpu_to_le64(NILFS_CNO_MAX)) {
154 printk(KERN_CRIT
155 "%s: vbn = %llu, start = %llu, end = %llu, pbn = %llu\n",
156 __func__, (unsigned long long)req->pr_entry_nr,
157 (unsigned long long)le64_to_cpu(entry->de_start),
158 (unsigned long long)le64_to_cpu(entry->de_end),
159 (unsigned long long)le64_to_cpu(entry->de_blocknr));
160 }
161 entry->de_blocknr = cpu_to_le64(blocknr);
162 kunmap_atomic(kaddr, KM_USER0);
163
164 nilfs_dat_commit_entry(dat, req);
165}
166
167void nilfs_dat_abort_start(struct inode *dat, struct nilfs_palloc_req *req)
168{
169 nilfs_dat_abort_entry(dat, req);
170}
171
172int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
173{
174 struct nilfs_dat_entry *entry;
175 __u64 start;
176 sector_t blocknr;
177 void *kaddr;
178 int ret;
179
180 ret = nilfs_dat_prepare_entry(dat, req, 0);
181 if (ret < 0) {
182 WARN_ON(ret == -ENOENT);
183 return ret;
184 }
185
186 kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
187 entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
188 req->pr_entry_bh, kaddr);
189 start = le64_to_cpu(entry->de_start);
190 blocknr = le64_to_cpu(entry->de_blocknr);
191 kunmap_atomic(kaddr, KM_USER0);
192
193 if (blocknr == 0) {
194 ret = nilfs_palloc_prepare_free_entry(dat, req);
195 if (ret < 0) {
196 nilfs_dat_abort_entry(dat, req);
197 return ret;
198 }
199 }
200
201 return 0;
202}
203
204void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req,
205 int dead)
206{
207 struct nilfs_dat_entry *entry;
208 __u64 start, end;
209 sector_t blocknr;
210 void *kaddr;
211
212 kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
213 entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
214 req->pr_entry_bh, kaddr);
215 end = start = le64_to_cpu(entry->de_start);
216 if (!dead) {
217 end = nilfs_mdt_cno(dat);
218 WARN_ON(start > end);
219 }
220 entry->de_end = cpu_to_le64(end);
221 blocknr = le64_to_cpu(entry->de_blocknr);
222 kunmap_atomic(kaddr, KM_USER0);
223
224 if (blocknr == 0)
225 nilfs_dat_commit_free(dat, req);
226 else
227 nilfs_dat_commit_entry(dat, req);
228}
229
230void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req)
231{
232 struct nilfs_dat_entry *entry;
233 __u64 start;
234 sector_t blocknr;
235 void *kaddr;
236
237 kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
238 entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
239 req->pr_entry_bh, kaddr);
240 start = le64_to_cpu(entry->de_start);
241 blocknr = le64_to_cpu(entry->de_blocknr);
242 kunmap_atomic(kaddr, KM_USER0);
243
244 if (start == nilfs_mdt_cno(dat) && blocknr == 0)
245 nilfs_palloc_abort_free_entry(dat, req);
246 nilfs_dat_abort_entry(dat, req);
247}
248
249/**
250 * nilfs_dat_mark_dirty -
251 * @dat: DAT file inode
252 * @vblocknr: virtual block number
253 *
254 * Description:
255 *
256 * Return Value: On success, 0 is returned. On error, one of the following
257 * negative error codes is returned.
258 *
259 * %-EIO - I/O error.
260 *
261 * %-ENOMEM - Insufficient amount of memory available.
262 */
263int nilfs_dat_mark_dirty(struct inode *dat, __u64 vblocknr)
264{
265 struct nilfs_palloc_req req;
266 int ret;
267
268 req.pr_entry_nr = vblocknr;
269 ret = nilfs_dat_prepare_entry(dat, &req, 0);
270 if (ret == 0)
271 nilfs_dat_commit_entry(dat, &req);
272 return ret;
273}
274
275/**
276 * nilfs_dat_freev - free virtual block numbers
277 * @dat: DAT file inode
278 * @vblocknrs: array of virtual block numbers
279 * @nitems: number of virtual block numbers
280 *
281 * Description: nilfs_dat_freev() frees the virtual block numbers specified by
282 * @vblocknrs and @nitems.
283 *
284 * Return Value: On success, 0 is returned. On error, one of the following
285 * nagative error codes is returned.
286 *
287 * %-EIO - I/O error.
288 *
289 * %-ENOMEM - Insufficient amount of memory available.
290 *
291 * %-ENOENT - The virtual block number have not been allocated.
292 */
293int nilfs_dat_freev(struct inode *dat, __u64 *vblocknrs, size_t nitems)
294{
295 return nilfs_palloc_freev(dat, vblocknrs, nitems);
296}
297
298/**
299 * nilfs_dat_move - change a block number
300 * @dat: DAT file inode
301 * @vblocknr: virtual block number
302 * @blocknr: block number
303 *
304 * Description: nilfs_dat_move() changes the block number associated with
305 * @vblocknr to @blocknr.
306 *
307 * Return Value: On success, 0 is returned. On error, one of the following
308 * negative error codes is returned.
309 *
310 * %-EIO - I/O error.
311 *
312 * %-ENOMEM - Insufficient amount of memory available.
313 */
314int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
315{
316 struct buffer_head *entry_bh;
317 struct nilfs_dat_entry *entry;
318 void *kaddr;
319 int ret;
320
321 ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
322 if (ret < 0)
323 return ret;
324 kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
325 entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
326 if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
327 printk(KERN_CRIT "%s: vbn = %llu, [%llu, %llu)\n", __func__,
328 (unsigned long long)vblocknr,
329 (unsigned long long)le64_to_cpu(entry->de_start),
330 (unsigned long long)le64_to_cpu(entry->de_end));
331 kunmap_atomic(kaddr, KM_USER0);
332 brelse(entry_bh);
333 return -EINVAL;
334 }
335 WARN_ON(blocknr == 0);
336 entry->de_blocknr = cpu_to_le64(blocknr);
337 kunmap_atomic(kaddr, KM_USER0);
338
339 nilfs_mdt_mark_buffer_dirty(entry_bh);
340 nilfs_mdt_mark_dirty(dat);
341
342 brelse(entry_bh);
343
344 return 0;
345}
346
347/**
348 * nilfs_dat_translate - translate a virtual block number to a block number
349 * @dat: DAT file inode
350 * @vblocknr: virtual block number
351 * @blocknrp: pointer to a block number
352 *
353 * Description: nilfs_dat_translate() maps the virtual block number @vblocknr
354 * to the corresponding block number.
355 *
356 * Return Value: On success, 0 is returned and the block number associated
357 * with @vblocknr is stored in the place pointed by @blocknrp. On error, one
358 * of the following negative error codes is returned.
359 *
360 * %-EIO - I/O error.
361 *
362 * %-ENOMEM - Insufficient amount of memory available.
363 *
364 * %-ENOENT - A block number associated with @vblocknr does not exist.
365 */
366int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
367{
368 struct buffer_head *entry_bh;
369 struct nilfs_dat_entry *entry;
370 sector_t blocknr;
371 void *kaddr;
372 int ret;
373
374 ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
375 if (ret < 0)
376 return ret;
377
378 kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
379 entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
380 blocknr = le64_to_cpu(entry->de_blocknr);
381 if (blocknr == 0) {
382 ret = -ENOENT;
383 goto out;
384 }
385 if (blocknrp != NULL)
386 *blocknrp = blocknr;
387
388 out:
389 kunmap_atomic(kaddr, KM_USER0);
390 brelse(entry_bh);
391 return ret;
392}
393
394ssize_t nilfs_dat_get_vinfo(struct inode *dat, struct nilfs_vinfo *vinfo,
395 size_t nvi)
396{
397 struct buffer_head *entry_bh;
398 struct nilfs_dat_entry *entry;
399 __u64 first, last;
400 void *kaddr;
401 unsigned long entries_per_block = NILFS_MDT(dat)->mi_entries_per_block;
402 int i, j, n, ret;
403
404 for (i = 0; i < nvi; i += n) {
405 ret = nilfs_palloc_get_entry_block(dat, vinfo[i].vi_vblocknr,
406 0, &entry_bh);
407 if (ret < 0)
408 return ret;
409 kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
410 /* last virtual block number in this block */
411 first = vinfo[i].vi_vblocknr;
412 do_div(first, entries_per_block);
413 first *= entries_per_block;
414 last = first + entries_per_block - 1;
415 for (j = i, n = 0;
416 j < nvi && vinfo[j].vi_vblocknr >= first &&
417 vinfo[j].vi_vblocknr <= last;
418 j++, n++) {
419 entry = nilfs_palloc_block_get_entry(
420 dat, vinfo[j].vi_vblocknr, entry_bh, kaddr);
421 vinfo[j].vi_start = le64_to_cpu(entry->de_start);
422 vinfo[j].vi_end = le64_to_cpu(entry->de_end);
423 vinfo[j].vi_blocknr = le64_to_cpu(entry->de_blocknr);
424 }
425 kunmap_atomic(kaddr, KM_USER0);
426 brelse(entry_bh);
427 }
428
429 return nvi;
430}
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
new file mode 100644
index 000000000000..d9560654a4b7
--- /dev/null
+++ b/fs/nilfs2/dat.h
@@ -0,0 +1,52 @@
1/*
2 * dat.h - NILFS disk address translation.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_DAT_H
24#define _NILFS_DAT_H
25
26#include <linux/types.h>
27#include <linux/buffer_head.h>
28#include <linux/fs.h>
29
30#define NILFS_DAT_GFP NILFS_MDT_GFP
31
32struct nilfs_palloc_req;
33
34int nilfs_dat_translate(struct inode *, __u64, sector_t *);
35
36int nilfs_dat_prepare_alloc(struct inode *, struct nilfs_palloc_req *);
37void nilfs_dat_commit_alloc(struct inode *, struct nilfs_palloc_req *);
38void nilfs_dat_abort_alloc(struct inode *, struct nilfs_palloc_req *);
39int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *);
40void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *,
41 sector_t);
42void nilfs_dat_abort_start(struct inode *, struct nilfs_palloc_req *);
43int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *);
44void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int);
45void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *);
46
47int nilfs_dat_mark_dirty(struct inode *, __u64);
48int nilfs_dat_freev(struct inode *, __u64 *, size_t);
49int nilfs_dat_move(struct inode *, __u64, sector_t);
50ssize_t nilfs_dat_get_vinfo(struct inode *, struct nilfs_vinfo *, size_t);
51
52#endif /* _NILFS_DAT_H */
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
new file mode 100644
index 000000000000..54100acc1102
--- /dev/null
+++ b/fs/nilfs2/dir.c
@@ -0,0 +1,711 @@
1/*
2 * dir.c - NILFS directory entry operations
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>
21 */
22/*
23 * linux/fs/ext2/dir.c
24 *
25 * Copyright (C) 1992, 1993, 1994, 1995
26 * Remy Card (card@masi.ibp.fr)
27 * Laboratoire MASI - Institut Blaise Pascal
28 * Universite Pierre et Marie Curie (Paris VI)
29 *
30 * from
31 *
32 * linux/fs/minix/dir.c
33 *
34 * Copyright (C) 1991, 1992 Linus Torvalds
35 *
36 * ext2 directory handling functions
37 *
38 * Big-endian to little-endian byte-swapping/bitmaps by
39 * David S. Miller (davem@caip.rutgers.edu), 1995
40 *
41 * All code that works with directory layout had been switched to pagecache
42 * and moved here. AV
43 */
44
45#include <linux/pagemap.h>
46#include <linux/smp_lock.h>
47#include "nilfs.h"
48#include "page.h"
49
50/*
51 * nilfs uses block-sized chunks. Arguably, sector-sized ones would be
52 * more robust, but we have what we have
53 */
54static inline unsigned nilfs_chunk_size(struct inode *inode)
55{
56 return inode->i_sb->s_blocksize;
57}
58
59static inline void nilfs_put_page(struct page *page)
60{
61 kunmap(page);
62 page_cache_release(page);
63}
64
65static inline unsigned long dir_pages(struct inode *inode)
66{
67 return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
68}
69
70/*
71 * Return the offset into page `page_nr' of the last valid
72 * byte in that page, plus one.
73 */
74static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr)
75{
76 unsigned last_byte = inode->i_size;
77
78 last_byte -= page_nr << PAGE_CACHE_SHIFT;
79 if (last_byte > PAGE_CACHE_SIZE)
80 last_byte = PAGE_CACHE_SIZE;
81 return last_byte;
82}
83
84static int nilfs_prepare_chunk_uninterruptible(struct page *page,
85 struct address_space *mapping,
86 unsigned from, unsigned to)
87{
88 loff_t pos = page_offset(page) + from;
89 return block_write_begin(NULL, mapping, pos, to - from,
90 AOP_FLAG_UNINTERRUPTIBLE, &page,
91 NULL, nilfs_get_block);
92}
93
94static int nilfs_prepare_chunk(struct page *page,
95 struct address_space *mapping,
96 unsigned from, unsigned to)
97{
98 loff_t pos = page_offset(page) + from;
99 return block_write_begin(NULL, mapping, pos, to - from, 0, &page,
100 NULL, nilfs_get_block);
101}
102
103static int nilfs_commit_chunk(struct page *page,
104 struct address_space *mapping,
105 unsigned from, unsigned to)
106{
107 struct inode *dir = mapping->host;
108 struct nilfs_sb_info *sbi = NILFS_SB(dir->i_sb);
109 loff_t pos = page_offset(page) + from;
110 unsigned len = to - from;
111 unsigned nr_dirty, copied;
112 int err;
113
114 nr_dirty = nilfs_page_count_clean_buffers(page, from, to);
115 copied = block_write_end(NULL, mapping, pos, len, len, page, NULL);
116 if (pos + copied > dir->i_size) {
117 i_size_write(dir, pos + copied);
118 mark_inode_dirty(dir);
119 }
120 if (IS_DIRSYNC(dir))
121 nilfs_set_transaction_flag(NILFS_TI_SYNC);
122 err = nilfs_set_file_dirty(sbi, dir, nr_dirty);
123 unlock_page(page);
124 return err;
125}
126
127static void nilfs_check_page(struct page *page)
128{
129 struct inode *dir = page->mapping->host;
130 struct super_block *sb = dir->i_sb;
131 unsigned chunk_size = nilfs_chunk_size(dir);
132 char *kaddr = page_address(page);
133 unsigned offs, rec_len;
134 unsigned limit = PAGE_CACHE_SIZE;
135 struct nilfs_dir_entry *p;
136 char *error;
137
138 if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
139 limit = dir->i_size & ~PAGE_CACHE_MASK;
140 if (limit & (chunk_size - 1))
141 goto Ebadsize;
142 if (!limit)
143 goto out;
144 }
145 for (offs = 0; offs <= limit - NILFS_DIR_REC_LEN(1); offs += rec_len) {
146 p = (struct nilfs_dir_entry *)(kaddr + offs);
147 rec_len = le16_to_cpu(p->rec_len);
148
149 if (rec_len < NILFS_DIR_REC_LEN(1))
150 goto Eshort;
151 if (rec_len & 3)
152 goto Ealign;
153 if (rec_len < NILFS_DIR_REC_LEN(p->name_len))
154 goto Enamelen;
155 if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
156 goto Espan;
157 }
158 if (offs != limit)
159 goto Eend;
160out:
161 SetPageChecked(page);
162 return;
163
164 /* Too bad, we had an error */
165
166Ebadsize:
167 nilfs_error(sb, "nilfs_check_page",
168 "size of directory #%lu is not a multiple of chunk size",
169 dir->i_ino
170 );
171 goto fail;
172Eshort:
173 error = "rec_len is smaller than minimal";
174 goto bad_entry;
175Ealign:
176 error = "unaligned directory entry";
177 goto bad_entry;
178Enamelen:
179 error = "rec_len is too small for name_len";
180 goto bad_entry;
181Espan:
182 error = "directory entry across blocks";
183bad_entry:
184 nilfs_error(sb, "nilfs_check_page", "bad entry in directory #%lu: %s - "
185 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
186 dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
187 (unsigned long) le64_to_cpu(p->inode),
188 rec_len, p->name_len);
189 goto fail;
190Eend:
191 p = (struct nilfs_dir_entry *)(kaddr + offs);
192 nilfs_error(sb, "nilfs_check_page",
193 "entry in directory #%lu spans the page boundary"
194 "offset=%lu, inode=%lu",
195 dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
196 (unsigned long) le64_to_cpu(p->inode));
197fail:
198 SetPageChecked(page);
199 SetPageError(page);
200}
201
202static struct page *nilfs_get_page(struct inode *dir, unsigned long n)
203{
204 struct address_space *mapping = dir->i_mapping;
205 struct page *page = read_cache_page(mapping, n,
206 (filler_t *)mapping->a_ops->readpage, NULL);
207 if (!IS_ERR(page)) {
208 wait_on_page_locked(page);
209 kmap(page);
210 if (!PageUptodate(page))
211 goto fail;
212 if (!PageChecked(page))
213 nilfs_check_page(page);
214 if (PageError(page))
215 goto fail;
216 }
217 return page;
218
219fail:
220 nilfs_put_page(page);
221 return ERR_PTR(-EIO);
222}
223
224/*
225 * NOTE! unlike strncmp, nilfs_match returns 1 for success, 0 for failure.
226 *
227 * len <= NILFS_NAME_LEN and de != NULL are guaranteed by caller.
228 */
229static int
230nilfs_match(int len, const char * const name, struct nilfs_dir_entry *de)
231{
232 if (len != de->name_len)
233 return 0;
234 if (!de->inode)
235 return 0;
236 return !memcmp(name, de->name, len);
237}
238
239/*
240 * p is at least 6 bytes before the end of page
241 */
242static struct nilfs_dir_entry *nilfs_next_entry(struct nilfs_dir_entry *p)
243{
244 return (struct nilfs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
245}
246
247static unsigned char
248nilfs_filetype_table[NILFS_FT_MAX] = {
249 [NILFS_FT_UNKNOWN] = DT_UNKNOWN,
250 [NILFS_FT_REG_FILE] = DT_REG,
251 [NILFS_FT_DIR] = DT_DIR,
252 [NILFS_FT_CHRDEV] = DT_CHR,
253 [NILFS_FT_BLKDEV] = DT_BLK,
254 [NILFS_FT_FIFO] = DT_FIFO,
255 [NILFS_FT_SOCK] = DT_SOCK,
256 [NILFS_FT_SYMLINK] = DT_LNK,
257};
258
259#define S_SHIFT 12
260static unsigned char
261nilfs_type_by_mode[S_IFMT >> S_SHIFT] = {
262 [S_IFREG >> S_SHIFT] = NILFS_FT_REG_FILE,
263 [S_IFDIR >> S_SHIFT] = NILFS_FT_DIR,
264 [S_IFCHR >> S_SHIFT] = NILFS_FT_CHRDEV,
265 [S_IFBLK >> S_SHIFT] = NILFS_FT_BLKDEV,
266 [S_IFIFO >> S_SHIFT] = NILFS_FT_FIFO,
267 [S_IFSOCK >> S_SHIFT] = NILFS_FT_SOCK,
268 [S_IFLNK >> S_SHIFT] = NILFS_FT_SYMLINK,
269};
270
271static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode)
272{
273 mode_t mode = inode->i_mode;
274
275 de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
276}
277
278static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
279{
280 loff_t pos = filp->f_pos;
281 struct inode *inode = filp->f_dentry->d_inode;
282 struct super_block *sb = inode->i_sb;
283 unsigned int offset = pos & ~PAGE_CACHE_MASK;
284 unsigned long n = pos >> PAGE_CACHE_SHIFT;
285 unsigned long npages = dir_pages(inode);
286/* unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */
287 unsigned char *types = NULL;
288 int ret;
289
290 if (pos > inode->i_size - NILFS_DIR_REC_LEN(1))
291 goto success;
292
293 types = nilfs_filetype_table;
294
295 for ( ; n < npages; n++, offset = 0) {
296 char *kaddr, *limit;
297 struct nilfs_dir_entry *de;
298 struct page *page = nilfs_get_page(inode, n);
299
300 if (IS_ERR(page)) {
301 nilfs_error(sb, __func__, "bad page in #%lu",
302 inode->i_ino);
303 filp->f_pos += PAGE_CACHE_SIZE - offset;
304 ret = -EIO;
305 goto done;
306 }
307 kaddr = page_address(page);
308 de = (struct nilfs_dir_entry *)(kaddr + offset);
309 limit = kaddr + nilfs_last_byte(inode, n) -
310 NILFS_DIR_REC_LEN(1);
311 for ( ; (char *)de <= limit; de = nilfs_next_entry(de)) {
312 if (de->rec_len == 0) {
313 nilfs_error(sb, __func__,
314 "zero-length directory entry");
315 ret = -EIO;
316 nilfs_put_page(page);
317 goto done;
318 }
319 if (de->inode) {
320 int over;
321 unsigned char d_type = DT_UNKNOWN;
322
323 if (types && de->file_type < NILFS_FT_MAX)
324 d_type = types[de->file_type];
325
326 offset = (char *)de - kaddr;
327 over = filldir(dirent, de->name, de->name_len,
328 (n<<PAGE_CACHE_SHIFT) | offset,
329 le64_to_cpu(de->inode), d_type);
330 if (over) {
331 nilfs_put_page(page);
332 goto success;
333 }
334 }
335 filp->f_pos += le16_to_cpu(de->rec_len);
336 }
337 nilfs_put_page(page);
338 }
339
340success:
341 ret = 0;
342done:
343 return ret;
344}
345
346/*
347 * nilfs_find_entry()
348 *
349 * finds an entry in the specified directory with the wanted name. It
350 * returns the page in which the entry was found, and the entry itself
351 * (as a parameter - res_dir). Page is returned mapped and unlocked.
352 * Entry is guaranteed to be valid.
353 */
354struct nilfs_dir_entry *
355nilfs_find_entry(struct inode *dir, struct dentry *dentry,
356 struct page **res_page)
357{
358 const char *name = dentry->d_name.name;
359 int namelen = dentry->d_name.len;
360 unsigned reclen = NILFS_DIR_REC_LEN(namelen);
361 unsigned long start, n;
362 unsigned long npages = dir_pages(dir);
363 struct page *page = NULL;
364 struct nilfs_inode_info *ei = NILFS_I(dir);
365 struct nilfs_dir_entry *de;
366
367 if (npages == 0)
368 goto out;
369
370 /* OFFSET_CACHE */
371 *res_page = NULL;
372
373 start = ei->i_dir_start_lookup;
374 if (start >= npages)
375 start = 0;
376 n = start;
377 do {
378 char *kaddr;
379 page = nilfs_get_page(dir, n);
380 if (!IS_ERR(page)) {
381 kaddr = page_address(page);
382 de = (struct nilfs_dir_entry *)kaddr;
383 kaddr += nilfs_last_byte(dir, n) - reclen;
384 while ((char *) de <= kaddr) {
385 if (de->rec_len == 0) {
386 nilfs_error(dir->i_sb, __func__,
387 "zero-length directory entry");
388 nilfs_put_page(page);
389 goto out;
390 }
391 if (nilfs_match(namelen, name, de))
392 goto found;
393 de = nilfs_next_entry(de);
394 }
395 nilfs_put_page(page);
396 }
397 if (++n >= npages)
398 n = 0;
399 /* next page is past the blocks we've got */
400 if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) {
401 nilfs_error(dir->i_sb, __func__,
402 "dir %lu size %lld exceeds block cout %llu",
403 dir->i_ino, dir->i_size,
404 (unsigned long long)dir->i_blocks);
405 goto out;
406 }
407 } while (n != start);
408out:
409 return NULL;
410
411found:
412 *res_page = page;
413 ei->i_dir_start_lookup = n;
414 return de;
415}
416
417struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct page **p)
418{
419 struct page *page = nilfs_get_page(dir, 0);
420 struct nilfs_dir_entry *de = NULL;
421
422 if (!IS_ERR(page)) {
423 de = nilfs_next_entry(
424 (struct nilfs_dir_entry *)page_address(page));
425 *p = page;
426 }
427 return de;
428}
429
430ino_t nilfs_inode_by_name(struct inode *dir, struct dentry *dentry)
431{
432 ino_t res = 0;
433 struct nilfs_dir_entry *de;
434 struct page *page;
435
436 de = nilfs_find_entry(dir, dentry, &page);
437 if (de) {
438 res = le64_to_cpu(de->inode);
439 kunmap(page);
440 page_cache_release(page);
441 }
442 return res;
443}
444
445/* Releases the page */
446void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
447 struct page *page, struct inode *inode)
448{
449 unsigned from = (char *) de - (char *) page_address(page);
450 unsigned to = from + le16_to_cpu(de->rec_len);
451 struct address_space *mapping = page->mapping;
452 int err;
453
454 lock_page(page);
455 err = nilfs_prepare_chunk_uninterruptible(page, mapping, from, to);
456 BUG_ON(err);
457 de->inode = cpu_to_le64(inode->i_ino);
458 nilfs_set_de_type(de, inode);
459 err = nilfs_commit_chunk(page, mapping, from, to);
460 nilfs_put_page(page);
461 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
462/* NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
463 mark_inode_dirty(dir);
464}
465
466/*
467 * Parent is locked.
468 */
469int nilfs_add_link(struct dentry *dentry, struct inode *inode)
470{
471 struct inode *dir = dentry->d_parent->d_inode;
472 const char *name = dentry->d_name.name;
473 int namelen = dentry->d_name.len;
474 unsigned chunk_size = nilfs_chunk_size(dir);
475 unsigned reclen = NILFS_DIR_REC_LEN(namelen);
476 unsigned short rec_len, name_len;
477 struct page *page = NULL;
478 struct nilfs_dir_entry *de;
479 unsigned long npages = dir_pages(dir);
480 unsigned long n;
481 char *kaddr;
482 unsigned from, to;
483 int err;
484
485 /*
486 * We take care of directory expansion in the same loop.
487 * This code plays outside i_size, so it locks the page
488 * to protect that region.
489 */
490 for (n = 0; n <= npages; n++) {
491 char *dir_end;
492
493 page = nilfs_get_page(dir, n);
494 err = PTR_ERR(page);
495 if (IS_ERR(page))
496 goto out;
497 lock_page(page);
498 kaddr = page_address(page);
499 dir_end = kaddr + nilfs_last_byte(dir, n);
500 de = (struct nilfs_dir_entry *)kaddr;
501 kaddr += PAGE_CACHE_SIZE - reclen;
502 while ((char *)de <= kaddr) {
503 if ((char *)de == dir_end) {
504 /* We hit i_size */
505 name_len = 0;
506 rec_len = chunk_size;
507 de->rec_len = cpu_to_le16(chunk_size);
508 de->inode = 0;
509 goto got_it;
510 }
511 if (de->rec_len == 0) {
512 nilfs_error(dir->i_sb, __func__,
513 "zero-length directory entry");
514 err = -EIO;
515 goto out_unlock;
516 }
517 err = -EEXIST;
518 if (nilfs_match(namelen, name, de))
519 goto out_unlock;
520 name_len = NILFS_DIR_REC_LEN(de->name_len);
521 rec_len = le16_to_cpu(de->rec_len);
522 if (!de->inode && rec_len >= reclen)
523 goto got_it;
524 if (rec_len >= name_len + reclen)
525 goto got_it;
526 de = (struct nilfs_dir_entry *)((char *)de + rec_len);
527 }
528 unlock_page(page);
529 nilfs_put_page(page);
530 }
531 BUG();
532 return -EINVAL;
533
534got_it:
535 from = (char *)de - (char *)page_address(page);
536 to = from + rec_len;
537 err = nilfs_prepare_chunk(page, page->mapping, from, to);
538 if (err)
539 goto out_unlock;
540 if (de->inode) {
541 struct nilfs_dir_entry *de1;
542
543 de1 = (struct nilfs_dir_entry *)((char *)de + name_len);
544 de1->rec_len = cpu_to_le16(rec_len - name_len);
545 de->rec_len = cpu_to_le16(name_len);
546 de = de1;
547 }
548 de->name_len = namelen;
549 memcpy(de->name, name, namelen);
550 de->inode = cpu_to_le64(inode->i_ino);
551 nilfs_set_de_type(de, inode);
552 err = nilfs_commit_chunk(page, page->mapping, from, to);
553 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
554/* NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
555 mark_inode_dirty(dir);
556 /* OFFSET_CACHE */
557out_put:
558 nilfs_put_page(page);
559out:
560 return err;
561out_unlock:
562 unlock_page(page);
563 goto out_put;
564}
565
566/*
567 * nilfs_delete_entry deletes a directory entry by merging it with the
568 * previous entry. Page is up-to-date. Releases the page.
569 */
570int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
571{
572 struct address_space *mapping = page->mapping;
573 struct inode *inode = mapping->host;
574 char *kaddr = page_address(page);
575 unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1);
576 unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len);
577 struct nilfs_dir_entry *pde = NULL;
578 struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from);
579 int err;
580
581 while ((char *)de < (char *)dir) {
582 if (de->rec_len == 0) {
583 nilfs_error(inode->i_sb, __func__,
584 "zero-length directory entry");
585 err = -EIO;
586 goto out;
587 }
588 pde = de;
589 de = nilfs_next_entry(de);
590 }
591 if (pde)
592 from = (char *)pde - (char *)page_address(page);
593 lock_page(page);
594 err = nilfs_prepare_chunk(page, mapping, from, to);
595 BUG_ON(err);
596 if (pde)
597 pde->rec_len = cpu_to_le16(to - from);
598 dir->inode = 0;
599 err = nilfs_commit_chunk(page, mapping, from, to);
600 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
601/* NILFS_I(inode)->i_flags &= ~NILFS_BTREE_FL; */
602 mark_inode_dirty(inode);
603out:
604 nilfs_put_page(page);
605 return err;
606}
607
608/*
609 * Set the first fragment of directory.
610 */
611int nilfs_make_empty(struct inode *inode, struct inode *parent)
612{
613 struct address_space *mapping = inode->i_mapping;
614 struct page *page = grab_cache_page(mapping, 0);
615 unsigned chunk_size = nilfs_chunk_size(inode);
616 struct nilfs_dir_entry *de;
617 int err;
618 void *kaddr;
619
620 if (!page)
621 return -ENOMEM;
622
623 err = nilfs_prepare_chunk(page, mapping, 0, chunk_size);
624 if (unlikely(err)) {
625 unlock_page(page);
626 goto fail;
627 }
628 kaddr = kmap_atomic(page, KM_USER0);
629 memset(kaddr, 0, chunk_size);
630 de = (struct nilfs_dir_entry *)kaddr;
631 de->name_len = 1;
632 de->rec_len = cpu_to_le16(NILFS_DIR_REC_LEN(1));
633 memcpy(de->name, ".\0\0", 4);
634 de->inode = cpu_to_le64(inode->i_ino);
635 nilfs_set_de_type(de, inode);
636
637 de = (struct nilfs_dir_entry *)(kaddr + NILFS_DIR_REC_LEN(1));
638 de->name_len = 2;
639 de->rec_len = cpu_to_le16(chunk_size - NILFS_DIR_REC_LEN(1));
640 de->inode = cpu_to_le64(parent->i_ino);
641 memcpy(de->name, "..\0", 4);
642 nilfs_set_de_type(de, inode);
643 kunmap_atomic(kaddr, KM_USER0);
644 err = nilfs_commit_chunk(page, mapping, 0, chunk_size);
645fail:
646 page_cache_release(page);
647 return err;
648}
649
650/*
651 * routine to check that the specified directory is empty (for rmdir)
652 */
653int nilfs_empty_dir(struct inode *inode)
654{
655 struct page *page = NULL;
656 unsigned long i, npages = dir_pages(inode);
657
658 for (i = 0; i < npages; i++) {
659 char *kaddr;
660 struct nilfs_dir_entry *de;
661
662 page = nilfs_get_page(inode, i);
663 if (IS_ERR(page))
664 continue;
665
666 kaddr = page_address(page);
667 de = (struct nilfs_dir_entry *)kaddr;
668 kaddr += nilfs_last_byte(inode, i) - NILFS_DIR_REC_LEN(1);
669
670 while ((char *)de <= kaddr) {
671 if (de->rec_len == 0) {
672 nilfs_error(inode->i_sb, __func__,
673 "zero-length directory entry "
674 "(kaddr=%p, de=%p)\n", kaddr, de);
675 goto not_empty;
676 }
677 if (de->inode != 0) {
678 /* check for . and .. */
679 if (de->name[0] != '.')
680 goto not_empty;
681 if (de->name_len > 2)
682 goto not_empty;
683 if (de->name_len < 2) {
684 if (de->inode !=
685 cpu_to_le64(inode->i_ino))
686 goto not_empty;
687 } else if (de->name[1] != '.')
688 goto not_empty;
689 }
690 de = nilfs_next_entry(de);
691 }
692 nilfs_put_page(page);
693 }
694 return 1;
695
696not_empty:
697 nilfs_put_page(page);
698 return 0;
699}
700
701struct file_operations nilfs_dir_operations = {
702 .llseek = generic_file_llseek,
703 .read = generic_read_dir,
704 .readdir = nilfs_readdir,
705 .unlocked_ioctl = nilfs_ioctl,
706#ifdef CONFIG_COMPAT
707 .compat_ioctl = nilfs_ioctl,
708#endif /* CONFIG_COMPAT */
709 .fsync = nilfs_sync_file,
710
711};
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
new file mode 100644
index 000000000000..c6379e482781
--- /dev/null
+++ b/fs/nilfs2/direct.c
@@ -0,0 +1,436 @@
1/*
2 * direct.c - NILFS direct block pointer.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#include <linux/errno.h>
24#include "nilfs.h"
25#include "page.h"
26#include "direct.h"
27#include "alloc.h"
28
29static inline __le64 *nilfs_direct_dptrs(const struct nilfs_direct *direct)
30{
31 return (__le64 *)
32 ((struct nilfs_direct_node *)direct->d_bmap.b_u.u_data + 1);
33}
34
35static inline __u64
36nilfs_direct_get_ptr(const struct nilfs_direct *direct, __u64 key)
37{
38 return nilfs_bmap_dptr_to_ptr(*(nilfs_direct_dptrs(direct) + key));
39}
40
41static inline void nilfs_direct_set_ptr(struct nilfs_direct *direct,
42 __u64 key, __u64 ptr)
43{
44 *(nilfs_direct_dptrs(direct) + key) = nilfs_bmap_ptr_to_dptr(ptr);
45}
46
47static int nilfs_direct_lookup(const struct nilfs_bmap *bmap,
48 __u64 key, int level, __u64 *ptrp)
49{
50 struct nilfs_direct *direct;
51 __u64 ptr;
52
53 direct = (struct nilfs_direct *)bmap;
54 if ((key > NILFS_DIRECT_KEY_MAX) ||
55 (level != 1) || /* XXX: use macro for level 1 */
56 ((ptr = nilfs_direct_get_ptr(direct, key)) ==
57 NILFS_BMAP_INVALID_PTR))
58 return -ENOENT;
59
60 if (ptrp != NULL)
61 *ptrp = ptr;
62 return 0;
63}
64
65static __u64
66nilfs_direct_find_target_v(const struct nilfs_direct *direct, __u64 key)
67{
68 __u64 ptr;
69
70 ptr = nilfs_bmap_find_target_seq(&direct->d_bmap, key);
71 if (ptr != NILFS_BMAP_INVALID_PTR)
72 /* sequential access */
73 return ptr;
74 else
75 /* block group */
76 return nilfs_bmap_find_target_in_group(&direct->d_bmap);
77}
78
79static void nilfs_direct_set_target_v(struct nilfs_direct *direct,
80 __u64 key, __u64 ptr)
81{
82 direct->d_bmap.b_last_allocated_key = key;
83 direct->d_bmap.b_last_allocated_ptr = ptr;
84}
85
86static int nilfs_direct_prepare_insert(struct nilfs_direct *direct,
87 __u64 key,
88 union nilfs_bmap_ptr_req *req,
89 struct nilfs_bmap_stats *stats)
90{
91 int ret;
92
93 if (direct->d_ops->dop_find_target != NULL)
94 req->bpr_ptr = direct->d_ops->dop_find_target(direct, key);
95 ret = direct->d_bmap.b_pops->bpop_prepare_alloc_ptr(&direct->d_bmap,
96 req);
97 if (ret < 0)
98 return ret;
99
100 stats->bs_nblocks = 1;
101 return 0;
102}
103
104static void nilfs_direct_commit_insert(struct nilfs_direct *direct,
105 union nilfs_bmap_ptr_req *req,
106 __u64 key, __u64 ptr)
107{
108 struct buffer_head *bh;
109
110 /* ptr must be a pointer to a buffer head. */
111 bh = (struct buffer_head *)((unsigned long)ptr);
112 set_buffer_nilfs_volatile(bh);
113
114 if (direct->d_bmap.b_pops->bpop_commit_alloc_ptr != NULL)
115 direct->d_bmap.b_pops->bpop_commit_alloc_ptr(
116 &direct->d_bmap, req);
117 nilfs_direct_set_ptr(direct, key, req->bpr_ptr);
118
119 if (!nilfs_bmap_dirty(&direct->d_bmap))
120 nilfs_bmap_set_dirty(&direct->d_bmap);
121
122 if (direct->d_ops->dop_set_target != NULL)
123 direct->d_ops->dop_set_target(direct, key, req->bpr_ptr);
124}
125
126static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
127{
128 struct nilfs_direct *direct;
129 union nilfs_bmap_ptr_req req;
130 struct nilfs_bmap_stats stats;
131 int ret;
132
133 direct = (struct nilfs_direct *)bmap;
134 if (key > NILFS_DIRECT_KEY_MAX)
135 return -ENOENT;
136 if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR)
137 return -EEXIST;
138
139 ret = nilfs_direct_prepare_insert(direct, key, &req, &stats);
140 if (ret < 0)
141 return ret;
142 nilfs_direct_commit_insert(direct, &req, key, ptr);
143 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
144
145 return 0;
146}
147
148static int nilfs_direct_prepare_delete(struct nilfs_direct *direct,
149 union nilfs_bmap_ptr_req *req,
150 __u64 key,
151 struct nilfs_bmap_stats *stats)
152{
153 int ret;
154
155 if (direct->d_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
156 req->bpr_ptr = nilfs_direct_get_ptr(direct, key);
157 ret = direct->d_bmap.b_pops->bpop_prepare_end_ptr(
158 &direct->d_bmap, req);
159 if (ret < 0)
160 return ret;
161 }
162
163 stats->bs_nblocks = 1;
164 return 0;
165}
166
167static void nilfs_direct_commit_delete(struct nilfs_direct *direct,
168 union nilfs_bmap_ptr_req *req,
169 __u64 key)
170{
171 if (direct->d_bmap.b_pops->bpop_commit_end_ptr != NULL)
172 direct->d_bmap.b_pops->bpop_commit_end_ptr(
173 &direct->d_bmap, req);
174 nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
175}
176
177static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
178{
179 struct nilfs_direct *direct;
180 union nilfs_bmap_ptr_req req;
181 struct nilfs_bmap_stats stats;
182 int ret;
183
184 direct = (struct nilfs_direct *)bmap;
185 if ((key > NILFS_DIRECT_KEY_MAX) ||
186 nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR)
187 return -ENOENT;
188
189 ret = nilfs_direct_prepare_delete(direct, &req, key, &stats);
190 if (ret < 0)
191 return ret;
192 nilfs_direct_commit_delete(direct, &req, key);
193 nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
194
195 return 0;
196}
197
198static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
199{
200 struct nilfs_direct *direct;
201 __u64 key, lastkey;
202
203 direct = (struct nilfs_direct *)bmap;
204 lastkey = NILFS_DIRECT_KEY_MAX + 1;
205 for (key = NILFS_DIRECT_KEY_MIN; key <= NILFS_DIRECT_KEY_MAX; key++)
206 if (nilfs_direct_get_ptr(direct, key) !=
207 NILFS_BMAP_INVALID_PTR)
208 lastkey = key;
209
210 if (lastkey == NILFS_DIRECT_KEY_MAX + 1)
211 return -ENOENT;
212
213 *keyp = lastkey;
214
215 return 0;
216}
217
218static int nilfs_direct_check_insert(const struct nilfs_bmap *bmap, __u64 key)
219{
220 return key > NILFS_DIRECT_KEY_MAX;
221}
222
223static int nilfs_direct_gather_data(struct nilfs_bmap *bmap,
224 __u64 *keys, __u64 *ptrs, int nitems)
225{
226 struct nilfs_direct *direct;
227 __u64 key;
228 __u64 ptr;
229 int n;
230
231 direct = (struct nilfs_direct *)bmap;
232 if (nitems > NILFS_DIRECT_NBLOCKS)
233 nitems = NILFS_DIRECT_NBLOCKS;
234 n = 0;
235 for (key = 0; key < nitems; key++) {
236 ptr = nilfs_direct_get_ptr(direct, key);
237 if (ptr != NILFS_BMAP_INVALID_PTR) {
238 keys[n] = key;
239 ptrs[n] = ptr;
240 n++;
241 }
242 }
243 return n;
244}
245
246int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
247 __u64 key, __u64 *keys, __u64 *ptrs,
248 int n, __u64 low, __u64 high)
249{
250 struct nilfs_direct *direct;
251 __le64 *dptrs;
252 int ret, i, j;
253
254 /* no need to allocate any resource for conversion */
255
256 /* delete */
257 ret = bmap->b_ops->bop_delete(bmap, key);
258 if (ret < 0)
259 return ret;
260
261 /* free resources */
262 if (bmap->b_ops->bop_clear != NULL)
263 bmap->b_ops->bop_clear(bmap);
264
265 /* convert */
266 direct = (struct nilfs_direct *)bmap;
267 dptrs = nilfs_direct_dptrs(direct);
268 for (i = 0, j = 0; i < NILFS_DIRECT_NBLOCKS; i++) {
269 if ((j < n) && (i == keys[j])) {
270 dptrs[i] = (i != key) ?
271 nilfs_bmap_ptr_to_dptr(ptrs[j]) :
272 NILFS_BMAP_INVALID_PTR;
273 j++;
274 } else
275 dptrs[i] = NILFS_BMAP_INVALID_PTR;
276 }
277
278 nilfs_direct_init(bmap, low, high);
279
280 return 0;
281}
282
283static int nilfs_direct_propagate_v(struct nilfs_direct *direct,
284 struct buffer_head *bh)
285{
286 union nilfs_bmap_ptr_req oldreq, newreq;
287 __u64 key;
288 __u64 ptr;
289 int ret;
290
291 key = nilfs_bmap_data_get_key(&direct->d_bmap, bh);
292 ptr = nilfs_direct_get_ptr(direct, key);
293 if (!buffer_nilfs_volatile(bh)) {
294 oldreq.bpr_ptr = ptr;
295 newreq.bpr_ptr = ptr;
296 ret = nilfs_bmap_prepare_update(&direct->d_bmap, &oldreq,
297 &newreq);
298 if (ret < 0)
299 return ret;
300 nilfs_bmap_commit_update(&direct->d_bmap, &oldreq, &newreq);
301 set_buffer_nilfs_volatile(bh);
302 nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr);
303 } else
304 ret = nilfs_bmap_mark_dirty(&direct->d_bmap, ptr);
305
306 return ret;
307}
308
309static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
310 struct buffer_head *bh)
311{
312 struct nilfs_direct *direct;
313
314 direct = (struct nilfs_direct *)bmap;
315 return (direct->d_ops->dop_propagate != NULL) ?
316 direct->d_ops->dop_propagate(direct, bh) :
317 0;
318}
319
320static int nilfs_direct_assign_v(struct nilfs_direct *direct,
321 __u64 key, __u64 ptr,
322 struct buffer_head **bh,
323 sector_t blocknr,
324 union nilfs_binfo *binfo)
325{
326 union nilfs_bmap_ptr_req req;
327 int ret;
328
329 req.bpr_ptr = ptr;
330 ret = direct->d_bmap.b_pops->bpop_prepare_start_ptr(
331 &direct->d_bmap, &req);
332 if (ret < 0)
333 return ret;
334 direct->d_bmap.b_pops->bpop_commit_start_ptr(&direct->d_bmap,
335 &req, blocknr);
336
337 binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
338 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
339
340 return 0;
341}
342
343static int nilfs_direct_assign_p(struct nilfs_direct *direct,
344 __u64 key, __u64 ptr,
345 struct buffer_head **bh,
346 sector_t blocknr,
347 union nilfs_binfo *binfo)
348{
349 nilfs_direct_set_ptr(direct, key, blocknr);
350
351 binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
352 binfo->bi_dat.bi_level = 0;
353
354 return 0;
355}
356
357static int nilfs_direct_assign(struct nilfs_bmap *bmap,
358 struct buffer_head **bh,
359 sector_t blocknr,
360 union nilfs_binfo *binfo)
361{
362 struct nilfs_direct *direct;
363 __u64 key;
364 __u64 ptr;
365
366 direct = (struct nilfs_direct *)bmap;
367 key = nilfs_bmap_data_get_key(bmap, *bh);
368 if (unlikely(key > NILFS_DIRECT_KEY_MAX)) {
369 printk(KERN_CRIT "%s: invalid key: %llu\n", __func__,
370 (unsigned long long)key);
371 return -EINVAL;
372 }
373 ptr = nilfs_direct_get_ptr(direct, key);
374 if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) {
375 printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__,
376 (unsigned long long)ptr);
377 return -EINVAL;
378 }
379
380 return direct->d_ops->dop_assign(direct, key, ptr, bh,
381 blocknr, binfo);
382}
383
384static const struct nilfs_bmap_operations nilfs_direct_ops = {
385 .bop_lookup = nilfs_direct_lookup,
386 .bop_insert = nilfs_direct_insert,
387 .bop_delete = nilfs_direct_delete,
388 .bop_clear = NULL,
389
390 .bop_propagate = nilfs_direct_propagate,
391
392 .bop_lookup_dirty_buffers = NULL,
393
394 .bop_assign = nilfs_direct_assign,
395 .bop_mark = NULL,
396
397 .bop_last_key = nilfs_direct_last_key,
398 .bop_check_insert = nilfs_direct_check_insert,
399 .bop_check_delete = NULL,
400 .bop_gather_data = nilfs_direct_gather_data,
401};
402
403
404static const struct nilfs_direct_operations nilfs_direct_ops_v = {
405 .dop_find_target = nilfs_direct_find_target_v,
406 .dop_set_target = nilfs_direct_set_target_v,
407 .dop_propagate = nilfs_direct_propagate_v,
408 .dop_assign = nilfs_direct_assign_v,
409};
410
411static const struct nilfs_direct_operations nilfs_direct_ops_p = {
412 .dop_find_target = NULL,
413 .dop_set_target = NULL,
414 .dop_propagate = NULL,
415 .dop_assign = nilfs_direct_assign_p,
416};
417
418int nilfs_direct_init(struct nilfs_bmap *bmap, __u64 low, __u64 high)
419{
420 struct nilfs_direct *direct;
421
422 direct = (struct nilfs_direct *)bmap;
423 bmap->b_ops = &nilfs_direct_ops;
424 bmap->b_low = low;
425 bmap->b_high = high;
426 switch (bmap->b_inode->i_ino) {
427 case NILFS_DAT_INO:
428 direct->d_ops = &nilfs_direct_ops_p;
429 break;
430 default:
431 direct->d_ops = &nilfs_direct_ops_v;
432 break;
433 }
434
435 return 0;
436}
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
new file mode 100644
index 000000000000..45d2c5cda812
--- /dev/null
+++ b/fs/nilfs2/direct.h
@@ -0,0 +1,78 @@
1/*
2 * direct.h - NILFS direct block pointer.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_DIRECT_H
24#define _NILFS_DIRECT_H
25
26#include <linux/types.h>
27#include <linux/buffer_head.h>
28#include "bmap.h"
29
30
31struct nilfs_direct;
32
33/**
34 * struct nilfs_direct_operations - direct mapping operation table
35 */
36struct nilfs_direct_operations {
37 __u64 (*dop_find_target)(const struct nilfs_direct *, __u64);
38 void (*dop_set_target)(struct nilfs_direct *, __u64, __u64);
39 int (*dop_propagate)(struct nilfs_direct *, struct buffer_head *);
40 int (*dop_assign)(struct nilfs_direct *, __u64, __u64,
41 struct buffer_head **, sector_t,
42 union nilfs_binfo *);
43};
44
45/**
46 * struct nilfs_direct_node - direct node
47 * @dn_flags: flags
48 * @dn_pad: padding
49 */
50struct nilfs_direct_node {
51 __u8 dn_flags;
52 __u8 pad[7];
53};
54
55/**
56 * struct nilfs_direct - direct mapping
57 * @d_bmap: bmap structure
58 * @d_ops: direct mapping operation table
59 */
60struct nilfs_direct {
61 struct nilfs_bmap d_bmap;
62
63 /* direct-mapping-specific members */
64 const struct nilfs_direct_operations *d_ops;
65};
66
67
68#define NILFS_DIRECT_NBLOCKS (NILFS_BMAP_SIZE / sizeof(__le64) - 1)
69#define NILFS_DIRECT_KEY_MIN 0
70#define NILFS_DIRECT_KEY_MAX (NILFS_DIRECT_NBLOCKS - 1)
71
72
73int nilfs_direct_init(struct nilfs_bmap *, __u64, __u64);
74int nilfs_direct_delete_and_convert(struct nilfs_bmap *, __u64, __u64 *,
75 __u64 *, int, __u64, __u64);
76
77
78#endif /* _NILFS_DIRECT_H */
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
new file mode 100644
index 000000000000..6bd84a0d8238
--- /dev/null
+++ b/fs/nilfs2/file.c
@@ -0,0 +1,160 @@
1/*
2 * file.c - NILFS regular file handling primitives including fsync().
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Amagai Yoshiji <amagai@osrg.net>,
21 * Ryusuke Konishi <ryusuke@osrg.net>
22 */
23
24#include <linux/fs.h>
25#include <linux/mm.h>
26#include <linux/writeback.h>
27#include "nilfs.h"
28#include "segment.h"
29
30int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
31{
32 /*
33 * Called from fsync() system call
34 * This is the only entry point that can catch write and synch
35 * timing for both data blocks and intermediate blocks.
36 *
37 * This function should be implemented when the writeback function
38 * will be implemented.
39 */
40 struct inode *inode = dentry->d_inode;
41 int err;
42
43 if (!nilfs_inode_dirty(inode))
44 return 0;
45
46 if (datasync)
47 err = nilfs_construct_dsync_segment(inode->i_sb, inode, 0,
48 LLONG_MAX);
49 else
50 err = nilfs_construct_segment(inode->i_sb);
51
52 return err;
53}
54
55static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
56{
57 struct page *page = vmf->page;
58 struct inode *inode = vma->vm_file->f_dentry->d_inode;
59 struct nilfs_transaction_info ti;
60 int ret;
61
62 if (unlikely(nilfs_near_disk_full(NILFS_SB(inode->i_sb)->s_nilfs)))
63 return VM_FAULT_SIGBUS; /* -ENOSPC */
64
65 lock_page(page);
66 if (page->mapping != inode->i_mapping ||
67 page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) {
68 unlock_page(page);
69 return VM_FAULT_NOPAGE; /* make the VM retry the fault */
70 }
71
72 /*
73 * check to see if the page is mapped already (no holes)
74 */
75 if (PageMappedToDisk(page)) {
76 unlock_page(page);
77 goto mapped;
78 }
79 if (page_has_buffers(page)) {
80 struct buffer_head *bh, *head;
81 int fully_mapped = 1;
82
83 bh = head = page_buffers(page);
84 do {
85 if (!buffer_mapped(bh)) {
86 fully_mapped = 0;
87 break;
88 }
89 } while (bh = bh->b_this_page, bh != head);
90
91 if (fully_mapped) {
92 SetPageMappedToDisk(page);
93 unlock_page(page);
94 goto mapped;
95 }
96 }
97 unlock_page(page);
98
99 /*
100 * fill hole blocks
101 */
102 ret = nilfs_transaction_begin(inode->i_sb, &ti, 1);
103 /* never returns -ENOMEM, but may return -ENOSPC */
104 if (unlikely(ret))
105 return VM_FAULT_SIGBUS;
106
107 ret = block_page_mkwrite(vma, vmf, nilfs_get_block);
108 if (unlikely(ret)) {
109 nilfs_transaction_abort(inode->i_sb);
110 return ret;
111 }
112 nilfs_transaction_commit(inode->i_sb);
113
114 mapped:
115 SetPageChecked(page);
116 wait_on_page_writeback(page);
117 return 0;
118}
119
120struct vm_operations_struct nilfs_file_vm_ops = {
121 .fault = filemap_fault,
122 .page_mkwrite = nilfs_page_mkwrite,
123};
124
125static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
126{
127 file_accessed(file);
128 vma->vm_ops = &nilfs_file_vm_ops;
129 vma->vm_flags |= VM_CAN_NONLINEAR;
130 return 0;
131}
132
133/*
134 * We have mostly NULL's here: the current defaults are ok for
135 * the nilfs filesystem.
136 */
137struct file_operations nilfs_file_operations = {
138 .llseek = generic_file_llseek,
139 .read = do_sync_read,
140 .write = do_sync_write,
141 .aio_read = generic_file_aio_read,
142 .aio_write = generic_file_aio_write,
143 .unlocked_ioctl = nilfs_ioctl,
144#ifdef CONFIG_COMPAT
145 .compat_ioctl = nilfs_ioctl,
146#endif /* CONFIG_COMPAT */
147 .mmap = nilfs_file_mmap,
148 .open = generic_file_open,
149 /* .release = nilfs_release_file, */
150 .fsync = nilfs_sync_file,
151 .splice_read = generic_file_splice_read,
152};
153
154struct inode_operations nilfs_file_inode_operations = {
155 .truncate = nilfs_truncate,
156 .setattr = nilfs_setattr,
157 .permission = nilfs_permission,
158};
159
160/* end of file */
diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c
new file mode 100644
index 000000000000..93383c5cee90
--- /dev/null
+++ b/fs/nilfs2/gcdat.c
@@ -0,0 +1,84 @@
1/*
2 * gcdat.c - NILFS shadow DAT inode for GC
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
21 * and Ryusuke Konishi <ryusuke@osrg.net>.
22 *
23 */
24
25#include <linux/buffer_head.h>
26#include "nilfs.h"
27#include "page.h"
28#include "mdt.h"
29
30int nilfs_init_gcdat_inode(struct the_nilfs *nilfs)
31{
32 struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
33 struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
34 int err;
35
36 gcdat->i_state = 0;
37 gcdat->i_blocks = dat->i_blocks;
38 gii->i_flags = dii->i_flags;
39 gii->i_state = dii->i_state | (1 << NILFS_I_GCDAT);
40 gii->i_cno = 0;
41 nilfs_bmap_init_gcdat(gii->i_bmap, dii->i_bmap);
42 err = nilfs_copy_dirty_pages(gcdat->i_mapping, dat->i_mapping);
43 if (unlikely(err))
44 return err;
45
46 return nilfs_copy_dirty_pages(&gii->i_btnode_cache,
47 &dii->i_btnode_cache);
48}
49
50void nilfs_commit_gcdat_inode(struct the_nilfs *nilfs)
51{
52 struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
53 struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
54 struct address_space *mapping = dat->i_mapping;
55 struct address_space *gmapping = gcdat->i_mapping;
56
57 down_write(&NILFS_MDT(dat)->mi_sem);
58 dat->i_blocks = gcdat->i_blocks;
59 dii->i_flags = gii->i_flags;
60 dii->i_state = gii->i_state & ~(1 << NILFS_I_GCDAT);
61
62 nilfs_bmap_commit_gcdat(gii->i_bmap, dii->i_bmap);
63
64 nilfs_clear_dirty_pages(mapping);
65 nilfs_copy_back_pages(mapping, gmapping);
66 /* note: mdt dirty flags should be cleared by segctor. */
67
68 nilfs_clear_dirty_pages(&dii->i_btnode_cache);
69 nilfs_copy_back_pages(&dii->i_btnode_cache, &gii->i_btnode_cache);
70
71 up_write(&NILFS_MDT(dat)->mi_sem);
72}
73
74void nilfs_clear_gcdat_inode(struct the_nilfs *nilfs)
75{
76 struct inode *gcdat = nilfs->ns_gc_dat;
77 struct nilfs_inode_info *gii = NILFS_I(gcdat);
78
79 gcdat->i_state = I_CLEAR;
80 gii->i_flags = 0;
81
82 truncate_inode_pages(gcdat->i_mapping, 0);
83 truncate_inode_pages(&gii->i_btnode_cache, 0);
84}
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
new file mode 100644
index 000000000000..19d2102b6a69
--- /dev/null
+++ b/fs/nilfs2/gcinode.c
@@ -0,0 +1,288 @@
1/*
2 * gcinode.c - dummy inodes to buffer blocks for garbage collection
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
21 * and Ryusuke Konishi <ryusuke@osrg.net>.
22 * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
23 *
24 */
25/*
26 * This file adds the cache of on-disk blocks to be moved in garbage
27 * collection. The disk blocks are held with dummy inodes (called
28 * gcinodes), and this file provides lookup function of the dummy
29 * inodes and their buffer read function.
30 *
31 * Since NILFS2 keeps up multiple checkpoints/snapshots accross GC, it
32 * has to treat blocks that belong to a same file but have different
33 * checkpoint numbers. To avoid interference among generations, dummy
34 * inodes are managed separatly from actual inodes, and their lookup
35 * function (nilfs_gc_iget) is designed to be specified with a
36 * checkpoint number argument as well as an inode number.
37 *
38 * Buffers and pages held by the dummy inodes will be released each
39 * time after they are copied to a new log. Dirty blocks made on the
40 * current generation and the blocks to be moved by GC never overlap
41 * because the dirty blocks make a new generation; they rather must be
42 * written individually.
43 */
44
45#include <linux/buffer_head.h>
46#include <linux/mpage.h>
47#include <linux/hash.h>
48#include <linux/swap.h>
49#include "nilfs.h"
50#include "page.h"
51#include "mdt.h"
52#include "dat.h"
53#include "ifile.h"
54
55static struct address_space_operations def_gcinode_aops = {};
56/* XXX need def_gcinode_iops/fops? */
57
58/*
59 * nilfs_gccache_submit_read_data() - add data buffer and submit read request
60 * @inode - gc inode
61 * @blkoff - dummy offset treated as the key for the page cache
62 * @pbn - physical block number of the block
63 * @vbn - virtual block number of the block, 0 for non-virtual block
64 * @out_bh - indirect pointer to a buffer_head struct to receive the results
65 *
66 * Description: nilfs_gccache_submit_read_data() registers the data buffer
67 * specified by @pbn to the GC pagecache with the key @blkoff.
68 * This function sets @vbn (@pbn if @vbn is zero) in b_blocknr of the buffer.
69 *
70 * Return Value: On success, 0 is returned. On Error, one of the following
71 * negative error code is returned.
72 *
73 * %-EIO - I/O error.
74 *
75 * %-ENOMEM - Insufficient amount of memory available.
76 *
77 * %-ENOENT - The block specified with @pbn does not exist.
78 */
79int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
80 sector_t pbn, __u64 vbn,
81 struct buffer_head **out_bh)
82{
83 struct buffer_head *bh;
84 int err;
85
86 bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
87 if (unlikely(!bh))
88 return -ENOMEM;
89
90 if (buffer_uptodate(bh))
91 goto out;
92
93 if (pbn == 0) {
94 struct inode *dat_inode = NILFS_I_NILFS(inode)->ns_dat;
95 /* use original dat, not gc dat. */
96 err = nilfs_dat_translate(dat_inode, vbn, &pbn);
97 if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */
98 brelse(bh);
99 goto failed;
100 }
101 }
102
103 lock_buffer(bh);
104 if (buffer_uptodate(bh)) {
105 unlock_buffer(bh);
106 goto out;
107 }
108
109 if (!buffer_mapped(bh)) {
110 bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
111 set_buffer_mapped(bh);
112 }
113 bh->b_blocknr = pbn;
114 bh->b_end_io = end_buffer_read_sync;
115 get_bh(bh);
116 submit_bh(READ, bh);
117 if (vbn)
118 bh->b_blocknr = vbn;
119 out:
120 err = 0;
121 *out_bh = bh;
122
123 failed:
124 unlock_page(bh->b_page);
125 page_cache_release(bh->b_page);
126 return err;
127}
128
129/*
130 * nilfs_gccache_submit_read_node() - add node buffer and submit read request
131 * @inode - gc inode
132 * @pbn - physical block number for the block
133 * @vbn - virtual block number for the block
134 * @out_bh - indirect pointer to a buffer_head struct to receive the results
135 *
136 * Description: nilfs_gccache_submit_read_node() registers the node buffer
137 * specified by @vbn to the GC pagecache. @pbn can be supplied by the
138 * caller to avoid translation of the disk block address.
139 *
140 * Return Value: On success, 0 is returned. On Error, one of the following
141 * negative error code is returned.
142 *
143 * %-EIO - I/O error.
144 *
145 * %-ENOMEM - Insufficient amount of memory available.
146 */
147int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
148 __u64 vbn, struct buffer_head **out_bh)
149{
150 int ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache,
151 vbn ? : pbn, pbn, out_bh, 0);
152 if (ret == -EEXIST) /* internal code (cache hit) */
153 ret = 0;
154 return ret;
155}
156
157int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
158{
159 wait_on_buffer(bh);
160 if (!buffer_uptodate(bh))
161 return -EIO;
162 if (buffer_dirty(bh))
163 return -EEXIST;
164
165 if (buffer_nilfs_node(bh))
166 nilfs_btnode_mark_dirty(bh);
167 else
168 nilfs_mdt_mark_buffer_dirty(bh);
169 return 0;
170}
171
172/*
173 * nilfs_init_gccache() - allocate and initialize gc_inode hash table
174 * @nilfs - the_nilfs
175 *
176 * Return Value: On success, 0.
177 * On error, a negative error code is returned.
178 */
179int nilfs_init_gccache(struct the_nilfs *nilfs)
180{
181 int loop;
182
183 BUG_ON(nilfs->ns_gc_inodes_h);
184
185 INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
186
187 nilfs->ns_gc_inodes_h =
188 kmalloc(sizeof(struct hlist_head) * NILFS_GCINODE_HASH_SIZE,
189 GFP_NOFS);
190 if (nilfs->ns_gc_inodes_h == NULL)
191 return -ENOMEM;
192
193 for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++)
194 INIT_HLIST_HEAD(&nilfs->ns_gc_inodes_h[loop]);
195 return 0;
196}
197
198/*
199 * nilfs_destroy_gccache() - free gc_inode hash table
200 * @nilfs - the nilfs
201 */
202void nilfs_destroy_gccache(struct the_nilfs *nilfs)
203{
204 if (nilfs->ns_gc_inodes_h) {
205 nilfs_remove_all_gcinode(nilfs);
206 kfree(nilfs->ns_gc_inodes_h);
207 nilfs->ns_gc_inodes_h = NULL;
208 }
209}
210
211static struct inode *alloc_gcinode(struct the_nilfs *nilfs, ino_t ino,
212 __u64 cno)
213{
214 struct inode *inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS);
215 struct nilfs_inode_info *ii;
216
217 if (!inode)
218 return NULL;
219
220 inode->i_op = NULL;
221 inode->i_fop = NULL;
222 inode->i_mapping->a_ops = &def_gcinode_aops;
223
224 ii = NILFS_I(inode);
225 ii->i_cno = cno;
226 ii->i_flags = 0;
227 ii->i_state = 1 << NILFS_I_GCINODE;
228 ii->i_bh = NULL;
229 nilfs_bmap_init_gc(ii->i_bmap);
230
231 return inode;
232}
233
234static unsigned long ihash(ino_t ino, __u64 cno)
235{
236 return hash_long((unsigned long)((ino << 2) + cno),
237 NILFS_GCINODE_HASH_BITS);
238}
239
240/*
241 * nilfs_gc_iget() - find or create gc inode with specified (ino,cno)
242 */
243struct inode *nilfs_gc_iget(struct the_nilfs *nilfs, ino_t ino, __u64 cno)
244{
245 struct hlist_head *head = nilfs->ns_gc_inodes_h + ihash(ino, cno);
246 struct hlist_node *node;
247 struct inode *inode;
248
249 hlist_for_each_entry(inode, node, head, i_hash) {
250 if (inode->i_ino == ino && NILFS_I(inode)->i_cno == cno)
251 return inode;
252 }
253
254 inode = alloc_gcinode(nilfs, ino, cno);
255 if (likely(inode)) {
256 hlist_add_head(&inode->i_hash, head);
257 list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes);
258 }
259 return inode;
260}
261
262/*
263 * nilfs_clear_gcinode() - clear and free a gc inode
264 */
265void nilfs_clear_gcinode(struct inode *inode)
266{
267 nilfs_mdt_clear(inode);
268 nilfs_mdt_destroy(inode);
269}
270
271/*
272 * nilfs_remove_all_gcinode() - remove all inodes from the_nilfs
273 */
274void nilfs_remove_all_gcinode(struct the_nilfs *nilfs)
275{
276 struct hlist_head *head = nilfs->ns_gc_inodes_h;
277 struct hlist_node *node, *n;
278 struct inode *inode;
279 int loop;
280
281 for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++, head++) {
282 hlist_for_each_entry_safe(inode, node, n, head, i_hash) {
283 hlist_del_init(&inode->i_hash);
284 list_del_init(&NILFS_I(inode)->i_dirty);
285 nilfs_clear_gcinode(inode); /* might sleep */
286 }
287 }
288}
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
new file mode 100644
index 000000000000..de86401f209f
--- /dev/null
+++ b/fs/nilfs2/ifile.c
@@ -0,0 +1,150 @@
1/*
2 * ifile.c - NILFS inode file
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Amagai Yoshiji <amagai@osrg.net>.
21 * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
22 *
23 */
24
25#include <linux/types.h>
26#include <linux/buffer_head.h>
27#include "nilfs.h"
28#include "mdt.h"
29#include "alloc.h"
30#include "ifile.h"
31
32/**
33 * nilfs_ifile_create_inode - create a new disk inode
34 * @ifile: ifile inode
35 * @out_ino: pointer to a variable to store inode number
36 * @out_bh: buffer_head contains newly allocated disk inode
37 *
38 * Return Value: On success, 0 is returned and the newly allocated inode
39 * number is stored in the place pointed by @ino, and buffer_head pointer
40 * that contains newly allocated disk inode structure is stored in the
41 * place pointed by @out_bh
42 * On error, one of the following negative error codes is returned.
43 *
44 * %-EIO - I/O error.
45 *
46 * %-ENOMEM - Insufficient amount of memory available.
47 *
48 * %-ENOSPC - No inode left.
49 */
50int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino,
51 struct buffer_head **out_bh)
52{
53 struct nilfs_palloc_req req;
54 int ret;
55
56 req.pr_entry_nr = 0; /* 0 says find free inode from beginning of
57 a group. dull code!! */
58 req.pr_entry_bh = NULL;
59
60 ret = nilfs_palloc_prepare_alloc_entry(ifile, &req);
61 if (!ret) {
62 ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 1,
63 &req.pr_entry_bh);
64 if (ret < 0)
65 nilfs_palloc_abort_alloc_entry(ifile, &req);
66 }
67 if (ret < 0) {
68 brelse(req.pr_entry_bh);
69 return ret;
70 }
71 nilfs_palloc_commit_alloc_entry(ifile, &req);
72 nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh);
73 nilfs_mdt_mark_dirty(ifile);
74 *out_ino = (ino_t)req.pr_entry_nr;
75 *out_bh = req.pr_entry_bh;
76 return 0;
77}
78
79/**
80 * nilfs_ifile_delete_inode - delete a disk inode
81 * @ifile: ifile inode
82 * @ino: inode number
83 *
84 * Return Value: On success, 0 is returned. On error, one of the following
85 * negative error codes is returned.
86 *
87 * %-EIO - I/O error.
88 *
89 * %-ENOMEM - Insufficient amount of memory available.
90 *
91 * %-ENOENT - The inode number @ino have not been allocated.
92 */
93int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino)
94{
95 struct nilfs_palloc_req req = {
96 .pr_entry_nr = ino, .pr_entry_bh = NULL
97 };
98 struct nilfs_inode *raw_inode;
99 void *kaddr;
100 int ret;
101
102 ret = nilfs_palloc_prepare_free_entry(ifile, &req);
103 if (!ret) {
104 ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 0,
105 &req.pr_entry_bh);
106 if (ret < 0)
107 nilfs_palloc_abort_free_entry(ifile, &req);
108 }
109 if (ret < 0) {
110 brelse(req.pr_entry_bh);
111 return ret;
112 }
113
114 kaddr = kmap_atomic(req.pr_entry_bh->b_page, KM_USER0);
115 raw_inode = nilfs_palloc_block_get_entry(ifile, req.pr_entry_nr,
116 req.pr_entry_bh, kaddr);
117 raw_inode->i_flags = 0;
118 kunmap_atomic(kaddr, KM_USER0);
119
120 nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh);
121 brelse(req.pr_entry_bh);
122
123 nilfs_palloc_commit_free_entry(ifile, &req);
124
125 return 0;
126}
127
128int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
129 struct buffer_head **out_bh)
130{
131 struct super_block *sb = ifile->i_sb;
132 int err;
133
134 if (unlikely(!NILFS_VALID_INODE(sb, ino))) {
135 nilfs_error(sb, __func__, "bad inode number: %lu",
136 (unsigned long) ino);
137 return -EINVAL;
138 }
139
140 err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh);
141 if (unlikely(err)) {
142 if (err == -EINVAL)
143 nilfs_error(sb, __func__, "ifile is broken");
144 else
145 nilfs_warning(sb, __func__,
146 "unable to read inode: %lu",
147 (unsigned long) ino);
148 }
149 return err;
150}
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
new file mode 100644
index 000000000000..5d30a35679b5
--- /dev/null
+++ b/fs/nilfs2/ifile.h
@@ -0,0 +1,53 @@
1/*
2 * ifile.h - NILFS inode file
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Amagai Yoshiji <amagai@osrg.net>
21 * Revised by Ryusuke Konishi <ryusuke@osrg.net>
22 *
23 */
24
25#ifndef _NILFS_IFILE_H
26#define _NILFS_IFILE_H
27
28#include <linux/fs.h>
29#include <linux/buffer_head.h>
30#include <linux/nilfs2_fs.h>
31#include "mdt.h"
32#include "alloc.h"
33
34#define NILFS_IFILE_GFP NILFS_MDT_GFP
35
36static inline struct nilfs_inode *
37nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh)
38{
39 void *kaddr = kmap(ibh->b_page);
40 return nilfs_palloc_block_get_entry(ifile, ino, ibh, kaddr);
41}
42
43static inline void nilfs_ifile_unmap_inode(struct inode *ifile, ino_t ino,
44 struct buffer_head *ibh)
45{
46 kunmap(ibh->b_page);
47}
48
49int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
50int nilfs_ifile_delete_inode(struct inode *, ino_t);
51int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
52
53#endif /* _NILFS_IFILE_H */
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
new file mode 100644
index 000000000000..49ab4a49bb4f
--- /dev/null
+++ b/fs/nilfs2/inode.c
@@ -0,0 +1,785 @@
1/*
2 * inode.c - NILFS inode operations.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23
24#include <linux/buffer_head.h>
25#include <linux/mpage.h>
26#include <linux/writeback.h>
27#include <linux/uio.h>
28#include "nilfs.h"
29#include "segment.h"
30#include "page.h"
31#include "mdt.h"
32#include "cpfile.h"
33#include "ifile.h"
34
35
36/**
37 * nilfs_get_block() - get a file block on the filesystem (callback function)
38 * @inode - inode struct of the target file
39 * @blkoff - file block number
40 * @bh_result - buffer head to be mapped on
41 * @create - indicate whether allocating the block or not when it has not
42 * been allocated yet.
43 *
44 * This function does not issue actual read request of the specified data
45 * block. It is done by VFS.
46 * Bulk read for direct-io is not supported yet. (should be supported)
47 */
48int nilfs_get_block(struct inode *inode, sector_t blkoff,
49 struct buffer_head *bh_result, int create)
50{
51 struct nilfs_inode_info *ii = NILFS_I(inode);
52 unsigned long blknum = 0;
53 int err = 0, ret;
54 struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode));
55
56 /* This exclusion control is a workaround; should be revised */
57 down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
58 ret = nilfs_bmap_lookup(ii->i_bmap, (unsigned long)blkoff, &blknum);
59 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
60 if (ret == 0) { /* found */
61 map_bh(bh_result, inode->i_sb, blknum);
62 goto out;
63 }
64 /* data block was not found */
65 if (ret == -ENOENT && create) {
66 struct nilfs_transaction_info ti;
67
68 bh_result->b_blocknr = 0;
69 err = nilfs_transaction_begin(inode->i_sb, &ti, 1);
70 if (unlikely(err))
71 goto out;
72 err = nilfs_bmap_insert(ii->i_bmap, (unsigned long)blkoff,
73 (unsigned long)bh_result);
74 if (unlikely(err != 0)) {
75 if (err == -EEXIST) {
76 /*
77 * The get_block() function could be called
78 * from multiple callers for an inode.
79 * However, the page having this block must
80 * be locked in this case.
81 */
82 printk(KERN_WARNING
83 "nilfs_get_block: a race condition "
84 "while inserting a data block. "
85 "(inode number=%lu, file block "
86 "offset=%llu)\n",
87 inode->i_ino,
88 (unsigned long long)blkoff);
89 err = 0;
90 } else if (err == -EINVAL) {
91 nilfs_error(inode->i_sb, __func__,
92 "broken bmap (inode=%lu)\n",
93 inode->i_ino);
94 err = -EIO;
95 }
96 nilfs_transaction_abort(inode->i_sb);
97 goto out;
98 }
99 nilfs_transaction_commit(inode->i_sb); /* never fails */
100 /* Error handling should be detailed */
101 set_buffer_new(bh_result);
102 map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
103 to proper value */
104 } else if (ret == -ENOENT) {
105 /* not found is not error (e.g. hole); must return without
106 the mapped state flag. */
107 ;
108 } else {
109 err = ret;
110 }
111
112 out:
113 return err;
114}
115
116/**
117 * nilfs_readpage() - implement readpage() method of nilfs_aops {}
118 * address_space_operations.
119 * @file - file struct of the file to be read
120 * @page - the page to be read
121 */
122static int nilfs_readpage(struct file *file, struct page *page)
123{
124 return mpage_readpage(page, nilfs_get_block);
125}
126
127/**
128 * nilfs_readpages() - implement readpages() method of nilfs_aops {}
129 * address_space_operations.
130 * @file - file struct of the file to be read
131 * @mapping - address_space struct used for reading multiple pages
132 * @pages - the pages to be read
133 * @nr_pages - number of pages to be read
134 */
135static int nilfs_readpages(struct file *file, struct address_space *mapping,
136 struct list_head *pages, unsigned nr_pages)
137{
138 return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block);
139}
140
141static int nilfs_writepages(struct address_space *mapping,
142 struct writeback_control *wbc)
143{
144 struct inode *inode = mapping->host;
145 int err = 0;
146
147 if (wbc->sync_mode == WB_SYNC_ALL)
148 err = nilfs_construct_dsync_segment(inode->i_sb, inode,
149 wbc->range_start,
150 wbc->range_end);
151 return err;
152}
153
154static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
155{
156 struct inode *inode = page->mapping->host;
157 int err;
158
159 redirty_page_for_writepage(wbc, page);
160 unlock_page(page);
161
162 if (wbc->sync_mode == WB_SYNC_ALL) {
163 err = nilfs_construct_segment(inode->i_sb);
164 if (unlikely(err))
165 return err;
166 } else if (wbc->for_reclaim)
167 nilfs_flush_segment(inode->i_sb, inode->i_ino);
168
169 return 0;
170}
171
172static int nilfs_set_page_dirty(struct page *page)
173{
174 int ret = __set_page_dirty_buffers(page);
175
176 if (ret) {
177 struct inode *inode = page->mapping->host;
178 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
179 unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
180
181 nilfs_set_file_dirty(sbi, inode, nr_dirty);
182 }
183 return ret;
184}
185
186static int nilfs_write_begin(struct file *file, struct address_space *mapping,
187 loff_t pos, unsigned len, unsigned flags,
188 struct page **pagep, void **fsdata)
189
190{
191 struct inode *inode = mapping->host;
192 int err = nilfs_transaction_begin(inode->i_sb, NULL, 1);
193
194 if (unlikely(err))
195 return err;
196
197 *pagep = NULL;
198 err = block_write_begin(file, mapping, pos, len, flags, pagep,
199 fsdata, nilfs_get_block);
200 if (unlikely(err))
201 nilfs_transaction_abort(inode->i_sb);
202 return err;
203}
204
205static int nilfs_write_end(struct file *file, struct address_space *mapping,
206 loff_t pos, unsigned len, unsigned copied,
207 struct page *page, void *fsdata)
208{
209 struct inode *inode = mapping->host;
210 unsigned start = pos & (PAGE_CACHE_SIZE - 1);
211 unsigned nr_dirty;
212 int err;
213
214 nr_dirty = nilfs_page_count_clean_buffers(page, start,
215 start + copied);
216 copied = generic_write_end(file, mapping, pos, len, copied, page,
217 fsdata);
218 nilfs_set_file_dirty(NILFS_SB(inode->i_sb), inode, nr_dirty);
219 err = nilfs_transaction_commit(inode->i_sb);
220 return err ? : copied;
221}
222
223static ssize_t
224nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
225 loff_t offset, unsigned long nr_segs)
226{
227 struct file *file = iocb->ki_filp;
228 struct inode *inode = file->f_mapping->host;
229 ssize_t size;
230
231 if (rw == WRITE)
232 return 0;
233
234 /* Needs synchronization with the cleaner */
235 size = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
236 offset, nr_segs, nilfs_get_block, NULL);
237 return size;
238}
239
240struct address_space_operations nilfs_aops = {
241 .writepage = nilfs_writepage,
242 .readpage = nilfs_readpage,
243 /* .sync_page = nilfs_sync_page, */
244 .writepages = nilfs_writepages,
245 .set_page_dirty = nilfs_set_page_dirty,
246 .readpages = nilfs_readpages,
247 .write_begin = nilfs_write_begin,
248 .write_end = nilfs_write_end,
249 /* .releasepage = nilfs_releasepage, */
250 .invalidatepage = block_invalidatepage,
251 .direct_IO = nilfs_direct_IO,
252};
253
254struct inode *nilfs_new_inode(struct inode *dir, int mode)
255{
256 struct super_block *sb = dir->i_sb;
257 struct nilfs_sb_info *sbi = NILFS_SB(sb);
258 struct inode *inode;
259 struct nilfs_inode_info *ii;
260 int err = -ENOMEM;
261 ino_t ino;
262
263 inode = new_inode(sb);
264 if (unlikely(!inode))
265 goto failed;
266
267 mapping_set_gfp_mask(inode->i_mapping,
268 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
269
270 ii = NILFS_I(inode);
271 ii->i_state = 1 << NILFS_I_NEW;
272
273 err = nilfs_ifile_create_inode(sbi->s_ifile, &ino, &ii->i_bh);
274 if (unlikely(err))
275 goto failed_ifile_create_inode;
276 /* reference count of i_bh inherits from nilfs_mdt_read_block() */
277
278 atomic_inc(&sbi->s_inodes_count);
279
280 inode->i_uid = current_fsuid();
281 if (dir->i_mode & S_ISGID) {
282 inode->i_gid = dir->i_gid;
283 if (S_ISDIR(mode))
284 mode |= S_ISGID;
285 } else
286 inode->i_gid = current_fsgid();
287
288 inode->i_mode = mode;
289 inode->i_ino = ino;
290 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
291
292 if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
293 err = nilfs_bmap_read(ii->i_bmap, NULL);
294 if (err < 0)
295 goto failed_bmap;
296
297 set_bit(NILFS_I_BMAP, &ii->i_state);
298 /* No lock is needed; iget() ensures it. */
299 }
300
301 ii->i_flags = NILFS_I(dir)->i_flags;
302 if (S_ISLNK(mode))
303 ii->i_flags &= ~(NILFS_IMMUTABLE_FL | NILFS_APPEND_FL);
304 if (!S_ISDIR(mode))
305 ii->i_flags &= ~NILFS_DIRSYNC_FL;
306
307 /* ii->i_file_acl = 0; */
308 /* ii->i_dir_acl = 0; */
309 ii->i_dir_start_lookup = 0;
310#ifdef CONFIG_NILFS_FS_POSIX_ACL
311 ii->i_acl = NULL;
312 ii->i_default_acl = NULL;
313#endif
314 ii->i_cno = 0;
315 nilfs_set_inode_flags(inode);
316 spin_lock(&sbi->s_next_gen_lock);
317 inode->i_generation = sbi->s_next_generation++;
318 spin_unlock(&sbi->s_next_gen_lock);
319 insert_inode_hash(inode);
320
321 err = nilfs_init_acl(inode, dir);
322 if (unlikely(err))
323 goto failed_acl; /* never occur. When supporting
324 nilfs_init_acl(), proper cancellation of
325 above jobs should be considered */
326
327 mark_inode_dirty(inode);
328 return inode;
329
330 failed_acl:
331 failed_bmap:
332 inode->i_nlink = 0;
333 iput(inode); /* raw_inode will be deleted through
334 generic_delete_inode() */
335 goto failed;
336
337 failed_ifile_create_inode:
338 make_bad_inode(inode);
339 iput(inode); /* if i_nlink == 1, generic_forget_inode() will be
340 called */
341 failed:
342 return ERR_PTR(err);
343}
344
345void nilfs_free_inode(struct inode *inode)
346{
347 struct super_block *sb = inode->i_sb;
348 struct nilfs_sb_info *sbi = NILFS_SB(sb);
349
350 clear_inode(inode);
351 /* XXX: check error code? Is there any thing I can do? */
352 (void) nilfs_ifile_delete_inode(sbi->s_ifile, inode->i_ino);
353 atomic_dec(&sbi->s_inodes_count);
354}
355
356void nilfs_set_inode_flags(struct inode *inode)
357{
358 unsigned int flags = NILFS_I(inode)->i_flags;
359
360 inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
361 S_DIRSYNC);
362 if (flags & NILFS_SYNC_FL)
363 inode->i_flags |= S_SYNC;
364 if (flags & NILFS_APPEND_FL)
365 inode->i_flags |= S_APPEND;
366 if (flags & NILFS_IMMUTABLE_FL)
367 inode->i_flags |= S_IMMUTABLE;
368#ifndef NILFS_ATIME_DISABLE
369 if (flags & NILFS_NOATIME_FL)
370#endif
371 inode->i_flags |= S_NOATIME;
372 if (flags & NILFS_DIRSYNC_FL)
373 inode->i_flags |= S_DIRSYNC;
374 mapping_set_gfp_mask(inode->i_mapping,
375 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
376}
377
378int nilfs_read_inode_common(struct inode *inode,
379 struct nilfs_inode *raw_inode)
380{
381 struct nilfs_inode_info *ii = NILFS_I(inode);
382 int err;
383
384 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
385 inode->i_uid = (uid_t)le32_to_cpu(raw_inode->i_uid);
386 inode->i_gid = (gid_t)le32_to_cpu(raw_inode->i_gid);
387 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
388 inode->i_size = le64_to_cpu(raw_inode->i_size);
389 inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
390 inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
391 inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
392 inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
393 inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
394 inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
395 if (inode->i_nlink == 0 && inode->i_mode == 0)
396 return -EINVAL; /* this inode is deleted */
397
398 inode->i_blocks = le64_to_cpu(raw_inode->i_blocks);
399 ii->i_flags = le32_to_cpu(raw_inode->i_flags);
400#if 0
401 ii->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
402 ii->i_dir_acl = S_ISREG(inode->i_mode) ?
403 0 : le32_to_cpu(raw_inode->i_dir_acl);
404#endif
405 ii->i_cno = 0;
406 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
407
408 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
409 S_ISLNK(inode->i_mode)) {
410 err = nilfs_bmap_read(ii->i_bmap, raw_inode);
411 if (err < 0)
412 return err;
413 set_bit(NILFS_I_BMAP, &ii->i_state);
414 /* No lock is needed; iget() ensures it. */
415 }
416 return 0;
417}
418
419static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
420 struct inode *inode)
421{
422 struct nilfs_sb_info *sbi = NILFS_SB(sb);
423 struct inode *dat = nilfs_dat_inode(sbi->s_nilfs);
424 struct buffer_head *bh;
425 struct nilfs_inode *raw_inode;
426 int err;
427
428 down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
429 err = nilfs_ifile_get_inode_block(sbi->s_ifile, ino, &bh);
430 if (unlikely(err))
431 goto bad_inode;
432
433 raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh);
434
435#ifdef CONFIG_NILFS_FS_POSIX_ACL
436 ii->i_acl = NILFS_ACL_NOT_CACHED;
437 ii->i_default_acl = NILFS_ACL_NOT_CACHED;
438#endif
439 if (nilfs_read_inode_common(inode, raw_inode))
440 goto failed_unmap;
441
442 if (S_ISREG(inode->i_mode)) {
443 inode->i_op = &nilfs_file_inode_operations;
444 inode->i_fop = &nilfs_file_operations;
445 inode->i_mapping->a_ops = &nilfs_aops;
446 } else if (S_ISDIR(inode->i_mode)) {
447 inode->i_op = &nilfs_dir_inode_operations;
448 inode->i_fop = &nilfs_dir_operations;
449 inode->i_mapping->a_ops = &nilfs_aops;
450 } else if (S_ISLNK(inode->i_mode)) {
451 inode->i_op = &nilfs_symlink_inode_operations;
452 inode->i_mapping->a_ops = &nilfs_aops;
453 } else {
454 inode->i_op = &nilfs_special_inode_operations;
455 init_special_inode(
456 inode, inode->i_mode,
457 new_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
458 }
459 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
460 brelse(bh);
461 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
462 nilfs_set_inode_flags(inode);
463 return 0;
464
465 failed_unmap:
466 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
467 brelse(bh);
468
469 bad_inode:
470 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
471 return err;
472}
473
474struct inode *nilfs_iget(struct super_block *sb, unsigned long ino)
475{
476 struct inode *inode;
477 int err;
478
479 inode = iget_locked(sb, ino);
480 if (unlikely(!inode))
481 return ERR_PTR(-ENOMEM);
482 if (!(inode->i_state & I_NEW))
483 return inode;
484
485 err = __nilfs_read_inode(sb, ino, inode);
486 if (unlikely(err)) {
487 iget_failed(inode);
488 return ERR_PTR(err);
489 }
490 unlock_new_inode(inode);
491 return inode;
492}
493
494void nilfs_write_inode_common(struct inode *inode,
495 struct nilfs_inode *raw_inode, int has_bmap)
496{
497 struct nilfs_inode_info *ii = NILFS_I(inode);
498
499 raw_inode->i_mode = cpu_to_le16(inode->i_mode);
500 raw_inode->i_uid = cpu_to_le32(inode->i_uid);
501 raw_inode->i_gid = cpu_to_le32(inode->i_gid);
502 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
503 raw_inode->i_size = cpu_to_le64(inode->i_size);
504 raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
505 raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
506 raw_inode->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
507 raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
508 raw_inode->i_blocks = cpu_to_le64(inode->i_blocks);
509
510 raw_inode->i_flags = cpu_to_le32(ii->i_flags);
511 raw_inode->i_generation = cpu_to_le32(inode->i_generation);
512
513 if (has_bmap)
514 nilfs_bmap_write(ii->i_bmap, raw_inode);
515 else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
516 raw_inode->i_device_code =
517 cpu_to_le64(new_encode_dev(inode->i_rdev));
518 /* When extending inode, nilfs->ns_inode_size should be checked
519 for substitutions of appended fields */
520}
521
522void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
523{
524 ino_t ino = inode->i_ino;
525 struct nilfs_inode_info *ii = NILFS_I(inode);
526 struct super_block *sb = inode->i_sb;
527 struct nilfs_sb_info *sbi = NILFS_SB(sb);
528 struct nilfs_inode *raw_inode;
529
530 raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, ibh);
531
532 /* The buffer is guarded with lock_buffer() by the caller */
533 if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
534 memset(raw_inode, 0, NILFS_MDT(sbi->s_ifile)->mi_entry_size);
535 set_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
536
537 nilfs_write_inode_common(inode, raw_inode, 0);
538 /* XXX: call with has_bmap = 0 is a workaround to avoid
539 deadlock of bmap. This delays update of i_bmap to just
540 before writing */
541 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, ibh);
542}
543
544#define NILFS_MAX_TRUNCATE_BLOCKS 16384 /* 64MB for 4KB block */
545
546static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
547 unsigned long from)
548{
549 unsigned long b;
550 int ret;
551
552 if (!test_bit(NILFS_I_BMAP, &ii->i_state))
553 return;
554 repeat:
555 ret = nilfs_bmap_last_key(ii->i_bmap, &b);
556 if (ret == -ENOENT)
557 return;
558 else if (ret < 0)
559 goto failed;
560
561 if (b < from)
562 return;
563
564 b -= min_t(unsigned long, NILFS_MAX_TRUNCATE_BLOCKS, b - from);
565 ret = nilfs_bmap_truncate(ii->i_bmap, b);
566 nilfs_relax_pressure_in_lock(ii->vfs_inode.i_sb);
567 if (!ret || (ret == -ENOMEM &&
568 nilfs_bmap_truncate(ii->i_bmap, b) == 0))
569 goto repeat;
570
571 failed:
572 if (ret == -EINVAL)
573 nilfs_error(ii->vfs_inode.i_sb, __func__,
574 "bmap is broken (ino=%lu)", ii->vfs_inode.i_ino);
575 else
576 nilfs_warning(ii->vfs_inode.i_sb, __func__,
577 "failed to truncate bmap (ino=%lu, err=%d)",
578 ii->vfs_inode.i_ino, ret);
579}
580
581void nilfs_truncate(struct inode *inode)
582{
583 unsigned long blkoff;
584 unsigned int blocksize;
585 struct nilfs_transaction_info ti;
586 struct super_block *sb = inode->i_sb;
587 struct nilfs_inode_info *ii = NILFS_I(inode);
588
589 if (!test_bit(NILFS_I_BMAP, &ii->i_state))
590 return;
591 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
592 return;
593
594 blocksize = sb->s_blocksize;
595 blkoff = (inode->i_size + blocksize - 1) >> sb->s_blocksize_bits;
596 nilfs_transaction_begin(sb, &ti, 0); /* never fails */
597
598 block_truncate_page(inode->i_mapping, inode->i_size, nilfs_get_block);
599
600 nilfs_truncate_bmap(ii, blkoff);
601
602 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
603 if (IS_SYNC(inode))
604 nilfs_set_transaction_flag(NILFS_TI_SYNC);
605
606 nilfs_set_file_dirty(NILFS_SB(sb), inode, 0);
607 nilfs_transaction_commit(sb);
608 /* May construct a logical segment and may fail in sync mode.
609 But truncate has no return value. */
610}
611
612void nilfs_delete_inode(struct inode *inode)
613{
614 struct nilfs_transaction_info ti;
615 struct super_block *sb = inode->i_sb;
616 struct nilfs_inode_info *ii = NILFS_I(inode);
617
618 if (unlikely(is_bad_inode(inode))) {
619 if (inode->i_data.nrpages)
620 truncate_inode_pages(&inode->i_data, 0);
621 clear_inode(inode);
622 return;
623 }
624 nilfs_transaction_begin(sb, &ti, 0); /* never fails */
625
626 if (inode->i_data.nrpages)
627 truncate_inode_pages(&inode->i_data, 0);
628
629 nilfs_truncate_bmap(ii, 0);
630 nilfs_free_inode(inode);
631 /* nilfs_free_inode() marks inode buffer dirty */
632 if (IS_SYNC(inode))
633 nilfs_set_transaction_flag(NILFS_TI_SYNC);
634 nilfs_transaction_commit(sb);
635 /* May construct a logical segment and may fail in sync mode.
636 But delete_inode has no return value. */
637}
638
639int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
640{
641 struct nilfs_transaction_info ti;
642 struct inode *inode = dentry->d_inode;
643 struct super_block *sb = inode->i_sb;
644 int err;
645
646 err = inode_change_ok(inode, iattr);
647 if (err)
648 return err;
649
650 err = nilfs_transaction_begin(sb, &ti, 0);
651 if (unlikely(err))
652 return err;
653 err = inode_setattr(inode, iattr);
654 if (!err && (iattr->ia_valid & ATTR_MODE))
655 err = nilfs_acl_chmod(inode);
656 if (likely(!err))
657 err = nilfs_transaction_commit(sb);
658 else
659 nilfs_transaction_abort(sb);
660
661 return err;
662}
663
664int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
665 struct buffer_head **pbh)
666{
667 struct nilfs_inode_info *ii = NILFS_I(inode);
668 int err;
669
670 spin_lock(&sbi->s_inode_lock);
671 /* Caller of this function MUST lock s_inode_lock */
672 if (ii->i_bh == NULL) {
673 spin_unlock(&sbi->s_inode_lock);
674 err = nilfs_ifile_get_inode_block(sbi->s_ifile, inode->i_ino,
675 pbh);
676 if (unlikely(err))
677 return err;
678 spin_lock(&sbi->s_inode_lock);
679 if (ii->i_bh == NULL)
680 ii->i_bh = *pbh;
681 else {
682 brelse(*pbh);
683 *pbh = ii->i_bh;
684 }
685 } else
686 *pbh = ii->i_bh;
687
688 get_bh(*pbh);
689 spin_unlock(&sbi->s_inode_lock);
690 return 0;
691}
692
693int nilfs_inode_dirty(struct inode *inode)
694{
695 struct nilfs_inode_info *ii = NILFS_I(inode);
696 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
697 int ret = 0;
698
699 if (!list_empty(&ii->i_dirty)) {
700 spin_lock(&sbi->s_inode_lock);
701 ret = test_bit(NILFS_I_DIRTY, &ii->i_state) ||
702 test_bit(NILFS_I_BUSY, &ii->i_state);
703 spin_unlock(&sbi->s_inode_lock);
704 }
705 return ret;
706}
707
708int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode,
709 unsigned nr_dirty)
710{
711 struct nilfs_inode_info *ii = NILFS_I(inode);
712
713 atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks);
714
715 if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state))
716 return 0;
717
718 spin_lock(&sbi->s_inode_lock);
719 if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
720 !test_bit(NILFS_I_BUSY, &ii->i_state)) {
721 /* Because this routine may race with nilfs_dispose_list(),
722 we have to check NILFS_I_QUEUED here, too. */
723 if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) {
724 /* This will happen when somebody is freeing
725 this inode. */
726 nilfs_warning(sbi->s_super, __func__,
727 "cannot get inode (ino=%lu)\n",
728 inode->i_ino);
729 spin_unlock(&sbi->s_inode_lock);
730 return -EINVAL; /* NILFS_I_DIRTY may remain for
731 freeing inode */
732 }
733 list_del(&ii->i_dirty);
734 list_add_tail(&ii->i_dirty, &sbi->s_dirty_files);
735 set_bit(NILFS_I_QUEUED, &ii->i_state);
736 }
737 spin_unlock(&sbi->s_inode_lock);
738 return 0;
739}
740
741int nilfs_mark_inode_dirty(struct inode *inode)
742{
743 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
744 struct buffer_head *ibh;
745 int err;
746
747 err = nilfs_load_inode_block(sbi, inode, &ibh);
748 if (unlikely(err)) {
749 nilfs_warning(inode->i_sb, __func__,
750 "failed to reget inode block.\n");
751 return err;
752 }
753 lock_buffer(ibh);
754 nilfs_update_inode(inode, ibh);
755 unlock_buffer(ibh);
756 nilfs_mdt_mark_buffer_dirty(ibh);
757 nilfs_mdt_mark_dirty(sbi->s_ifile);
758 brelse(ibh);
759 return 0;
760}
761
762/**
763 * nilfs_dirty_inode - reflect changes on given inode to an inode block.
764 * @inode: inode of the file to be registered.
765 *
766 * nilfs_dirty_inode() loads a inode block containing the specified
767 * @inode and copies data from a nilfs_inode to a corresponding inode
768 * entry in the inode block. This operation is excluded from the segment
769 * construction. This function can be called both as a single operation
770 * and as a part of indivisible file operations.
771 */
772void nilfs_dirty_inode(struct inode *inode)
773{
774 struct nilfs_transaction_info ti;
775
776 if (is_bad_inode(inode)) {
777 nilfs_warning(inode->i_sb, __func__,
778 "tried to mark bad_inode dirty. ignored.\n");
779 dump_stack();
780 return;
781 }
782 nilfs_transaction_begin(inode->i_sb, &ti, 0);
783 nilfs_mark_inode_dirty(inode);
784 nilfs_transaction_commit(inode->i_sb); /* never fails */
785}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
new file mode 100644
index 000000000000..d6759b92006f
--- /dev/null
+++ b/fs/nilfs2/ioctl.c
@@ -0,0 +1,665 @@
1/*
2 * ioctl.c - NILFS ioctl operations.
3 *
4 * Copyright (C) 2007, 2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#include <linux/fs.h>
24#include <linux/wait.h>
25#include <linux/smp_lock.h> /* lock_kernel(), unlock_kernel() */
26#include <linux/capability.h> /* capable() */
27#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */
28#include <linux/vmalloc.h>
29#include <linux/nilfs2_fs.h>
30#include "nilfs.h"
31#include "segment.h"
32#include "bmap.h"
33#include "cpfile.h"
34#include "sufile.h"
35#include "dat.h"
36
37
38static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
39 struct nilfs_argv *argv, int dir,
40 ssize_t (*dofunc)(struct the_nilfs *,
41 __u64 *, int,
42 void *, size_t, size_t))
43{
44 void *buf;
45 void __user *base = (void __user *)(unsigned long)argv->v_base;
46 size_t maxmembs, total, n;
47 ssize_t nr;
48 int ret, i;
49 __u64 pos, ppos;
50
51 if (argv->v_nmembs == 0)
52 return 0;
53
54 if (argv->v_size > PAGE_SIZE)
55 return -EINVAL;
56
57 buf = (void *)__get_free_pages(GFP_NOFS, 0);
58 if (unlikely(!buf))
59 return -ENOMEM;
60 maxmembs = PAGE_SIZE / argv->v_size;
61
62 ret = 0;
63 total = 0;
64 pos = argv->v_index;
65 for (i = 0; i < argv->v_nmembs; i += n) {
66 n = (argv->v_nmembs - i < maxmembs) ?
67 argv->v_nmembs - i : maxmembs;
68 if ((dir & _IOC_WRITE) &&
69 copy_from_user(buf, base + argv->v_size * i,
70 argv->v_size * n)) {
71 ret = -EFAULT;
72 break;
73 }
74 ppos = pos;
75 nr = dofunc(nilfs, &pos, argv->v_flags, buf, argv->v_size,
76 n);
77 if (nr < 0) {
78 ret = nr;
79 break;
80 }
81 if ((dir & _IOC_READ) &&
82 copy_to_user(base + argv->v_size * i, buf,
83 argv->v_size * nr)) {
84 ret = -EFAULT;
85 break;
86 }
87 total += nr;
88 if ((size_t)nr < n)
89 break;
90 if (pos == ppos)
91 pos += n;
92 }
93 argv->v_nmembs = total;
94
95 free_pages((unsigned long)buf, 0);
96 return ret;
97}
98
99static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
100 unsigned int cmd, void __user *argp)
101{
102 struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile;
103 struct nilfs_transaction_info ti;
104 struct nilfs_cpmode cpmode;
105 int ret;
106
107 if (!capable(CAP_SYS_ADMIN))
108 return -EPERM;
109 if (copy_from_user(&cpmode, argp, sizeof(cpmode)))
110 return -EFAULT;
111
112 nilfs_transaction_begin(inode->i_sb, &ti, 0);
113 ret = nilfs_cpfile_change_cpmode(
114 cpfile, cpmode.cm_cno, cpmode.cm_mode);
115 if (unlikely(ret < 0)) {
116 nilfs_transaction_abort(inode->i_sb);
117 return ret;
118 }
119 nilfs_transaction_commit(inode->i_sb); /* never fails */
120 return ret;
121}
122
123static int
124nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
125 unsigned int cmd, void __user *argp)
126{
127 struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile;
128 struct nilfs_transaction_info ti;
129 __u64 cno;
130 int ret;
131
132 if (!capable(CAP_SYS_ADMIN))
133 return -EPERM;
134 if (copy_from_user(&cno, argp, sizeof(cno)))
135 return -EFAULT;
136
137 nilfs_transaction_begin(inode->i_sb, &ti, 0);
138 ret = nilfs_cpfile_delete_checkpoint(cpfile, cno);
139 if (unlikely(ret < 0)) {
140 nilfs_transaction_abort(inode->i_sb);
141 return ret;
142 }
143 nilfs_transaction_commit(inode->i_sb); /* never fails */
144 return ret;
145}
146
147static ssize_t
148nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
149 void *buf, size_t size, size_t nmembs)
150{
151 int ret;
152
153 down_read(&nilfs->ns_segctor_sem);
154 ret = nilfs_cpfile_get_cpinfo(nilfs->ns_cpfile, posp, flags, buf,
155 nmembs);
156 up_read(&nilfs->ns_segctor_sem);
157 return ret;
158}
159
160static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp,
161 unsigned int cmd, void __user *argp)
162{
163 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
164 struct nilfs_cpstat cpstat;
165 int ret;
166
167 down_read(&nilfs->ns_segctor_sem);
168 ret = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
169 up_read(&nilfs->ns_segctor_sem);
170 if (ret < 0)
171 return ret;
172
173 if (copy_to_user(argp, &cpstat, sizeof(cpstat)))
174 ret = -EFAULT;
175 return ret;
176}
177
178static ssize_t
179nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
180 void *buf, size_t size, size_t nmembs)
181{
182 int ret;
183
184 down_read(&nilfs->ns_segctor_sem);
185 ret = nilfs_sufile_get_suinfo(nilfs->ns_sufile, *posp, buf, nmembs);
186 up_read(&nilfs->ns_segctor_sem);
187 return ret;
188}
189
190static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp,
191 unsigned int cmd, void __user *argp)
192{
193 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
194 struct nilfs_sustat sustat;
195 int ret;
196
197 down_read(&nilfs->ns_segctor_sem);
198 ret = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat);
199 up_read(&nilfs->ns_segctor_sem);
200 if (ret < 0)
201 return ret;
202
203 if (copy_to_user(argp, &sustat, sizeof(sustat)))
204 ret = -EFAULT;
205 return ret;
206}
207
208static ssize_t
209nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
210 void *buf, size_t size, size_t nmembs)
211{
212 int ret;
213
214 down_read(&nilfs->ns_segctor_sem);
215 ret = nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, nmembs);
216 up_read(&nilfs->ns_segctor_sem);
217 return ret;
218}
219
220static ssize_t
221nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
222 void *buf, size_t size, size_t nmembs)
223{
224 struct inode *dat = nilfs_dat_inode(nilfs);
225 struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
226 struct nilfs_bdesc *bdescs = buf;
227 int ret, i;
228
229 down_read(&nilfs->ns_segctor_sem);
230 for (i = 0; i < nmembs; i++) {
231 ret = nilfs_bmap_lookup_at_level(bmap,
232 bdescs[i].bd_offset,
233 bdescs[i].bd_level + 1,
234 &bdescs[i].bd_blocknr);
235 if (ret < 0) {
236 if (ret != -ENOENT) {
237 up_read(&nilfs->ns_segctor_sem);
238 return ret;
239 }
240 bdescs[i].bd_blocknr = 0;
241 }
242 }
243 up_read(&nilfs->ns_segctor_sem);
244 return nmembs;
245}
246
247static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp,
248 unsigned int cmd, void __user *argp)
249{
250 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
251 struct nilfs_argv argv;
252 int ret;
253
254 if (copy_from_user(&argv, argp, sizeof(argv)))
255 return -EFAULT;
256
257 if (argv.v_size != sizeof(struct nilfs_bdesc))
258 return -EINVAL;
259
260 ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd),
261 nilfs_ioctl_do_get_bdescs);
262 if (ret < 0)
263 return ret;
264
265 if (copy_to_user(argp, &argv, sizeof(argv)))
266 ret = -EFAULT;
267 return ret;
268}
269
270static int nilfs_ioctl_move_inode_block(struct inode *inode,
271 struct nilfs_vdesc *vdesc,
272 struct list_head *buffers)
273{
274 struct buffer_head *bh;
275 int ret;
276
277 if (vdesc->vd_flags == 0)
278 ret = nilfs_gccache_submit_read_data(
279 inode, vdesc->vd_offset, vdesc->vd_blocknr,
280 vdesc->vd_vblocknr, &bh);
281 else
282 ret = nilfs_gccache_submit_read_node(
283 inode, vdesc->vd_blocknr, vdesc->vd_vblocknr, &bh);
284
285 if (unlikely(ret < 0)) {
286 if (ret == -ENOENT)
287 printk(KERN_CRIT
288 "%s: invalid virtual block address (%s): "
289 "ino=%llu, cno=%llu, offset=%llu, "
290 "blocknr=%llu, vblocknr=%llu\n",
291 __func__, vdesc->vd_flags ? "node" : "data",
292 (unsigned long long)vdesc->vd_ino,
293 (unsigned long long)vdesc->vd_cno,
294 (unsigned long long)vdesc->vd_offset,
295 (unsigned long long)vdesc->vd_blocknr,
296 (unsigned long long)vdesc->vd_vblocknr);
297 return ret;
298 }
299 bh->b_private = vdesc;
300 list_add_tail(&bh->b_assoc_buffers, buffers);
301 return 0;
302}
303
304static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
305 struct nilfs_argv *argv, void *buf)
306{
307 size_t nmembs = argv->v_nmembs;
308 struct inode *inode;
309 struct nilfs_vdesc *vdesc;
310 struct buffer_head *bh, *n;
311 LIST_HEAD(buffers);
312 ino_t ino;
313 __u64 cno;
314 int i, ret;
315
316 for (i = 0, vdesc = buf; i < nmembs; ) {
317 ino = vdesc->vd_ino;
318 cno = vdesc->vd_cno;
319 inode = nilfs_gc_iget(nilfs, ino, cno);
320 if (unlikely(inode == NULL)) {
321 ret = -ENOMEM;
322 goto failed;
323 }
324 do {
325 ret = nilfs_ioctl_move_inode_block(inode, vdesc,
326 &buffers);
327 if (unlikely(ret < 0))
328 goto failed;
329 vdesc++;
330 } while (++i < nmembs &&
331 vdesc->vd_ino == ino && vdesc->vd_cno == cno);
332 }
333
334 list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
335 ret = nilfs_gccache_wait_and_mark_dirty(bh);
336 if (unlikely(ret < 0)) {
337 if (ret == -EEXIST) {
338 vdesc = bh->b_private;
339 printk(KERN_CRIT
340 "%s: conflicting %s buffer: "
341 "ino=%llu, cno=%llu, offset=%llu, "
342 "blocknr=%llu, vblocknr=%llu\n",
343 __func__,
344 vdesc->vd_flags ? "node" : "data",
345 (unsigned long long)vdesc->vd_ino,
346 (unsigned long long)vdesc->vd_cno,
347 (unsigned long long)vdesc->vd_offset,
348 (unsigned long long)vdesc->vd_blocknr,
349 (unsigned long long)vdesc->vd_vblocknr);
350 }
351 goto failed;
352 }
353 list_del_init(&bh->b_assoc_buffers);
354 bh->b_private = NULL;
355 brelse(bh);
356 }
357 return nmembs;
358
359 failed:
360 list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
361 list_del_init(&bh->b_assoc_buffers);
362 bh->b_private = NULL;
363 brelse(bh);
364 }
365 return ret;
366}
367
368static int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs,
369 struct nilfs_argv *argv, void *buf)
370{
371 size_t nmembs = argv->v_nmembs;
372 struct inode *cpfile = nilfs->ns_cpfile;
373 struct nilfs_period *periods = buf;
374 int ret, i;
375
376 for (i = 0; i < nmembs; i++) {
377 ret = nilfs_cpfile_delete_checkpoints(
378 cpfile, periods[i].p_start, periods[i].p_end);
379 if (ret < 0)
380 return ret;
381 }
382 return nmembs;
383}
384
385static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
386 struct nilfs_argv *argv, void *buf)
387{
388 size_t nmembs = argv->v_nmembs;
389 int ret;
390
391 ret = nilfs_dat_freev(nilfs_dat_inode(nilfs), buf, nmembs);
392
393 return (ret < 0) ? ret : nmembs;
394}
395
396static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
397 struct nilfs_argv *argv, void *buf)
398{
399 size_t nmembs = argv->v_nmembs;
400 struct inode *dat = nilfs_dat_inode(nilfs);
401 struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
402 struct nilfs_bdesc *bdescs = buf;
403 int ret, i;
404
405 for (i = 0; i < nmembs; i++) {
406 /* XXX: use macro or inline func to check liveness */
407 ret = nilfs_bmap_lookup_at_level(bmap,
408 bdescs[i].bd_offset,
409 bdescs[i].bd_level + 1,
410 &bdescs[i].bd_blocknr);
411 if (ret < 0) {
412 if (ret != -ENOENT)
413 return ret;
414 bdescs[i].bd_blocknr = 0;
415 }
416 if (bdescs[i].bd_blocknr != bdescs[i].bd_oblocknr)
417 /* skip dead block */
418 continue;
419 if (bdescs[i].bd_level == 0) {
420 ret = nilfs_mdt_mark_block_dirty(dat,
421 bdescs[i].bd_offset);
422 if (ret < 0) {
423 WARN_ON(ret == -ENOENT);
424 return ret;
425 }
426 } else {
427 ret = nilfs_bmap_mark(bmap, bdescs[i].bd_offset,
428 bdescs[i].bd_level);
429 if (ret < 0) {
430 WARN_ON(ret == -ENOENT);
431 return ret;
432 }
433 }
434 }
435 return nmembs;
436}
437
438static int nilfs_ioctl_free_segments(struct the_nilfs *nilfs,
439 struct nilfs_argv *argv, void *buf)
440{
441 size_t nmembs = argv->v_nmembs;
442 struct nilfs_sb_info *sbi = nilfs->ns_writer;
443 int ret;
444
445 if (unlikely(!sbi)) {
446 /* never happens because called for a writable mount */
447 WARN_ON(1);
448 return -EROFS;
449 }
450 ret = nilfs_segctor_add_segments_to_be_freed(
451 NILFS_SC(sbi), buf, nmembs);
452
453 return (ret < 0) ? ret : nmembs;
454}
455
456int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
457 struct nilfs_argv *argv, void **kbufs)
458{
459 const char *msg;
460 int ret;
461
462 ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]);
463 if (ret < 0) {
464 msg = "cannot read source blocks";
465 goto failed;
466 }
467
468 ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], kbufs[1]);
469 if (ret < 0) {
470 /*
471 * can safely abort because checkpoints can be removed
472 * independently.
473 */
474 msg = "cannot delete checkpoints";
475 goto failed;
476 }
477 ret = nilfs_ioctl_free_vblocknrs(nilfs, &argv[2], kbufs[2]);
478 if (ret < 0) {
479 /*
480 * can safely abort because DAT file is updated atomically
481 * using a copy-on-write technique.
482 */
483 msg = "cannot delete virtual blocks from DAT file";
484 goto failed;
485 }
486 ret = nilfs_ioctl_mark_blocks_dirty(nilfs, &argv[3], kbufs[3]);
487 if (ret < 0) {
488 /*
489 * can safely abort because the operation is nondestructive.
490 */
491 msg = "cannot mark copying blocks dirty";
492 goto failed;
493 }
494 ret = nilfs_ioctl_free_segments(nilfs, &argv[4], kbufs[4]);
495 if (ret < 0) {
496 /*
497 * can safely abort because this operation is atomic.
498 */
499 msg = "cannot set segments to be freed";
500 goto failed;
501 }
502 return 0;
503
504 failed:
505 nilfs_remove_all_gcinode(nilfs);
506 printk(KERN_ERR "NILFS: GC failed during preparation: %s: err=%d\n",
507 msg, ret);
508 return ret;
509}
510
511static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
512 unsigned int cmd, void __user *argp)
513{
514 struct nilfs_argv argv[5];
515 const static size_t argsz[5] = {
516 sizeof(struct nilfs_vdesc),
517 sizeof(struct nilfs_period),
518 sizeof(__u64),
519 sizeof(struct nilfs_bdesc),
520 sizeof(__u64),
521 };
522 void __user *base;
523 void *kbufs[5];
524 struct the_nilfs *nilfs;
525 size_t len, nsegs;
526 int n, ret;
527
528 if (!capable(CAP_SYS_ADMIN))
529 return -EPERM;
530
531 if (copy_from_user(argv, argp, sizeof(argv)))
532 return -EFAULT;
533
534 nsegs = argv[4].v_nmembs;
535 if (argv[4].v_size != argsz[4])
536 return -EINVAL;
537 /*
538 * argv[4] points to segment numbers this ioctl cleans. We
539 * use kmalloc() for its buffer because memory used for the
540 * segment numbers is enough small.
541 */
542 kbufs[4] = memdup_user((void __user *)(unsigned long)argv[4].v_base,
543 nsegs * sizeof(__u64));
544 if (IS_ERR(kbufs[4]))
545 return PTR_ERR(kbufs[4]);
546
547 nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
548
549 for (n = 0; n < 4; n++) {
550 ret = -EINVAL;
551 if (argv[n].v_size != argsz[n])
552 goto out_free;
553
554 if (argv[n].v_nmembs > nsegs * nilfs->ns_blocks_per_segment)
555 goto out_free;
556
557 len = argv[n].v_size * argv[n].v_nmembs;
558 base = (void __user *)(unsigned long)argv[n].v_base;
559 if (len == 0) {
560 kbufs[n] = NULL;
561 continue;
562 }
563
564 kbufs[n] = vmalloc(len);
565 if (!kbufs[n]) {
566 ret = -ENOMEM;
567 goto out_free;
568 }
569 if (copy_from_user(kbufs[n], base, len)) {
570 ret = -EFAULT;
571 vfree(kbufs[n]);
572 goto out_free;
573 }
574 }
575
576 ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
577
578 out_free:
579 while (--n >= 0)
580 vfree(kbufs[n]);
581 kfree(kbufs[4]);
582 return ret;
583}
584
585static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
586 unsigned int cmd, void __user *argp)
587{
588 __u64 cno;
589 int ret;
590
591 ret = nilfs_construct_segment(inode->i_sb);
592 if (ret < 0)
593 return ret;
594
595 if (argp != NULL) {
596 cno = NILFS_SB(inode->i_sb)->s_nilfs->ns_cno - 1;
597 if (copy_to_user(argp, &cno, sizeof(cno)))
598 return -EFAULT;
599 }
600 return 0;
601}
602
603static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
604 unsigned int cmd, void __user *argp,
605 size_t membsz,
606 ssize_t (*dofunc)(struct the_nilfs *,
607 __u64 *, int,
608 void *, size_t, size_t))
609
610{
611 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
612 struct nilfs_argv argv;
613 int ret;
614
615 if (copy_from_user(&argv, argp, sizeof(argv)))
616 return -EFAULT;
617
618 if (argv.v_size != membsz)
619 return -EINVAL;
620
621 ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd), dofunc);
622 if (ret < 0)
623 return ret;
624
625 if (copy_to_user(argp, &argv, sizeof(argv)))
626 ret = -EFAULT;
627 return ret;
628}
629
630long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
631{
632 struct inode *inode = filp->f_dentry->d_inode;
633 void __user *argp = (void * __user *)arg;
634
635 switch (cmd) {
636 case NILFS_IOCTL_CHANGE_CPMODE:
637 return nilfs_ioctl_change_cpmode(inode, filp, cmd, argp);
638 case NILFS_IOCTL_DELETE_CHECKPOINT:
639 return nilfs_ioctl_delete_checkpoint(inode, filp, cmd, argp);
640 case NILFS_IOCTL_GET_CPINFO:
641 return nilfs_ioctl_get_info(inode, filp, cmd, argp,
642 sizeof(struct nilfs_cpinfo),
643 nilfs_ioctl_do_get_cpinfo);
644 case NILFS_IOCTL_GET_CPSTAT:
645 return nilfs_ioctl_get_cpstat(inode, filp, cmd, argp);
646 case NILFS_IOCTL_GET_SUINFO:
647 return nilfs_ioctl_get_info(inode, filp, cmd, argp,
648 sizeof(struct nilfs_suinfo),
649 nilfs_ioctl_do_get_suinfo);
650 case NILFS_IOCTL_GET_SUSTAT:
651 return nilfs_ioctl_get_sustat(inode, filp, cmd, argp);
652 case NILFS_IOCTL_GET_VINFO:
653 return nilfs_ioctl_get_info(inode, filp, cmd, argp,
654 sizeof(struct nilfs_vinfo),
655 nilfs_ioctl_do_get_vinfo);
656 case NILFS_IOCTL_GET_BDESCS:
657 return nilfs_ioctl_get_bdescs(inode, filp, cmd, argp);
658 case NILFS_IOCTL_CLEAN_SEGMENTS:
659 return nilfs_ioctl_clean_segments(inode, filp, cmd, argp);
660 case NILFS_IOCTL_SYNC:
661 return nilfs_ioctl_sync(inode, filp, cmd, argp);
662 default:
663 return -ENOTTY;
664 }
665}
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
new file mode 100644
index 000000000000..bb78745a0e30
--- /dev/null
+++ b/fs/nilfs2/mdt.c
@@ -0,0 +1,564 @@
1/*
2 * mdt.c - meta data file for NILFS
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 */
22
23#include <linux/buffer_head.h>
24#include <linux/mpage.h>
25#include <linux/mm.h>
26#include <linux/writeback.h>
27#include <linux/backing-dev.h>
28#include <linux/swap.h>
29#include "nilfs.h"
30#include "segment.h"
31#include "page.h"
32#include "mdt.h"
33
34
35#define NILFS_MDT_MAX_RA_BLOCKS (16 - 1)
36
37#define INIT_UNUSED_INODE_FIELDS
38
39static int
40nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
41 struct buffer_head *bh,
42 void (*init_block)(struct inode *,
43 struct buffer_head *, void *))
44{
45 struct nilfs_inode_info *ii = NILFS_I(inode);
46 void *kaddr;
47 int ret;
48
49 /* Caller exclude read accesses using page lock */
50
51 /* set_buffer_new(bh); */
52 bh->b_blocknr = 0;
53
54 ret = nilfs_bmap_insert(ii->i_bmap, block, (unsigned long)bh);
55 if (unlikely(ret))
56 return ret;
57
58 set_buffer_mapped(bh);
59
60 kaddr = kmap_atomic(bh->b_page, KM_USER0);
61 memset(kaddr + bh_offset(bh), 0, 1 << inode->i_blkbits);
62 if (init_block)
63 init_block(inode, bh, kaddr);
64 flush_dcache_page(bh->b_page);
65 kunmap_atomic(kaddr, KM_USER0);
66
67 set_buffer_uptodate(bh);
68 nilfs_mark_buffer_dirty(bh);
69 nilfs_mdt_mark_dirty(inode);
70 return 0;
71}
72
73static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
74 struct buffer_head **out_bh,
75 void (*init_block)(struct inode *,
76 struct buffer_head *,
77 void *))
78{
79 struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
80 struct super_block *sb = inode->i_sb;
81 struct nilfs_transaction_info ti;
82 struct buffer_head *bh;
83 int err;
84
85 if (!sb) {
86 /*
87 * Make sure this function is not called from any
88 * read-only context.
89 */
90 if (!nilfs->ns_writer) {
91 WARN_ON(1);
92 err = -EROFS;
93 goto out;
94 }
95 sb = nilfs->ns_writer->s_super;
96 }
97
98 nilfs_transaction_begin(sb, &ti, 0);
99
100 err = -ENOMEM;
101 bh = nilfs_grab_buffer(inode, inode->i_mapping, block, 0);
102 if (unlikely(!bh))
103 goto failed_unlock;
104
105 err = -EEXIST;
106 if (buffer_uptodate(bh) || buffer_mapped(bh))
107 goto failed_bh;
108#if 0
109 /* The uptodate flag is not protected by the page lock, but
110 the mapped flag is. Thus, we don't have to wait the buffer. */
111 wait_on_buffer(bh);
112 if (buffer_uptodate(bh))
113 goto failed_bh;
114#endif
115
116 bh->b_bdev = nilfs->ns_bdev;
117 err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
118 if (likely(!err)) {
119 get_bh(bh);
120 *out_bh = bh;
121 }
122
123 failed_bh:
124 unlock_page(bh->b_page);
125 page_cache_release(bh->b_page);
126 brelse(bh);
127
128 failed_unlock:
129 if (likely(!err))
130 err = nilfs_transaction_commit(sb);
131 else
132 nilfs_transaction_abort(sb);
133 out:
134 return err;
135}
136
137static int
138nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
139 int mode, struct buffer_head **out_bh)
140{
141 struct buffer_head *bh;
142 unsigned long blknum = 0;
143 int ret = -ENOMEM;
144
145 bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
146 if (unlikely(!bh))
147 goto failed;
148
149 ret = -EEXIST; /* internal code */
150 if (buffer_uptodate(bh))
151 goto out;
152
153 if (mode == READA) {
154 if (!trylock_buffer(bh)) {
155 ret = -EBUSY;
156 goto failed_bh;
157 }
158 } else /* mode == READ */
159 lock_buffer(bh);
160
161 if (buffer_uptodate(bh)) {
162 unlock_buffer(bh);
163 goto out;
164 }
165 if (!buffer_mapped(bh)) { /* unused buffer */
166 ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff,
167 &blknum);
168 if (unlikely(ret)) {
169 unlock_buffer(bh);
170 goto failed_bh;
171 }
172 bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
173 bh->b_blocknr = blknum;
174 set_buffer_mapped(bh);
175 }
176
177 bh->b_end_io = end_buffer_read_sync;
178 get_bh(bh);
179 submit_bh(mode, bh);
180 ret = 0;
181 out:
182 get_bh(bh);
183 *out_bh = bh;
184
185 failed_bh:
186 unlock_page(bh->b_page);
187 page_cache_release(bh->b_page);
188 brelse(bh);
189 failed:
190 return ret;
191}
192
193static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
194 struct buffer_head **out_bh)
195{
196 struct buffer_head *first_bh, *bh;
197 unsigned long blkoff;
198 int i, nr_ra_blocks = NILFS_MDT_MAX_RA_BLOCKS;
199 int err;
200
201 err = nilfs_mdt_submit_block(inode, block, READ, &first_bh);
202 if (err == -EEXIST) /* internal code */
203 goto out;
204
205 if (unlikely(err))
206 goto failed;
207
208 blkoff = block + 1;
209 for (i = 0; i < nr_ra_blocks; i++, blkoff++) {
210 err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh);
211 if (likely(!err || err == -EEXIST))
212 brelse(bh);
213 else if (err != -EBUSY)
214 break; /* abort readahead if bmap lookup failed */
215
216 if (!buffer_locked(first_bh))
217 goto out_no_wait;
218 }
219
220 wait_on_buffer(first_bh);
221
222 out_no_wait:
223 err = -EIO;
224 if (!buffer_uptodate(first_bh))
225 goto failed_bh;
226 out:
227 *out_bh = first_bh;
228 return 0;
229
230 failed_bh:
231 brelse(first_bh);
232 failed:
233 return err;
234}
235
236/**
237 * nilfs_mdt_get_block - read or create a buffer on meta data file.
238 * @inode: inode of the meta data file
239 * @blkoff: block offset
240 * @create: create flag
241 * @init_block: initializer used for newly allocated block
242 * @out_bh: output of a pointer to the buffer_head
243 *
244 * nilfs_mdt_get_block() looks up the specified buffer and tries to create
245 * a new buffer if @create is not zero. On success, the returned buffer is
246 * assured to be either existing or formatted using a buffer lock on success.
247 * @out_bh is substituted only when zero is returned.
248 *
249 * Return Value: On success, it returns 0. On error, the following negative
250 * error code is returned.
251 *
252 * %-ENOMEM - Insufficient memory available.
253 *
254 * %-EIO - I/O error
255 *
256 * %-ENOENT - the specified block does not exist (hole block)
257 *
258 * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
259 *
260 * %-EROFS - Read only filesystem (for create mode)
261 */
262int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
263 void (*init_block)(struct inode *,
264 struct buffer_head *, void *),
265 struct buffer_head **out_bh)
266{
267 int ret;
268
269 /* Should be rewritten with merging nilfs_mdt_read_block() */
270 retry:
271 ret = nilfs_mdt_read_block(inode, blkoff, out_bh);
272 if (!create || ret != -ENOENT)
273 return ret;
274
275 ret = nilfs_mdt_create_block(inode, blkoff, out_bh, init_block);
276 if (unlikely(ret == -EEXIST)) {
277 /* create = 0; */ /* limit read-create loop retries */
278 goto retry;
279 }
280 return ret;
281}
282
283/**
284 * nilfs_mdt_delete_block - make a hole on the meta data file.
285 * @inode: inode of the meta data file
286 * @block: block offset
287 *
288 * Return Value: On success, zero is returned.
289 * On error, one of the following negative error code is returned.
290 *
291 * %-ENOMEM - Insufficient memory available.
292 *
293 * %-EIO - I/O error
294 *
295 * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
296 */
297int nilfs_mdt_delete_block(struct inode *inode, unsigned long block)
298{
299 struct nilfs_inode_info *ii = NILFS_I(inode);
300 int err;
301
302 err = nilfs_bmap_delete(ii->i_bmap, block);
303 if (!err || err == -ENOENT) {
304 nilfs_mdt_mark_dirty(inode);
305 nilfs_mdt_forget_block(inode, block);
306 }
307 return err;
308}
309
310/**
311 * nilfs_mdt_forget_block - discard dirty state and try to remove the page
312 * @inode: inode of the meta data file
313 * @block: block offset
314 *
315 * nilfs_mdt_forget_block() clears a dirty flag of the specified buffer, and
316 * tries to release the page including the buffer from a page cache.
317 *
318 * Return Value: On success, 0 is returned. On error, one of the following
319 * negative error code is returned.
320 *
321 * %-EBUSY - page has an active buffer.
322 *
323 * %-ENOENT - page cache has no page addressed by the offset.
324 */
325int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
326{
327 pgoff_t index = (pgoff_t)block >>
328 (PAGE_CACHE_SHIFT - inode->i_blkbits);
329 struct page *page;
330 unsigned long first_block;
331 int ret = 0;
332 int still_dirty;
333
334 page = find_lock_page(inode->i_mapping, index);
335 if (!page)
336 return -ENOENT;
337
338 wait_on_page_writeback(page);
339
340 first_block = (unsigned long)index <<
341 (PAGE_CACHE_SHIFT - inode->i_blkbits);
342 if (page_has_buffers(page)) {
343 struct buffer_head *bh;
344
345 bh = nilfs_page_get_nth_block(page, block - first_block);
346 nilfs_forget_buffer(bh);
347 }
348 still_dirty = PageDirty(page);
349 unlock_page(page);
350 page_cache_release(page);
351
352 if (still_dirty ||
353 invalidate_inode_pages2_range(inode->i_mapping, index, index) != 0)
354 ret = -EBUSY;
355 return ret;
356}
357
358/**
359 * nilfs_mdt_mark_block_dirty - mark a block on the meta data file dirty.
360 * @inode: inode of the meta data file
361 * @block: block offset
362 *
363 * Return Value: On success, it returns 0. On error, the following negative
364 * error code is returned.
365 *
366 * %-ENOMEM - Insufficient memory available.
367 *
368 * %-EIO - I/O error
369 *
370 * %-ENOENT - the specified block does not exist (hole block)
371 *
372 * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
373 */
374int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block)
375{
376 struct buffer_head *bh;
377 int err;
378
379 err = nilfs_mdt_read_block(inode, block, &bh);
380 if (unlikely(err))
381 return err;
382 nilfs_mark_buffer_dirty(bh);
383 nilfs_mdt_mark_dirty(inode);
384 brelse(bh);
385 return 0;
386}
387
388int nilfs_mdt_fetch_dirty(struct inode *inode)
389{
390 struct nilfs_inode_info *ii = NILFS_I(inode);
391
392 if (nilfs_bmap_test_and_clear_dirty(ii->i_bmap)) {
393 set_bit(NILFS_I_DIRTY, &ii->i_state);
394 return 1;
395 }
396 return test_bit(NILFS_I_DIRTY, &ii->i_state);
397}
398
399static int
400nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
401{
402 struct inode *inode = container_of(page->mapping,
403 struct inode, i_data);
404 struct super_block *sb = inode->i_sb;
405 struct nilfs_sb_info *writer = NULL;
406 int err = 0;
407
408 redirty_page_for_writepage(wbc, page);
409 unlock_page(page);
410
411 if (page->mapping->assoc_mapping)
412 return 0; /* Do not request flush for shadow page cache */
413 if (!sb) {
414 writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs);
415 if (!writer)
416 return -EROFS;
417 sb = writer->s_super;
418 }
419
420 if (wbc->sync_mode == WB_SYNC_ALL)
421 err = nilfs_construct_segment(sb);
422 else if (wbc->for_reclaim)
423 nilfs_flush_segment(sb, inode->i_ino);
424
425 if (writer)
426 nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs);
427 return err;
428}
429
430
431static struct address_space_operations def_mdt_aops = {
432 .writepage = nilfs_mdt_write_page,
433};
434
435static struct inode_operations def_mdt_iops;
436static struct file_operations def_mdt_fops;
437
438/*
439 * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile,
440 * ifile, or gcinodes. This allows the B-tree code and segment constructor
441 * to treat them like regular files, and this helps to simplify the
442 * implementation.
443 * On the other hand, some of the pseudo inodes have an irregular point:
444 * They don't have valid inode->i_sb pointer because their lifetimes are
445 * longer than those of the super block structs; they may continue for
446 * several consecutive mounts/umounts. This would need discussions.
447 */
448struct inode *
449nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
450 ino_t ino, gfp_t gfp_mask)
451{
452 struct inode *inode = nilfs_alloc_inode(sb);
453
454 if (!inode)
455 return NULL;
456 else {
457 struct address_space * const mapping = &inode->i_data;
458 struct nilfs_mdt_info *mi = kzalloc(sizeof(*mi), GFP_NOFS);
459
460 if (!mi) {
461 nilfs_destroy_inode(inode);
462 return NULL;
463 }
464 mi->mi_nilfs = nilfs;
465 init_rwsem(&mi->mi_sem);
466
467 inode->i_sb = sb; /* sb may be NULL for some meta data files */
468 inode->i_blkbits = nilfs->ns_blocksize_bits;
469 inode->i_flags = 0;
470 atomic_set(&inode->i_count, 1);
471 inode->i_nlink = 1;
472 inode->i_ino = ino;
473 inode->i_mode = S_IFREG;
474 inode->i_private = mi;
475
476#ifdef INIT_UNUSED_INODE_FIELDS
477 atomic_set(&inode->i_writecount, 0);
478 inode->i_size = 0;
479 inode->i_blocks = 0;
480 inode->i_bytes = 0;
481 inode->i_generation = 0;
482#ifdef CONFIG_QUOTA
483 memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
484#endif
485 inode->i_pipe = NULL;
486 inode->i_bdev = NULL;
487 inode->i_cdev = NULL;
488 inode->i_rdev = 0;
489#ifdef CONFIG_SECURITY
490 inode->i_security = NULL;
491#endif
492 inode->dirtied_when = 0;
493
494 INIT_LIST_HEAD(&inode->i_list);
495 INIT_LIST_HEAD(&inode->i_sb_list);
496 inode->i_state = 0;
497#endif
498
499 spin_lock_init(&inode->i_lock);
500 mutex_init(&inode->i_mutex);
501 init_rwsem(&inode->i_alloc_sem);
502
503 mapping->host = NULL; /* instead of inode */
504 mapping->flags = 0;
505 mapping_set_gfp_mask(mapping, gfp_mask);
506 mapping->assoc_mapping = NULL;
507 mapping->backing_dev_info = nilfs->ns_bdi;
508
509 inode->i_mapping = mapping;
510 }
511
512 return inode;
513}
514
515struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb,
516 ino_t ino, gfp_t gfp_mask)
517{
518 struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, gfp_mask);
519
520 if (!inode)
521 return NULL;
522
523 inode->i_op = &def_mdt_iops;
524 inode->i_fop = &def_mdt_fops;
525 inode->i_mapping->a_ops = &def_mdt_aops;
526 return inode;
527}
528
529void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
530 unsigned header_size)
531{
532 struct nilfs_mdt_info *mi = NILFS_MDT(inode);
533
534 mi->mi_entry_size = entry_size;
535 mi->mi_entries_per_block = (1 << inode->i_blkbits) / entry_size;
536 mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size);
537}
538
539void nilfs_mdt_set_shadow(struct inode *orig, struct inode *shadow)
540{
541 shadow->i_mapping->assoc_mapping = orig->i_mapping;
542 NILFS_I(shadow)->i_btnode_cache.assoc_mapping =
543 &NILFS_I(orig)->i_btnode_cache;
544}
545
546void nilfs_mdt_clear(struct inode *inode)
547{
548 struct nilfs_inode_info *ii = NILFS_I(inode);
549
550 invalidate_mapping_pages(inode->i_mapping, 0, -1);
551 truncate_inode_pages(inode->i_mapping, 0);
552
553 nilfs_bmap_clear(ii->i_bmap);
554 nilfs_btnode_cache_clear(&ii->i_btnode_cache);
555}
556
557void nilfs_mdt_destroy(struct inode *inode)
558{
559 struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
560
561 kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
562 kfree(mdi);
563 nilfs_destroy_inode(inode);
564}
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
new file mode 100644
index 000000000000..df683e0bca6a
--- /dev/null
+++ b/fs/nilfs2/mdt.h
@@ -0,0 +1,125 @@
1/*
2 * mdt.h - NILFS meta data file prototype and definitions
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 */
22
23#ifndef _NILFS_MDT_H
24#define _NILFS_MDT_H
25
26#include <linux/buffer_head.h>
27#include <linux/blockgroup_lock.h>
28#include "nilfs.h"
29#include "page.h"
30
31/**
32 * struct nilfs_mdt_info - on-memory private data of meta data files
33 * @mi_nilfs: back pointer to the_nilfs struct
34 * @mi_sem: reader/writer semaphore for meta data operations
35 * @mi_bgl: per-blockgroup locking
36 * @mi_entry_size: size of an entry
37 * @mi_first_entry_offset: offset to the first entry
38 * @mi_entries_per_block: number of entries in a block
39 * @mi_blocks_per_group: number of blocks in a group
40 * @mi_blocks_per_desc_block: number of blocks per descriptor block
41 */
42struct nilfs_mdt_info {
43 struct the_nilfs *mi_nilfs;
44 struct rw_semaphore mi_sem;
45 struct blockgroup_lock *mi_bgl;
46 unsigned mi_entry_size;
47 unsigned mi_first_entry_offset;
48 unsigned long mi_entries_per_block;
49 unsigned long mi_blocks_per_group;
50 unsigned long mi_blocks_per_desc_block;
51};
52
53static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
54{
55 return inode->i_private;
56}
57
58static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode)
59{
60 struct super_block *sb = inode->i_sb;
61
62 return sb ? NILFS_SB(sb)->s_nilfs : NILFS_MDT(inode)->mi_nilfs;
63}
64
65/* Default GFP flags using highmem */
66#define NILFS_MDT_GFP (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM)
67
68int nilfs_mdt_get_block(struct inode *, unsigned long, int,
69 void (*init_block)(struct inode *,
70 struct buffer_head *, void *),
71 struct buffer_head **);
72int nilfs_mdt_delete_block(struct inode *, unsigned long);
73int nilfs_mdt_forget_block(struct inode *, unsigned long);
74int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
75int nilfs_mdt_fetch_dirty(struct inode *);
76
77struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t,
78 gfp_t);
79struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
80 ino_t, gfp_t);
81void nilfs_mdt_destroy(struct inode *);
82void nilfs_mdt_clear(struct inode *);
83void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
84void nilfs_mdt_set_shadow(struct inode *, struct inode *);
85
86
87#define nilfs_mdt_mark_buffer_dirty(bh) nilfs_mark_buffer_dirty(bh)
88
89static inline void nilfs_mdt_mark_dirty(struct inode *inode)
90{
91 if (!test_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state))
92 set_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state);
93}
94
95static inline void nilfs_mdt_clear_dirty(struct inode *inode)
96{
97 clear_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state);
98}
99
100static inline __u64 nilfs_mdt_cno(struct inode *inode)
101{
102 return NILFS_MDT(inode)->mi_nilfs->ns_cno;
103}
104
105#define nilfs_mdt_bgl_lock(inode, bg) \
106 (&NILFS_MDT(inode)->mi_bgl->locks[(bg) & (NR_BG_LOCKS-1)].lock)
107
108
109static inline int
110nilfs_mdt_read_inode_direct(struct inode *inode, struct buffer_head *bh,
111 unsigned n)
112{
113 return nilfs_read_inode_common(
114 inode, (struct nilfs_inode *)(bh->b_data + n));
115}
116
117static inline void
118nilfs_mdt_write_inode_direct(struct inode *inode, struct buffer_head *bh,
119 unsigned n)
120{
121 nilfs_write_inode_common(
122 inode, (struct nilfs_inode *)(bh->b_data + n), 1);
123}
124
125#endif /* _NILFS_MDT_H */
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
new file mode 100644
index 000000000000..df70dadb336f
--- /dev/null
+++ b/fs/nilfs2/namei.c
@@ -0,0 +1,474 @@
1/*
2 * namei.c - NILFS pathname lookup operations.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>,
21 * Ryusuke Konishi <ryusuke@osrg.net>
22 */
23/*
24 * linux/fs/ext2/namei.c
25 *
26 * Copyright (C) 1992, 1993, 1994, 1995
27 * Remy Card (card@masi.ibp.fr)
28 * Laboratoire MASI - Institut Blaise Pascal
29 * Universite Pierre et Marie Curie (Paris VI)
30 *
31 * from
32 *
33 * linux/fs/minix/namei.c
34 *
35 * Copyright (C) 1991, 1992 Linus Torvalds
36 *
37 * Big-endian to little-endian byte-swapping/bitmaps by
38 * David S. Miller (davem@caip.rutgers.edu), 1995
39 */
40
41#include <linux/pagemap.h>
42#include "nilfs.h"
43
44
45static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode)
46{
47 int err = nilfs_add_link(dentry, inode);
48 if (!err) {
49 d_instantiate(dentry, inode);
50 return 0;
51 }
52 inode_dec_link_count(inode);
53 iput(inode);
54 return err;
55}
56
57/*
58 * Methods themselves.
59 */
60
61static struct dentry *
62nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
63{
64 struct inode *inode;
65 ino_t ino;
66
67 if (dentry->d_name.len > NILFS_NAME_LEN)
68 return ERR_PTR(-ENAMETOOLONG);
69
70 ino = nilfs_inode_by_name(dir, dentry);
71 inode = NULL;
72 if (ino) {
73 inode = nilfs_iget(dir->i_sb, ino);
74 if (IS_ERR(inode))
75 return ERR_CAST(inode);
76 }
77 return d_splice_alias(inode, dentry);
78}
79
80struct dentry *nilfs_get_parent(struct dentry *child)
81{
82 unsigned long ino;
83 struct inode *inode;
84 struct dentry dotdot;
85
86 dotdot.d_name.name = "..";
87 dotdot.d_name.len = 2;
88
89 ino = nilfs_inode_by_name(child->d_inode, &dotdot);
90 if (!ino)
91 return ERR_PTR(-ENOENT);
92
93 inode = nilfs_iget(child->d_inode->i_sb, ino);
94 if (IS_ERR(inode))
95 return ERR_CAST(inode);
96 return d_obtain_alias(inode);
97}
98
99/*
100 * By the time this is called, we already have created
101 * the directory cache entry for the new file, but it
102 * is so far negative - it has no inode.
103 *
104 * If the create succeeds, we fill in the inode information
105 * with d_instantiate().
106 */
107static int nilfs_create(struct inode *dir, struct dentry *dentry, int mode,
108 struct nameidata *nd)
109{
110 struct inode *inode;
111 struct nilfs_transaction_info ti;
112 int err;
113
114 err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
115 if (err)
116 return err;
117 inode = nilfs_new_inode(dir, mode);
118 err = PTR_ERR(inode);
119 if (!IS_ERR(inode)) {
120 inode->i_op = &nilfs_file_inode_operations;
121 inode->i_fop = &nilfs_file_operations;
122 inode->i_mapping->a_ops = &nilfs_aops;
123 mark_inode_dirty(inode);
124 err = nilfs_add_nondir(dentry, inode);
125 }
126 if (!err)
127 err = nilfs_transaction_commit(dir->i_sb);
128 else
129 nilfs_transaction_abort(dir->i_sb);
130
131 return err;
132}
133
134static int
135nilfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
136{
137 struct inode *inode;
138 struct nilfs_transaction_info ti;
139 int err;
140
141 if (!new_valid_dev(rdev))
142 return -EINVAL;
143
144 err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
145 if (err)
146 return err;
147 inode = nilfs_new_inode(dir, mode);
148 err = PTR_ERR(inode);
149 if (!IS_ERR(inode)) {
150 init_special_inode(inode, inode->i_mode, rdev);
151 mark_inode_dirty(inode);
152 err = nilfs_add_nondir(dentry, inode);
153 }
154 if (!err)
155 err = nilfs_transaction_commit(dir->i_sb);
156 else
157 nilfs_transaction_abort(dir->i_sb);
158
159 return err;
160}
161
162static int nilfs_symlink(struct inode *dir, struct dentry *dentry,
163 const char *symname)
164{
165 struct nilfs_transaction_info ti;
166 struct super_block *sb = dir->i_sb;
167 unsigned l = strlen(symname)+1;
168 struct inode *inode;
169 int err;
170
171 if (l > sb->s_blocksize)
172 return -ENAMETOOLONG;
173
174 err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
175 if (err)
176 return err;
177
178 inode = nilfs_new_inode(dir, S_IFLNK | S_IRWXUGO);
179 err = PTR_ERR(inode);
180 if (IS_ERR(inode))
181 goto out;
182
183 /* slow symlink */
184 inode->i_op = &nilfs_symlink_inode_operations;
185 inode->i_mapping->a_ops = &nilfs_aops;
186 err = page_symlink(inode, symname, l);
187 if (err)
188 goto out_fail;
189
190 /* mark_inode_dirty(inode); */
191 /* nilfs_new_inode() and page_symlink() do this */
192
193 err = nilfs_add_nondir(dentry, inode);
194out:
195 if (!err)
196 err = nilfs_transaction_commit(dir->i_sb);
197 else
198 nilfs_transaction_abort(dir->i_sb);
199
200 return err;
201
202out_fail:
203 inode_dec_link_count(inode);
204 iput(inode);
205 goto out;
206}
207
208static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
209 struct dentry *dentry)
210{
211 struct inode *inode = old_dentry->d_inode;
212 struct nilfs_transaction_info ti;
213 int err;
214
215 if (inode->i_nlink >= NILFS_LINK_MAX)
216 return -EMLINK;
217
218 err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
219 if (err)
220 return err;
221
222 inode->i_ctime = CURRENT_TIME;
223 inode_inc_link_count(inode);
224 atomic_inc(&inode->i_count);
225
226 err = nilfs_add_nondir(dentry, inode);
227 if (!err)
228 err = nilfs_transaction_commit(dir->i_sb);
229 else
230 nilfs_transaction_abort(dir->i_sb);
231
232 return err;
233}
234
235static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
236{
237 struct inode *inode;
238 struct nilfs_transaction_info ti;
239 int err;
240
241 if (dir->i_nlink >= NILFS_LINK_MAX)
242 return -EMLINK;
243
244 err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
245 if (err)
246 return err;
247
248 inode_inc_link_count(dir);
249
250 inode = nilfs_new_inode(dir, S_IFDIR | mode);
251 err = PTR_ERR(inode);
252 if (IS_ERR(inode))
253 goto out_dir;
254
255 inode->i_op = &nilfs_dir_inode_operations;
256 inode->i_fop = &nilfs_dir_operations;
257 inode->i_mapping->a_ops = &nilfs_aops;
258
259 inode_inc_link_count(inode);
260
261 err = nilfs_make_empty(inode, dir);
262 if (err)
263 goto out_fail;
264
265 err = nilfs_add_link(dentry, inode);
266 if (err)
267 goto out_fail;
268
269 d_instantiate(dentry, inode);
270out:
271 if (!err)
272 err = nilfs_transaction_commit(dir->i_sb);
273 else
274 nilfs_transaction_abort(dir->i_sb);
275
276 return err;
277
278out_fail:
279 inode_dec_link_count(inode);
280 inode_dec_link_count(inode);
281 iput(inode);
282out_dir:
283 inode_dec_link_count(dir);
284 goto out;
285}
286
287static int nilfs_unlink(struct inode *dir, struct dentry *dentry)
288{
289 struct inode *inode;
290 struct nilfs_dir_entry *de;
291 struct page *page;
292 struct nilfs_transaction_info ti;
293 int err;
294
295 err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
296 if (err)
297 return err;
298
299 err = -ENOENT;
300 de = nilfs_find_entry(dir, dentry, &page);
301 if (!de)
302 goto out;
303
304 inode = dentry->d_inode;
305 err = -EIO;
306 if (le64_to_cpu(de->inode) != inode->i_ino)
307 goto out;
308
309 if (!inode->i_nlink) {
310 nilfs_warning(inode->i_sb, __func__,
311 "deleting nonexistent file (%lu), %d\n",
312 inode->i_ino, inode->i_nlink);
313 inode->i_nlink = 1;
314 }
315 err = nilfs_delete_entry(de, page);
316 if (err)
317 goto out;
318
319 inode->i_ctime = dir->i_ctime;
320 inode_dec_link_count(inode);
321 err = 0;
322out:
323 if (!err)
324 err = nilfs_transaction_commit(dir->i_sb);
325 else
326 nilfs_transaction_abort(dir->i_sb);
327
328 return err;
329}
330
331static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
332{
333 struct inode *inode = dentry->d_inode;
334 struct nilfs_transaction_info ti;
335 int err;
336
337 err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
338 if (err)
339 return err;
340
341 err = -ENOTEMPTY;
342 if (nilfs_empty_dir(inode)) {
343 err = nilfs_unlink(dir, dentry);
344 if (!err) {
345 inode->i_size = 0;
346 inode_dec_link_count(inode);
347 inode_dec_link_count(dir);
348 }
349 }
350 if (!err)
351 err = nilfs_transaction_commit(dir->i_sb);
352 else
353 nilfs_transaction_abort(dir->i_sb);
354
355 return err;
356}
357
358static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
359 struct inode *new_dir, struct dentry *new_dentry)
360{
361 struct inode *old_inode = old_dentry->d_inode;
362 struct inode *new_inode = new_dentry->d_inode;
363 struct page *dir_page = NULL;
364 struct nilfs_dir_entry *dir_de = NULL;
365 struct page *old_page;
366 struct nilfs_dir_entry *old_de;
367 struct nilfs_transaction_info ti;
368 int err;
369
370 err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1);
371 if (unlikely(err))
372 return err;
373
374 err = -ENOENT;
375 old_de = nilfs_find_entry(old_dir, old_dentry, &old_page);
376 if (!old_de)
377 goto out;
378
379 if (S_ISDIR(old_inode->i_mode)) {
380 err = -EIO;
381 dir_de = nilfs_dotdot(old_inode, &dir_page);
382 if (!dir_de)
383 goto out_old;
384 }
385
386 if (new_inode) {
387 struct page *new_page;
388 struct nilfs_dir_entry *new_de;
389
390 err = -ENOTEMPTY;
391 if (dir_de && !nilfs_empty_dir(new_inode))
392 goto out_dir;
393
394 err = -ENOENT;
395 new_de = nilfs_find_entry(new_dir, new_dentry, &new_page);
396 if (!new_de)
397 goto out_dir;
398 inode_inc_link_count(old_inode);
399 nilfs_set_link(new_dir, new_de, new_page, old_inode);
400 new_inode->i_ctime = CURRENT_TIME;
401 if (dir_de)
402 drop_nlink(new_inode);
403 inode_dec_link_count(new_inode);
404 } else {
405 if (dir_de) {
406 err = -EMLINK;
407 if (new_dir->i_nlink >= NILFS_LINK_MAX)
408 goto out_dir;
409 }
410 inode_inc_link_count(old_inode);
411 err = nilfs_add_link(new_dentry, old_inode);
412 if (err) {
413 inode_dec_link_count(old_inode);
414 goto out_dir;
415 }
416 if (dir_de)
417 inode_inc_link_count(new_dir);
418 }
419
420 /*
421 * Like most other Unix systems, set the ctime for inodes on a
422 * rename.
423 * inode_dec_link_count() will mark the inode dirty.
424 */
425 old_inode->i_ctime = CURRENT_TIME;
426
427 nilfs_delete_entry(old_de, old_page);
428 inode_dec_link_count(old_inode);
429
430 if (dir_de) {
431 nilfs_set_link(old_inode, dir_de, dir_page, new_dir);
432 inode_dec_link_count(old_dir);
433 }
434
435 err = nilfs_transaction_commit(old_dir->i_sb);
436 return err;
437
438out_dir:
439 if (dir_de) {
440 kunmap(dir_page);
441 page_cache_release(dir_page);
442 }
443out_old:
444 kunmap(old_page);
445 page_cache_release(old_page);
446out:
447 nilfs_transaction_abort(old_dir->i_sb);
448 return err;
449}
450
451struct inode_operations nilfs_dir_inode_operations = {
452 .create = nilfs_create,
453 .lookup = nilfs_lookup,
454 .link = nilfs_link,
455 .unlink = nilfs_unlink,
456 .symlink = nilfs_symlink,
457 .mkdir = nilfs_mkdir,
458 .rmdir = nilfs_rmdir,
459 .mknod = nilfs_mknod,
460 .rename = nilfs_rename,
461 .setattr = nilfs_setattr,
462 .permission = nilfs_permission,
463};
464
465struct inode_operations nilfs_special_inode_operations = {
466 .setattr = nilfs_setattr,
467 .permission = nilfs_permission,
468};
469
470struct inode_operations nilfs_symlink_inode_operations = {
471 .readlink = generic_readlink,
472 .follow_link = page_follow_link_light,
473 .put_link = page_put_link,
474};
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
new file mode 100644
index 000000000000..da6fc0bba2e5
--- /dev/null
+++ b/fs/nilfs2/nilfs.h
@@ -0,0 +1,314 @@
1/*
2 * nilfs.h - NILFS local header file.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>
21 * Ryusuke Konishi <ryusuke@osrg.net>
22 */
23
24#ifndef _NILFS_H
25#define _NILFS_H
26
27#include <linux/kernel.h>
28#include <linux/buffer_head.h>
29#include <linux/spinlock.h>
30#include <linux/blkdev.h>
31#include <linux/nilfs2_fs.h>
32#include "the_nilfs.h"
33#include "sb.h"
34#include "bmap.h"
35#include "bmap_union.h"
36
37/*
38 * nilfs inode data in memory
39 */
40struct nilfs_inode_info {
41 __u32 i_flags;
42 unsigned long i_state; /* Dynamic state flags */
43 struct nilfs_bmap *i_bmap;
44 union nilfs_bmap_union i_bmap_union;
45 __u64 i_xattr; /* sector_t ??? */
46 __u32 i_dir_start_lookup;
47 __u64 i_cno; /* check point number for GC inode */
48 struct address_space i_btnode_cache;
49 struct list_head i_dirty; /* List for connecting dirty files */
50
51#ifdef CONFIG_NILFS_XATTR
52 /*
53 * Extended attributes can be read independently of the main file
54 * data. Taking i_sem even when reading would cause contention
55 * between readers of EAs and writers of regular file data, so
56 * instead we synchronize on xattr_sem when reading or changing
57 * EAs.
58 */
59 struct rw_semaphore xattr_sem;
60#endif
61#ifdef CONFIG_NILFS_POSIX_ACL
62 struct posix_acl *i_acl;
63 struct posix_acl *i_default_acl;
64#endif
65 struct buffer_head *i_bh; /* i_bh contains a new or dirty
66 disk inode */
67 struct inode vfs_inode;
68};
69
70static inline struct nilfs_inode_info *NILFS_I(const struct inode *inode)
71{
72 return container_of(inode, struct nilfs_inode_info, vfs_inode);
73}
74
75static inline struct nilfs_inode_info *
76NILFS_BMAP_I(const struct nilfs_bmap *bmap)
77{
78 return container_of((union nilfs_bmap_union *)bmap,
79 struct nilfs_inode_info,
80 i_bmap_union);
81}
82
83static inline struct inode *NILFS_BTNC_I(struct address_space *btnc)
84{
85 struct nilfs_inode_info *ii =
86 container_of(btnc, struct nilfs_inode_info, i_btnode_cache);
87 return &ii->vfs_inode;
88}
89
90static inline struct inode *NILFS_AS_I(struct address_space *mapping)
91{
92 return (mapping->host) ? :
93 container_of(mapping, struct inode, i_data);
94}
95
96/*
97 * Dynamic state flags of NILFS on-memory inode (i_state)
98 */
99enum {
100 NILFS_I_NEW = 0, /* Inode is newly created */
101 NILFS_I_DIRTY, /* The file is dirty */
102 NILFS_I_QUEUED, /* inode is in dirty_files list */
103 NILFS_I_BUSY, /* inode is grabbed by a segment
104 constructor */
105 NILFS_I_COLLECTED, /* All dirty blocks are collected */
106 NILFS_I_UPDATED, /* The file has been written back */
107 NILFS_I_INODE_DIRTY, /* write_inode is requested */
108 NILFS_I_BMAP, /* has bmap and btnode_cache */
109 NILFS_I_GCINODE, /* inode for GC, on memory only */
110 NILFS_I_GCDAT, /* shadow DAT, on memory only */
111};
112
113/*
114 * Macros to check inode numbers
115 */
116#define NILFS_MDT_INO_BITS \
117 ((unsigned int)(1 << NILFS_DAT_INO | 1 << NILFS_CPFILE_INO | \
118 1 << NILFS_SUFILE_INO | 1 << NILFS_IFILE_INO | \
119 1 << NILFS_ATIME_INO | 1 << NILFS_SKETCH_INO))
120
121#define NILFS_SYS_INO_BITS \
122 ((unsigned int)(1 << NILFS_ROOT_INO) | NILFS_MDT_INO_BITS)
123
124#define NILFS_FIRST_INO(sb) (NILFS_SB(sb)->s_nilfs->ns_first_ino)
125
126#define NILFS_MDT_INODE(sb, ino) \
127 ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & (1 << (ino))))
128#define NILFS_VALID_INODE(sb, ino) \
129 ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & (1 << (ino))))
130
131/**
132 * struct nilfs_transaction_info: context information for synchronization
133 * @ti_magic: Magic number
134 * @ti_save: Backup of journal_info field of task_struct
135 * @ti_flags: Flags
136 * @ti_count: Nest level
137 * @ti_garbage: List of inode to be put when releasing semaphore
138 */
139struct nilfs_transaction_info {
140 u32 ti_magic;
141 void *ti_save;
142 /* This should never used. If this happens,
143 one of other filesystems has a bug. */
144 unsigned short ti_flags;
145 unsigned short ti_count;
146 struct list_head ti_garbage;
147};
148
149/* ti_magic */
150#define NILFS_TI_MAGIC 0xd9e392fb
151
152/* ti_flags */
153#define NILFS_TI_DYNAMIC_ALLOC 0x0001 /* Allocated from slab */
154#define NILFS_TI_SYNC 0x0002 /* Force to construct segment at the
155 end of transaction. */
156#define NILFS_TI_GC 0x0004 /* GC context */
157#define NILFS_TI_COMMIT 0x0008 /* Change happened or not */
158#define NILFS_TI_WRITER 0x0010 /* Constructor context */
159
160
161int nilfs_transaction_begin(struct super_block *,
162 struct nilfs_transaction_info *, int);
163int nilfs_transaction_commit(struct super_block *);
164void nilfs_transaction_abort(struct super_block *);
165
166static inline void nilfs_set_transaction_flag(unsigned int flag)
167{
168 struct nilfs_transaction_info *ti = current->journal_info;
169
170 ti->ti_flags |= flag;
171}
172
173static inline int nilfs_test_transaction_flag(unsigned int flag)
174{
175 struct nilfs_transaction_info *ti = current->journal_info;
176
177 if (ti == NULL || ti->ti_magic != NILFS_TI_MAGIC)
178 return 0;
179 return !!(ti->ti_flags & flag);
180}
181
182static inline int nilfs_doing_gc(void)
183{
184 return nilfs_test_transaction_flag(NILFS_TI_GC);
185}
186
187static inline int nilfs_doing_construction(void)
188{
189 return nilfs_test_transaction_flag(NILFS_TI_WRITER);
190}
191
192static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs)
193{
194 return nilfs_doing_gc() ? nilfs->ns_gc_dat : nilfs->ns_dat;
195}
196
197/*
198 * function prototype
199 */
200#ifdef CONFIG_NILFS_POSIX_ACL
201#error "NILFS: not yet supported POSIX ACL"
202extern int nilfs_permission(struct inode *, int, struct nameidata *);
203extern int nilfs_acl_chmod(struct inode *);
204extern int nilfs_init_acl(struct inode *, struct inode *);
205#else
206#define nilfs_permission NULL
207
208static inline int nilfs_acl_chmod(struct inode *inode)
209{
210 return 0;
211}
212
213static inline int nilfs_init_acl(struct inode *inode, struct inode *dir)
214{
215 inode->i_mode &= ~current_umask();
216 return 0;
217}
218#endif
219
220#define NILFS_ATIME_DISABLE
221
222/* dir.c */
223extern int nilfs_add_link(struct dentry *, struct inode *);
224extern ino_t nilfs_inode_by_name(struct inode *, struct dentry *);
225extern int nilfs_make_empty(struct inode *, struct inode *);
226extern struct nilfs_dir_entry *
227nilfs_find_entry(struct inode *, struct dentry *, struct page **);
228extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *);
229extern int nilfs_empty_dir(struct inode *);
230extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **);
231extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
232 struct page *, struct inode *);
233
234/* file.c */
235extern int nilfs_sync_file(struct file *, struct dentry *, int);
236
237/* ioctl.c */
238long nilfs_ioctl(struct file *, unsigned int, unsigned long);
239int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, struct nilfs_argv *,
240 void **);
241
242/* inode.c */
243extern struct inode *nilfs_new_inode(struct inode *, int);
244extern void nilfs_free_inode(struct inode *);
245extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
246extern void nilfs_set_inode_flags(struct inode *);
247extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *);
248extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int);
249extern struct inode *nilfs_iget(struct super_block *, unsigned long);
250extern void nilfs_update_inode(struct inode *, struct buffer_head *);
251extern void nilfs_truncate(struct inode *);
252extern void nilfs_delete_inode(struct inode *);
253extern int nilfs_setattr(struct dentry *, struct iattr *);
254extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *,
255 struct buffer_head **);
256extern int nilfs_inode_dirty(struct inode *);
257extern int nilfs_set_file_dirty(struct nilfs_sb_info *, struct inode *,
258 unsigned);
259extern int nilfs_mark_inode_dirty(struct inode *);
260extern void nilfs_dirty_inode(struct inode *);
261
262/* namei.c */
263extern struct dentry *nilfs_get_parent(struct dentry *);
264
265/* super.c */
266extern struct inode *nilfs_alloc_inode(struct super_block *);
267extern void nilfs_destroy_inode(struct inode *);
268extern void nilfs_error(struct super_block *, const char *, const char *, ...)
269 __attribute__ ((format (printf, 3, 4)));
270extern void nilfs_warning(struct super_block *, const char *, const char *, ...)
271 __attribute__ ((format (printf, 3, 4)));
272extern struct nilfs_super_block *
273nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
274extern int nilfs_store_magic_and_option(struct super_block *,
275 struct nilfs_super_block *, char *);
276extern int nilfs_commit_super(struct nilfs_sb_info *, int);
277extern int nilfs_attach_checkpoint(struct nilfs_sb_info *, __u64);
278extern void nilfs_detach_checkpoint(struct nilfs_sb_info *);
279
280/* gcinode.c */
281int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64,
282 struct buffer_head **);
283int nilfs_gccache_submit_read_node(struct inode *, sector_t, __u64,
284 struct buffer_head **);
285int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *);
286int nilfs_init_gccache(struct the_nilfs *);
287void nilfs_destroy_gccache(struct the_nilfs *);
288void nilfs_clear_gcinode(struct inode *);
289struct inode *nilfs_gc_iget(struct the_nilfs *, ino_t, __u64);
290void nilfs_remove_all_gcinode(struct the_nilfs *);
291
292/* gcdat.c */
293int nilfs_init_gcdat_inode(struct the_nilfs *);
294void nilfs_commit_gcdat_inode(struct the_nilfs *);
295void nilfs_clear_gcdat_inode(struct the_nilfs *);
296
297/*
298 * Inodes and files operations
299 */
300extern struct file_operations nilfs_dir_operations;
301extern struct inode_operations nilfs_file_inode_operations;
302extern struct file_operations nilfs_file_operations;
303extern struct address_space_operations nilfs_aops;
304extern struct inode_operations nilfs_dir_inode_operations;
305extern struct inode_operations nilfs_special_inode_operations;
306extern struct inode_operations nilfs_symlink_inode_operations;
307
308/*
309 * filesystem type
310 */
311extern struct file_system_type nilfs_fs_type;
312
313
314#endif /* _NILFS_H */
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
new file mode 100644
index 000000000000..a2692bbc7b50
--- /dev/null
+++ b/fs/nilfs2/page.c
@@ -0,0 +1,541 @@
1/*
2 * page.c - buffer/page management specific to NILFS
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>,
21 * Seiji Kihara <kihara@osrg.net>.
22 */
23
24#include <linux/pagemap.h>
25#include <linux/writeback.h>
26#include <linux/swap.h>
27#include <linux/bitops.h>
28#include <linux/page-flags.h>
29#include <linux/list.h>
30#include <linux/highmem.h>
31#include <linux/pagevec.h>
32#include "nilfs.h"
33#include "page.h"
34#include "mdt.h"
35
36
37#define NILFS_BUFFER_INHERENT_BITS \
38 ((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \
39 (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated))
40
41static struct buffer_head *
42__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
43 int blkbits, unsigned long b_state)
44
45{
46 unsigned long first_block;
47 struct buffer_head *bh;
48
49 if (!page_has_buffers(page))
50 create_empty_buffers(page, 1 << blkbits, b_state);
51
52 first_block = (unsigned long)index << (PAGE_CACHE_SHIFT - blkbits);
53 bh = nilfs_page_get_nth_block(page, block - first_block);
54
55 touch_buffer(bh);
56 wait_on_buffer(bh);
57 return bh;
58}
59
60/*
61 * Since the page cache of B-tree node pages or data page cache of pseudo
62 * inodes does not have a valid mapping->host pointer, calling
63 * mark_buffer_dirty() for their buffers causes a NULL pointer dereference;
64 * it calls __mark_inode_dirty(NULL) through __set_page_dirty().
65 * To avoid this problem, the old style mark_buffer_dirty() is used instead.
66 */
67void nilfs_mark_buffer_dirty(struct buffer_head *bh)
68{
69 if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
70 __set_page_dirty_nobuffers(bh->b_page);
71}
72
73struct buffer_head *nilfs_grab_buffer(struct inode *inode,
74 struct address_space *mapping,
75 unsigned long blkoff,
76 unsigned long b_state)
77{
78 int blkbits = inode->i_blkbits;
79 pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits);
80 struct page *page, *opage;
81 struct buffer_head *bh, *obh;
82
83 page = grab_cache_page(mapping, index);
84 if (unlikely(!page))
85 return NULL;
86
87 bh = __nilfs_get_page_block(page, blkoff, index, blkbits, b_state);
88 if (unlikely(!bh)) {
89 unlock_page(page);
90 page_cache_release(page);
91 return NULL;
92 }
93 if (!buffer_uptodate(bh) && mapping->assoc_mapping != NULL) {
94 /*
95 * Shadow page cache uses assoc_mapping to point its original
96 * page cache. The following code tries the original cache
97 * if the given cache is a shadow and it didn't hit.
98 */
99 opage = find_lock_page(mapping->assoc_mapping, index);
100 if (!opage)
101 return bh;
102
103 obh = __nilfs_get_page_block(opage, blkoff, index, blkbits,
104 b_state);
105 if (buffer_uptodate(obh)) {
106 nilfs_copy_buffer(bh, obh);
107 if (buffer_dirty(obh)) {
108 nilfs_mark_buffer_dirty(bh);
109 if (!buffer_nilfs_node(bh) && NILFS_MDT(inode))
110 nilfs_mdt_mark_dirty(inode);
111 }
112 }
113 brelse(obh);
114 unlock_page(opage);
115 page_cache_release(opage);
116 }
117 return bh;
118}
119
120/**
121 * nilfs_forget_buffer - discard dirty state
122 * @inode: owner inode of the buffer
123 * @bh: buffer head of the buffer to be discarded
124 */
125void nilfs_forget_buffer(struct buffer_head *bh)
126{
127 struct page *page = bh->b_page;
128
129 lock_buffer(bh);
130 clear_buffer_nilfs_volatile(bh);
131 clear_buffer_dirty(bh);
132 if (nilfs_page_buffers_clean(page))
133 __nilfs_clear_page_dirty(page);
134
135 clear_buffer_uptodate(bh);
136 clear_buffer_mapped(bh);
137 bh->b_blocknr = -1;
138 ClearPageUptodate(page);
139 ClearPageMappedToDisk(page);
140 unlock_buffer(bh);
141 brelse(bh);
142}
143
144/**
145 * nilfs_copy_buffer -- copy buffer data and flags
146 * @dbh: destination buffer
147 * @sbh: source buffer
148 */
149void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh)
150{
151 void *kaddr0, *kaddr1;
152 unsigned long bits;
153 struct page *spage = sbh->b_page, *dpage = dbh->b_page;
154 struct buffer_head *bh;
155
156 kaddr0 = kmap_atomic(spage, KM_USER0);
157 kaddr1 = kmap_atomic(dpage, KM_USER1);
158 memcpy(kaddr1 + bh_offset(dbh), kaddr0 + bh_offset(sbh), sbh->b_size);
159 kunmap_atomic(kaddr1, KM_USER1);
160 kunmap_atomic(kaddr0, KM_USER0);
161
162 dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS;
163 dbh->b_blocknr = sbh->b_blocknr;
164 dbh->b_bdev = sbh->b_bdev;
165
166 bh = dbh;
167 bits = sbh->b_state & ((1UL << BH_Uptodate) | (1UL << BH_Mapped));
168 while ((bh = bh->b_this_page) != dbh) {
169 lock_buffer(bh);
170 bits &= bh->b_state;
171 unlock_buffer(bh);
172 }
173 if (bits & (1UL << BH_Uptodate))
174 SetPageUptodate(dpage);
175 else
176 ClearPageUptodate(dpage);
177 if (bits & (1UL << BH_Mapped))
178 SetPageMappedToDisk(dpage);
179 else
180 ClearPageMappedToDisk(dpage);
181}
182
183/**
184 * nilfs_page_buffers_clean - check if a page has dirty buffers or not.
185 * @page: page to be checked
186 *
187 * nilfs_page_buffers_clean() returns zero if the page has dirty buffers.
188 * Otherwise, it returns non-zero value.
189 */
190int nilfs_page_buffers_clean(struct page *page)
191{
192 struct buffer_head *bh, *head;
193
194 bh = head = page_buffers(page);
195 do {
196 if (buffer_dirty(bh))
197 return 0;
198 bh = bh->b_this_page;
199 } while (bh != head);
200 return 1;
201}
202
203void nilfs_page_bug(struct page *page)
204{
205 struct address_space *m;
206 unsigned long ino = 0;
207
208 if (unlikely(!page)) {
209 printk(KERN_CRIT "NILFS_PAGE_BUG(NULL)\n");
210 return;
211 }
212
213 m = page->mapping;
214 if (m) {
215 struct inode *inode = NILFS_AS_I(m);
216 if (inode != NULL)
217 ino = inode->i_ino;
218 }
219 printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx "
220 "mapping=%p ino=%lu\n",
221 page, atomic_read(&page->_count),
222 (unsigned long long)page->index, page->flags, m, ino);
223
224 if (page_has_buffers(page)) {
225 struct buffer_head *bh, *head;
226 int i = 0;
227
228 bh = head = page_buffers(page);
229 do {
230 printk(KERN_CRIT
231 " BH[%d] %p: cnt=%d block#=%llu state=0x%lx\n",
232 i++, bh, atomic_read(&bh->b_count),
233 (unsigned long long)bh->b_blocknr, bh->b_state);
234 bh = bh->b_this_page;
235 } while (bh != head);
236 }
237}
238
239/**
240 * nilfs_alloc_private_page - allocate a private page with buffer heads
241 *
242 * Return Value: On success, a pointer to the allocated page is returned.
243 * On error, NULL is returned.
244 */
245struct page *nilfs_alloc_private_page(struct block_device *bdev, int size,
246 unsigned long state)
247{
248 struct buffer_head *bh, *head, *tail;
249 struct page *page;
250
251 page = alloc_page(GFP_NOFS); /* page_count of the returned page is 1 */
252 if (unlikely(!page))
253 return NULL;
254
255 lock_page(page);
256 head = alloc_page_buffers(page, size, 0);
257 if (unlikely(!head)) {
258 unlock_page(page);
259 __free_page(page);
260 return NULL;
261 }
262
263 bh = head;
264 do {
265 bh->b_state = (1UL << BH_NILFS_Allocated) | state;
266 tail = bh;
267 bh->b_bdev = bdev;
268 bh = bh->b_this_page;
269 } while (bh);
270
271 tail->b_this_page = head;
272 attach_page_buffers(page, head);
273
274 return page;
275}
276
277void nilfs_free_private_page(struct page *page)
278{
279 BUG_ON(!PageLocked(page));
280 BUG_ON(page->mapping);
281
282 if (page_has_buffers(page) && !try_to_free_buffers(page))
283 NILFS_PAGE_BUG(page, "failed to free page");
284
285 unlock_page(page);
286 __free_page(page);
287}
288
289/**
290 * nilfs_copy_page -- copy the page with buffers
291 * @dst: destination page
292 * @src: source page
293 * @copy_dirty: flag whether to copy dirty states on the page's buffer heads.
294 *
295 * This fuction is for both data pages and btnode pages. The dirty flag
296 * should be treated by caller. The page must not be under i/o.
297 * Both src and dst page must be locked
298 */
299static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty)
300{
301 struct buffer_head *dbh, *dbufs, *sbh, *sbufs;
302 unsigned long mask = NILFS_BUFFER_INHERENT_BITS;
303
304 BUG_ON(PageWriteback(dst));
305
306 sbh = sbufs = page_buffers(src);
307 if (!page_has_buffers(dst))
308 create_empty_buffers(dst, sbh->b_size, 0);
309
310 if (copy_dirty)
311 mask |= (1UL << BH_Dirty);
312
313 dbh = dbufs = page_buffers(dst);
314 do {
315 lock_buffer(sbh);
316 lock_buffer(dbh);
317 dbh->b_state = sbh->b_state & mask;
318 dbh->b_blocknr = sbh->b_blocknr;
319 dbh->b_bdev = sbh->b_bdev;
320 sbh = sbh->b_this_page;
321 dbh = dbh->b_this_page;
322 } while (dbh != dbufs);
323
324 copy_highpage(dst, src);
325
326 if (PageUptodate(src) && !PageUptodate(dst))
327 SetPageUptodate(dst);
328 else if (!PageUptodate(src) && PageUptodate(dst))
329 ClearPageUptodate(dst);
330 if (PageMappedToDisk(src) && !PageMappedToDisk(dst))
331 SetPageMappedToDisk(dst);
332 else if (!PageMappedToDisk(src) && PageMappedToDisk(dst))
333 ClearPageMappedToDisk(dst);
334
335 do {
336 unlock_buffer(sbh);
337 unlock_buffer(dbh);
338 sbh = sbh->b_this_page;
339 dbh = dbh->b_this_page;
340 } while (dbh != dbufs);
341}
342
343int nilfs_copy_dirty_pages(struct address_space *dmap,
344 struct address_space *smap)
345{
346 struct pagevec pvec;
347 unsigned int i;
348 pgoff_t index = 0;
349 int err = 0;
350
351 pagevec_init(&pvec, 0);
352repeat:
353 if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY,
354 PAGEVEC_SIZE))
355 return 0;
356
357 for (i = 0; i < pagevec_count(&pvec); i++) {
358 struct page *page = pvec.pages[i], *dpage;
359
360 lock_page(page);
361 if (unlikely(!PageDirty(page)))
362 NILFS_PAGE_BUG(page, "inconsistent dirty state");
363
364 dpage = grab_cache_page(dmap, page->index);
365 if (unlikely(!dpage)) {
366 /* No empty page is added to the page cache */
367 err = -ENOMEM;
368 unlock_page(page);
369 break;
370 }
371 if (unlikely(!page_has_buffers(page)))
372 NILFS_PAGE_BUG(page,
373 "found empty page in dat page cache");
374
375 nilfs_copy_page(dpage, page, 1);
376 __set_page_dirty_nobuffers(dpage);
377
378 unlock_page(dpage);
379 page_cache_release(dpage);
380 unlock_page(page);
381 }
382 pagevec_release(&pvec);
383 cond_resched();
384
385 if (likely(!err))
386 goto repeat;
387 return err;
388}
389
390/**
391 * nilfs_copy_back_pages -- copy back pages to orignal cache from shadow cache
392 * @dmap: destination page cache
393 * @smap: source page cache
394 *
395 * No pages must no be added to the cache during this process.
396 * This must be ensured by the caller.
397 */
398void nilfs_copy_back_pages(struct address_space *dmap,
399 struct address_space *smap)
400{
401 struct pagevec pvec;
402 unsigned int i, n;
403 pgoff_t index = 0;
404 int err;
405
406 pagevec_init(&pvec, 0);
407repeat:
408 n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE);
409 if (!n)
410 return;
411 index = pvec.pages[n - 1]->index + 1;
412
413 for (i = 0; i < pagevec_count(&pvec); i++) {
414 struct page *page = pvec.pages[i], *dpage;
415 pgoff_t offset = page->index;
416
417 lock_page(page);
418 dpage = find_lock_page(dmap, offset);
419 if (dpage) {
420 /* override existing page on the destination cache */
421 WARN_ON(PageDirty(dpage));
422 nilfs_copy_page(dpage, page, 0);
423 unlock_page(dpage);
424 page_cache_release(dpage);
425 } else {
426 struct page *page2;
427
428 /* move the page to the destination cache */
429 spin_lock_irq(&smap->tree_lock);
430 page2 = radix_tree_delete(&smap->page_tree, offset);
431 WARN_ON(page2 != page);
432
433 smap->nrpages--;
434 spin_unlock_irq(&smap->tree_lock);
435
436 spin_lock_irq(&dmap->tree_lock);
437 err = radix_tree_insert(&dmap->page_tree, offset, page);
438 if (unlikely(err < 0)) {
439 WARN_ON(err == -EEXIST);
440 page->mapping = NULL;
441 page_cache_release(page); /* for cache */
442 } else {
443 page->mapping = dmap;
444 dmap->nrpages++;
445 if (PageDirty(page))
446 radix_tree_tag_set(&dmap->page_tree,
447 offset,
448 PAGECACHE_TAG_DIRTY);
449 }
450 spin_unlock_irq(&dmap->tree_lock);
451 }
452 unlock_page(page);
453 }
454 pagevec_release(&pvec);
455 cond_resched();
456
457 goto repeat;
458}
459
460void nilfs_clear_dirty_pages(struct address_space *mapping)
461{
462 struct pagevec pvec;
463 unsigned int i;
464 pgoff_t index = 0;
465
466 pagevec_init(&pvec, 0);
467
468 while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
469 PAGEVEC_SIZE)) {
470 for (i = 0; i < pagevec_count(&pvec); i++) {
471 struct page *page = pvec.pages[i];
472 struct buffer_head *bh, *head;
473
474 lock_page(page);
475 ClearPageUptodate(page);
476 ClearPageMappedToDisk(page);
477 bh = head = page_buffers(page);
478 do {
479 lock_buffer(bh);
480 clear_buffer_dirty(bh);
481 clear_buffer_nilfs_volatile(bh);
482 clear_buffer_uptodate(bh);
483 clear_buffer_mapped(bh);
484 unlock_buffer(bh);
485 bh = bh->b_this_page;
486 } while (bh != head);
487
488 __nilfs_clear_page_dirty(page);
489 unlock_page(page);
490 }
491 pagevec_release(&pvec);
492 cond_resched();
493 }
494}
495
496unsigned nilfs_page_count_clean_buffers(struct page *page,
497 unsigned from, unsigned to)
498{
499 unsigned block_start, block_end;
500 struct buffer_head *bh, *head;
501 unsigned nc = 0;
502
503 for (bh = head = page_buffers(page), block_start = 0;
504 bh != head || !block_start;
505 block_start = block_end, bh = bh->b_this_page) {
506 block_end = block_start + bh->b_size;
507 if (block_end > from && block_start < to && !buffer_dirty(bh))
508 nc++;
509 }
510 return nc;
511}
512
513/*
514 * NILFS2 needs clear_page_dirty() in the following two cases:
515 *
516 * 1) For B-tree node pages and data pages of the dat/gcdat, NILFS2 clears
517 * page dirty flags when it copies back pages from the shadow cache
518 * (gcdat->{i_mapping,i_btnode_cache}) to its original cache
519 * (dat->{i_mapping,i_btnode_cache}).
520 *
521 * 2) Some B-tree operations like insertion or deletion may dispose buffers
522 * in dirty state, and this needs to cancel the dirty state of their pages.
523 */
524int __nilfs_clear_page_dirty(struct page *page)
525{
526 struct address_space *mapping = page->mapping;
527
528 if (mapping) {
529 spin_lock_irq(&mapping->tree_lock);
530 if (test_bit(PG_dirty, &page->flags)) {
531 radix_tree_tag_clear(&mapping->page_tree,
532 page_index(page),
533 PAGECACHE_TAG_DIRTY);
534 spin_unlock_irq(&mapping->tree_lock);
535 return clear_page_dirty_for_io(page);
536 }
537 spin_unlock_irq(&mapping->tree_lock);
538 return 0;
539 }
540 return TestClearPageDirty(page);
541}
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
new file mode 100644
index 000000000000..8abca4d1c1f8
--- /dev/null
+++ b/fs/nilfs2/page.h
@@ -0,0 +1,76 @@
1/*
2 * page.h - buffer/page management specific to NILFS
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>,
21 * Seiji Kihara <kihara@osrg.net>.
22 */
23
24#ifndef _NILFS_PAGE_H
25#define _NILFS_PAGE_H
26
27#include <linux/buffer_head.h>
28#include "nilfs.h"
29
30/*
31 * Extended buffer state bits
32 */
33enum {
34 BH_NILFS_Allocated = BH_PrivateStart,
35 BH_NILFS_Node,
36 BH_NILFS_Volatile,
37};
38
39BUFFER_FNS(NILFS_Allocated, nilfs_allocated) /* nilfs private buffers */
40BUFFER_FNS(NILFS_Node, nilfs_node) /* nilfs node buffers */
41BUFFER_FNS(NILFS_Volatile, nilfs_volatile)
42
43
44void nilfs_mark_buffer_dirty(struct buffer_head *bh);
45int __nilfs_clear_page_dirty(struct page *);
46
47struct buffer_head *nilfs_grab_buffer(struct inode *, struct address_space *,
48 unsigned long, unsigned long);
49void nilfs_forget_buffer(struct buffer_head *);
50void nilfs_copy_buffer(struct buffer_head *, struct buffer_head *);
51int nilfs_page_buffers_clean(struct page *);
52void nilfs_page_bug(struct page *);
53struct page *nilfs_alloc_private_page(struct block_device *, int,
54 unsigned long);
55void nilfs_free_private_page(struct page *);
56
57int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
58void nilfs_copy_back_pages(struct address_space *, struct address_space *);
59void nilfs_clear_dirty_pages(struct address_space *);
60unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
61
62#define NILFS_PAGE_BUG(page, m, a...) \
63 do { nilfs_page_bug(page); BUG(); } while (0)
64
65static inline struct buffer_head *
66nilfs_page_get_nth_block(struct page *page, unsigned int count)
67{
68 struct buffer_head *bh = page_buffers(page);
69
70 while (count-- > 0)
71 bh = bh->b_this_page;
72 get_bh(bh);
73 return bh;
74}
75
76#endif /* _NILFS_PAGE_H */
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
new file mode 100644
index 000000000000..57afa9d24061
--- /dev/null
+++ b/fs/nilfs2/recovery.c
@@ -0,0 +1,919 @@
1/*
2 * recovery.c - NILFS recovery logic
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 */
22
23#include <linux/buffer_head.h>
24#include <linux/blkdev.h>
25#include <linux/swap.h>
26#include <linux/crc32.h>
27#include "nilfs.h"
28#include "segment.h"
29#include "sufile.h"
30#include "page.h"
31#include "seglist.h"
32#include "segbuf.h"
33
34/*
35 * Segment check result
36 */
37enum {
38 NILFS_SEG_VALID,
39 NILFS_SEG_NO_SUPER_ROOT,
40 NILFS_SEG_FAIL_IO,
41 NILFS_SEG_FAIL_MAGIC,
42 NILFS_SEG_FAIL_SEQ,
43 NILFS_SEG_FAIL_CHECKSUM_SEGSUM,
44 NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT,
45 NILFS_SEG_FAIL_CHECKSUM_FULL,
46 NILFS_SEG_FAIL_CONSISTENCY,
47};
48
49/* work structure for recovery */
50struct nilfs_recovery_block {
51 ino_t ino; /* Inode number of the file that this block
52 belongs to */
53 sector_t blocknr; /* block number */
54 __u64 vblocknr; /* virtual block number */
55 unsigned long blkoff; /* File offset of the data block (per block) */
56 struct list_head list;
57};
58
59
60static int nilfs_warn_segment_error(int err)
61{
62 switch (err) {
63 case NILFS_SEG_FAIL_IO:
64 printk(KERN_WARNING
65 "NILFS warning: I/O error on loading last segment\n");
66 return -EIO;
67 case NILFS_SEG_FAIL_MAGIC:
68 printk(KERN_WARNING
69 "NILFS warning: Segment magic number invalid\n");
70 break;
71 case NILFS_SEG_FAIL_SEQ:
72 printk(KERN_WARNING
73 "NILFS warning: Sequence number mismatch\n");
74 break;
75 case NILFS_SEG_FAIL_CHECKSUM_SEGSUM:
76 printk(KERN_WARNING
77 "NILFS warning: Checksum error in segment summary\n");
78 break;
79 case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT:
80 printk(KERN_WARNING
81 "NILFS warning: Checksum error in super root\n");
82 break;
83 case NILFS_SEG_FAIL_CHECKSUM_FULL:
84 printk(KERN_WARNING
85 "NILFS warning: Checksum error in segment payload\n");
86 break;
87 case NILFS_SEG_FAIL_CONSISTENCY:
88 printk(KERN_WARNING
89 "NILFS warning: Inconsistent segment\n");
90 break;
91 case NILFS_SEG_NO_SUPER_ROOT:
92 printk(KERN_WARNING
93 "NILFS warning: No super root in the last segment\n");
94 break;
95 }
96 return -EINVAL;
97}
98
99static void store_segsum_info(struct nilfs_segsum_info *ssi,
100 struct nilfs_segment_summary *sum,
101 unsigned int blocksize)
102{
103 ssi->flags = le16_to_cpu(sum->ss_flags);
104 ssi->seg_seq = le64_to_cpu(sum->ss_seq);
105 ssi->ctime = le64_to_cpu(sum->ss_create);
106 ssi->next = le64_to_cpu(sum->ss_next);
107 ssi->nblocks = le32_to_cpu(sum->ss_nblocks);
108 ssi->nfinfo = le32_to_cpu(sum->ss_nfinfo);
109 ssi->sumbytes = le32_to_cpu(sum->ss_sumbytes);
110
111 ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize);
112 ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi);
113}
114
115/**
116 * calc_crc_cont - check CRC of blocks continuously
117 * @sbi: nilfs_sb_info
118 * @bhs: buffer head of start block
119 * @sum: place to store result
120 * @offset: offset bytes in the first block
121 * @check_bytes: number of bytes to be checked
122 * @start: DBN of start block
123 * @nblock: number of blocks to be checked
124 */
125static int calc_crc_cont(struct nilfs_sb_info *sbi, struct buffer_head *bhs,
126 u32 *sum, unsigned long offset, u64 check_bytes,
127 sector_t start, unsigned long nblock)
128{
129 unsigned long blocksize = sbi->s_super->s_blocksize;
130 unsigned long size;
131 u32 crc;
132
133 BUG_ON(offset >= blocksize);
134 check_bytes -= offset;
135 size = min_t(u64, check_bytes, blocksize - offset);
136 crc = crc32_le(sbi->s_nilfs->ns_crc_seed,
137 (unsigned char *)bhs->b_data + offset, size);
138 if (--nblock > 0) {
139 do {
140 struct buffer_head *bh
141 = sb_bread(sbi->s_super, ++start);
142 if (!bh)
143 return -EIO;
144 check_bytes -= size;
145 size = min_t(u64, check_bytes, blocksize);
146 crc = crc32_le(crc, bh->b_data, size);
147 brelse(bh);
148 } while (--nblock > 0);
149 }
150 *sum = crc;
151 return 0;
152}
153
154/**
155 * nilfs_read_super_root_block - read super root block
156 * @sb: super_block
157 * @sr_block: disk block number of the super root block
158 * @pbh: address of a buffer_head pointer to return super root buffer
159 * @check: CRC check flag
160 */
161int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
162 struct buffer_head **pbh, int check)
163{
164 struct buffer_head *bh_sr;
165 struct nilfs_super_root *sr;
166 u32 crc;
167 int ret;
168
169 *pbh = NULL;
170 bh_sr = sb_bread(sb, sr_block);
171 if (unlikely(!bh_sr)) {
172 ret = NILFS_SEG_FAIL_IO;
173 goto failed;
174 }
175
176 sr = (struct nilfs_super_root *)bh_sr->b_data;
177 if (check) {
178 unsigned bytes = le16_to_cpu(sr->sr_bytes);
179
180 if (bytes == 0 || bytes > sb->s_blocksize) {
181 ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
182 goto failed_bh;
183 }
184 if (calc_crc_cont(NILFS_SB(sb), bh_sr, &crc,
185 sizeof(sr->sr_sum), bytes, sr_block, 1)) {
186 ret = NILFS_SEG_FAIL_IO;
187 goto failed_bh;
188 }
189 if (crc != le32_to_cpu(sr->sr_sum)) {
190 ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
191 goto failed_bh;
192 }
193 }
194 *pbh = bh_sr;
195 return 0;
196
197 failed_bh:
198 brelse(bh_sr);
199
200 failed:
201 return nilfs_warn_segment_error(ret);
202}
203
204/**
205 * load_segment_summary - read segment summary of the specified partial segment
206 * @sbi: nilfs_sb_info
207 * @pseg_start: start disk block number of partial segment
208 * @seg_seq: sequence number requested
209 * @ssi: pointer to nilfs_segsum_info struct to store information
210 * @full_check: full check flag
211 * (0: only checks segment summary CRC, 1: data CRC)
212 */
213static int
214load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start,
215 u64 seg_seq, struct nilfs_segsum_info *ssi,
216 int full_check)
217{
218 struct buffer_head *bh_sum;
219 struct nilfs_segment_summary *sum;
220 unsigned long offset, nblock;
221 u64 check_bytes;
222 u32 crc, crc_sum;
223 int ret = NILFS_SEG_FAIL_IO;
224
225 bh_sum = sb_bread(sbi->s_super, pseg_start);
226 if (!bh_sum)
227 goto out;
228
229 sum = (struct nilfs_segment_summary *)bh_sum->b_data;
230
231 /* Check consistency of segment summary */
232 if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC) {
233 ret = NILFS_SEG_FAIL_MAGIC;
234 goto failed;
235 }
236 store_segsum_info(ssi, sum, sbi->s_super->s_blocksize);
237 if (seg_seq != ssi->seg_seq) {
238 ret = NILFS_SEG_FAIL_SEQ;
239 goto failed;
240 }
241 if (full_check) {
242 offset = sizeof(sum->ss_datasum);
243 check_bytes =
244 ((u64)ssi->nblocks << sbi->s_super->s_blocksize_bits);
245 nblock = ssi->nblocks;
246 crc_sum = le32_to_cpu(sum->ss_datasum);
247 ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
248 } else { /* only checks segment summary */
249 offset = sizeof(sum->ss_datasum) + sizeof(sum->ss_sumsum);
250 check_bytes = ssi->sumbytes;
251 nblock = ssi->nsumblk;
252 crc_sum = le32_to_cpu(sum->ss_sumsum);
253 ret = NILFS_SEG_FAIL_CHECKSUM_SEGSUM;
254 }
255
256 if (unlikely(nblock == 0 ||
257 nblock > sbi->s_nilfs->ns_blocks_per_segment)) {
258 /* This limits the number of blocks read in the CRC check */
259 ret = NILFS_SEG_FAIL_CONSISTENCY;
260 goto failed;
261 }
262 if (calc_crc_cont(sbi, bh_sum, &crc, offset, check_bytes,
263 pseg_start, nblock)) {
264 ret = NILFS_SEG_FAIL_IO;
265 goto failed;
266 }
267 if (crc == crc_sum)
268 ret = 0;
269 failed:
270 brelse(bh_sum);
271 out:
272 return ret;
273}
274
275static void *segsum_get(struct super_block *sb, struct buffer_head **pbh,
276 unsigned int *offset, unsigned int bytes)
277{
278 void *ptr;
279 sector_t blocknr;
280
281 BUG_ON((*pbh)->b_size < *offset);
282 if (bytes > (*pbh)->b_size - *offset) {
283 blocknr = (*pbh)->b_blocknr;
284 brelse(*pbh);
285 *pbh = sb_bread(sb, blocknr + 1);
286 if (unlikely(!*pbh))
287 return NULL;
288 *offset = 0;
289 }
290 ptr = (*pbh)->b_data + *offset;
291 *offset += bytes;
292 return ptr;
293}
294
295static void segsum_skip(struct super_block *sb, struct buffer_head **pbh,
296 unsigned int *offset, unsigned int bytes,
297 unsigned long count)
298{
299 unsigned int rest_item_in_current_block
300 = ((*pbh)->b_size - *offset) / bytes;
301
302 if (count <= rest_item_in_current_block) {
303 *offset += bytes * count;
304 } else {
305 sector_t blocknr = (*pbh)->b_blocknr;
306 unsigned int nitem_per_block = (*pbh)->b_size / bytes;
307 unsigned int bcnt;
308
309 count -= rest_item_in_current_block;
310 bcnt = DIV_ROUND_UP(count, nitem_per_block);
311 *offset = bytes * (count - (bcnt - 1) * nitem_per_block);
312
313 brelse(*pbh);
314 *pbh = sb_bread(sb, blocknr + bcnt);
315 }
316}
317
318static int
319collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr,
320 struct nilfs_segsum_info *ssi,
321 struct list_head *head)
322{
323 struct buffer_head *bh;
324 unsigned int offset;
325 unsigned long nfinfo = ssi->nfinfo;
326 sector_t blocknr = sum_blocknr + ssi->nsumblk;
327 ino_t ino;
328 int err = -EIO;
329
330 if (!nfinfo)
331 return 0;
332
333 bh = sb_bread(sbi->s_super, sum_blocknr);
334 if (unlikely(!bh))
335 goto out;
336
337 offset = le16_to_cpu(
338 ((struct nilfs_segment_summary *)bh->b_data)->ss_bytes);
339 for (;;) {
340 unsigned long nblocks, ndatablk, nnodeblk;
341 struct nilfs_finfo *finfo;
342
343 finfo = segsum_get(sbi->s_super, &bh, &offset, sizeof(*finfo));
344 if (unlikely(!finfo))
345 goto out;
346
347 ino = le64_to_cpu(finfo->fi_ino);
348 nblocks = le32_to_cpu(finfo->fi_nblocks);
349 ndatablk = le32_to_cpu(finfo->fi_ndatablk);
350 nnodeblk = nblocks - ndatablk;
351
352 while (ndatablk-- > 0) {
353 struct nilfs_recovery_block *rb;
354 struct nilfs_binfo_v *binfo;
355
356 binfo = segsum_get(sbi->s_super, &bh, &offset,
357 sizeof(*binfo));
358 if (unlikely(!binfo))
359 goto out;
360
361 rb = kmalloc(sizeof(*rb), GFP_NOFS);
362 if (unlikely(!rb)) {
363 err = -ENOMEM;
364 goto out;
365 }
366 rb->ino = ino;
367 rb->blocknr = blocknr++;
368 rb->vblocknr = le64_to_cpu(binfo->bi_vblocknr);
369 rb->blkoff = le64_to_cpu(binfo->bi_blkoff);
370 /* INIT_LIST_HEAD(&rb->list); */
371 list_add_tail(&rb->list, head);
372 }
373 if (--nfinfo == 0)
374 break;
375 blocknr += nnodeblk; /* always 0 for the data sync segments */
376 segsum_skip(sbi->s_super, &bh, &offset, sizeof(__le64),
377 nnodeblk);
378 if (unlikely(!bh))
379 goto out;
380 }
381 err = 0;
382 out:
383 brelse(bh); /* brelse(NULL) is just ignored */
384 return err;
385}
386
387static void dispose_recovery_list(struct list_head *head)
388{
389 while (!list_empty(head)) {
390 struct nilfs_recovery_block *rb
391 = list_entry(head->next,
392 struct nilfs_recovery_block, list);
393 list_del(&rb->list);
394 kfree(rb);
395 }
396}
397
398void nilfs_dispose_segment_list(struct list_head *head)
399{
400 while (!list_empty(head)) {
401 struct nilfs_segment_entry *ent
402 = list_entry(head->next,
403 struct nilfs_segment_entry, list);
404 list_del(&ent->list);
405 nilfs_free_segment_entry(ent);
406 }
407}
408
409static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
410 struct nilfs_sb_info *sbi,
411 struct nilfs_recovery_info *ri)
412{
413 struct list_head *head = &ri->ri_used_segments;
414 struct nilfs_segment_entry *ent, *n;
415 struct inode *sufile = nilfs->ns_sufile;
416 __u64 segnum[4];
417 int err;
418 int i;
419
420 segnum[0] = nilfs->ns_segnum;
421 segnum[1] = nilfs->ns_nextnum;
422 segnum[2] = ri->ri_segnum;
423 segnum[3] = ri->ri_nextnum;
424
425 nilfs_attach_writer(nilfs, sbi);
426 /*
427 * Releasing the next segment of the latest super root.
428 * The next segment is invalidated by this recovery.
429 */
430 err = nilfs_sufile_free(sufile, segnum[1]);
431 if (unlikely(err))
432 goto failed;
433
434 err = -ENOMEM;
435 for (i = 1; i < 4; i++) {
436 ent = nilfs_alloc_segment_entry(segnum[i]);
437 if (unlikely(!ent))
438 goto failed;
439 list_add_tail(&ent->list, head);
440 }
441
442 /*
443 * Collecting segments written after the latest super root.
444 * These are marked dirty to avoid being reallocated in the next write.
445 */
446 list_for_each_entry_safe(ent, n, head, list) {
447 if (ent->segnum != segnum[0]) {
448 err = nilfs_sufile_scrap(sufile, ent->segnum);
449 if (unlikely(err))
450 goto failed;
451 }
452 list_del(&ent->list);
453 nilfs_free_segment_entry(ent);
454 }
455
456 /* Allocate new segments for recovery */
457 err = nilfs_sufile_alloc(sufile, &segnum[0]);
458 if (unlikely(err))
459 goto failed;
460
461 nilfs->ns_pseg_offset = 0;
462 nilfs->ns_seg_seq = ri->ri_seq + 2;
463 nilfs->ns_nextnum = nilfs->ns_segnum = segnum[0];
464
465 failed:
466 /* No need to recover sufile because it will be destroyed on error */
467 nilfs_detach_writer(nilfs, sbi);
468 return err;
469}
470
471static int nilfs_recovery_copy_block(struct nilfs_sb_info *sbi,
472 struct nilfs_recovery_block *rb,
473 struct page *page)
474{
475 struct buffer_head *bh_org;
476 void *kaddr;
477
478 bh_org = sb_bread(sbi->s_super, rb->blocknr);
479 if (unlikely(!bh_org))
480 return -EIO;
481
482 kaddr = kmap_atomic(page, KM_USER0);
483 memcpy(kaddr + bh_offset(bh_org), bh_org->b_data, bh_org->b_size);
484 kunmap_atomic(kaddr, KM_USER0);
485 brelse(bh_org);
486 return 0;
487}
488
489static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
490 struct list_head *head,
491 unsigned long *nr_salvaged_blocks)
492{
493 struct inode *inode;
494 struct nilfs_recovery_block *rb, *n;
495 unsigned blocksize = sbi->s_super->s_blocksize;
496 struct page *page;
497 loff_t pos;
498 int err = 0, err2 = 0;
499
500 list_for_each_entry_safe(rb, n, head, list) {
501 inode = nilfs_iget(sbi->s_super, rb->ino);
502 if (IS_ERR(inode)) {
503 err = PTR_ERR(inode);
504 inode = NULL;
505 goto failed_inode;
506 }
507
508 pos = rb->blkoff << inode->i_blkbits;
509 page = NULL;
510 err = block_write_begin(NULL, inode->i_mapping, pos, blocksize,
511 0, &page, NULL, nilfs_get_block);
512 if (unlikely(err))
513 goto failed_inode;
514
515 err = nilfs_recovery_copy_block(sbi, rb, page);
516 if (unlikely(err))
517 goto failed_page;
518
519 err = nilfs_set_file_dirty(sbi, inode, 1);
520 if (unlikely(err))
521 goto failed_page;
522
523 block_write_end(NULL, inode->i_mapping, pos, blocksize,
524 blocksize, page, NULL);
525
526 unlock_page(page);
527 page_cache_release(page);
528
529 (*nr_salvaged_blocks)++;
530 goto next;
531
532 failed_page:
533 unlock_page(page);
534 page_cache_release(page);
535
536 failed_inode:
537 printk(KERN_WARNING
538 "NILFS warning: error recovering data block "
539 "(err=%d, ino=%lu, block-offset=%llu)\n",
540 err, rb->ino, (unsigned long long)rb->blkoff);
541 if (!err2)
542 err2 = err;
543 next:
544 iput(inode); /* iput(NULL) is just ignored */
545 list_del_init(&rb->list);
546 kfree(rb);
547 }
548 return err2;
549}
550
551/**
552 * nilfs_do_roll_forward - salvage logical segments newer than the latest
553 * checkpoint
554 * @sbi: nilfs_sb_info
555 * @nilfs: the_nilfs
556 * @ri: pointer to a nilfs_recovery_info
557 */
558static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
559 struct nilfs_sb_info *sbi,
560 struct nilfs_recovery_info *ri)
561{
562 struct nilfs_segsum_info ssi;
563 sector_t pseg_start;
564 sector_t seg_start, seg_end; /* Starting/ending DBN of full segment */
565 unsigned long nsalvaged_blocks = 0;
566 u64 seg_seq;
567 __u64 segnum, nextnum = 0;
568 int empty_seg = 0;
569 int err = 0, ret;
570 LIST_HEAD(dsync_blocks); /* list of data blocks to be recovered */
571 enum {
572 RF_INIT_ST,
573 RF_DSYNC_ST, /* scanning data-sync segments */
574 };
575 int state = RF_INIT_ST;
576
577 nilfs_attach_writer(nilfs, sbi);
578 pseg_start = ri->ri_lsegs_start;
579 seg_seq = ri->ri_lsegs_start_seq;
580 segnum = nilfs_get_segnum_of_block(nilfs, pseg_start);
581 nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
582
583 while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) {
584
585 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1);
586 if (ret) {
587 if (ret == NILFS_SEG_FAIL_IO) {
588 err = -EIO;
589 goto failed;
590 }
591 goto strayed;
592 }
593 if (unlikely(NILFS_SEG_HAS_SR(&ssi)))
594 goto confused;
595
596 /* Found a valid partial segment; do recovery actions */
597 nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next);
598 empty_seg = 0;
599 nilfs->ns_ctime = ssi.ctime;
600 if (!(ssi.flags & NILFS_SS_GC))
601 nilfs->ns_nongc_ctime = ssi.ctime;
602
603 switch (state) {
604 case RF_INIT_ST:
605 if (!NILFS_SEG_LOGBGN(&ssi) || !NILFS_SEG_DSYNC(&ssi))
606 goto try_next_pseg;
607 state = RF_DSYNC_ST;
608 /* Fall through */
609 case RF_DSYNC_ST:
610 if (!NILFS_SEG_DSYNC(&ssi))
611 goto confused;
612
613 err = collect_blocks_from_segsum(
614 sbi, pseg_start, &ssi, &dsync_blocks);
615 if (unlikely(err))
616 goto failed;
617 if (NILFS_SEG_LOGEND(&ssi)) {
618 err = recover_dsync_blocks(
619 sbi, &dsync_blocks, &nsalvaged_blocks);
620 if (unlikely(err))
621 goto failed;
622 state = RF_INIT_ST;
623 }
624 break; /* Fall through to try_next_pseg */
625 }
626
627 try_next_pseg:
628 if (pseg_start == ri->ri_lsegs_end)
629 break;
630 pseg_start += ssi.nblocks;
631 if (pseg_start < seg_end)
632 continue;
633 goto feed_segment;
634
635 strayed:
636 if (pseg_start == ri->ri_lsegs_end)
637 break;
638
639 feed_segment:
640 /* Looking to the next full segment */
641 if (empty_seg++)
642 break;
643 seg_seq++;
644 segnum = nextnum;
645 nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
646 pseg_start = seg_start;
647 }
648
649 if (nsalvaged_blocks) {
650 printk(KERN_INFO "NILFS (device %s): salvaged %lu blocks\n",
651 sbi->s_super->s_id, nsalvaged_blocks);
652 ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE;
653 }
654 out:
655 dispose_recovery_list(&dsync_blocks);
656 nilfs_detach_writer(sbi->s_nilfs, sbi);
657 return err;
658
659 confused:
660 err = -EINVAL;
661 failed:
662 printk(KERN_ERR
663 "NILFS (device %s): Error roll-forwarding "
664 "(err=%d, pseg block=%llu). ",
665 sbi->s_super->s_id, err, (unsigned long long)pseg_start);
666 goto out;
667}
668
669static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
670 struct nilfs_sb_info *sbi,
671 struct nilfs_recovery_info *ri)
672{
673 struct buffer_head *bh;
674 int err;
675
676 if (nilfs_get_segnum_of_block(nilfs, ri->ri_lsegs_start) !=
677 nilfs_get_segnum_of_block(nilfs, ri->ri_super_root))
678 return;
679
680 bh = sb_getblk(sbi->s_super, ri->ri_lsegs_start);
681 BUG_ON(!bh);
682 memset(bh->b_data, 0, bh->b_size);
683 set_buffer_dirty(bh);
684 err = sync_dirty_buffer(bh);
685 if (unlikely(err))
686 printk(KERN_WARNING
687 "NILFS warning: buffer sync write failed during "
688 "post-cleaning of recovery.\n");
689 brelse(bh);
690}
691
692/**
693 * nilfs_recover_logical_segments - salvage logical segments written after
694 * the latest super root
695 * @nilfs: the_nilfs
696 * @sbi: nilfs_sb_info
697 * @ri: pointer to a nilfs_recovery_info struct to store search results.
698 *
699 * Return Value: On success, 0 is returned. On error, one of the following
700 * negative error code is returned.
701 *
702 * %-EINVAL - Inconsistent filesystem state.
703 *
704 * %-EIO - I/O error
705 *
706 * %-ENOSPC - No space left on device (only in a panic state).
707 *
708 * %-ERESTARTSYS - Interrupted.
709 *
710 * %-ENOMEM - Insufficient memory available.
711 */
712int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
713 struct nilfs_sb_info *sbi,
714 struct nilfs_recovery_info *ri)
715{
716 int err;
717
718 if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0)
719 return 0;
720
721 err = nilfs_attach_checkpoint(sbi, ri->ri_cno);
722 if (unlikely(err)) {
723 printk(KERN_ERR
724 "NILFS: error loading the latest checkpoint.\n");
725 return err;
726 }
727
728 err = nilfs_do_roll_forward(nilfs, sbi, ri);
729 if (unlikely(err))
730 goto failed;
731
732 if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) {
733 err = nilfs_prepare_segment_for_recovery(nilfs, sbi, ri);
734 if (unlikely(err)) {
735 printk(KERN_ERR "NILFS: Error preparing segments for "
736 "recovery.\n");
737 goto failed;
738 }
739
740 err = nilfs_attach_segment_constructor(sbi);
741 if (unlikely(err))
742 goto failed;
743
744 set_nilfs_discontinued(nilfs);
745 err = nilfs_construct_segment(sbi->s_super);
746 nilfs_detach_segment_constructor(sbi);
747
748 if (unlikely(err)) {
749 printk(KERN_ERR "NILFS: Oops! recovery failed. "
750 "(err=%d)\n", err);
751 goto failed;
752 }
753
754 nilfs_finish_roll_forward(nilfs, sbi, ri);
755 }
756
757 nilfs_detach_checkpoint(sbi);
758 return 0;
759
760 failed:
761 nilfs_detach_checkpoint(sbi);
762 nilfs_mdt_clear(nilfs->ns_cpfile);
763 nilfs_mdt_clear(nilfs->ns_sufile);
764 nilfs_mdt_clear(nilfs->ns_dat);
765 return err;
766}
767
768/**
769 * nilfs_search_super_root - search the latest valid super root
770 * @nilfs: the_nilfs
771 * @sbi: nilfs_sb_info
772 * @ri: pointer to a nilfs_recovery_info struct to store search results.
773 *
774 * nilfs_search_super_root() looks for the latest super-root from a partial
775 * segment pointed by the superblock. It sets up struct the_nilfs through
776 * this search. It fills nilfs_recovery_info (ri) required for recovery.
777 *
778 * Return Value: On success, 0 is returned. On error, one of the following
779 * negative error code is returned.
780 *
781 * %-EINVAL - No valid segment found
782 *
783 * %-EIO - I/O error
784 */
785int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
786 struct nilfs_recovery_info *ri)
787{
788 struct nilfs_segsum_info ssi;
789 sector_t pseg_start, pseg_end, sr_pseg_start = 0;
790 sector_t seg_start, seg_end; /* range of full segment (block number) */
791 u64 seg_seq;
792 __u64 segnum, nextnum = 0;
793 __u64 cno;
794 struct nilfs_segment_entry *ent;
795 LIST_HEAD(segments);
796 int empty_seg = 0, scan_newer = 0;
797 int ret;
798
799 pseg_start = nilfs->ns_last_pseg;
800 seg_seq = nilfs->ns_last_seq;
801 cno = nilfs->ns_last_cno;
802 segnum = nilfs_get_segnum_of_block(nilfs, pseg_start);
803
804 /* Calculate range of segment */
805 nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
806
807 for (;;) {
808 /* Load segment summary */
809 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1);
810 if (ret) {
811 if (ret == NILFS_SEG_FAIL_IO)
812 goto failed;
813 goto strayed;
814 }
815 pseg_end = pseg_start + ssi.nblocks - 1;
816 if (unlikely(pseg_end > seg_end)) {
817 ret = NILFS_SEG_FAIL_CONSISTENCY;
818 goto strayed;
819 }
820
821 /* A valid partial segment */
822 ri->ri_pseg_start = pseg_start;
823 ri->ri_seq = seg_seq;
824 ri->ri_segnum = segnum;
825 nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next);
826 ri->ri_nextnum = nextnum;
827 empty_seg = 0;
828
829 if (!NILFS_SEG_HAS_SR(&ssi)) {
830 if (!scan_newer) {
831 /* This will never happen because a superblock
832 (last_segment) always points to a pseg
833 having a super root. */
834 ret = NILFS_SEG_FAIL_CONSISTENCY;
835 goto failed;
836 }
837 if (!ri->ri_lsegs_start && NILFS_SEG_LOGBGN(&ssi)) {
838 ri->ri_lsegs_start = pseg_start;
839 ri->ri_lsegs_start_seq = seg_seq;
840 }
841 if (NILFS_SEG_LOGEND(&ssi))
842 ri->ri_lsegs_end = pseg_start;
843 goto try_next_pseg;
844 }
845
846 /* A valid super root was found. */
847 ri->ri_cno = cno++;
848 ri->ri_super_root = pseg_end;
849 ri->ri_lsegs_start = ri->ri_lsegs_end = 0;
850
851 nilfs_dispose_segment_list(&segments);
852 nilfs->ns_pseg_offset = (sr_pseg_start = pseg_start)
853 + ssi.nblocks - seg_start;
854 nilfs->ns_seg_seq = seg_seq;
855 nilfs->ns_segnum = segnum;
856 nilfs->ns_cno = cno; /* nilfs->ns_cno = ri->ri_cno + 1 */
857 nilfs->ns_ctime = ssi.ctime;
858 nilfs->ns_nextnum = nextnum;
859
860 if (scan_newer)
861 ri->ri_need_recovery = NILFS_RECOVERY_SR_UPDATED;
862 else {
863 if (nilfs->ns_mount_state & NILFS_VALID_FS)
864 goto super_root_found;
865 scan_newer = 1;
866 }
867
868 /* reset region for roll-forward */
869 pseg_start += ssi.nblocks;
870 if (pseg_start < seg_end)
871 continue;
872 goto feed_segment;
873
874 try_next_pseg:
875 /* Standing on a course, or met an inconsistent state */
876 pseg_start += ssi.nblocks;
877 if (pseg_start < seg_end)
878 continue;
879 goto feed_segment;
880
881 strayed:
882 /* Off the trail */
883 if (!scan_newer)
884 /*
885 * This can happen if a checkpoint was written without
886 * barriers, or as a result of an I/O failure.
887 */
888 goto failed;
889
890 feed_segment:
891 /* Looking to the next full segment */
892 if (empty_seg++)
893 goto super_root_found; /* found a valid super root */
894
895 ent = nilfs_alloc_segment_entry(segnum);
896 if (unlikely(!ent)) {
897 ret = -ENOMEM;
898 goto failed;
899 }
900 list_add_tail(&ent->list, &segments);
901
902 seg_seq++;
903 segnum = nextnum;
904 nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
905 pseg_start = seg_start;
906 }
907
908 super_root_found:
909 /* Updating pointers relating to the latest checkpoint */
910 list_splice(&segments, ri->ri_used_segments.prev);
911 nilfs->ns_last_pseg = sr_pseg_start;
912 nilfs->ns_last_seq = nilfs->ns_seg_seq;
913 nilfs->ns_last_cno = ri->ri_cno;
914 return 0;
915
916 failed:
917 nilfs_dispose_segment_list(&segments);
918 return (ret < 0) ? ret : nilfs_warn_segment_error(ret);
919}
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
new file mode 100644
index 000000000000..adccd4fc654e
--- /dev/null
+++ b/fs/nilfs2/sb.h
@@ -0,0 +1,102 @@
1/*
2 * sb.h - NILFS on-memory super block structure.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23
24#ifndef _NILFS_SB
25#define _NILFS_SB
26
27#include <linux/types.h>
28#include <linux/fs.h>
29
30/*
31 * Mount options
32 */
33struct nilfs_mount_options {
34 unsigned long mount_opt;
35 __u64 snapshot_cno;
36};
37
38struct the_nilfs;
39struct nilfs_sc_info;
40
41/*
42 * NILFS super-block data in memory
43 */
44struct nilfs_sb_info {
45 /* Snapshot status */
46 __u64 s_snapshot_cno; /* Checkpoint number */
47 atomic_t s_inodes_count;
48 atomic_t s_blocks_count; /* Reserved (might be deleted) */
49
50 /* Mount options */
51 unsigned long s_mount_opt;
52 uid_t s_resuid;
53 gid_t s_resgid;
54
55 unsigned long s_interval; /* construction interval */
56 unsigned long s_watermark; /* threshold of data amount
57 for the segment construction */
58
59 /* Fundamental members */
60 struct super_block *s_super; /* reverse pointer to super_block */
61 struct the_nilfs *s_nilfs;
62 struct list_head s_list; /* list head for nilfs->ns_supers */
63
64 /* Segment constructor */
65 struct list_head s_dirty_files; /* dirty files list */
66 struct nilfs_sc_info *s_sc_info; /* segment constructor info */
67 spinlock_t s_inode_lock; /* Lock for the nilfs inode.
68 It covers s_dirty_files list */
69
70 /* Metadata files */
71 struct inode *s_ifile; /* index file inode */
72
73 /* Inode allocator */
74 spinlock_t s_next_gen_lock;
75 u32 s_next_generation;
76};
77
78static inline struct nilfs_sb_info *NILFS_SB(struct super_block *sb)
79{
80 return sb->s_fs_info;
81}
82
83static inline struct nilfs_sc_info *NILFS_SC(struct nilfs_sb_info *sbi)
84{
85 return sbi->s_sc_info;
86}
87
88/*
89 * Bit operations for the mount option
90 */
91#define nilfs_clear_opt(sbi, opt) \
92 do { (sbi)->s_mount_opt &= ~NILFS_MOUNT_##opt; } while (0)
93#define nilfs_set_opt(sbi, opt) \
94 do { (sbi)->s_mount_opt |= NILFS_MOUNT_##opt; } while (0)
95#define nilfs_test_opt(sbi, opt) ((sbi)->s_mount_opt & NILFS_MOUNT_##opt)
96#define nilfs_write_opt(sbi, mask, opt) \
97 do { (sbi)->s_mount_opt = \
98 (((sbi)->s_mount_opt & ~NILFS_MOUNT_##mask) | \
99 NILFS_MOUNT_##opt); \
100 } while (0)
101
102#endif /* _NILFS_SB */
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
new file mode 100644
index 000000000000..1e68821b4a9b
--- /dev/null
+++ b/fs/nilfs2/segbuf.c
@@ -0,0 +1,439 @@
1/*
2 * segbuf.c - NILFS segment buffer
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23
24#include <linux/buffer_head.h>
25#include <linux/writeback.h>
26#include <linux/crc32.h>
27#include "page.h"
28#include "segbuf.h"
29#include "seglist.h"
30
31
32static struct kmem_cache *nilfs_segbuf_cachep;
33
34static void nilfs_segbuf_init_once(void *obj)
35{
36 memset(obj, 0, sizeof(struct nilfs_segment_buffer));
37}
38
39int __init nilfs_init_segbuf_cache(void)
40{
41 nilfs_segbuf_cachep =
42 kmem_cache_create("nilfs2_segbuf_cache",
43 sizeof(struct nilfs_segment_buffer),
44 0, SLAB_RECLAIM_ACCOUNT,
45 nilfs_segbuf_init_once);
46
47 return (nilfs_segbuf_cachep == NULL) ? -ENOMEM : 0;
48}
49
50void nilfs_destroy_segbuf_cache(void)
51{
52 kmem_cache_destroy(nilfs_segbuf_cachep);
53}
54
55struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
56{
57 struct nilfs_segment_buffer *segbuf;
58
59 segbuf = kmem_cache_alloc(nilfs_segbuf_cachep, GFP_NOFS);
60 if (unlikely(!segbuf))
61 return NULL;
62
63 segbuf->sb_super = sb;
64 INIT_LIST_HEAD(&segbuf->sb_list);
65 INIT_LIST_HEAD(&segbuf->sb_segsum_buffers);
66 INIT_LIST_HEAD(&segbuf->sb_payload_buffers);
67 return segbuf;
68}
69
70void nilfs_segbuf_free(struct nilfs_segment_buffer *segbuf)
71{
72 kmem_cache_free(nilfs_segbuf_cachep, segbuf);
73}
74
75void nilfs_segbuf_map(struct nilfs_segment_buffer *segbuf, __u64 segnum,
76 unsigned long offset, struct the_nilfs *nilfs)
77{
78 segbuf->sb_segnum = segnum;
79 nilfs_get_segment_range(nilfs, segnum, &segbuf->sb_fseg_start,
80 &segbuf->sb_fseg_end);
81
82 segbuf->sb_pseg_start = segbuf->sb_fseg_start + offset;
83 segbuf->sb_rest_blocks =
84 segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1;
85}
86
87void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *segbuf,
88 __u64 nextnum, struct the_nilfs *nilfs)
89{
90 segbuf->sb_nextnum = nextnum;
91 segbuf->sb_sum.next = nilfs_get_segment_start_blocknr(nilfs, nextnum);
92}
93
94int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *segbuf)
95{
96 struct buffer_head *bh;
97
98 bh = sb_getblk(segbuf->sb_super,
99 segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk);
100 if (unlikely(!bh))
101 return -ENOMEM;
102
103 nilfs_segbuf_add_segsum_buffer(segbuf, bh);
104 return 0;
105}
106
107int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf,
108 struct buffer_head **bhp)
109{
110 struct buffer_head *bh;
111
112 bh = sb_getblk(segbuf->sb_super,
113 segbuf->sb_pseg_start + segbuf->sb_sum.nblocks);
114 if (unlikely(!bh))
115 return -ENOMEM;
116
117 nilfs_segbuf_add_payload_buffer(segbuf, bh);
118 *bhp = bh;
119 return 0;
120}
121
122int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
123 time_t ctime)
124{
125 int err;
126
127 segbuf->sb_sum.nblocks = segbuf->sb_sum.nsumblk = 0;
128 err = nilfs_segbuf_extend_segsum(segbuf);
129 if (unlikely(err))
130 return err;
131
132 segbuf->sb_sum.flags = flags;
133 segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary);
134 segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0;
135 segbuf->sb_sum.ctime = ctime;
136
137 segbuf->sb_io_error = 0;
138 return 0;
139}
140
141/*
142 * Setup segument summary
143 */
144void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf)
145{
146 struct nilfs_segment_summary *raw_sum;
147 struct buffer_head *bh_sum;
148
149 bh_sum = list_entry(segbuf->sb_segsum_buffers.next,
150 struct buffer_head, b_assoc_buffers);
151 raw_sum = (struct nilfs_segment_summary *)bh_sum->b_data;
152
153 raw_sum->ss_magic = cpu_to_le32(NILFS_SEGSUM_MAGIC);
154 raw_sum->ss_bytes = cpu_to_le16(sizeof(*raw_sum));
155 raw_sum->ss_flags = cpu_to_le16(segbuf->sb_sum.flags);
156 raw_sum->ss_seq = cpu_to_le64(segbuf->sb_sum.seg_seq);
157 raw_sum->ss_create = cpu_to_le64(segbuf->sb_sum.ctime);
158 raw_sum->ss_next = cpu_to_le64(segbuf->sb_sum.next);
159 raw_sum->ss_nblocks = cpu_to_le32(segbuf->sb_sum.nblocks);
160 raw_sum->ss_nfinfo = cpu_to_le32(segbuf->sb_sum.nfinfo);
161 raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes);
162 raw_sum->ss_pad = 0;
163}
164
165/*
166 * CRC calculation routines
167 */
168void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf,
169 u32 seed)
170{
171 struct buffer_head *bh;
172 struct nilfs_segment_summary *raw_sum;
173 unsigned long size, bytes = segbuf->sb_sum.sumbytes;
174 u32 crc;
175
176 bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head,
177 b_assoc_buffers);
178
179 raw_sum = (struct nilfs_segment_summary *)bh->b_data;
180 size = min_t(unsigned long, bytes, bh->b_size);
181 crc = crc32_le(seed,
182 (unsigned char *)raw_sum +
183 sizeof(raw_sum->ss_datasum) + sizeof(raw_sum->ss_sumsum),
184 size - (sizeof(raw_sum->ss_datasum) +
185 sizeof(raw_sum->ss_sumsum)));
186
187 list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers,
188 b_assoc_buffers) {
189 bytes -= size;
190 size = min_t(unsigned long, bytes, bh->b_size);
191 crc = crc32_le(crc, bh->b_data, size);
192 }
193 raw_sum->ss_sumsum = cpu_to_le32(crc);
194}
195
196void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
197 u32 seed)
198{
199 struct buffer_head *bh;
200 struct nilfs_segment_summary *raw_sum;
201 void *kaddr;
202 u32 crc;
203
204 bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head,
205 b_assoc_buffers);
206 raw_sum = (struct nilfs_segment_summary *)bh->b_data;
207 crc = crc32_le(seed,
208 (unsigned char *)raw_sum + sizeof(raw_sum->ss_datasum),
209 bh->b_size - sizeof(raw_sum->ss_datasum));
210
211 list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers,
212 b_assoc_buffers) {
213 crc = crc32_le(crc, bh->b_data, bh->b_size);
214 }
215 list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
216 kaddr = kmap_atomic(bh->b_page, KM_USER0);
217 crc = crc32_le(crc, kaddr + bh_offset(bh), bh->b_size);
218 kunmap_atomic(kaddr, KM_USER0);
219 }
220 raw_sum->ss_datasum = cpu_to_le32(crc);
221}
222
223void nilfs_release_buffers(struct list_head *list)
224{
225 struct buffer_head *bh, *n;
226
227 list_for_each_entry_safe(bh, n, list, b_assoc_buffers) {
228 list_del_init(&bh->b_assoc_buffers);
229 if (buffer_nilfs_allocated(bh)) {
230 struct page *clone_page = bh->b_page;
231
232 /* remove clone page */
233 brelse(bh);
234 page_cache_release(clone_page); /* for each bh */
235 if (page_count(clone_page) <= 2) {
236 lock_page(clone_page);
237 nilfs_free_private_page(clone_page);
238 }
239 continue;
240 }
241 brelse(bh);
242 }
243}
244
245/*
246 * BIO operations
247 */
248static void nilfs_end_bio_write(struct bio *bio, int err)
249{
250 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
251 struct nilfs_write_info *wi = bio->bi_private;
252
253 if (err == -EOPNOTSUPP) {
254 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
255 bio_put(bio);
256 /* to be detected by submit_seg_bio() */
257 }
258
259 if (!uptodate)
260 atomic_inc(&wi->err);
261
262 bio_put(bio);
263 complete(&wi->bio_event);
264}
265
266static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode)
267{
268 struct bio *bio = wi->bio;
269 int err;
270
271 if (wi->nbio > 0 && bdi_write_congested(wi->bdi)) {
272 wait_for_completion(&wi->bio_event);
273 wi->nbio--;
274 if (unlikely(atomic_read(&wi->err))) {
275 bio_put(bio);
276 err = -EIO;
277 goto failed;
278 }
279 }
280
281 bio->bi_end_io = nilfs_end_bio_write;
282 bio->bi_private = wi;
283 bio_get(bio);
284 submit_bio(mode, bio);
285 if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
286 bio_put(bio);
287 err = -EOPNOTSUPP;
288 goto failed;
289 }
290 wi->nbio++;
291 bio_put(bio);
292
293 wi->bio = NULL;
294 wi->rest_blocks -= wi->end - wi->start;
295 wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
296 wi->start = wi->end;
297 return 0;
298
299 failed:
300 wi->bio = NULL;
301 return err;
302}
303
304/**
305 * nilfs_alloc_seg_bio - allocate a bio for writing segment.
306 * @sb: super block
307 * @start: beginning disk block number of this BIO.
308 * @nr_vecs: request size of page vector.
309 *
310 * alloc_seg_bio() allocates a new BIO structure and initialize it.
311 *
312 * Return Value: On success, pointer to the struct bio is returned.
313 * On error, NULL is returned.
314 */
315static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start,
316 int nr_vecs)
317{
318 struct bio *bio;
319
320 bio = bio_alloc(GFP_NOWAIT, nr_vecs);
321 if (bio == NULL) {
322 while (!bio && (nr_vecs >>= 1))
323 bio = bio_alloc(GFP_NOWAIT, nr_vecs);
324 }
325 if (likely(bio)) {
326 bio->bi_bdev = sb->s_bdev;
327 bio->bi_sector = (sector_t)start << (sb->s_blocksize_bits - 9);
328 }
329 return bio;
330}
331
332void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
333 struct nilfs_write_info *wi)
334{
335 wi->bio = NULL;
336 wi->rest_blocks = segbuf->sb_sum.nblocks;
337 wi->max_pages = bio_get_nr_vecs(wi->sb->s_bdev);
338 wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
339 wi->start = wi->end = 0;
340 wi->nbio = 0;
341 wi->blocknr = segbuf->sb_pseg_start;
342
343 atomic_set(&wi->err, 0);
344 init_completion(&wi->bio_event);
345}
346
347static int nilfs_submit_bh(struct nilfs_write_info *wi, struct buffer_head *bh,
348 int mode)
349{
350 int len, err;
351
352 BUG_ON(wi->nr_vecs <= 0);
353 repeat:
354 if (!wi->bio) {
355 wi->bio = nilfs_alloc_seg_bio(wi->sb, wi->blocknr + wi->end,
356 wi->nr_vecs);
357 if (unlikely(!wi->bio))
358 return -ENOMEM;
359 }
360
361 len = bio_add_page(wi->bio, bh->b_page, bh->b_size, bh_offset(bh));
362 if (len == bh->b_size) {
363 wi->end++;
364 return 0;
365 }
366 /* bio is FULL */
367 err = nilfs_submit_seg_bio(wi, mode);
368 /* never submit current bh */
369 if (likely(!err))
370 goto repeat;
371 return err;
372}
373
374int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
375 struct nilfs_write_info *wi)
376{
377 struct buffer_head *bh;
378 int res, rw = WRITE;
379
380 list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) {
381 res = nilfs_submit_bh(wi, bh, rw);
382 if (unlikely(res))
383 goto failed_bio;
384 }
385
386 list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
387 res = nilfs_submit_bh(wi, bh, rw);
388 if (unlikely(res))
389 goto failed_bio;
390 }
391
392 if (wi->bio) {
393 /*
394 * Last BIO is always sent through the following
395 * submission.
396 */
397 rw |= (1 << BIO_RW_SYNCIO);
398 res = nilfs_submit_seg_bio(wi, rw);
399 if (unlikely(res))
400 goto failed_bio;
401 }
402
403 res = 0;
404 out:
405 return res;
406
407 failed_bio:
408 atomic_inc(&wi->err);
409 goto out;
410}
411
412/**
413 * nilfs_segbuf_wait - wait for completion of requested BIOs
414 * @wi: nilfs_write_info
415 *
416 * Return Value: On Success, 0 is returned. On Error, one of the following
417 * negative error code is returned.
418 *
419 * %-EIO - I/O error
420 */
421int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf,
422 struct nilfs_write_info *wi)
423{
424 int err = 0;
425
426 if (!wi->nbio)
427 return 0;
428
429 do {
430 wait_for_completion(&wi->bio_event);
431 } while (--wi->nbio > 0);
432
433 if (unlikely(atomic_read(&wi->err) > 0)) {
434 printk(KERN_ERR "NILFS: IO error writing segment\n");
435 err = -EIO;
436 segbuf->sb_io_error = 1;
437 }
438 return err;
439}
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
new file mode 100644
index 000000000000..0c3076f4e592
--- /dev/null
+++ b/fs/nilfs2/segbuf.h
@@ -0,0 +1,201 @@
1/*
2 * segbuf.h - NILFS Segment buffer prototypes and definitions
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23#ifndef _NILFS_SEGBUF_H
24#define _NILFS_SEGBUF_H
25
26#include <linux/fs.h>
27#include <linux/buffer_head.h>
28#include <linux/bio.h>
29#include <linux/completion.h>
30#include <linux/backing-dev.h>
31
32/**
33 * struct nilfs_segsum_info - On-memory segment summary
34 * @flags: Flags
35 * @nfinfo: Number of file information structures
36 * @nblocks: Number of blocks included in the partial segment
37 * @nsumblk: Number of summary blocks
38 * @sumbytes: Byte count of segment summary
39 * @nfileblk: Total number of file blocks
40 * @seg_seq: Segment sequence number
41 * @ctime: Creation time
42 * @next: Block number of the next full segment
43 */
44struct nilfs_segsum_info {
45 unsigned int flags;
46 unsigned long nfinfo;
47 unsigned long nblocks;
48 unsigned long nsumblk;
49 unsigned long sumbytes;
50 unsigned long nfileblk;
51 u64 seg_seq;
52 time_t ctime;
53 sector_t next;
54};
55
56/* macro for the flags */
57#define NILFS_SEG_HAS_SR(sum) ((sum)->flags & NILFS_SS_SR)
58#define NILFS_SEG_LOGBGN(sum) ((sum)->flags & NILFS_SS_LOGBGN)
59#define NILFS_SEG_LOGEND(sum) ((sum)->flags & NILFS_SS_LOGEND)
60#define NILFS_SEG_DSYNC(sum) ((sum)->flags & NILFS_SS_SYNDT)
61#define NILFS_SEG_SIMPLEX(sum) \
62 (((sum)->flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) == \
63 (NILFS_SS_LOGBGN | NILFS_SS_LOGEND))
64
65#define NILFS_SEG_EMPTY(sum) ((sum)->nblocks == (sum)->nsumblk)
66
67/**
68 * struct nilfs_segment_buffer - Segment buffer
69 * @sb_super: back pointer to a superblock struct
70 * @sb_list: List head to chain this structure
71 * @sb_sum: On-memory segment summary
72 * @sb_segnum: Index number of the full segment
73 * @sb_nextnum: Index number of the next full segment
74 * @sb_fseg_start: Start block number of the full segment
75 * @sb_fseg_end: End block number of the full segment
76 * @sb_pseg_start: Disk block number of partial segment
77 * @sb_rest_blocks: Number of residual blocks in the current segment
78 * @sb_segsum_buffers: List of buffers for segment summaries
79 * @sb_payload_buffers: List of buffers for segment payload
80 * @sb_io_error: I/O error status
81 */
82struct nilfs_segment_buffer {
83 struct super_block *sb_super;
84 struct list_head sb_list;
85
86 /* Segment information */
87 struct nilfs_segsum_info sb_sum;
88 __u64 sb_segnum;
89 __u64 sb_nextnum;
90 sector_t sb_fseg_start, sb_fseg_end;
91 sector_t sb_pseg_start;
92 unsigned sb_rest_blocks;
93
94 /* Buffers */
95 struct list_head sb_segsum_buffers;
96 struct list_head sb_payload_buffers; /* including super root */
97
98 /* io status */
99 int sb_io_error;
100};
101
102#define NILFS_LIST_SEGBUF(head) \
103 list_entry((head), struct nilfs_segment_buffer, sb_list)
104#define NILFS_NEXT_SEGBUF(segbuf) NILFS_LIST_SEGBUF((segbuf)->sb_list.next)
105#define NILFS_PREV_SEGBUF(segbuf) NILFS_LIST_SEGBUF((segbuf)->sb_list.prev)
106#define NILFS_LAST_SEGBUF(head) NILFS_LIST_SEGBUF((head)->prev)
107#define NILFS_FIRST_SEGBUF(head) NILFS_LIST_SEGBUF((head)->next)
108#define NILFS_SEGBUF_IS_LAST(segbuf, head) ((segbuf)->sb_list.next == (head))
109
110#define nilfs_for_each_segbuf_before(s, t, h) \
111 for ((s) = NILFS_FIRST_SEGBUF(h); (s) != (t); \
112 (s) = NILFS_NEXT_SEGBUF(s))
113
114#define NILFS_SEGBUF_FIRST_BH(head) \
115 (list_entry((head)->next, struct buffer_head, b_assoc_buffers))
116#define NILFS_SEGBUF_NEXT_BH(bh) \
117 (list_entry((bh)->b_assoc_buffers.next, struct buffer_head, \
118 b_assoc_buffers))
119#define NILFS_SEGBUF_BH_IS_LAST(bh, head) ((bh)->b_assoc_buffers.next == head)
120
121
122int __init nilfs_init_segbuf_cache(void);
123void nilfs_destroy_segbuf_cache(void);
124struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *);
125void nilfs_segbuf_free(struct nilfs_segment_buffer *);
126void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long,
127 struct the_nilfs *);
128void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
129 struct the_nilfs *);
130int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t);
131int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *);
132int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *,
133 struct buffer_head **);
134void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *);
135void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *, u32);
136void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *, u32);
137
138static inline void
139nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf,
140 struct buffer_head *bh)
141{
142 list_add_tail(&bh->b_assoc_buffers, &segbuf->sb_segsum_buffers);
143 segbuf->sb_sum.nblocks++;
144 segbuf->sb_sum.nsumblk++;
145}
146
147static inline void
148nilfs_segbuf_add_payload_buffer(struct nilfs_segment_buffer *segbuf,
149 struct buffer_head *bh)
150{
151 list_add_tail(&bh->b_assoc_buffers, &segbuf->sb_payload_buffers);
152 segbuf->sb_sum.nblocks++;
153}
154
155static inline void
156nilfs_segbuf_add_file_buffer(struct nilfs_segment_buffer *segbuf,
157 struct buffer_head *bh)
158{
159 get_bh(bh);
160 nilfs_segbuf_add_payload_buffer(segbuf, bh);
161 segbuf->sb_sum.nfileblk++;
162}
163
164void nilfs_release_buffers(struct list_head *);
165
166static inline void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf)
167{
168 nilfs_release_buffers(&segbuf->sb_segsum_buffers);
169 nilfs_release_buffers(&segbuf->sb_payload_buffers);
170}
171
172struct nilfs_write_info {
173 struct bio *bio;
174 int start, end; /* The region to be submitted */
175 int rest_blocks;
176 int max_pages;
177 int nr_vecs;
178 sector_t blocknr;
179
180 int nbio;
181 atomic_t err;
182 struct completion bio_event;
183 /* completion event of segment write */
184
185 /*
186 * The following fields must be set explicitly
187 */
188 struct super_block *sb;
189 struct backing_dev_info *bdi; /* backing dev info */
190 struct buffer_head *bh_sr;
191};
192
193
194void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *,
195 struct nilfs_write_info *);
196int nilfs_segbuf_write(struct nilfs_segment_buffer *,
197 struct nilfs_write_info *);
198int nilfs_segbuf_wait(struct nilfs_segment_buffer *,
199 struct nilfs_write_info *);
200
201#endif /* _NILFS_SEGBUF_H */
diff --git a/fs/nilfs2/seglist.h b/fs/nilfs2/seglist.h
new file mode 100644
index 000000000000..d39df9144e99
--- /dev/null
+++ b/fs/nilfs2/seglist.h
@@ -0,0 +1,85 @@
1/*
2 * seglist.h - expediential structure and routines to handle list of segments
3 * (would be removed in a future release)
4 *
5 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 *
21 * Written by Ryusuke Konishi <ryusuke@osrg.net>
22 *
23 */
24#ifndef _NILFS_SEGLIST_H
25#define _NILFS_SEGLIST_H
26
27#include <linux/fs.h>
28#include <linux/buffer_head.h>
29#include <linux/nilfs2_fs.h>
30#include "sufile.h"
31
32struct nilfs_segment_entry {
33 __u64 segnum;
34
35#define NILFS_SLH_FREED 0x0001 /* The segment was freed provisonally.
36 It must be cancelled if
37 construction aborted */
38
39 unsigned flags;
40 struct list_head list;
41 struct buffer_head *bh_su;
42 struct nilfs_segment_usage *raw_su;
43};
44
45
46void nilfs_dispose_segment_list(struct list_head *);
47
48static inline struct nilfs_segment_entry *
49nilfs_alloc_segment_entry(__u64 segnum)
50{
51 struct nilfs_segment_entry *ent = kmalloc(sizeof(*ent), GFP_NOFS);
52
53 if (likely(ent)) {
54 ent->segnum = segnum;
55 ent->flags = 0;
56 ent->bh_su = NULL;
57 ent->raw_su = NULL;
58 INIT_LIST_HEAD(&ent->list);
59 }
60 return ent;
61}
62
63static inline int nilfs_open_segment_entry(struct nilfs_segment_entry *ent,
64 struct inode *sufile)
65{
66 return nilfs_sufile_get_segment_usage(sufile, ent->segnum,
67 &ent->raw_su, &ent->bh_su);
68}
69
70static inline void nilfs_close_segment_entry(struct nilfs_segment_entry *ent,
71 struct inode *sufile)
72{
73 if (!ent->bh_su)
74 return;
75 nilfs_sufile_put_segment_usage(sufile, ent->segnum, ent->bh_su);
76 ent->bh_su = NULL;
77 ent->raw_su = NULL;
78}
79
80static inline void nilfs_free_segment_entry(struct nilfs_segment_entry *ent)
81{
82 kfree(ent);
83}
84
85#endif /* _NILFS_SEGLIST_H */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
new file mode 100644
index 000000000000..22c7f65c2403
--- /dev/null
+++ b/fs/nilfs2/segment.c
@@ -0,0 +1,2978 @@
1/*
2 * segment.c - NILFS segment constructor.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23
24#include <linux/pagemap.h>
25#include <linux/buffer_head.h>
26#include <linux/writeback.h>
27#include <linux/bio.h>
28#include <linux/completion.h>
29#include <linux/blkdev.h>
30#include <linux/backing-dev.h>
31#include <linux/freezer.h>
32#include <linux/kthread.h>
33#include <linux/crc32.h>
34#include <linux/pagevec.h>
35#include "nilfs.h"
36#include "btnode.h"
37#include "page.h"
38#include "segment.h"
39#include "sufile.h"
40#include "cpfile.h"
41#include "ifile.h"
42#include "seglist.h"
43#include "segbuf.h"
44
45
46/*
47 * Segment constructor
48 */
49#define SC_N_INODEVEC 16 /* Size of locally allocated inode vector */
50
51#define SC_MAX_SEGDELTA 64 /* Upper limit of the number of segments
52 appended in collection retry loop */
53
54/* Construction mode */
55enum {
56 SC_LSEG_SR = 1, /* Make a logical segment having a super root */
57 SC_LSEG_DSYNC, /* Flush data blocks of a given file and make
58 a logical segment without a super root */
59 SC_FLUSH_FILE, /* Flush data files, leads to segment writes without
60 creating a checkpoint */
61 SC_FLUSH_DAT, /* Flush DAT file. This also creates segments without
62 a checkpoint */
63};
64
65/* Stage numbers of dirty block collection */
66enum {
67 NILFS_ST_INIT = 0,
68 NILFS_ST_GC, /* Collecting dirty blocks for GC */
69 NILFS_ST_FILE,
70 NILFS_ST_IFILE,
71 NILFS_ST_CPFILE,
72 NILFS_ST_SUFILE,
73 NILFS_ST_DAT,
74 NILFS_ST_SR, /* Super root */
75 NILFS_ST_DSYNC, /* Data sync blocks */
76 NILFS_ST_DONE,
77};
78
79/* State flags of collection */
80#define NILFS_CF_NODE 0x0001 /* Collecting node blocks */
81#define NILFS_CF_IFILE_STARTED 0x0002 /* IFILE stage has started */
82#define NILFS_CF_HISTORY_MASK (NILFS_CF_IFILE_STARTED)
83
84/* Operations depending on the construction mode and file type */
85struct nilfs_sc_operations {
86 int (*collect_data)(struct nilfs_sc_info *, struct buffer_head *,
87 struct inode *);
88 int (*collect_node)(struct nilfs_sc_info *, struct buffer_head *,
89 struct inode *);
90 int (*collect_bmap)(struct nilfs_sc_info *, struct buffer_head *,
91 struct inode *);
92 void (*write_data_binfo)(struct nilfs_sc_info *,
93 struct nilfs_segsum_pointer *,
94 union nilfs_binfo *);
95 void (*write_node_binfo)(struct nilfs_sc_info *,
96 struct nilfs_segsum_pointer *,
97 union nilfs_binfo *);
98};
99
100/*
101 * Other definitions
102 */
103static void nilfs_segctor_start_timer(struct nilfs_sc_info *);
104static void nilfs_segctor_do_flush(struct nilfs_sc_info *, int);
105static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *);
106static void nilfs_dispose_list(struct nilfs_sb_info *, struct list_head *,
107 int);
108
109#define nilfs_cnt32_gt(a, b) \
110 (typecheck(__u32, a) && typecheck(__u32, b) && \
111 ((__s32)(b) - (__s32)(a) < 0))
112#define nilfs_cnt32_ge(a, b) \
113 (typecheck(__u32, a) && typecheck(__u32, b) && \
114 ((__s32)(a) - (__s32)(b) >= 0))
115#define nilfs_cnt32_lt(a, b) nilfs_cnt32_gt(b, a)
116#define nilfs_cnt32_le(a, b) nilfs_cnt32_ge(b, a)
117
118/*
119 * Transaction
120 */
121static struct kmem_cache *nilfs_transaction_cachep;
122
123/**
124 * nilfs_init_transaction_cache - create a cache for nilfs_transaction_info
125 *
126 * nilfs_init_transaction_cache() creates a slab cache for the struct
127 * nilfs_transaction_info.
128 *
129 * Return Value: On success, it returns 0. On error, one of the following
130 * negative error code is returned.
131 *
132 * %-ENOMEM - Insufficient memory available.
133 */
134int nilfs_init_transaction_cache(void)
135{
136 nilfs_transaction_cachep =
137 kmem_cache_create("nilfs2_transaction_cache",
138 sizeof(struct nilfs_transaction_info),
139 0, SLAB_RECLAIM_ACCOUNT, NULL);
140 return (nilfs_transaction_cachep == NULL) ? -ENOMEM : 0;
141}
142
143/**
144 * nilfs_detroy_transaction_cache - destroy the cache for transaction info
145 *
146 * nilfs_destroy_transaction_cache() frees the slab cache for the struct
147 * nilfs_transaction_info.
148 */
149void nilfs_destroy_transaction_cache(void)
150{
151 kmem_cache_destroy(nilfs_transaction_cachep);
152}
153
154static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
155{
156 struct nilfs_transaction_info *cur_ti = current->journal_info;
157 void *save = NULL;
158
159 if (cur_ti) {
160 if (cur_ti->ti_magic == NILFS_TI_MAGIC)
161 return ++cur_ti->ti_count;
162 else {
163 /*
164 * If journal_info field is occupied by other FS,
165 * it is saved and will be restored on
166 * nilfs_transaction_commit().
167 */
168 printk(KERN_WARNING
169 "NILFS warning: journal info from a different "
170 "FS\n");
171 save = current->journal_info;
172 }
173 }
174 if (!ti) {
175 ti = kmem_cache_alloc(nilfs_transaction_cachep, GFP_NOFS);
176 if (!ti)
177 return -ENOMEM;
178 ti->ti_flags = NILFS_TI_DYNAMIC_ALLOC;
179 } else {
180 ti->ti_flags = 0;
181 }
182 ti->ti_count = 0;
183 ti->ti_save = save;
184 ti->ti_magic = NILFS_TI_MAGIC;
185 current->journal_info = ti;
186 return 0;
187}
188
189/**
190 * nilfs_transaction_begin - start indivisible file operations.
191 * @sb: super block
192 * @ti: nilfs_transaction_info
193 * @vacancy_check: flags for vacancy rate checks
194 *
195 * nilfs_transaction_begin() acquires a reader/writer semaphore, called
196 * the segment semaphore, to make a segment construction and write tasks
197 * exclusive. The function is used with nilfs_transaction_commit() in pairs.
198 * The region enclosed by these two functions can be nested. To avoid a
199 * deadlock, the semaphore is only acquired or released in the outermost call.
200 *
201 * This function allocates a nilfs_transaction_info struct to keep context
202 * information on it. It is initialized and hooked onto the current task in
203 * the outermost call. If a pre-allocated struct is given to @ti, it is used
204 * instead; othewise a new struct is assigned from a slab.
205 *
206 * When @vacancy_check flag is set, this function will check the amount of
207 * free space, and will wait for the GC to reclaim disk space if low capacity.
208 *
209 * Return Value: On success, 0 is returned. On error, one of the following
210 * negative error code is returned.
211 *
212 * %-ENOMEM - Insufficient memory available.
213 *
214 * %-ENOSPC - No space left on device
215 */
216int nilfs_transaction_begin(struct super_block *sb,
217 struct nilfs_transaction_info *ti,
218 int vacancy_check)
219{
220 struct nilfs_sb_info *sbi;
221 struct the_nilfs *nilfs;
222 int ret = nilfs_prepare_segment_lock(ti);
223
224 if (unlikely(ret < 0))
225 return ret;
226 if (ret > 0)
227 return 0;
228
229 sbi = NILFS_SB(sb);
230 nilfs = sbi->s_nilfs;
231 down_read(&nilfs->ns_segctor_sem);
232 if (vacancy_check && nilfs_near_disk_full(nilfs)) {
233 up_read(&nilfs->ns_segctor_sem);
234 ret = -ENOSPC;
235 goto failed;
236 }
237 return 0;
238
239 failed:
240 ti = current->journal_info;
241 current->journal_info = ti->ti_save;
242 if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
243 kmem_cache_free(nilfs_transaction_cachep, ti);
244 return ret;
245}
246
247/**
248 * nilfs_transaction_commit - commit indivisible file operations.
249 * @sb: super block
250 *
251 * nilfs_transaction_commit() releases the read semaphore which is
252 * acquired by nilfs_transaction_begin(). This is only performed
253 * in outermost call of this function. If a commit flag is set,
254 * nilfs_transaction_commit() sets a timer to start the segment
255 * constructor. If a sync flag is set, it starts construction
256 * directly.
257 */
258int nilfs_transaction_commit(struct super_block *sb)
259{
260 struct nilfs_transaction_info *ti = current->journal_info;
261 struct nilfs_sb_info *sbi;
262 struct nilfs_sc_info *sci;
263 int err = 0;
264
265 BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
266 ti->ti_flags |= NILFS_TI_COMMIT;
267 if (ti->ti_count > 0) {
268 ti->ti_count--;
269 return 0;
270 }
271 sbi = NILFS_SB(sb);
272 sci = NILFS_SC(sbi);
273 if (sci != NULL) {
274 if (ti->ti_flags & NILFS_TI_COMMIT)
275 nilfs_segctor_start_timer(sci);
276 if (atomic_read(&sbi->s_nilfs->ns_ndirtyblks) >
277 sci->sc_watermark)
278 nilfs_segctor_do_flush(sci, 0);
279 }
280 up_read(&sbi->s_nilfs->ns_segctor_sem);
281 current->journal_info = ti->ti_save;
282
283 if (ti->ti_flags & NILFS_TI_SYNC)
284 err = nilfs_construct_segment(sb);
285 if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
286 kmem_cache_free(nilfs_transaction_cachep, ti);
287 return err;
288}
289
290void nilfs_transaction_abort(struct super_block *sb)
291{
292 struct nilfs_transaction_info *ti = current->journal_info;
293
294 BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
295 if (ti->ti_count > 0) {
296 ti->ti_count--;
297 return;
298 }
299 up_read(&NILFS_SB(sb)->s_nilfs->ns_segctor_sem);
300
301 current->journal_info = ti->ti_save;
302 if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
303 kmem_cache_free(nilfs_transaction_cachep, ti);
304}
305
306void nilfs_relax_pressure_in_lock(struct super_block *sb)
307{
308 struct nilfs_sb_info *sbi = NILFS_SB(sb);
309 struct nilfs_sc_info *sci = NILFS_SC(sbi);
310 struct the_nilfs *nilfs = sbi->s_nilfs;
311
312 if (!sci || !sci->sc_flush_request)
313 return;
314
315 set_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags);
316 up_read(&nilfs->ns_segctor_sem);
317
318 down_write(&nilfs->ns_segctor_sem);
319 if (sci->sc_flush_request &&
320 test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags)) {
321 struct nilfs_transaction_info *ti = current->journal_info;
322
323 ti->ti_flags |= NILFS_TI_WRITER;
324 nilfs_segctor_do_immediate_flush(sci);
325 ti->ti_flags &= ~NILFS_TI_WRITER;
326 }
327 downgrade_write(&nilfs->ns_segctor_sem);
328}
329
330static void nilfs_transaction_lock(struct nilfs_sb_info *sbi,
331 struct nilfs_transaction_info *ti,
332 int gcflag)
333{
334 struct nilfs_transaction_info *cur_ti = current->journal_info;
335
336 WARN_ON(cur_ti);
337 ti->ti_flags = NILFS_TI_WRITER;
338 ti->ti_count = 0;
339 ti->ti_save = cur_ti;
340 ti->ti_magic = NILFS_TI_MAGIC;
341 INIT_LIST_HEAD(&ti->ti_garbage);
342 current->journal_info = ti;
343
344 for (;;) {
345 down_write(&sbi->s_nilfs->ns_segctor_sem);
346 if (!test_bit(NILFS_SC_PRIOR_FLUSH, &NILFS_SC(sbi)->sc_flags))
347 break;
348
349 nilfs_segctor_do_immediate_flush(NILFS_SC(sbi));
350
351 up_write(&sbi->s_nilfs->ns_segctor_sem);
352 yield();
353 }
354 if (gcflag)
355 ti->ti_flags |= NILFS_TI_GC;
356}
357
358static void nilfs_transaction_unlock(struct nilfs_sb_info *sbi)
359{
360 struct nilfs_transaction_info *ti = current->journal_info;
361
362 BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
363 BUG_ON(ti->ti_count > 0);
364
365 up_write(&sbi->s_nilfs->ns_segctor_sem);
366 current->journal_info = ti->ti_save;
367 if (!list_empty(&ti->ti_garbage))
368 nilfs_dispose_list(sbi, &ti->ti_garbage, 0);
369}
370
371static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
372 struct nilfs_segsum_pointer *ssp,
373 unsigned bytes)
374{
375 struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
376 unsigned blocksize = sci->sc_super->s_blocksize;
377 void *p;
378
379 if (unlikely(ssp->offset + bytes > blocksize)) {
380 ssp->offset = 0;
381 BUG_ON(NILFS_SEGBUF_BH_IS_LAST(ssp->bh,
382 &segbuf->sb_segsum_buffers));
383 ssp->bh = NILFS_SEGBUF_NEXT_BH(ssp->bh);
384 }
385 p = ssp->bh->b_data + ssp->offset;
386 ssp->offset += bytes;
387 return p;
388}
389
390/**
391 * nilfs_segctor_reset_segment_buffer - reset the current segment buffer
392 * @sci: nilfs_sc_info
393 */
394static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
395{
396 struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
397 struct buffer_head *sumbh;
398 unsigned sumbytes;
399 unsigned flags = 0;
400 int err;
401
402 if (nilfs_doing_gc())
403 flags = NILFS_SS_GC;
404 err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime);
405 if (unlikely(err))
406 return err;
407
408 sumbh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers);
409 sumbytes = segbuf->sb_sum.sumbytes;
410 sci->sc_finfo_ptr.bh = sumbh; sci->sc_finfo_ptr.offset = sumbytes;
411 sci->sc_binfo_ptr.bh = sumbh; sci->sc_binfo_ptr.offset = sumbytes;
412 sci->sc_blk_cnt = sci->sc_datablk_cnt = 0;
413 return 0;
414}
415
416static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci)
417{
418 sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks;
419 if (NILFS_SEGBUF_IS_LAST(sci->sc_curseg, &sci->sc_segbufs))
420 return -E2BIG; /* The current segment is filled up
421 (internal code) */
422 sci->sc_curseg = NILFS_NEXT_SEGBUF(sci->sc_curseg);
423 return nilfs_segctor_reset_segment_buffer(sci);
424}
425
426static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci)
427{
428 struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
429 int err;
430
431 if (segbuf->sb_sum.nblocks >= segbuf->sb_rest_blocks) {
432 err = nilfs_segctor_feed_segment(sci);
433 if (err)
434 return err;
435 segbuf = sci->sc_curseg;
436 }
437 err = nilfs_segbuf_extend_payload(segbuf, &sci->sc_super_root);
438 if (likely(!err))
439 segbuf->sb_sum.flags |= NILFS_SS_SR;
440 return err;
441}
442
443/*
444 * Functions for making segment summary and payloads
445 */
446static int nilfs_segctor_segsum_block_required(
447 struct nilfs_sc_info *sci, const struct nilfs_segsum_pointer *ssp,
448 unsigned binfo_size)
449{
450 unsigned blocksize = sci->sc_super->s_blocksize;
451 /* Size of finfo and binfo is enough small against blocksize */
452
453 return ssp->offset + binfo_size +
454 (!sci->sc_blk_cnt ? sizeof(struct nilfs_finfo) : 0) >
455 blocksize;
456}
457
458static void nilfs_segctor_begin_finfo(struct nilfs_sc_info *sci,
459 struct inode *inode)
460{
461 sci->sc_curseg->sb_sum.nfinfo++;
462 sci->sc_binfo_ptr = sci->sc_finfo_ptr;
463 nilfs_segctor_map_segsum_entry(
464 sci, &sci->sc_binfo_ptr, sizeof(struct nilfs_finfo));
465
466 if (inode->i_sb && !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
467 set_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
468 /* skip finfo */
469}
470
471static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci,
472 struct inode *inode)
473{
474 struct nilfs_finfo *finfo;
475 struct nilfs_inode_info *ii;
476 struct nilfs_segment_buffer *segbuf;
477
478 if (sci->sc_blk_cnt == 0)
479 return;
480
481 ii = NILFS_I(inode);
482 finfo = nilfs_segctor_map_segsum_entry(sci, &sci->sc_finfo_ptr,
483 sizeof(*finfo));
484 finfo->fi_ino = cpu_to_le64(inode->i_ino);
485 finfo->fi_nblocks = cpu_to_le32(sci->sc_blk_cnt);
486 finfo->fi_ndatablk = cpu_to_le32(sci->sc_datablk_cnt);
487 finfo->fi_cno = cpu_to_le64(ii->i_cno);
488
489 segbuf = sci->sc_curseg;
490 segbuf->sb_sum.sumbytes = sci->sc_binfo_ptr.offset +
491 sci->sc_super->s_blocksize * (segbuf->sb_sum.nsumblk - 1);
492 sci->sc_finfo_ptr = sci->sc_binfo_ptr;
493 sci->sc_blk_cnt = sci->sc_datablk_cnt = 0;
494}
495
496static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci,
497 struct buffer_head *bh,
498 struct inode *inode,
499 unsigned binfo_size)
500{
501 struct nilfs_segment_buffer *segbuf;
502 int required, err = 0;
503
504 retry:
505 segbuf = sci->sc_curseg;
506 required = nilfs_segctor_segsum_block_required(
507 sci, &sci->sc_binfo_ptr, binfo_size);
508 if (segbuf->sb_sum.nblocks + required + 1 > segbuf->sb_rest_blocks) {
509 nilfs_segctor_end_finfo(sci, inode);
510 err = nilfs_segctor_feed_segment(sci);
511 if (err)
512 return err;
513 goto retry;
514 }
515 if (unlikely(required)) {
516 err = nilfs_segbuf_extend_segsum(segbuf);
517 if (unlikely(err))
518 goto failed;
519 }
520 if (sci->sc_blk_cnt == 0)
521 nilfs_segctor_begin_finfo(sci, inode);
522
523 nilfs_segctor_map_segsum_entry(sci, &sci->sc_binfo_ptr, binfo_size);
524 /* Substitution to vblocknr is delayed until update_blocknr() */
525 nilfs_segbuf_add_file_buffer(segbuf, bh);
526 sci->sc_blk_cnt++;
527 failed:
528 return err;
529}
530
531static int nilfs_handle_bmap_error(int err, const char *fname,
532 struct inode *inode, struct super_block *sb)
533{
534 if (err == -EINVAL) {
535 nilfs_error(sb, fname, "broken bmap (inode=%lu)\n",
536 inode->i_ino);
537 err = -EIO;
538 }
539 return err;
540}
541
542/*
543 * Callback functions that enumerate, mark, and collect dirty blocks
544 */
545static int nilfs_collect_file_data(struct nilfs_sc_info *sci,
546 struct buffer_head *bh, struct inode *inode)
547{
548 int err;
549
550 err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
551 if (unlikely(err < 0))
552 return nilfs_handle_bmap_error(err, __func__, inode,
553 sci->sc_super);
554
555 err = nilfs_segctor_add_file_block(sci, bh, inode,
556 sizeof(struct nilfs_binfo_v));
557 if (!err)
558 sci->sc_datablk_cnt++;
559 return err;
560}
561
562static int nilfs_collect_file_node(struct nilfs_sc_info *sci,
563 struct buffer_head *bh,
564 struct inode *inode)
565{
566 int err;
567
568 err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
569 if (unlikely(err < 0))
570 return nilfs_handle_bmap_error(err, __func__, inode,
571 sci->sc_super);
572 return 0;
573}
574
575static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci,
576 struct buffer_head *bh,
577 struct inode *inode)
578{
579 WARN_ON(!buffer_dirty(bh));
580 return nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
581}
582
583static void nilfs_write_file_data_binfo(struct nilfs_sc_info *sci,
584 struct nilfs_segsum_pointer *ssp,
585 union nilfs_binfo *binfo)
586{
587 struct nilfs_binfo_v *binfo_v = nilfs_segctor_map_segsum_entry(
588 sci, ssp, sizeof(*binfo_v));
589 *binfo_v = binfo->bi_v;
590}
591
592static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci,
593 struct nilfs_segsum_pointer *ssp,
594 union nilfs_binfo *binfo)
595{
596 __le64 *vblocknr = nilfs_segctor_map_segsum_entry(
597 sci, ssp, sizeof(*vblocknr));
598 *vblocknr = binfo->bi_v.bi_vblocknr;
599}
600
601struct nilfs_sc_operations nilfs_sc_file_ops = {
602 .collect_data = nilfs_collect_file_data,
603 .collect_node = nilfs_collect_file_node,
604 .collect_bmap = nilfs_collect_file_bmap,
605 .write_data_binfo = nilfs_write_file_data_binfo,
606 .write_node_binfo = nilfs_write_file_node_binfo,
607};
608
609static int nilfs_collect_dat_data(struct nilfs_sc_info *sci,
610 struct buffer_head *bh, struct inode *inode)
611{
612 int err;
613
614 err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
615 if (unlikely(err < 0))
616 return nilfs_handle_bmap_error(err, __func__, inode,
617 sci->sc_super);
618
619 err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
620 if (!err)
621 sci->sc_datablk_cnt++;
622 return err;
623}
624
625static int nilfs_collect_dat_bmap(struct nilfs_sc_info *sci,
626 struct buffer_head *bh, struct inode *inode)
627{
628 WARN_ON(!buffer_dirty(bh));
629 return nilfs_segctor_add_file_block(sci, bh, inode,
630 sizeof(struct nilfs_binfo_dat));
631}
632
633static void nilfs_write_dat_data_binfo(struct nilfs_sc_info *sci,
634 struct nilfs_segsum_pointer *ssp,
635 union nilfs_binfo *binfo)
636{
637 __le64 *blkoff = nilfs_segctor_map_segsum_entry(sci, ssp,
638 sizeof(*blkoff));
639 *blkoff = binfo->bi_dat.bi_blkoff;
640}
641
642static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci,
643 struct nilfs_segsum_pointer *ssp,
644 union nilfs_binfo *binfo)
645{
646 struct nilfs_binfo_dat *binfo_dat =
647 nilfs_segctor_map_segsum_entry(sci, ssp, sizeof(*binfo_dat));
648 *binfo_dat = binfo->bi_dat;
649}
650
651struct nilfs_sc_operations nilfs_sc_dat_ops = {
652 .collect_data = nilfs_collect_dat_data,
653 .collect_node = nilfs_collect_file_node,
654 .collect_bmap = nilfs_collect_dat_bmap,
655 .write_data_binfo = nilfs_write_dat_data_binfo,
656 .write_node_binfo = nilfs_write_dat_node_binfo,
657};
658
659struct nilfs_sc_operations nilfs_sc_dsync_ops = {
660 .collect_data = nilfs_collect_file_data,
661 .collect_node = NULL,
662 .collect_bmap = NULL,
663 .write_data_binfo = nilfs_write_file_data_binfo,
664 .write_node_binfo = NULL,
665};
666
667static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
668 struct list_head *listp,
669 size_t nlimit,
670 loff_t start, loff_t end)
671{
672 struct address_space *mapping = inode->i_mapping;
673 struct pagevec pvec;
674 pgoff_t index = 0, last = ULONG_MAX;
675 size_t ndirties = 0;
676 int i;
677
678 if (unlikely(start != 0 || end != LLONG_MAX)) {
679 /*
680 * A valid range is given for sync-ing data pages. The
681 * range is rounded to per-page; extra dirty buffers
682 * may be included if blocksize < pagesize.
683 */
684 index = start >> PAGE_SHIFT;
685 last = end >> PAGE_SHIFT;
686 }
687 pagevec_init(&pvec, 0);
688 repeat:
689 if (unlikely(index > last) ||
690 !pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
691 min_t(pgoff_t, last - index,
692 PAGEVEC_SIZE - 1) + 1))
693 return ndirties;
694
695 for (i = 0; i < pagevec_count(&pvec); i++) {
696 struct buffer_head *bh, *head;
697 struct page *page = pvec.pages[i];
698
699 if (unlikely(page->index > last))
700 break;
701
702 if (mapping->host) {
703 lock_page(page);
704 if (!page_has_buffers(page))
705 create_empty_buffers(page,
706 1 << inode->i_blkbits, 0);
707 unlock_page(page);
708 }
709
710 bh = head = page_buffers(page);
711 do {
712 if (!buffer_dirty(bh))
713 continue;
714 get_bh(bh);
715 list_add_tail(&bh->b_assoc_buffers, listp);
716 ndirties++;
717 if (unlikely(ndirties >= nlimit)) {
718 pagevec_release(&pvec);
719 cond_resched();
720 return ndirties;
721 }
722 } while (bh = bh->b_this_page, bh != head);
723 }
724 pagevec_release(&pvec);
725 cond_resched();
726 goto repeat;
727}
728
729static void nilfs_lookup_dirty_node_buffers(struct inode *inode,
730 struct list_head *listp)
731{
732 struct nilfs_inode_info *ii = NILFS_I(inode);
733 struct address_space *mapping = &ii->i_btnode_cache;
734 struct pagevec pvec;
735 struct buffer_head *bh, *head;
736 unsigned int i;
737 pgoff_t index = 0;
738
739 pagevec_init(&pvec, 0);
740
741 while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
742 PAGEVEC_SIZE)) {
743 for (i = 0; i < pagevec_count(&pvec); i++) {
744 bh = head = page_buffers(pvec.pages[i]);
745 do {
746 if (buffer_dirty(bh)) {
747 get_bh(bh);
748 list_add_tail(&bh->b_assoc_buffers,
749 listp);
750 }
751 bh = bh->b_this_page;
752 } while (bh != head);
753 }
754 pagevec_release(&pvec);
755 cond_resched();
756 }
757}
758
759static void nilfs_dispose_list(struct nilfs_sb_info *sbi,
760 struct list_head *head, int force)
761{
762 struct nilfs_inode_info *ii, *n;
763 struct nilfs_inode_info *ivec[SC_N_INODEVEC], **pii;
764 unsigned nv = 0;
765
766 while (!list_empty(head)) {
767 spin_lock(&sbi->s_inode_lock);
768 list_for_each_entry_safe(ii, n, head, i_dirty) {
769 list_del_init(&ii->i_dirty);
770 if (force) {
771 if (unlikely(ii->i_bh)) {
772 brelse(ii->i_bh);
773 ii->i_bh = NULL;
774 }
775 } else if (test_bit(NILFS_I_DIRTY, &ii->i_state)) {
776 set_bit(NILFS_I_QUEUED, &ii->i_state);
777 list_add_tail(&ii->i_dirty,
778 &sbi->s_dirty_files);
779 continue;
780 }
781 ivec[nv++] = ii;
782 if (nv == SC_N_INODEVEC)
783 break;
784 }
785 spin_unlock(&sbi->s_inode_lock);
786
787 for (pii = ivec; nv > 0; pii++, nv--)
788 iput(&(*pii)->vfs_inode);
789 }
790}
791
792static int nilfs_test_metadata_dirty(struct nilfs_sb_info *sbi)
793{
794 struct the_nilfs *nilfs = sbi->s_nilfs;
795 int ret = 0;
796
797 if (nilfs_mdt_fetch_dirty(sbi->s_ifile))
798 ret++;
799 if (nilfs_mdt_fetch_dirty(nilfs->ns_cpfile))
800 ret++;
801 if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile))
802 ret++;
803 if (ret || nilfs_doing_gc())
804 if (nilfs_mdt_fetch_dirty(nilfs_dat_inode(nilfs)))
805 ret++;
806 return ret;
807}
808
809static int nilfs_segctor_clean(struct nilfs_sc_info *sci)
810{
811 return list_empty(&sci->sc_dirty_files) &&
812 !test_bit(NILFS_SC_DIRTY, &sci->sc_flags) &&
813 list_empty(&sci->sc_cleaning_segments) &&
814 (!nilfs_doing_gc() || list_empty(&sci->sc_gc_inodes));
815}
816
817static int nilfs_segctor_confirm(struct nilfs_sc_info *sci)
818{
819 struct nilfs_sb_info *sbi = sci->sc_sbi;
820 int ret = 0;
821
822 if (nilfs_test_metadata_dirty(sbi))
823 set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
824
825 spin_lock(&sbi->s_inode_lock);
826 if (list_empty(&sbi->s_dirty_files) && nilfs_segctor_clean(sci))
827 ret++;
828
829 spin_unlock(&sbi->s_inode_lock);
830 return ret;
831}
832
833static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
834{
835 struct nilfs_sb_info *sbi = sci->sc_sbi;
836 struct the_nilfs *nilfs = sbi->s_nilfs;
837
838 nilfs_mdt_clear_dirty(sbi->s_ifile);
839 nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
840 nilfs_mdt_clear_dirty(nilfs->ns_sufile);
841 nilfs_mdt_clear_dirty(nilfs_dat_inode(nilfs));
842}
843
844static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
845{
846 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
847 struct buffer_head *bh_cp;
848 struct nilfs_checkpoint *raw_cp;
849 int err;
850
851 /* XXX: this interface will be changed */
852 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 1,
853 &raw_cp, &bh_cp);
854 if (likely(!err)) {
855 /* The following code is duplicated with cpfile. But, it is
856 needed to collect the checkpoint even if it was not newly
857 created */
858 nilfs_mdt_mark_buffer_dirty(bh_cp);
859 nilfs_mdt_mark_dirty(nilfs->ns_cpfile);
860 nilfs_cpfile_put_checkpoint(
861 nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
862 } else
863 WARN_ON(err == -EINVAL || err == -ENOENT);
864
865 return err;
866}
867
868static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
869{
870 struct nilfs_sb_info *sbi = sci->sc_sbi;
871 struct the_nilfs *nilfs = sbi->s_nilfs;
872 struct buffer_head *bh_cp;
873 struct nilfs_checkpoint *raw_cp;
874 int err;
875
876 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0,
877 &raw_cp, &bh_cp);
878 if (unlikely(err)) {
879 WARN_ON(err == -EINVAL || err == -ENOENT);
880 goto failed_ibh;
881 }
882 raw_cp->cp_snapshot_list.ssl_next = 0;
883 raw_cp->cp_snapshot_list.ssl_prev = 0;
884 raw_cp->cp_inodes_count =
885 cpu_to_le64(atomic_read(&sbi->s_inodes_count));
886 raw_cp->cp_blocks_count =
887 cpu_to_le64(atomic_read(&sbi->s_blocks_count));
888 raw_cp->cp_nblk_inc =
889 cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
890 raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
891 raw_cp->cp_cno = cpu_to_le64(nilfs->ns_cno);
892
893 if (test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
894 nilfs_checkpoint_clear_minor(raw_cp);
895 else
896 nilfs_checkpoint_set_minor(raw_cp);
897
898 nilfs_write_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode, 1);
899 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
900 return 0;
901
902 failed_ibh:
903 return err;
904}
905
906static void nilfs_fill_in_file_bmap(struct inode *ifile,
907 struct nilfs_inode_info *ii)
908
909{
910 struct buffer_head *ibh;
911 struct nilfs_inode *raw_inode;
912
913 if (test_bit(NILFS_I_BMAP, &ii->i_state)) {
914 ibh = ii->i_bh;
915 BUG_ON(!ibh);
916 raw_inode = nilfs_ifile_map_inode(ifile, ii->vfs_inode.i_ino,
917 ibh);
918 nilfs_bmap_write(ii->i_bmap, raw_inode);
919 nilfs_ifile_unmap_inode(ifile, ii->vfs_inode.i_ino, ibh);
920 }
921}
922
923static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci,
924 struct inode *ifile)
925{
926 struct nilfs_inode_info *ii;
927
928 list_for_each_entry(ii, &sci->sc_dirty_files, i_dirty) {
929 nilfs_fill_in_file_bmap(ifile, ii);
930 set_bit(NILFS_I_COLLECTED, &ii->i_state);
931 }
932}
933
934/*
935 * CRC calculation routines
936 */
937static void nilfs_fill_in_super_root_crc(struct buffer_head *bh_sr, u32 seed)
938{
939 struct nilfs_super_root *raw_sr =
940 (struct nilfs_super_root *)bh_sr->b_data;
941 u32 crc;
942
943 crc = crc32_le(seed,
944 (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
945 NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
946 raw_sr->sr_sum = cpu_to_le32(crc);
947}
948
949static void nilfs_segctor_fill_in_checksums(struct nilfs_sc_info *sci,
950 u32 seed)
951{
952 struct nilfs_segment_buffer *segbuf;
953
954 if (sci->sc_super_root)
955 nilfs_fill_in_super_root_crc(sci->sc_super_root, seed);
956
957 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
958 nilfs_segbuf_fill_in_segsum_crc(segbuf, seed);
959 nilfs_segbuf_fill_in_data_crc(segbuf, seed);
960 }
961}
962
963static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
964 struct the_nilfs *nilfs)
965{
966 struct buffer_head *bh_sr = sci->sc_super_root;
967 struct nilfs_super_root *raw_sr =
968 (struct nilfs_super_root *)bh_sr->b_data;
969 unsigned isz = nilfs->ns_inode_size;
970
971 raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES);
972 raw_sr->sr_nongc_ctime
973 = cpu_to_le64(nilfs_doing_gc() ?
974 nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
975 raw_sr->sr_flags = 0;
976
977 nilfs_mdt_write_inode_direct(
978 nilfs_dat_inode(nilfs), bh_sr, NILFS_SR_DAT_OFFSET(isz));
979 nilfs_mdt_write_inode_direct(
980 nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(isz));
981 nilfs_mdt_write_inode_direct(
982 nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(isz));
983}
984
985static void nilfs_redirty_inodes(struct list_head *head)
986{
987 struct nilfs_inode_info *ii;
988
989 list_for_each_entry(ii, head, i_dirty) {
990 if (test_bit(NILFS_I_COLLECTED, &ii->i_state))
991 clear_bit(NILFS_I_COLLECTED, &ii->i_state);
992 }
993}
994
995static void nilfs_drop_collected_inodes(struct list_head *head)
996{
997 struct nilfs_inode_info *ii;
998
999 list_for_each_entry(ii, head, i_dirty) {
1000 if (!test_and_clear_bit(NILFS_I_COLLECTED, &ii->i_state))
1001 continue;
1002
1003 clear_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
1004 set_bit(NILFS_I_UPDATED, &ii->i_state);
1005 }
1006}
1007
1008static void nilfs_segctor_cancel_free_segments(struct nilfs_sc_info *sci,
1009 struct inode *sufile)
1010
1011{
1012 struct list_head *head = &sci->sc_cleaning_segments;
1013 struct nilfs_segment_entry *ent;
1014 int err;
1015
1016 list_for_each_entry(ent, head, list) {
1017 if (!(ent->flags & NILFS_SLH_FREED))
1018 break;
1019 err = nilfs_sufile_cancel_free(sufile, ent->segnum);
1020 WARN_ON(err); /* do not happen */
1021 ent->flags &= ~NILFS_SLH_FREED;
1022 }
1023}
1024
1025static int nilfs_segctor_prepare_free_segments(struct nilfs_sc_info *sci,
1026 struct inode *sufile)
1027{
1028 struct list_head *head = &sci->sc_cleaning_segments;
1029 struct nilfs_segment_entry *ent;
1030 int err;
1031
1032 list_for_each_entry(ent, head, list) {
1033 err = nilfs_sufile_free(sufile, ent->segnum);
1034 if (unlikely(err))
1035 return err;
1036 ent->flags |= NILFS_SLH_FREED;
1037 }
1038 return 0;
1039}
1040
1041static void nilfs_segctor_commit_free_segments(struct nilfs_sc_info *sci)
1042{
1043 nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
1044}
1045
1046static int nilfs_segctor_apply_buffers(struct nilfs_sc_info *sci,
1047 struct inode *inode,
1048 struct list_head *listp,
1049 int (*collect)(struct nilfs_sc_info *,
1050 struct buffer_head *,
1051 struct inode *))
1052{
1053 struct buffer_head *bh, *n;
1054 int err = 0;
1055
1056 if (collect) {
1057 list_for_each_entry_safe(bh, n, listp, b_assoc_buffers) {
1058 list_del_init(&bh->b_assoc_buffers);
1059 err = collect(sci, bh, inode);
1060 brelse(bh);
1061 if (unlikely(err))
1062 goto dispose_buffers;
1063 }
1064 return 0;
1065 }
1066
1067 dispose_buffers:
1068 while (!list_empty(listp)) {
1069 bh = list_entry(listp->next, struct buffer_head,
1070 b_assoc_buffers);
1071 list_del_init(&bh->b_assoc_buffers);
1072 brelse(bh);
1073 }
1074 return err;
1075}
1076
1077static size_t nilfs_segctor_buffer_rest(struct nilfs_sc_info *sci)
1078{
1079 /* Remaining number of blocks within segment buffer */
1080 return sci->sc_segbuf_nblocks -
1081 (sci->sc_nblk_this_inc + sci->sc_curseg->sb_sum.nblocks);
1082}
1083
1084static int nilfs_segctor_scan_file(struct nilfs_sc_info *sci,
1085 struct inode *inode,
1086 struct nilfs_sc_operations *sc_ops)
1087{
1088 LIST_HEAD(data_buffers);
1089 LIST_HEAD(node_buffers);
1090 int err;
1091
1092 if (!(sci->sc_stage.flags & NILFS_CF_NODE)) {
1093 size_t n, rest = nilfs_segctor_buffer_rest(sci);
1094
1095 n = nilfs_lookup_dirty_data_buffers(
1096 inode, &data_buffers, rest + 1, 0, LLONG_MAX);
1097 if (n > rest) {
1098 err = nilfs_segctor_apply_buffers(
1099 sci, inode, &data_buffers,
1100 sc_ops->collect_data);
1101 BUG_ON(!err); /* always receive -E2BIG or true error */
1102 goto break_or_fail;
1103 }
1104 }
1105 nilfs_lookup_dirty_node_buffers(inode, &node_buffers);
1106
1107 if (!(sci->sc_stage.flags & NILFS_CF_NODE)) {
1108 err = nilfs_segctor_apply_buffers(
1109 sci, inode, &data_buffers, sc_ops->collect_data);
1110 if (unlikely(err)) {
1111 /* dispose node list */
1112 nilfs_segctor_apply_buffers(
1113 sci, inode, &node_buffers, NULL);
1114 goto break_or_fail;
1115 }
1116 sci->sc_stage.flags |= NILFS_CF_NODE;
1117 }
1118 /* Collect node */
1119 err = nilfs_segctor_apply_buffers(
1120 sci, inode, &node_buffers, sc_ops->collect_node);
1121 if (unlikely(err))
1122 goto break_or_fail;
1123
1124 nilfs_bmap_lookup_dirty_buffers(NILFS_I(inode)->i_bmap, &node_buffers);
1125 err = nilfs_segctor_apply_buffers(
1126 sci, inode, &node_buffers, sc_ops->collect_bmap);
1127 if (unlikely(err))
1128 goto break_or_fail;
1129
1130 nilfs_segctor_end_finfo(sci, inode);
1131 sci->sc_stage.flags &= ~NILFS_CF_NODE;
1132
1133 break_or_fail:
1134 return err;
1135}
1136
1137static int nilfs_segctor_scan_file_dsync(struct nilfs_sc_info *sci,
1138 struct inode *inode)
1139{
1140 LIST_HEAD(data_buffers);
1141 size_t n, rest = nilfs_segctor_buffer_rest(sci);
1142 int err;
1143
1144 n = nilfs_lookup_dirty_data_buffers(inode, &data_buffers, rest + 1,
1145 sci->sc_dsync_start,
1146 sci->sc_dsync_end);
1147
1148 err = nilfs_segctor_apply_buffers(sci, inode, &data_buffers,
1149 nilfs_collect_file_data);
1150 if (!err) {
1151 nilfs_segctor_end_finfo(sci, inode);
1152 BUG_ON(n > rest);
1153 /* always receive -E2BIG or true error if n > rest */
1154 }
1155 return err;
1156}
1157
1158static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
1159{
1160 struct nilfs_sb_info *sbi = sci->sc_sbi;
1161 struct the_nilfs *nilfs = sbi->s_nilfs;
1162 struct list_head *head;
1163 struct nilfs_inode_info *ii;
1164 int err = 0;
1165
1166 switch (sci->sc_stage.scnt) {
1167 case NILFS_ST_INIT:
1168 /* Pre-processes */
1169 sci->sc_stage.flags = 0;
1170
1171 if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags)) {
1172 sci->sc_nblk_inc = 0;
1173 sci->sc_curseg->sb_sum.flags = NILFS_SS_LOGBGN;
1174 if (mode == SC_LSEG_DSYNC) {
1175 sci->sc_stage.scnt = NILFS_ST_DSYNC;
1176 goto dsync_mode;
1177 }
1178 }
1179
1180 sci->sc_stage.dirty_file_ptr = NULL;
1181 sci->sc_stage.gc_inode_ptr = NULL;
1182 if (mode == SC_FLUSH_DAT) {
1183 sci->sc_stage.scnt = NILFS_ST_DAT;
1184 goto dat_stage;
1185 }
1186 sci->sc_stage.scnt++; /* Fall through */
1187 case NILFS_ST_GC:
1188 if (nilfs_doing_gc()) {
1189 head = &sci->sc_gc_inodes;
1190 ii = list_prepare_entry(sci->sc_stage.gc_inode_ptr,
1191 head, i_dirty);
1192 list_for_each_entry_continue(ii, head, i_dirty) {
1193 err = nilfs_segctor_scan_file(
1194 sci, &ii->vfs_inode,
1195 &nilfs_sc_file_ops);
1196 if (unlikely(err)) {
1197 sci->sc_stage.gc_inode_ptr = list_entry(
1198 ii->i_dirty.prev,
1199 struct nilfs_inode_info,
1200 i_dirty);
1201 goto break_or_fail;
1202 }
1203 set_bit(NILFS_I_COLLECTED, &ii->i_state);
1204 }
1205 sci->sc_stage.gc_inode_ptr = NULL;
1206 }
1207 sci->sc_stage.scnt++; /* Fall through */
1208 case NILFS_ST_FILE:
1209 head = &sci->sc_dirty_files;
1210 ii = list_prepare_entry(sci->sc_stage.dirty_file_ptr, head,
1211 i_dirty);
1212 list_for_each_entry_continue(ii, head, i_dirty) {
1213 clear_bit(NILFS_I_DIRTY, &ii->i_state);
1214
1215 err = nilfs_segctor_scan_file(sci, &ii->vfs_inode,
1216 &nilfs_sc_file_ops);
1217 if (unlikely(err)) {
1218 sci->sc_stage.dirty_file_ptr =
1219 list_entry(ii->i_dirty.prev,
1220 struct nilfs_inode_info,
1221 i_dirty);
1222 goto break_or_fail;
1223 }
1224 /* sci->sc_stage.dirty_file_ptr = NILFS_I(inode); */
1225 /* XXX: required ? */
1226 }
1227 sci->sc_stage.dirty_file_ptr = NULL;
1228 if (mode == SC_FLUSH_FILE) {
1229 sci->sc_stage.scnt = NILFS_ST_DONE;
1230 return 0;
1231 }
1232 sci->sc_stage.scnt++;
1233 sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED;
1234 /* Fall through */
1235 case NILFS_ST_IFILE:
1236 err = nilfs_segctor_scan_file(sci, sbi->s_ifile,
1237 &nilfs_sc_file_ops);
1238 if (unlikely(err))
1239 break;
1240 sci->sc_stage.scnt++;
1241 /* Creating a checkpoint */
1242 err = nilfs_segctor_create_checkpoint(sci);
1243 if (unlikely(err))
1244 break;
1245 /* Fall through */
1246 case NILFS_ST_CPFILE:
1247 err = nilfs_segctor_scan_file(sci, nilfs->ns_cpfile,
1248 &nilfs_sc_file_ops);
1249 if (unlikely(err))
1250 break;
1251 sci->sc_stage.scnt++; /* Fall through */
1252 case NILFS_ST_SUFILE:
1253 err = nilfs_segctor_prepare_free_segments(sci,
1254 nilfs->ns_sufile);
1255 if (unlikely(err))
1256 break;
1257 err = nilfs_segctor_scan_file(sci, nilfs->ns_sufile,
1258 &nilfs_sc_file_ops);
1259 if (unlikely(err))
1260 break;
1261 sci->sc_stage.scnt++; /* Fall through */
1262 case NILFS_ST_DAT:
1263 dat_stage:
1264 err = nilfs_segctor_scan_file(sci, nilfs_dat_inode(nilfs),
1265 &nilfs_sc_dat_ops);
1266 if (unlikely(err))
1267 break;
1268 if (mode == SC_FLUSH_DAT) {
1269 sci->sc_stage.scnt = NILFS_ST_DONE;
1270 return 0;
1271 }
1272 sci->sc_stage.scnt++; /* Fall through */
1273 case NILFS_ST_SR:
1274 if (mode == SC_LSEG_SR) {
1275 /* Appending a super root */
1276 err = nilfs_segctor_add_super_root(sci);
1277 if (unlikely(err))
1278 break;
1279 }
1280 /* End of a logical segment */
1281 sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
1282 sci->sc_stage.scnt = NILFS_ST_DONE;
1283 return 0;
1284 case NILFS_ST_DSYNC:
1285 dsync_mode:
1286 sci->sc_curseg->sb_sum.flags |= NILFS_SS_SYNDT;
1287 ii = sci->sc_dsync_inode;
1288 if (!test_bit(NILFS_I_BUSY, &ii->i_state))
1289 break;
1290
1291 err = nilfs_segctor_scan_file_dsync(sci, &ii->vfs_inode);
1292 if (unlikely(err))
1293 break;
1294 sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
1295 sci->sc_stage.scnt = NILFS_ST_DONE;
1296 return 0;
1297 case NILFS_ST_DONE:
1298 return 0;
1299 default:
1300 BUG();
1301 }
1302
1303 break_or_fail:
1304 return err;
1305}
1306
1307static int nilfs_touch_segusage(struct inode *sufile, __u64 segnum)
1308{
1309 struct buffer_head *bh_su;
1310 struct nilfs_segment_usage *raw_su;
1311 int err;
1312
1313 err = nilfs_sufile_get_segment_usage(sufile, segnum, &raw_su, &bh_su);
1314 if (unlikely(err))
1315 return err;
1316 nilfs_mdt_mark_buffer_dirty(bh_su);
1317 nilfs_mdt_mark_dirty(sufile);
1318 nilfs_sufile_put_segment_usage(sufile, segnum, bh_su);
1319 return 0;
1320}
1321
1322static int nilfs_segctor_begin_construction(struct nilfs_sc_info *sci,
1323 struct the_nilfs *nilfs)
1324{
1325 struct nilfs_segment_buffer *segbuf, *n;
1326 __u64 nextnum;
1327 int err;
1328
1329 if (list_empty(&sci->sc_segbufs)) {
1330 segbuf = nilfs_segbuf_new(sci->sc_super);
1331 if (unlikely(!segbuf))
1332 return -ENOMEM;
1333 list_add(&segbuf->sb_list, &sci->sc_segbufs);
1334 } else
1335 segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
1336
1337 nilfs_segbuf_map(segbuf, nilfs->ns_segnum, nilfs->ns_pseg_offset,
1338 nilfs);
1339
1340 if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) {
1341 nilfs_shift_to_next_segment(nilfs);
1342 nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs);
1343 }
1344 sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks;
1345
1346 err = nilfs_touch_segusage(nilfs->ns_sufile, segbuf->sb_segnum);
1347 if (unlikely(err))
1348 return err;
1349
1350 if (nilfs->ns_segnum == nilfs->ns_nextnum) {
1351 /* Start from the head of a new full segment */
1352 err = nilfs_sufile_alloc(nilfs->ns_sufile, &nextnum);
1353 if (unlikely(err))
1354 return err;
1355 } else
1356 nextnum = nilfs->ns_nextnum;
1357
1358 segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq;
1359 nilfs_segbuf_set_next_segnum(segbuf, nextnum, nilfs);
1360
1361 /* truncating segment buffers */
1362 list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs,
1363 sb_list) {
1364 list_del_init(&segbuf->sb_list);
1365 nilfs_segbuf_free(segbuf);
1366 }
1367 return 0;
1368}
1369
1370static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci,
1371 struct the_nilfs *nilfs, int nadd)
1372{
1373 struct nilfs_segment_buffer *segbuf, *prev, *n;
1374 struct inode *sufile = nilfs->ns_sufile;
1375 __u64 nextnextnum;
1376 LIST_HEAD(list);
1377 int err, ret, i;
1378
1379 prev = NILFS_LAST_SEGBUF(&sci->sc_segbufs);
1380 /*
1381 * Since the segment specified with nextnum might be allocated during
1382 * the previous construction, the buffer including its segusage may
1383 * not be dirty. The following call ensures that the buffer is dirty
1384 * and will pin the buffer on memory until the sufile is written.
1385 */
1386 err = nilfs_touch_segusage(sufile, prev->sb_nextnum);
1387 if (unlikely(err))
1388 return err;
1389
1390 for (i = 0; i < nadd; i++) {
1391 /* extend segment info */
1392 err = -ENOMEM;
1393 segbuf = nilfs_segbuf_new(sci->sc_super);
1394 if (unlikely(!segbuf))
1395 goto failed;
1396
1397 /* map this buffer to region of segment on-disk */
1398 nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs);
1399 sci->sc_segbuf_nblocks += segbuf->sb_rest_blocks;
1400
1401 /* allocate the next next full segment */
1402 err = nilfs_sufile_alloc(sufile, &nextnextnum);
1403 if (unlikely(err))
1404 goto failed_segbuf;
1405
1406 segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq + 1;
1407 nilfs_segbuf_set_next_segnum(segbuf, nextnextnum, nilfs);
1408
1409 list_add_tail(&segbuf->sb_list, &list);
1410 prev = segbuf;
1411 }
1412 list_splice(&list, sci->sc_segbufs.prev);
1413 return 0;
1414
1415 failed_segbuf:
1416 nilfs_segbuf_free(segbuf);
1417 failed:
1418 list_for_each_entry_safe(segbuf, n, &list, sb_list) {
1419 ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
1420 WARN_ON(ret); /* never fails */
1421 list_del_init(&segbuf->sb_list);
1422 nilfs_segbuf_free(segbuf);
1423 }
1424 return err;
1425}
1426
1427static void nilfs_segctor_free_incomplete_segments(struct nilfs_sc_info *sci,
1428 struct the_nilfs *nilfs)
1429{
1430 struct nilfs_segment_buffer *segbuf;
1431 int ret, done = 0;
1432
1433 segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
1434 if (nilfs->ns_nextnum != segbuf->sb_nextnum) {
1435 ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum);
1436 WARN_ON(ret); /* never fails */
1437 }
1438 if (segbuf->sb_io_error) {
1439 /* Case 1: The first segment failed */
1440 if (segbuf->sb_pseg_start != segbuf->sb_fseg_start)
1441 /* Case 1a: Partial segment appended into an existing
1442 segment */
1443 nilfs_terminate_segment(nilfs, segbuf->sb_fseg_start,
1444 segbuf->sb_fseg_end);
1445 else /* Case 1b: New full segment */
1446 set_nilfs_discontinued(nilfs);
1447 done++;
1448 }
1449
1450 list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
1451 ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum);
1452 WARN_ON(ret); /* never fails */
1453 if (!done && segbuf->sb_io_error) {
1454 if (segbuf->sb_segnum != nilfs->ns_nextnum)
1455 /* Case 2: extended segment (!= next) failed */
1456 nilfs_sufile_set_error(nilfs->ns_sufile,
1457 segbuf->sb_segnum);
1458 done++;
1459 }
1460 }
1461}
1462
1463static void nilfs_segctor_clear_segment_buffers(struct nilfs_sc_info *sci)
1464{
1465 struct nilfs_segment_buffer *segbuf;
1466
1467 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list)
1468 nilfs_segbuf_clear(segbuf);
1469 sci->sc_super_root = NULL;
1470}
1471
1472static void nilfs_segctor_destroy_segment_buffers(struct nilfs_sc_info *sci)
1473{
1474 struct nilfs_segment_buffer *segbuf;
1475
1476 while (!list_empty(&sci->sc_segbufs)) {
1477 segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
1478 list_del_init(&segbuf->sb_list);
1479 nilfs_segbuf_free(segbuf);
1480 }
1481 /* sci->sc_curseg = NULL; */
1482}
1483
1484static void nilfs_segctor_end_construction(struct nilfs_sc_info *sci,
1485 struct the_nilfs *nilfs, int err)
1486{
1487 if (unlikely(err)) {
1488 nilfs_segctor_free_incomplete_segments(sci, nilfs);
1489 nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile);
1490 }
1491 nilfs_segctor_clear_segment_buffers(sci);
1492}
1493
1494static void nilfs_segctor_update_segusage(struct nilfs_sc_info *sci,
1495 struct inode *sufile)
1496{
1497 struct nilfs_segment_buffer *segbuf;
1498 struct buffer_head *bh_su;
1499 struct nilfs_segment_usage *raw_su;
1500 unsigned long live_blocks;
1501 int ret;
1502
1503 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
1504 ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
1505 &raw_su, &bh_su);
1506 WARN_ON(ret); /* always succeed because bh_su is dirty */
1507 live_blocks = segbuf->sb_sum.nblocks +
1508 (segbuf->sb_pseg_start - segbuf->sb_fseg_start);
1509 raw_su->su_lastmod = cpu_to_le64(sci->sc_seg_ctime);
1510 raw_su->su_nblocks = cpu_to_le32(live_blocks);
1511 nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum,
1512 bh_su);
1513 }
1514}
1515
1516static void nilfs_segctor_cancel_segusage(struct nilfs_sc_info *sci,
1517 struct inode *sufile)
1518{
1519 struct nilfs_segment_buffer *segbuf;
1520 struct buffer_head *bh_su;
1521 struct nilfs_segment_usage *raw_su;
1522 int ret;
1523
1524 segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
1525 ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
1526 &raw_su, &bh_su);
1527 WARN_ON(ret); /* always succeed because bh_su is dirty */
1528 raw_su->su_nblocks = cpu_to_le32(segbuf->sb_pseg_start -
1529 segbuf->sb_fseg_start);
1530 nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum, bh_su);
1531
1532 list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
1533 ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
1534 &raw_su, &bh_su);
1535 WARN_ON(ret); /* always succeed */
1536 raw_su->su_nblocks = 0;
1537 nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum,
1538 bh_su);
1539 }
1540}
1541
1542static void nilfs_segctor_truncate_segments(struct nilfs_sc_info *sci,
1543 struct nilfs_segment_buffer *last,
1544 struct inode *sufile)
1545{
1546 struct nilfs_segment_buffer *segbuf = last, *n;
1547 int ret;
1548
1549 list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs,
1550 sb_list) {
1551 list_del_init(&segbuf->sb_list);
1552 sci->sc_segbuf_nblocks -= segbuf->sb_rest_blocks;
1553 ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
1554 WARN_ON(ret);
1555 nilfs_segbuf_free(segbuf);
1556 }
1557}
1558
1559
1560static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
1561 struct the_nilfs *nilfs, int mode)
1562{
1563 struct nilfs_cstage prev_stage = sci->sc_stage;
1564 int err, nadd = 1;
1565
1566 /* Collection retry loop */
1567 for (;;) {
1568 sci->sc_super_root = NULL;
1569 sci->sc_nblk_this_inc = 0;
1570 sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
1571
1572 err = nilfs_segctor_reset_segment_buffer(sci);
1573 if (unlikely(err))
1574 goto failed;
1575
1576 err = nilfs_segctor_collect_blocks(sci, mode);
1577 sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks;
1578 if (!err)
1579 break;
1580
1581 if (unlikely(err != -E2BIG))
1582 goto failed;
1583
1584 /* The current segment is filled up */
1585 if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
1586 break;
1587
1588 nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile);
1589 nilfs_segctor_clear_segment_buffers(sci);
1590
1591 err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
1592 if (unlikely(err))
1593 return err;
1594
1595 nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
1596 sci->sc_stage = prev_stage;
1597 }
1598 nilfs_segctor_truncate_segments(sci, sci->sc_curseg, nilfs->ns_sufile);
1599 return 0;
1600
1601 failed:
1602 return err;
1603}
1604
1605static void nilfs_list_replace_buffer(struct buffer_head *old_bh,
1606 struct buffer_head *new_bh)
1607{
1608 BUG_ON(!list_empty(&new_bh->b_assoc_buffers));
1609
1610 list_replace_init(&old_bh->b_assoc_buffers, &new_bh->b_assoc_buffers);
1611 /* The caller must release old_bh */
1612}
1613
1614static int
1615nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
1616 struct nilfs_segment_buffer *segbuf,
1617 int mode)
1618{
1619 struct inode *inode = NULL;
1620 sector_t blocknr;
1621 unsigned long nfinfo = segbuf->sb_sum.nfinfo;
1622 unsigned long nblocks = 0, ndatablk = 0;
1623 struct nilfs_sc_operations *sc_op = NULL;
1624 struct nilfs_segsum_pointer ssp;
1625 struct nilfs_finfo *finfo = NULL;
1626 union nilfs_binfo binfo;
1627 struct buffer_head *bh, *bh_org;
1628 ino_t ino = 0;
1629 int err = 0;
1630
1631 if (!nfinfo)
1632 goto out;
1633
1634 blocknr = segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk;
1635 ssp.bh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers);
1636 ssp.offset = sizeof(struct nilfs_segment_summary);
1637
1638 list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
1639 if (bh == sci->sc_super_root)
1640 break;
1641 if (!finfo) {
1642 finfo = nilfs_segctor_map_segsum_entry(
1643 sci, &ssp, sizeof(*finfo));
1644 ino = le64_to_cpu(finfo->fi_ino);
1645 nblocks = le32_to_cpu(finfo->fi_nblocks);
1646 ndatablk = le32_to_cpu(finfo->fi_ndatablk);
1647
1648 if (buffer_nilfs_node(bh))
1649 inode = NILFS_BTNC_I(bh->b_page->mapping);
1650 else
1651 inode = NILFS_AS_I(bh->b_page->mapping);
1652
1653 if (mode == SC_LSEG_DSYNC)
1654 sc_op = &nilfs_sc_dsync_ops;
1655 else if (ino == NILFS_DAT_INO)
1656 sc_op = &nilfs_sc_dat_ops;
1657 else /* file blocks */
1658 sc_op = &nilfs_sc_file_ops;
1659 }
1660 bh_org = bh;
1661 get_bh(bh_org);
1662 err = nilfs_bmap_assign(NILFS_I(inode)->i_bmap, &bh, blocknr,
1663 &binfo);
1664 if (bh != bh_org)
1665 nilfs_list_replace_buffer(bh_org, bh);
1666 brelse(bh_org);
1667 if (unlikely(err))
1668 goto failed_bmap;
1669
1670 if (ndatablk > 0)
1671 sc_op->write_data_binfo(sci, &ssp, &binfo);
1672 else
1673 sc_op->write_node_binfo(sci, &ssp, &binfo);
1674
1675 blocknr++;
1676 if (--nblocks == 0) {
1677 finfo = NULL;
1678 if (--nfinfo == 0)
1679 break;
1680 } else if (ndatablk > 0)
1681 ndatablk--;
1682 }
1683 out:
1684 return 0;
1685
1686 failed_bmap:
1687 err = nilfs_handle_bmap_error(err, __func__, inode, sci->sc_super);
1688 return err;
1689}
1690
1691static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode)
1692{
1693 struct nilfs_segment_buffer *segbuf;
1694 int err;
1695
1696 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
1697 err = nilfs_segctor_update_payload_blocknr(sci, segbuf, mode);
1698 if (unlikely(err))
1699 return err;
1700 nilfs_segbuf_fill_in_segsum(segbuf);
1701 }
1702 return 0;
1703}
1704
1705static int
1706nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out)
1707{
1708 struct page *clone_page;
1709 struct buffer_head *bh, *head, *bh2;
1710 void *kaddr;
1711
1712 bh = head = page_buffers(page);
1713
1714 clone_page = nilfs_alloc_private_page(bh->b_bdev, bh->b_size, 0);
1715 if (unlikely(!clone_page))
1716 return -ENOMEM;
1717
1718 bh2 = page_buffers(clone_page);
1719 kaddr = kmap_atomic(page, KM_USER0);
1720 do {
1721 if (list_empty(&bh->b_assoc_buffers))
1722 continue;
1723 get_bh(bh2);
1724 page_cache_get(clone_page); /* for each bh */
1725 memcpy(bh2->b_data, kaddr + bh_offset(bh), bh2->b_size);
1726 bh2->b_blocknr = bh->b_blocknr;
1727 list_replace(&bh->b_assoc_buffers, &bh2->b_assoc_buffers);
1728 list_add_tail(&bh->b_assoc_buffers, out);
1729 } while (bh = bh->b_this_page, bh2 = bh2->b_this_page, bh != head);
1730 kunmap_atomic(kaddr, KM_USER0);
1731
1732 if (!TestSetPageWriteback(clone_page))
1733 inc_zone_page_state(clone_page, NR_WRITEBACK);
1734 unlock_page(clone_page);
1735
1736 return 0;
1737}
1738
1739static int nilfs_test_page_to_be_frozen(struct page *page)
1740{
1741 struct address_space *mapping = page->mapping;
1742
1743 if (!mapping || !mapping->host || S_ISDIR(mapping->host->i_mode))
1744 return 0;
1745
1746 if (page_mapped(page)) {
1747 ClearPageChecked(page);
1748 return 1;
1749 }
1750 return PageChecked(page);
1751}
1752
1753static int nilfs_begin_page_io(struct page *page, struct list_head *out)
1754{
1755 if (!page || PageWriteback(page))
1756 /* For split b-tree node pages, this function may be called
1757 twice. We ignore the 2nd or later calls by this check. */
1758 return 0;
1759
1760 lock_page(page);
1761 clear_page_dirty_for_io(page);
1762 set_page_writeback(page);
1763 unlock_page(page);
1764
1765 if (nilfs_test_page_to_be_frozen(page)) {
1766 int err = nilfs_copy_replace_page_buffers(page, out);
1767 if (unlikely(err))
1768 return err;
1769 }
1770 return 0;
1771}
1772
1773static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
1774 struct page **failed_page)
1775{
1776 struct nilfs_segment_buffer *segbuf;
1777 struct page *bd_page = NULL, *fs_page = NULL;
1778 struct list_head *list = &sci->sc_copied_buffers;
1779 int err;
1780
1781 *failed_page = NULL;
1782 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
1783 struct buffer_head *bh;
1784
1785 list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
1786 b_assoc_buffers) {
1787 if (bh->b_page != bd_page) {
1788 if (bd_page) {
1789 lock_page(bd_page);
1790 clear_page_dirty_for_io(bd_page);
1791 set_page_writeback(bd_page);
1792 unlock_page(bd_page);
1793 }
1794 bd_page = bh->b_page;
1795 }
1796 }
1797
1798 list_for_each_entry(bh, &segbuf->sb_payload_buffers,
1799 b_assoc_buffers) {
1800 if (bh == sci->sc_super_root) {
1801 if (bh->b_page != bd_page) {
1802 lock_page(bd_page);
1803 clear_page_dirty_for_io(bd_page);
1804 set_page_writeback(bd_page);
1805 unlock_page(bd_page);
1806 bd_page = bh->b_page;
1807 }
1808 break;
1809 }
1810 if (bh->b_page != fs_page) {
1811 err = nilfs_begin_page_io(fs_page, list);
1812 if (unlikely(err)) {
1813 *failed_page = fs_page;
1814 goto out;
1815 }
1816 fs_page = bh->b_page;
1817 }
1818 }
1819 }
1820 if (bd_page) {
1821 lock_page(bd_page);
1822 clear_page_dirty_for_io(bd_page);
1823 set_page_writeback(bd_page);
1824 unlock_page(bd_page);
1825 }
1826 err = nilfs_begin_page_io(fs_page, list);
1827 if (unlikely(err))
1828 *failed_page = fs_page;
1829 out:
1830 return err;
1831}
1832
1833static int nilfs_segctor_write(struct nilfs_sc_info *sci,
1834 struct backing_dev_info *bdi)
1835{
1836 struct nilfs_segment_buffer *segbuf;
1837 struct nilfs_write_info wi;
1838 int err, res;
1839
1840 wi.sb = sci->sc_super;
1841 wi.bh_sr = sci->sc_super_root;
1842 wi.bdi = bdi;
1843
1844 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
1845 nilfs_segbuf_prepare_write(segbuf, &wi);
1846 err = nilfs_segbuf_write(segbuf, &wi);
1847
1848 res = nilfs_segbuf_wait(segbuf, &wi);
1849 err = unlikely(err) ? : res;
1850 if (unlikely(err))
1851 return err;
1852 }
1853 return 0;
1854}
1855
1856static int nilfs_page_has_uncleared_buffer(struct page *page)
1857{
1858 struct buffer_head *head, *bh;
1859
1860 head = bh = page_buffers(page);
1861 do {
1862 if (buffer_dirty(bh) && !list_empty(&bh->b_assoc_buffers))
1863 return 1;
1864 bh = bh->b_this_page;
1865 } while (bh != head);
1866 return 0;
1867}
1868
1869static void __nilfs_end_page_io(struct page *page, int err)
1870{
1871 if (!err) {
1872 if (!nilfs_page_buffers_clean(page))
1873 __set_page_dirty_nobuffers(page);
1874 ClearPageError(page);
1875 } else {
1876 __set_page_dirty_nobuffers(page);
1877 SetPageError(page);
1878 }
1879
1880 if (buffer_nilfs_allocated(page_buffers(page))) {
1881 if (TestClearPageWriteback(page))
1882 dec_zone_page_state(page, NR_WRITEBACK);
1883 } else
1884 end_page_writeback(page);
1885}
1886
1887static void nilfs_end_page_io(struct page *page, int err)
1888{
1889 if (!page)
1890 return;
1891
1892 if (buffer_nilfs_node(page_buffers(page)) &&
1893 nilfs_page_has_uncleared_buffer(page))
1894 /* For b-tree node pages, this function may be called twice
1895 or more because they might be split in a segment.
1896 This check assures that cleanup has been done for all
1897 buffers in a split btnode page. */
1898 return;
1899
1900 __nilfs_end_page_io(page, err);
1901}
1902
1903static void nilfs_clear_copied_buffers(struct list_head *list, int err)
1904{
1905 struct buffer_head *bh, *head;
1906 struct page *page;
1907
1908 while (!list_empty(list)) {
1909 bh = list_entry(list->next, struct buffer_head,
1910 b_assoc_buffers);
1911 page = bh->b_page;
1912 page_cache_get(page);
1913 head = bh = page_buffers(page);
1914 do {
1915 if (!list_empty(&bh->b_assoc_buffers)) {
1916 list_del_init(&bh->b_assoc_buffers);
1917 if (!err) {
1918 set_buffer_uptodate(bh);
1919 clear_buffer_dirty(bh);
1920 clear_buffer_nilfs_volatile(bh);
1921 }
1922 brelse(bh); /* for b_assoc_buffers */
1923 }
1924 } while ((bh = bh->b_this_page) != head);
1925
1926 __nilfs_end_page_io(page, err);
1927 page_cache_release(page);
1928 }
1929}
1930
1931static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci,
1932 struct page *failed_page, int err)
1933{
1934 struct nilfs_segment_buffer *segbuf;
1935 struct page *bd_page = NULL, *fs_page = NULL;
1936
1937 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
1938 struct buffer_head *bh;
1939
1940 list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
1941 b_assoc_buffers) {
1942 if (bh->b_page != bd_page) {
1943 if (bd_page)
1944 end_page_writeback(bd_page);
1945 bd_page = bh->b_page;
1946 }
1947 }
1948
1949 list_for_each_entry(bh, &segbuf->sb_payload_buffers,
1950 b_assoc_buffers) {
1951 if (bh == sci->sc_super_root) {
1952 if (bh->b_page != bd_page) {
1953 end_page_writeback(bd_page);
1954 bd_page = bh->b_page;
1955 }
1956 break;
1957 }
1958 if (bh->b_page != fs_page) {
1959 nilfs_end_page_io(fs_page, err);
1960 if (unlikely(fs_page == failed_page))
1961 goto done;
1962 fs_page = bh->b_page;
1963 }
1964 }
1965 }
1966 if (bd_page)
1967 end_page_writeback(bd_page);
1968
1969 nilfs_end_page_io(fs_page, err);
1970 done:
1971 nilfs_clear_copied_buffers(&sci->sc_copied_buffers, err);
1972}
1973
1974static void nilfs_set_next_segment(struct the_nilfs *nilfs,
1975 struct nilfs_segment_buffer *segbuf)
1976{
1977 nilfs->ns_segnum = segbuf->sb_segnum;
1978 nilfs->ns_nextnum = segbuf->sb_nextnum;
1979 nilfs->ns_pseg_offset = segbuf->sb_pseg_start - segbuf->sb_fseg_start
1980 + segbuf->sb_sum.nblocks;
1981 nilfs->ns_seg_seq = segbuf->sb_sum.seg_seq;
1982 nilfs->ns_ctime = segbuf->sb_sum.ctime;
1983}
1984
1985static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1986{
1987 struct nilfs_segment_buffer *segbuf;
1988 struct page *bd_page = NULL, *fs_page = NULL;
1989 struct nilfs_sb_info *sbi = sci->sc_sbi;
1990 struct the_nilfs *nilfs = sbi->s_nilfs;
1991 int update_sr = (sci->sc_super_root != NULL);
1992
1993 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
1994 struct buffer_head *bh;
1995
1996 list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
1997 b_assoc_buffers) {
1998 set_buffer_uptodate(bh);
1999 clear_buffer_dirty(bh);
2000 if (bh->b_page != bd_page) {
2001 if (bd_page)
2002 end_page_writeback(bd_page);
2003 bd_page = bh->b_page;
2004 }
2005 }
2006 /*
2007 * We assume that the buffers which belong to the same page
2008 * continue over the buffer list.
2009 * Under this assumption, the last BHs of pages is
2010 * identifiable by the discontinuity of bh->b_page
2011 * (page != fs_page).
2012 *
2013 * For B-tree node blocks, however, this assumption is not
2014 * guaranteed. The cleanup code of B-tree node pages needs
2015 * special care.
2016 */
2017 list_for_each_entry(bh, &segbuf->sb_payload_buffers,
2018 b_assoc_buffers) {
2019 set_buffer_uptodate(bh);
2020 clear_buffer_dirty(bh);
2021 clear_buffer_nilfs_volatile(bh);
2022 if (bh == sci->sc_super_root) {
2023 if (bh->b_page != bd_page) {
2024 end_page_writeback(bd_page);
2025 bd_page = bh->b_page;
2026 }
2027 break;
2028 }
2029 if (bh->b_page != fs_page) {
2030 nilfs_end_page_io(fs_page, 0);
2031 fs_page = bh->b_page;
2032 }
2033 }
2034
2035 if (!NILFS_SEG_SIMPLEX(&segbuf->sb_sum)) {
2036 if (NILFS_SEG_LOGBGN(&segbuf->sb_sum)) {
2037 set_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
2038 sci->sc_lseg_stime = jiffies;
2039 }
2040 if (NILFS_SEG_LOGEND(&segbuf->sb_sum))
2041 clear_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
2042 }
2043 }
2044 /*
2045 * Since pages may continue over multiple segment buffers,
2046 * end of the last page must be checked outside of the loop.
2047 */
2048 if (bd_page)
2049 end_page_writeback(bd_page);
2050
2051 nilfs_end_page_io(fs_page, 0);
2052
2053 nilfs_clear_copied_buffers(&sci->sc_copied_buffers, 0);
2054
2055 nilfs_drop_collected_inodes(&sci->sc_dirty_files);
2056
2057 if (nilfs_doing_gc()) {
2058 nilfs_drop_collected_inodes(&sci->sc_gc_inodes);
2059 if (update_sr)
2060 nilfs_commit_gcdat_inode(nilfs);
2061 } else
2062 nilfs->ns_nongc_ctime = sci->sc_seg_ctime;
2063
2064 sci->sc_nblk_inc += sci->sc_nblk_this_inc;
2065
2066 segbuf = NILFS_LAST_SEGBUF(&sci->sc_segbufs);
2067 nilfs_set_next_segment(nilfs, segbuf);
2068
2069 if (update_sr) {
2070 nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start,
2071 segbuf->sb_sum.seg_seq, nilfs->ns_cno++);
2072 sbi->s_super->s_dirt = 1;
2073
2074 clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
2075 clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
2076 set_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
2077 } else
2078 clear_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
2079}
2080
2081static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
2082 struct nilfs_sb_info *sbi)
2083{
2084 struct nilfs_inode_info *ii, *n;
2085 __u64 cno = sbi->s_nilfs->ns_cno;
2086
2087 spin_lock(&sbi->s_inode_lock);
2088 retry:
2089 list_for_each_entry_safe(ii, n, &sbi->s_dirty_files, i_dirty) {
2090 if (!ii->i_bh) {
2091 struct buffer_head *ibh;
2092 int err;
2093
2094 spin_unlock(&sbi->s_inode_lock);
2095 err = nilfs_ifile_get_inode_block(
2096 sbi->s_ifile, ii->vfs_inode.i_ino, &ibh);
2097 if (unlikely(err)) {
2098 nilfs_warning(sbi->s_super, __func__,
2099 "failed to get inode block.\n");
2100 return err;
2101 }
2102 nilfs_mdt_mark_buffer_dirty(ibh);
2103 nilfs_mdt_mark_dirty(sbi->s_ifile);
2104 spin_lock(&sbi->s_inode_lock);
2105 if (likely(!ii->i_bh))
2106 ii->i_bh = ibh;
2107 else
2108 brelse(ibh);
2109 goto retry;
2110 }
2111 ii->i_cno = cno;
2112
2113 clear_bit(NILFS_I_QUEUED, &ii->i_state);
2114 set_bit(NILFS_I_BUSY, &ii->i_state);
2115 list_del(&ii->i_dirty);
2116 list_add_tail(&ii->i_dirty, &sci->sc_dirty_files);
2117 }
2118 spin_unlock(&sbi->s_inode_lock);
2119
2120 NILFS_I(sbi->s_ifile)->i_cno = cno;
2121
2122 return 0;
2123}
2124
2125static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci,
2126 struct nilfs_sb_info *sbi)
2127{
2128 struct nilfs_transaction_info *ti = current->journal_info;
2129 struct nilfs_inode_info *ii, *n;
2130 __u64 cno = sbi->s_nilfs->ns_cno;
2131
2132 spin_lock(&sbi->s_inode_lock);
2133 list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
2134 if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) ||
2135 test_bit(NILFS_I_DIRTY, &ii->i_state)) {
2136 /* The current checkpoint number (=nilfs->ns_cno) is
2137 changed between check-in and check-out only if the
2138 super root is written out. So, we can update i_cno
2139 for the inodes that remain in the dirty list. */
2140 ii->i_cno = cno;
2141 continue;
2142 }
2143 clear_bit(NILFS_I_BUSY, &ii->i_state);
2144 brelse(ii->i_bh);
2145 ii->i_bh = NULL;
2146 list_del(&ii->i_dirty);
2147 list_add_tail(&ii->i_dirty, &ti->ti_garbage);
2148 }
2149 spin_unlock(&sbi->s_inode_lock);
2150}
2151
2152/*
2153 * Main procedure of segment constructor
2154 */
2155static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2156{
2157 struct nilfs_sb_info *sbi = sci->sc_sbi;
2158 struct the_nilfs *nilfs = sbi->s_nilfs;
2159 struct page *failed_page;
2160 int err, has_sr = 0;
2161
2162 sci->sc_stage.scnt = NILFS_ST_INIT;
2163
2164 err = nilfs_segctor_check_in_files(sci, sbi);
2165 if (unlikely(err))
2166 goto out;
2167
2168 if (nilfs_test_metadata_dirty(sbi))
2169 set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
2170
2171 if (nilfs_segctor_clean(sci))
2172 goto out;
2173
2174 do {
2175 sci->sc_stage.flags &= ~NILFS_CF_HISTORY_MASK;
2176
2177 err = nilfs_segctor_begin_construction(sci, nilfs);
2178 if (unlikely(err))
2179 goto out;
2180
2181 /* Update time stamp */
2182 sci->sc_seg_ctime = get_seconds();
2183
2184 err = nilfs_segctor_collect(sci, nilfs, mode);
2185 if (unlikely(err))
2186 goto failed;
2187
2188 has_sr = (sci->sc_super_root != NULL);
2189
2190 /* Avoid empty segment */
2191 if (sci->sc_stage.scnt == NILFS_ST_DONE &&
2192 NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) {
2193 nilfs_segctor_end_construction(sci, nilfs, 1);
2194 goto out;
2195 }
2196
2197 err = nilfs_segctor_assign(sci, mode);
2198 if (unlikely(err))
2199 goto failed;
2200
2201 if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
2202 nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile);
2203
2204 if (has_sr) {
2205 err = nilfs_segctor_fill_in_checkpoint(sci);
2206 if (unlikely(err))
2207 goto failed_to_make_up;
2208
2209 nilfs_segctor_fill_in_super_root(sci, nilfs);
2210 }
2211 nilfs_segctor_update_segusage(sci, nilfs->ns_sufile);
2212
2213 /* Write partial segments */
2214 err = nilfs_segctor_prepare_write(sci, &failed_page);
2215 if (unlikely(err))
2216 goto failed_to_write;
2217
2218 nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed);
2219
2220 err = nilfs_segctor_write(sci, nilfs->ns_bdi);
2221 if (unlikely(err))
2222 goto failed_to_write;
2223
2224 nilfs_segctor_complete_write(sci);
2225
2226 /* Commit segments */
2227 if (has_sr) {
2228 nilfs_segctor_commit_free_segments(sci);
2229 nilfs_segctor_clear_metadata_dirty(sci);
2230 }
2231
2232 nilfs_segctor_end_construction(sci, nilfs, 0);
2233
2234 } while (sci->sc_stage.scnt != NILFS_ST_DONE);
2235
2236 out:
2237 nilfs_segctor_destroy_segment_buffers(sci);
2238 nilfs_segctor_check_out_files(sci, sbi);
2239 return err;
2240
2241 failed_to_write:
2242 nilfs_segctor_abort_write(sci, failed_page, err);
2243 nilfs_segctor_cancel_segusage(sci, nilfs->ns_sufile);
2244
2245 failed_to_make_up:
2246 if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
2247 nilfs_redirty_inodes(&sci->sc_dirty_files);
2248
2249 failed:
2250 if (nilfs_doing_gc())
2251 nilfs_redirty_inodes(&sci->sc_gc_inodes);
2252 nilfs_segctor_end_construction(sci, nilfs, err);
2253 goto out;
2254}
2255
2256/**
2257 * nilfs_secgtor_start_timer - set timer of background write
2258 * @sci: nilfs_sc_info
2259 *
2260 * If the timer has already been set, it ignores the new request.
2261 * This function MUST be called within a section locking the segment
2262 * semaphore.
2263 */
2264static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci)
2265{
2266 spin_lock(&sci->sc_state_lock);
2267 if (sci->sc_timer && !(sci->sc_state & NILFS_SEGCTOR_COMMIT)) {
2268 sci->sc_timer->expires = jiffies + sci->sc_interval;
2269 add_timer(sci->sc_timer);
2270 sci->sc_state |= NILFS_SEGCTOR_COMMIT;
2271 }
2272 spin_unlock(&sci->sc_state_lock);
2273}
2274
2275static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn)
2276{
2277 spin_lock(&sci->sc_state_lock);
2278 if (!(sci->sc_flush_request & (1 << bn))) {
2279 unsigned long prev_req = sci->sc_flush_request;
2280
2281 sci->sc_flush_request |= (1 << bn);
2282 if (!prev_req)
2283 wake_up(&sci->sc_wait_daemon);
2284 }
2285 spin_unlock(&sci->sc_state_lock);
2286}
2287
2288/**
2289 * nilfs_flush_segment - trigger a segment construction for resource control
2290 * @sb: super block
2291 * @ino: inode number of the file to be flushed out.
2292 */
2293void nilfs_flush_segment(struct super_block *sb, ino_t ino)
2294{
2295 struct nilfs_sb_info *sbi = NILFS_SB(sb);
2296 struct nilfs_sc_info *sci = NILFS_SC(sbi);
2297
2298 if (!sci || nilfs_doing_construction())
2299 return;
2300 nilfs_segctor_do_flush(sci, NILFS_MDT_INODE(sb, ino) ? ino : 0);
2301 /* assign bit 0 to data files */
2302}
2303
2304int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *sci,
2305 __u64 *segnum, size_t nsegs)
2306{
2307 struct nilfs_segment_entry *ent;
2308 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
2309 struct inode *sufile = nilfs->ns_sufile;
2310 LIST_HEAD(list);
2311 __u64 *pnum;
2312 size_t i;
2313 int err;
2314
2315 for (pnum = segnum, i = 0; i < nsegs; pnum++, i++) {
2316 ent = nilfs_alloc_segment_entry(*pnum);
2317 if (unlikely(!ent)) {
2318 err = -ENOMEM;
2319 goto failed;
2320 }
2321 list_add_tail(&ent->list, &list);
2322
2323 err = nilfs_open_segment_entry(ent, sufile);
2324 if (unlikely(err))
2325 goto failed;
2326
2327 if (unlikely(!nilfs_segment_usage_dirty(ent->raw_su)))
2328 printk(KERN_WARNING "NILFS: unused segment is "
2329 "requested to be cleaned (segnum=%llu)\n",
2330 (unsigned long long)ent->segnum);
2331 nilfs_close_segment_entry(ent, sufile);
2332 }
2333 list_splice(&list, sci->sc_cleaning_segments.prev);
2334 return 0;
2335
2336 failed:
2337 nilfs_dispose_segment_list(&list);
2338 return err;
2339}
2340
2341void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *sci)
2342{
2343 nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
2344}
2345
2346struct nilfs_segctor_wait_request {
2347 wait_queue_t wq;
2348 __u32 seq;
2349 int err;
2350 atomic_t done;
2351};
2352
2353static int nilfs_segctor_sync(struct nilfs_sc_info *sci)
2354{
2355 struct nilfs_segctor_wait_request wait_req;
2356 int err = 0;
2357
2358 spin_lock(&sci->sc_state_lock);
2359 init_wait(&wait_req.wq);
2360 wait_req.err = 0;
2361 atomic_set(&wait_req.done, 0);
2362 wait_req.seq = ++sci->sc_seq_request;
2363 spin_unlock(&sci->sc_state_lock);
2364
2365 init_waitqueue_entry(&wait_req.wq, current);
2366 add_wait_queue(&sci->sc_wait_request, &wait_req.wq);
2367 set_current_state(TASK_INTERRUPTIBLE);
2368 wake_up(&sci->sc_wait_daemon);
2369
2370 for (;;) {
2371 if (atomic_read(&wait_req.done)) {
2372 err = wait_req.err;
2373 break;
2374 }
2375 if (!signal_pending(current)) {
2376 schedule();
2377 continue;
2378 }
2379 err = -ERESTARTSYS;
2380 break;
2381 }
2382 finish_wait(&sci->sc_wait_request, &wait_req.wq);
2383 return err;
2384}
2385
2386static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err)
2387{
2388 struct nilfs_segctor_wait_request *wrq, *n;
2389 unsigned long flags;
2390
2391 spin_lock_irqsave(&sci->sc_wait_request.lock, flags);
2392 list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.task_list,
2393 wq.task_list) {
2394 if (!atomic_read(&wrq->done) &&
2395 nilfs_cnt32_ge(sci->sc_seq_done, wrq->seq)) {
2396 wrq->err = err;
2397 atomic_set(&wrq->done, 1);
2398 }
2399 if (atomic_read(&wrq->done)) {
2400 wrq->wq.func(&wrq->wq,
2401 TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
2402 0, NULL);
2403 }
2404 }
2405 spin_unlock_irqrestore(&sci->sc_wait_request.lock, flags);
2406}
2407
2408/**
2409 * nilfs_construct_segment - construct a logical segment
2410 * @sb: super block
2411 *
2412 * Return Value: On success, 0 is retured. On errors, one of the following
2413 * negative error code is returned.
2414 *
2415 * %-EROFS - Read only filesystem.
2416 *
2417 * %-EIO - I/O error
2418 *
2419 * %-ENOSPC - No space left on device (only in a panic state).
2420 *
2421 * %-ERESTARTSYS - Interrupted.
2422 *
2423 * %-ENOMEM - Insufficient memory available.
2424 */
2425int nilfs_construct_segment(struct super_block *sb)
2426{
2427 struct nilfs_sb_info *sbi = NILFS_SB(sb);
2428 struct nilfs_sc_info *sci = NILFS_SC(sbi);
2429 struct nilfs_transaction_info *ti;
2430 int err;
2431
2432 if (!sci)
2433 return -EROFS;
2434
2435 /* A call inside transactions causes a deadlock. */
2436 BUG_ON((ti = current->journal_info) && ti->ti_magic == NILFS_TI_MAGIC);
2437
2438 err = nilfs_segctor_sync(sci);
2439 return err;
2440}
2441
2442/**
2443 * nilfs_construct_dsync_segment - construct a data-only logical segment
2444 * @sb: super block
2445 * @inode: inode whose data blocks should be written out
2446 * @start: start byte offset
2447 * @end: end byte offset (inclusive)
2448 *
2449 * Return Value: On success, 0 is retured. On errors, one of the following
2450 * negative error code is returned.
2451 *
2452 * %-EROFS - Read only filesystem.
2453 *
2454 * %-EIO - I/O error
2455 *
2456 * %-ENOSPC - No space left on device (only in a panic state).
2457 *
2458 * %-ERESTARTSYS - Interrupted.
2459 *
2460 * %-ENOMEM - Insufficient memory available.
2461 */
2462int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
2463 loff_t start, loff_t end)
2464{
2465 struct nilfs_sb_info *sbi = NILFS_SB(sb);
2466 struct nilfs_sc_info *sci = NILFS_SC(sbi);
2467 struct nilfs_inode_info *ii;
2468 struct nilfs_transaction_info ti;
2469 int err = 0;
2470
2471 if (!sci)
2472 return -EROFS;
2473
2474 nilfs_transaction_lock(sbi, &ti, 0);
2475
2476 ii = NILFS_I(inode);
2477 if (test_bit(NILFS_I_INODE_DIRTY, &ii->i_state) ||
2478 nilfs_test_opt(sbi, STRICT_ORDER) ||
2479 test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
2480 nilfs_discontinued(sbi->s_nilfs)) {
2481 nilfs_transaction_unlock(sbi);
2482 err = nilfs_segctor_sync(sci);
2483 return err;
2484 }
2485
2486 spin_lock(&sbi->s_inode_lock);
2487 if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
2488 !test_bit(NILFS_I_BUSY, &ii->i_state)) {
2489 spin_unlock(&sbi->s_inode_lock);
2490 nilfs_transaction_unlock(sbi);
2491 return 0;
2492 }
2493 spin_unlock(&sbi->s_inode_lock);
2494 sci->sc_dsync_inode = ii;
2495 sci->sc_dsync_start = start;
2496 sci->sc_dsync_end = end;
2497
2498 err = nilfs_segctor_do_construct(sci, SC_LSEG_DSYNC);
2499
2500 nilfs_transaction_unlock(sbi);
2501 return err;
2502}
2503
2504struct nilfs_segctor_req {
2505 int mode;
2506 __u32 seq_accepted;
2507 int sc_err; /* construction failure */
2508 int sb_err; /* super block writeback failure */
2509};
2510
2511#define FLUSH_FILE_BIT (0x1) /* data file only */
2512#define FLUSH_DAT_BIT (1 << NILFS_DAT_INO) /* DAT only */
2513
2514static void nilfs_segctor_accept(struct nilfs_sc_info *sci,
2515 struct nilfs_segctor_req *req)
2516{
2517 req->sc_err = req->sb_err = 0;
2518 spin_lock(&sci->sc_state_lock);
2519 req->seq_accepted = sci->sc_seq_request;
2520 spin_unlock(&sci->sc_state_lock);
2521
2522 if (sci->sc_timer)
2523 del_timer_sync(sci->sc_timer);
2524}
2525
2526static void nilfs_segctor_notify(struct nilfs_sc_info *sci,
2527 struct nilfs_segctor_req *req)
2528{
2529 /* Clear requests (even when the construction failed) */
2530 spin_lock(&sci->sc_state_lock);
2531
2532 sci->sc_state &= ~NILFS_SEGCTOR_COMMIT;
2533
2534 if (req->mode == SC_LSEG_SR) {
2535 sci->sc_seq_done = req->seq_accepted;
2536 nilfs_segctor_wakeup(sci, req->sc_err ? : req->sb_err);
2537 sci->sc_flush_request = 0;
2538 } else if (req->mode == SC_FLUSH_FILE)
2539 sci->sc_flush_request &= ~FLUSH_FILE_BIT;
2540 else if (req->mode == SC_FLUSH_DAT)
2541 sci->sc_flush_request &= ~FLUSH_DAT_BIT;
2542
2543 spin_unlock(&sci->sc_state_lock);
2544}
2545
2546static int nilfs_segctor_construct(struct nilfs_sc_info *sci,
2547 struct nilfs_segctor_req *req)
2548{
2549 struct nilfs_sb_info *sbi = sci->sc_sbi;
2550 struct the_nilfs *nilfs = sbi->s_nilfs;
2551 int err = 0;
2552
2553 if (nilfs_discontinued(nilfs))
2554 req->mode = SC_LSEG_SR;
2555 if (!nilfs_segctor_confirm(sci)) {
2556 err = nilfs_segctor_do_construct(sci, req->mode);
2557 req->sc_err = err;
2558 }
2559 if (likely(!err)) {
2560 if (req->mode != SC_FLUSH_DAT)
2561 atomic_set(&nilfs->ns_ndirtyblks, 0);
2562 if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
2563 nilfs_discontinued(nilfs)) {
2564 down_write(&nilfs->ns_sem);
2565 req->sb_err = nilfs_commit_super(sbi, 0);
2566 up_write(&nilfs->ns_sem);
2567 }
2568 }
2569 return err;
2570}
2571
2572static void nilfs_construction_timeout(unsigned long data)
2573{
2574 struct task_struct *p = (struct task_struct *)data;
2575 wake_up_process(p);
2576}
2577
2578static void
2579nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head)
2580{
2581 struct nilfs_inode_info *ii, *n;
2582
2583 list_for_each_entry_safe(ii, n, head, i_dirty) {
2584 if (!test_bit(NILFS_I_UPDATED, &ii->i_state))
2585 continue;
2586 hlist_del_init(&ii->vfs_inode.i_hash);
2587 list_del_init(&ii->i_dirty);
2588 nilfs_clear_gcinode(&ii->vfs_inode);
2589 }
2590}
2591
2592int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2593 void **kbufs)
2594{
2595 struct nilfs_sb_info *sbi = NILFS_SB(sb);
2596 struct nilfs_sc_info *sci = NILFS_SC(sbi);
2597 struct the_nilfs *nilfs = sbi->s_nilfs;
2598 struct nilfs_transaction_info ti;
2599 struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
2600 int err;
2601
2602 if (unlikely(!sci))
2603 return -EROFS;
2604
2605 nilfs_transaction_lock(sbi, &ti, 1);
2606
2607 err = nilfs_init_gcdat_inode(nilfs);
2608 if (unlikely(err))
2609 goto out_unlock;
2610 err = nilfs_ioctl_prepare_clean_segments(nilfs, argv, kbufs);
2611 if (unlikely(err))
2612 goto out_unlock;
2613
2614 list_splice_init(&nilfs->ns_gc_inodes, sci->sc_gc_inodes.prev);
2615
2616 for (;;) {
2617 nilfs_segctor_accept(sci, &req);
2618 err = nilfs_segctor_construct(sci, &req);
2619 nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes);
2620 nilfs_segctor_notify(sci, &req);
2621
2622 if (likely(!err))
2623 break;
2624
2625 nilfs_warning(sb, __func__,
2626 "segment construction failed. (err=%d)", err);
2627 set_current_state(TASK_INTERRUPTIBLE);
2628 schedule_timeout(sci->sc_interval);
2629 }
2630
2631 out_unlock:
2632 nilfs_clear_gcdat_inode(nilfs);
2633 nilfs_transaction_unlock(sbi);
2634 return err;
2635}
2636
2637static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
2638{
2639 struct nilfs_sb_info *sbi = sci->sc_sbi;
2640 struct nilfs_transaction_info ti;
2641 struct nilfs_segctor_req req = { .mode = mode };
2642
2643 nilfs_transaction_lock(sbi, &ti, 0);
2644
2645 nilfs_segctor_accept(sci, &req);
2646 nilfs_segctor_construct(sci, &req);
2647 nilfs_segctor_notify(sci, &req);
2648
2649 /*
2650 * Unclosed segment should be retried. We do this using sc_timer.
2651 * Timeout of sc_timer will invoke complete construction which leads
2652 * to close the current logical segment.
2653 */
2654 if (test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags))
2655 nilfs_segctor_start_timer(sci);
2656
2657 nilfs_transaction_unlock(sbi);
2658}
2659
2660static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci)
2661{
2662 int mode = 0;
2663 int err;
2664
2665 spin_lock(&sci->sc_state_lock);
2666 mode = (sci->sc_flush_request & FLUSH_DAT_BIT) ?
2667 SC_FLUSH_DAT : SC_FLUSH_FILE;
2668 spin_unlock(&sci->sc_state_lock);
2669
2670 if (mode) {
2671 err = nilfs_segctor_do_construct(sci, mode);
2672
2673 spin_lock(&sci->sc_state_lock);
2674 sci->sc_flush_request &= (mode == SC_FLUSH_FILE) ?
2675 ~FLUSH_FILE_BIT : ~FLUSH_DAT_BIT;
2676 spin_unlock(&sci->sc_state_lock);
2677 }
2678 clear_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags);
2679}
2680
2681static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci)
2682{
2683 if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
2684 time_before(jiffies, sci->sc_lseg_stime + sci->sc_mjcp_freq)) {
2685 if (!(sci->sc_flush_request & ~FLUSH_FILE_BIT))
2686 return SC_FLUSH_FILE;
2687 else if (!(sci->sc_flush_request & ~FLUSH_DAT_BIT))
2688 return SC_FLUSH_DAT;
2689 }
2690 return SC_LSEG_SR;
2691}
2692
2693/**
2694 * nilfs_segctor_thread - main loop of the segment constructor thread.
2695 * @arg: pointer to a struct nilfs_sc_info.
2696 *
2697 * nilfs_segctor_thread() initializes a timer and serves as a daemon
2698 * to execute segment constructions.
2699 */
2700static int nilfs_segctor_thread(void *arg)
2701{
2702 struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
2703 struct timer_list timer;
2704 int timeout = 0;
2705
2706 init_timer(&timer);
2707 timer.data = (unsigned long)current;
2708 timer.function = nilfs_construction_timeout;
2709 sci->sc_timer = &timer;
2710
2711 /* start sync. */
2712 sci->sc_task = current;
2713 wake_up(&sci->sc_wait_task); /* for nilfs_segctor_start_thread() */
2714 printk(KERN_INFO
2715 "segctord starting. Construction interval = %lu seconds, "
2716 "CP frequency < %lu seconds\n",
2717 sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ);
2718
2719 spin_lock(&sci->sc_state_lock);
2720 loop:
2721 for (;;) {
2722 int mode;
2723
2724 if (sci->sc_state & NILFS_SEGCTOR_QUIT)
2725 goto end_thread;
2726
2727 if (timeout || sci->sc_seq_request != sci->sc_seq_done)
2728 mode = SC_LSEG_SR;
2729 else if (!sci->sc_flush_request)
2730 break;
2731 else
2732 mode = nilfs_segctor_flush_mode(sci);
2733
2734 spin_unlock(&sci->sc_state_lock);
2735 nilfs_segctor_thread_construct(sci, mode);
2736 spin_lock(&sci->sc_state_lock);
2737 timeout = 0;
2738 }
2739
2740
2741 if (freezing(current)) {
2742 spin_unlock(&sci->sc_state_lock);
2743 refrigerator();
2744 spin_lock(&sci->sc_state_lock);
2745 } else {
2746 DEFINE_WAIT(wait);
2747 int should_sleep = 1;
2748
2749 prepare_to_wait(&sci->sc_wait_daemon, &wait,
2750 TASK_INTERRUPTIBLE);
2751
2752 if (sci->sc_seq_request != sci->sc_seq_done)
2753 should_sleep = 0;
2754 else if (sci->sc_flush_request)
2755 should_sleep = 0;
2756 else if (sci->sc_state & NILFS_SEGCTOR_COMMIT)
2757 should_sleep = time_before(jiffies,
2758 sci->sc_timer->expires);
2759
2760 if (should_sleep) {
2761 spin_unlock(&sci->sc_state_lock);
2762 schedule();
2763 spin_lock(&sci->sc_state_lock);
2764 }
2765 finish_wait(&sci->sc_wait_daemon, &wait);
2766 timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
2767 time_after_eq(jiffies, sci->sc_timer->expires));
2768 }
2769 goto loop;
2770
2771 end_thread:
2772 spin_unlock(&sci->sc_state_lock);
2773 del_timer_sync(sci->sc_timer);
2774 sci->sc_timer = NULL;
2775
2776 /* end sync. */
2777 sci->sc_task = NULL;
2778 wake_up(&sci->sc_wait_task); /* for nilfs_segctor_kill_thread() */
2779 return 0;
2780}
2781
2782static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci)
2783{
2784 struct task_struct *t;
2785
2786 t = kthread_run(nilfs_segctor_thread, sci, "segctord");
2787 if (IS_ERR(t)) {
2788 int err = PTR_ERR(t);
2789
2790 printk(KERN_ERR "NILFS: error %d creating segctord thread\n",
2791 err);
2792 return err;
2793 }
2794 wait_event(sci->sc_wait_task, sci->sc_task != NULL);
2795 return 0;
2796}
2797
2798static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
2799{
2800 sci->sc_state |= NILFS_SEGCTOR_QUIT;
2801
2802 while (sci->sc_task) {
2803 wake_up(&sci->sc_wait_daemon);
2804 spin_unlock(&sci->sc_state_lock);
2805 wait_event(sci->sc_wait_task, sci->sc_task == NULL);
2806 spin_lock(&sci->sc_state_lock);
2807 }
2808}
2809
2810static int nilfs_segctor_init(struct nilfs_sc_info *sci)
2811{
2812 sci->sc_seq_done = sci->sc_seq_request;
2813
2814 return nilfs_segctor_start_thread(sci);
2815}
2816
2817/*
2818 * Setup & clean-up functions
2819 */
2820static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
2821{
2822 struct nilfs_sc_info *sci;
2823
2824 sci = kzalloc(sizeof(*sci), GFP_KERNEL);
2825 if (!sci)
2826 return NULL;
2827
2828 sci->sc_sbi = sbi;
2829 sci->sc_super = sbi->s_super;
2830
2831 init_waitqueue_head(&sci->sc_wait_request);
2832 init_waitqueue_head(&sci->sc_wait_daemon);
2833 init_waitqueue_head(&sci->sc_wait_task);
2834 spin_lock_init(&sci->sc_state_lock);
2835 INIT_LIST_HEAD(&sci->sc_dirty_files);
2836 INIT_LIST_HEAD(&sci->sc_segbufs);
2837 INIT_LIST_HEAD(&sci->sc_gc_inodes);
2838 INIT_LIST_HEAD(&sci->sc_cleaning_segments);
2839 INIT_LIST_HEAD(&sci->sc_copied_buffers);
2840
2841 sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
2842 sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ;
2843 sci->sc_watermark = NILFS_SC_DEFAULT_WATERMARK;
2844
2845 if (sbi->s_interval)
2846 sci->sc_interval = sbi->s_interval;
2847 if (sbi->s_watermark)
2848 sci->sc_watermark = sbi->s_watermark;
2849 return sci;
2850}
2851
2852static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
2853{
2854 int ret, retrycount = NILFS_SC_CLEANUP_RETRY;
2855
2856 /* The segctord thread was stopped and its timer was removed.
2857 But some tasks remain. */
2858 do {
2859 struct nilfs_sb_info *sbi = sci->sc_sbi;
2860 struct nilfs_transaction_info ti;
2861 struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
2862
2863 nilfs_transaction_lock(sbi, &ti, 0);
2864 nilfs_segctor_accept(sci, &req);
2865 ret = nilfs_segctor_construct(sci, &req);
2866 nilfs_segctor_notify(sci, &req);
2867 nilfs_transaction_unlock(sbi);
2868
2869 } while (ret && retrycount-- > 0);
2870}
2871
2872/**
2873 * nilfs_segctor_destroy - destroy the segment constructor.
2874 * @sci: nilfs_sc_info
2875 *
2876 * nilfs_segctor_destroy() kills the segctord thread and frees
2877 * the nilfs_sc_info struct.
2878 * Caller must hold the segment semaphore.
2879 */
2880static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2881{
2882 struct nilfs_sb_info *sbi = sci->sc_sbi;
2883 int flag;
2884
2885 up_write(&sbi->s_nilfs->ns_segctor_sem);
2886
2887 spin_lock(&sci->sc_state_lock);
2888 nilfs_segctor_kill_thread(sci);
2889 flag = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) || sci->sc_flush_request
2890 || sci->sc_seq_request != sci->sc_seq_done);
2891 spin_unlock(&sci->sc_state_lock);
2892
2893 if (flag || nilfs_segctor_confirm(sci))
2894 nilfs_segctor_write_out(sci);
2895
2896 WARN_ON(!list_empty(&sci->sc_copied_buffers));
2897
2898 if (!list_empty(&sci->sc_dirty_files)) {
2899 nilfs_warning(sbi->s_super, __func__,
2900 "dirty file(s) after the final construction\n");
2901 nilfs_dispose_list(sbi, &sci->sc_dirty_files, 1);
2902 }
2903
2904 if (!list_empty(&sci->sc_cleaning_segments))
2905 nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
2906
2907 WARN_ON(!list_empty(&sci->sc_segbufs));
2908
2909 down_write(&sbi->s_nilfs->ns_segctor_sem);
2910
2911 kfree(sci);
2912}
2913
2914/**
2915 * nilfs_attach_segment_constructor - attach a segment constructor
2916 * @sbi: nilfs_sb_info
2917 *
2918 * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info,
2919 * initilizes it, and starts the segment constructor.
2920 *
2921 * Return Value: On success, 0 is returned. On error, one of the following
2922 * negative error code is returned.
2923 *
2924 * %-ENOMEM - Insufficient memory available.
2925 */
2926int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
2927{
2928 struct the_nilfs *nilfs = sbi->s_nilfs;
2929 int err;
2930
2931 /* Each field of nilfs_segctor is cleared through the initialization
2932 of super-block info */
2933 sbi->s_sc_info = nilfs_segctor_new(sbi);
2934 if (!sbi->s_sc_info)
2935 return -ENOMEM;
2936
2937 nilfs_attach_writer(nilfs, sbi);
2938 err = nilfs_segctor_init(NILFS_SC(sbi));
2939 if (err) {
2940 nilfs_detach_writer(nilfs, sbi);
2941 kfree(sbi->s_sc_info);
2942 sbi->s_sc_info = NULL;
2943 }
2944 return err;
2945}
2946
2947/**
2948 * nilfs_detach_segment_constructor - destroy the segment constructor
2949 * @sbi: nilfs_sb_info
2950 *
2951 * nilfs_detach_segment_constructor() kills the segment constructor daemon,
2952 * frees the struct nilfs_sc_info, and destroy the dirty file list.
2953 */
2954void nilfs_detach_segment_constructor(struct nilfs_sb_info *sbi)
2955{
2956 struct the_nilfs *nilfs = sbi->s_nilfs;
2957 LIST_HEAD(garbage_list);
2958
2959 down_write(&nilfs->ns_segctor_sem);
2960 if (NILFS_SC(sbi)) {
2961 nilfs_segctor_destroy(NILFS_SC(sbi));
2962 sbi->s_sc_info = NULL;
2963 }
2964
2965 /* Force to free the list of dirty files */
2966 spin_lock(&sbi->s_inode_lock);
2967 if (!list_empty(&sbi->s_dirty_files)) {
2968 list_splice_init(&sbi->s_dirty_files, &garbage_list);
2969 nilfs_warning(sbi->s_super, __func__,
2970 "Non empty dirty list after the last "
2971 "segment construction\n");
2972 }
2973 spin_unlock(&sbi->s_inode_lock);
2974 up_write(&nilfs->ns_segctor_sem);
2975
2976 nilfs_dispose_list(sbi, &garbage_list, 1);
2977 nilfs_detach_writer(nilfs, sbi);
2978}
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
new file mode 100644
index 000000000000..476bdd5df5be
--- /dev/null
+++ b/fs/nilfs2/segment.h
@@ -0,0 +1,244 @@
1/*
2 * segment.h - NILFS Segment constructor prototypes and definitions
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23#ifndef _NILFS_SEGMENT_H
24#define _NILFS_SEGMENT_H
25
26#include <linux/types.h>
27#include <linux/fs.h>
28#include <linux/buffer_head.h>
29#include <linux/nilfs2_fs.h>
30#include "sb.h"
31
32/**
33 * struct nilfs_recovery_info - Recovery infomation
34 * @ri_need_recovery: Recovery status
35 * @ri_super_root: Block number of the last super root
36 * @ri_ri_cno: Number of the last checkpoint
37 * @ri_lsegs_start: Region for roll-forwarding (start block number)
38 * @ri_lsegs_end: Region for roll-forwarding (end block number)
39 * @ri_lseg_start_seq: Sequence value of the segment at ri_lsegs_start
40 * @ri_used_segments: List of segments to be mark active
41 * @ri_pseg_start: Block number of the last partial segment
42 * @ri_seq: Sequence number on the last partial segment
43 * @ri_segnum: Segment number on the last partial segment
44 * @ri_nextnum: Next segment number on the last partial segment
45 */
46struct nilfs_recovery_info {
47 int ri_need_recovery;
48 sector_t ri_super_root;
49 __u64 ri_cno;
50
51 sector_t ri_lsegs_start;
52 sector_t ri_lsegs_end;
53 u64 ri_lsegs_start_seq;
54 struct list_head ri_used_segments;
55 sector_t ri_pseg_start;
56 u64 ri_seq;
57 __u64 ri_segnum;
58 __u64 ri_nextnum;
59};
60
61/* ri_need_recovery */
62#define NILFS_RECOVERY_SR_UPDATED 1 /* The super root was updated */
63#define NILFS_RECOVERY_ROLLFORWARD_DONE 2 /* Rollforward was carried out */
64
65/**
66 * struct nilfs_cstage - Context of collection stage
67 * @scnt: Stage count
68 * @flags: State flags
69 * @dirty_file_ptr: Pointer on dirty_files list, or inode of a target file
70 * @gc_inode_ptr: Pointer on the list of gc-inodes
71 */
72struct nilfs_cstage {
73 int scnt;
74 unsigned flags;
75 struct nilfs_inode_info *dirty_file_ptr;
76 struct nilfs_inode_info *gc_inode_ptr;
77};
78
79struct nilfs_segment_buffer;
80
81struct nilfs_segsum_pointer {
82 struct buffer_head *bh;
83 unsigned offset; /* offset in bytes */
84};
85
86/**
87 * struct nilfs_sc_info - Segment constructor information
88 * @sc_super: Back pointer to super_block struct
89 * @sc_sbi: Back pointer to nilfs_sb_info struct
90 * @sc_nblk_inc: Block count of current generation
91 * @sc_dirty_files: List of files to be written
92 * @sc_gc_inodes: List of GC inodes having blocks to be written
93 * @sc_cleaning_segments: List of segments to be freed through construction
94 * @sc_copied_buffers: List of copied buffers (buffer heads) to freeze data
95 * @sc_dsync_inode: inode whose data pages are written for a sync operation
96 * @sc_dsync_start: start byte offset of data pages
97 * @sc_dsync_end: end byte offset of data pages (inclusive)
98 * @sc_segbufs: List of segment buffers
99 * @sc_segbuf_nblocks: Number of available blocks in segment buffers.
100 * @sc_curseg: Current segment buffer
101 * @sc_super_root: Pointer to the super root buffer
102 * @sc_stage: Collection stage
103 * @sc_finfo_ptr: pointer to the current finfo struct in the segment summary
104 * @sc_binfo_ptr: pointer to the current binfo struct in the segment summary
105 * @sc_blk_cnt: Block count of a file
106 * @sc_datablk_cnt: Data block count of a file
107 * @sc_nblk_this_inc: Number of blocks included in the current logical segment
108 * @sc_seg_ctime: Creation time
109 * @sc_flags: Internal flags
110 * @sc_state_lock: spinlock for sc_state and so on
111 * @sc_state: Segctord state flags
112 * @sc_flush_request: inode bitmap of metadata files to be flushed
113 * @sc_wait_request: Client request queue
114 * @sc_wait_daemon: Daemon wait queue
115 * @sc_wait_task: Start/end wait queue to control segctord task
116 * @sc_seq_request: Request counter
117 * @sc_seq_done: Completion counter
118 * @sc_sync: Request of explicit sync operation
119 * @sc_interval: Timeout value of background construction
120 * @sc_mjcp_freq: Frequency of creating checkpoints
121 * @sc_lseg_stime: Start time of the latest logical segment
122 * @sc_watermark: Watermark for the number of dirty buffers
123 * @sc_timer: Timer for segctord
124 * @sc_task: current thread of segctord
125 */
126struct nilfs_sc_info {
127 struct super_block *sc_super;
128 struct nilfs_sb_info *sc_sbi;
129
130 unsigned long sc_nblk_inc;
131
132 struct list_head sc_dirty_files;
133 struct list_head sc_gc_inodes;
134 struct list_head sc_cleaning_segments;
135 struct list_head sc_copied_buffers;
136
137 struct nilfs_inode_info *sc_dsync_inode;
138 loff_t sc_dsync_start;
139 loff_t sc_dsync_end;
140
141 /* Segment buffers */
142 struct list_head sc_segbufs;
143 unsigned long sc_segbuf_nblocks;
144 struct nilfs_segment_buffer *sc_curseg;
145 struct buffer_head *sc_super_root;
146
147 struct nilfs_cstage sc_stage;
148
149 struct nilfs_segsum_pointer sc_finfo_ptr;
150 struct nilfs_segsum_pointer sc_binfo_ptr;
151 unsigned long sc_blk_cnt;
152 unsigned long sc_datablk_cnt;
153 unsigned long sc_nblk_this_inc;
154 time_t sc_seg_ctime;
155
156 unsigned long sc_flags;
157
158 spinlock_t sc_state_lock;
159 unsigned long sc_state;
160 unsigned long sc_flush_request;
161
162 wait_queue_head_t sc_wait_request;
163 wait_queue_head_t sc_wait_daemon;
164 wait_queue_head_t sc_wait_task;
165
166 __u32 sc_seq_request;
167 __u32 sc_seq_done;
168
169 int sc_sync;
170 unsigned long sc_interval;
171 unsigned long sc_mjcp_freq;
172 unsigned long sc_lseg_stime; /* in 1/HZ seconds */
173 unsigned long sc_watermark;
174
175 struct timer_list *sc_timer;
176 struct task_struct *sc_task;
177};
178
179/* sc_flags */
180enum {
181 NILFS_SC_DIRTY, /* One or more dirty meta-data blocks exist */
182 NILFS_SC_UNCLOSED, /* Logical segment is not closed */
183 NILFS_SC_SUPER_ROOT, /* The latest segment has a super root */
184 NILFS_SC_PRIOR_FLUSH, /* Requesting immediate flush without making a
185 checkpoint */
186 NILFS_SC_HAVE_DELTA, /* Next checkpoint will have update of files
187 other than DAT, cpfile, sufile, or files
188 moved by GC */
189};
190
191/* sc_state */
192#define NILFS_SEGCTOR_QUIT 0x0001 /* segctord is being destroyed */
193#define NILFS_SEGCTOR_COMMIT 0x0004 /* committed transaction exists */
194
195/*
196 * Constant parameters
197 */
198#define NILFS_SC_CLEANUP_RETRY 3 /* Retry count of construction when
199 destroying segctord */
200
201/*
202 * Default values of timeout, in seconds.
203 */
204#define NILFS_SC_DEFAULT_TIMEOUT 5 /* Timeout value of dirty blocks.
205 It triggers construction of a
206 logical segment with a super root */
207#define NILFS_SC_DEFAULT_SR_FREQ 30 /* Maximum frequency of super root
208 creation */
209
210/*
211 * The default threshold amount of data, in block counts.
212 */
213#define NILFS_SC_DEFAULT_WATERMARK 3600
214
215
216/* segment.c */
217extern int nilfs_init_transaction_cache(void);
218extern void nilfs_destroy_transaction_cache(void);
219extern void nilfs_relax_pressure_in_lock(struct super_block *);
220
221extern int nilfs_construct_segment(struct super_block *);
222extern int nilfs_construct_dsync_segment(struct super_block *, struct inode *,
223 loff_t, loff_t);
224extern void nilfs_flush_segment(struct super_block *, ino_t);
225extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *,
226 void **);
227
228extern int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *,
229 __u64 *, size_t);
230extern void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *);
231
232extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *);
233extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *);
234
235/* recovery.c */
236extern int nilfs_read_super_root_block(struct super_block *, sector_t,
237 struct buffer_head **, int);
238extern int nilfs_search_super_root(struct the_nilfs *, struct nilfs_sb_info *,
239 struct nilfs_recovery_info *);
240extern int nilfs_recover_logical_segments(struct the_nilfs *,
241 struct nilfs_sb_info *,
242 struct nilfs_recovery_info *);
243
244#endif /* _NILFS_SEGMENT_H */
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
new file mode 100644
index 000000000000..98e68677f045
--- /dev/null
+++ b/fs/nilfs2/sufile.c
@@ -0,0 +1,558 @@
1/*
2 * sufile.c - NILFS segment usage file.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#include <linux/kernel.h>
24#include <linux/fs.h>
25#include <linux/string.h>
26#include <linux/buffer_head.h>
27#include <linux/errno.h>
28#include <linux/nilfs2_fs.h>
29#include "mdt.h"
30#include "sufile.h"
31
32
33static inline unsigned long
34nilfs_sufile_segment_usages_per_block(const struct inode *sufile)
35{
36 return NILFS_MDT(sufile)->mi_entries_per_block;
37}
38
39static unsigned long
40nilfs_sufile_get_blkoff(const struct inode *sufile, __u64 segnum)
41{
42 __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;
43 do_div(t, nilfs_sufile_segment_usages_per_block(sufile));
44 return (unsigned long)t;
45}
46
47static unsigned long
48nilfs_sufile_get_offset(const struct inode *sufile, __u64 segnum)
49{
50 __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;
51 return do_div(t, nilfs_sufile_segment_usages_per_block(sufile));
52}
53
54static unsigned long
55nilfs_sufile_segment_usages_in_block(const struct inode *sufile, __u64 curr,
56 __u64 max)
57{
58 return min_t(unsigned long,
59 nilfs_sufile_segment_usages_per_block(sufile) -
60 nilfs_sufile_get_offset(sufile, curr),
61 max - curr + 1);
62}
63
64static inline struct nilfs_sufile_header *
65nilfs_sufile_block_get_header(const struct inode *sufile,
66 struct buffer_head *bh,
67 void *kaddr)
68{
69 return kaddr + bh_offset(bh);
70}
71
72static struct nilfs_segment_usage *
73nilfs_sufile_block_get_segment_usage(const struct inode *sufile, __u64 segnum,
74 struct buffer_head *bh, void *kaddr)
75{
76 return kaddr + bh_offset(bh) +
77 nilfs_sufile_get_offset(sufile, segnum) *
78 NILFS_MDT(sufile)->mi_entry_size;
79}
80
81static inline int nilfs_sufile_get_header_block(struct inode *sufile,
82 struct buffer_head **bhp)
83{
84 return nilfs_mdt_get_block(sufile, 0, 0, NULL, bhp);
85}
86
87static inline int
88nilfs_sufile_get_segment_usage_block(struct inode *sufile, __u64 segnum,
89 int create, struct buffer_head **bhp)
90{
91 return nilfs_mdt_get_block(sufile,
92 nilfs_sufile_get_blkoff(sufile, segnum),
93 create, NULL, bhp);
94}
95
96static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
97 u64 ncleanadd, u64 ndirtyadd)
98{
99 struct nilfs_sufile_header *header;
100 void *kaddr;
101
102 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
103 header = kaddr + bh_offset(header_bh);
104 le64_add_cpu(&header->sh_ncleansegs, ncleanadd);
105 le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd);
106 kunmap_atomic(kaddr, KM_USER0);
107
108 nilfs_mdt_mark_buffer_dirty(header_bh);
109}
110
111int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create,
112 void (*dofunc)(struct inode *, __u64,
113 struct buffer_head *,
114 struct buffer_head *))
115{
116 struct buffer_head *header_bh, *bh;
117 int ret;
118
119 if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) {
120 printk(KERN_WARNING "%s: invalid segment number: %llu\n",
121 __func__, (unsigned long long)segnum);
122 return -EINVAL;
123 }
124 down_write(&NILFS_MDT(sufile)->mi_sem);
125
126 ret = nilfs_sufile_get_header_block(sufile, &header_bh);
127 if (ret < 0)
128 goto out_sem;
129
130 ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, create, &bh);
131 if (!ret) {
132 dofunc(sufile, segnum, header_bh, bh);
133 brelse(bh);
134 }
135 brelse(header_bh);
136
137 out_sem:
138 up_write(&NILFS_MDT(sufile)->mi_sem);
139 return ret;
140}
141
142/**
143 * nilfs_sufile_alloc - allocate a segment
144 * @sufile: inode of segment usage file
145 * @segnump: pointer to segment number
146 *
147 * Description: nilfs_sufile_alloc() allocates a clean segment.
148 *
149 * Return Value: On success, 0 is returned and the segment number of the
150 * allocated segment is stored in the place pointed by @segnump. On error, one
151 * of the following negative error codes is returned.
152 *
153 * %-EIO - I/O error.
154 *
155 * %-ENOMEM - Insufficient amount of memory available.
156 *
157 * %-ENOSPC - No clean segment left.
158 */
159int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
160{
161 struct buffer_head *header_bh, *su_bh;
162 struct nilfs_sufile_header *header;
163 struct nilfs_segment_usage *su;
164 size_t susz = NILFS_MDT(sufile)->mi_entry_size;
165 __u64 segnum, maxsegnum, last_alloc;
166 void *kaddr;
167 unsigned long nsegments, ncleansegs, nsus;
168 int ret, i, j;
169
170 down_write(&NILFS_MDT(sufile)->mi_sem);
171
172 ret = nilfs_sufile_get_header_block(sufile, &header_bh);
173 if (ret < 0)
174 goto out_sem;
175 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
176 header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
177 ncleansegs = le64_to_cpu(header->sh_ncleansegs);
178 last_alloc = le64_to_cpu(header->sh_last_alloc);
179 kunmap_atomic(kaddr, KM_USER0);
180
181 nsegments = nilfs_sufile_get_nsegments(sufile);
182 segnum = last_alloc + 1;
183 maxsegnum = nsegments - 1;
184 for (i = 0; i < nsegments; i += nsus) {
185 if (segnum >= nsegments) {
186 /* wrap around */
187 segnum = 0;
188 maxsegnum = last_alloc;
189 }
190 ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1,
191 &su_bh);
192 if (ret < 0)
193 goto out_header;
194 kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
195 su = nilfs_sufile_block_get_segment_usage(
196 sufile, segnum, su_bh, kaddr);
197
198 nsus = nilfs_sufile_segment_usages_in_block(
199 sufile, segnum, maxsegnum);
200 for (j = 0; j < nsus; j++, su = (void *)su + susz, segnum++) {
201 if (!nilfs_segment_usage_clean(su))
202 continue;
203 /* found a clean segment */
204 nilfs_segment_usage_set_dirty(su);
205 kunmap_atomic(kaddr, KM_USER0);
206
207 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
208 header = nilfs_sufile_block_get_header(
209 sufile, header_bh, kaddr);
210 le64_add_cpu(&header->sh_ncleansegs, -1);
211 le64_add_cpu(&header->sh_ndirtysegs, 1);
212 header->sh_last_alloc = cpu_to_le64(segnum);
213 kunmap_atomic(kaddr, KM_USER0);
214
215 nilfs_mdt_mark_buffer_dirty(header_bh);
216 nilfs_mdt_mark_buffer_dirty(su_bh);
217 nilfs_mdt_mark_dirty(sufile);
218 brelse(su_bh);
219 *segnump = segnum;
220 goto out_header;
221 }
222
223 kunmap_atomic(kaddr, KM_USER0);
224 brelse(su_bh);
225 }
226
227 /* no segments left */
228 ret = -ENOSPC;
229
230 out_header:
231 brelse(header_bh);
232
233 out_sem:
234 up_write(&NILFS_MDT(sufile)->mi_sem);
235 return ret;
236}
237
238void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,
239 struct buffer_head *header_bh,
240 struct buffer_head *su_bh)
241{
242 struct nilfs_segment_usage *su;
243 void *kaddr;
244
245 kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
246 su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
247 if (unlikely(!nilfs_segment_usage_clean(su))) {
248 printk(KERN_WARNING "%s: segment %llu must be clean\n",
249 __func__, (unsigned long long)segnum);
250 kunmap_atomic(kaddr, KM_USER0);
251 return;
252 }
253 nilfs_segment_usage_set_dirty(su);
254 kunmap_atomic(kaddr, KM_USER0);
255
256 nilfs_sufile_mod_counter(header_bh, -1, 1);
257 nilfs_mdt_mark_buffer_dirty(su_bh);
258 nilfs_mdt_mark_dirty(sufile);
259}
260
261void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
262 struct buffer_head *header_bh,
263 struct buffer_head *su_bh)
264{
265 struct nilfs_segment_usage *su;
266 void *kaddr;
267 int clean, dirty;
268
269 kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
270 su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
271 if (su->su_flags == cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY) &&
272 su->su_nblocks == cpu_to_le32(0)) {
273 kunmap_atomic(kaddr, KM_USER0);
274 return;
275 }
276 clean = nilfs_segment_usage_clean(su);
277 dirty = nilfs_segment_usage_dirty(su);
278
279 /* make the segment garbage */
280 su->su_lastmod = cpu_to_le64(0);
281 su->su_nblocks = cpu_to_le32(0);
282 su->su_flags = cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY);
283 kunmap_atomic(kaddr, KM_USER0);
284
285 nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1);
286 nilfs_mdt_mark_buffer_dirty(su_bh);
287 nilfs_mdt_mark_dirty(sufile);
288}
289
290void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
291 struct buffer_head *header_bh,
292 struct buffer_head *su_bh)
293{
294 struct nilfs_segment_usage *su;
295 void *kaddr;
296 int sudirty;
297
298 kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
299 su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
300 if (nilfs_segment_usage_clean(su)) {
301 printk(KERN_WARNING "%s: segment %llu is already clean\n",
302 __func__, (unsigned long long)segnum);
303 kunmap_atomic(kaddr, KM_USER0);
304 return;
305 }
306 WARN_ON(nilfs_segment_usage_error(su));
307 WARN_ON(!nilfs_segment_usage_dirty(su));
308
309 sudirty = nilfs_segment_usage_dirty(su);
310 nilfs_segment_usage_set_clean(su);
311 kunmap_atomic(kaddr, KM_USER0);
312 nilfs_mdt_mark_buffer_dirty(su_bh);
313
314 nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0);
315 nilfs_mdt_mark_dirty(sufile);
316}
317
318/**
319 * nilfs_sufile_get_segment_usage - get a segment usage
320 * @sufile: inode of segment usage file
321 * @segnum: segment number
322 * @sup: pointer to segment usage
323 * @bhp: pointer to buffer head
324 *
325 * Description: nilfs_sufile_get_segment_usage() acquires the segment usage
326 * specified by @segnum.
327 *
328 * Return Value: On success, 0 is returned, and the segment usage and the
329 * buffer head of the buffer on which the segment usage is located are stored
330 * in the place pointed by @sup and @bhp, respectively. On error, one of the
331 * following negative error codes is returned.
332 *
333 * %-EIO - I/O error.
334 *
335 * %-ENOMEM - Insufficient amount of memory available.
336 *
337 * %-EINVAL - Invalid segment usage number.
338 */
339int nilfs_sufile_get_segment_usage(struct inode *sufile, __u64 segnum,
340 struct nilfs_segment_usage **sup,
341 struct buffer_head **bhp)
342{
343 struct buffer_head *bh;
344 struct nilfs_segment_usage *su;
345 void *kaddr;
346 int ret;
347
348 /* segnum is 0 origin */
349 if (segnum >= nilfs_sufile_get_nsegments(sufile))
350 return -EINVAL;
351 down_write(&NILFS_MDT(sufile)->mi_sem);
352 ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, &bh);
353 if (ret < 0)
354 goto out_sem;
355 kaddr = kmap(bh->b_page);
356 su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
357 if (nilfs_segment_usage_error(su)) {
358 kunmap(bh->b_page);
359 brelse(bh);
360 ret = -EINVAL;
361 goto out_sem;
362 }
363
364 if (sup != NULL)
365 *sup = su;
366 *bhp = bh;
367
368 out_sem:
369 up_write(&NILFS_MDT(sufile)->mi_sem);
370 return ret;
371}
372
373/**
374 * nilfs_sufile_put_segment_usage - put a segment usage
375 * @sufile: inode of segment usage file
376 * @segnum: segment number
377 * @bh: buffer head
378 *
379 * Description: nilfs_sufile_put_segment_usage() releases the segment usage
380 * specified by @segnum. @bh must be the buffer head which have been returned
381 * by a previous call to nilfs_sufile_get_segment_usage() with @segnum.
382 */
383void nilfs_sufile_put_segment_usage(struct inode *sufile, __u64 segnum,
384 struct buffer_head *bh)
385{
386 kunmap(bh->b_page);
387 brelse(bh);
388}
389
390/**
391 * nilfs_sufile_get_stat - get segment usage statistics
392 * @sufile: inode of segment usage file
393 * @stat: pointer to a structure of segment usage statistics
394 *
395 * Description: nilfs_sufile_get_stat() returns information about segment
396 * usage.
397 *
398 * Return Value: On success, 0 is returned, and segment usage information is
399 * stored in the place pointed by @stat. On error, one of the following
400 * negative error codes is returned.
401 *
402 * %-EIO - I/O error.
403 *
404 * %-ENOMEM - Insufficient amount of memory available.
405 */
406int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
407{
408 struct buffer_head *header_bh;
409 struct nilfs_sufile_header *header;
410 struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs;
411 void *kaddr;
412 int ret;
413
414 down_read(&NILFS_MDT(sufile)->mi_sem);
415
416 ret = nilfs_sufile_get_header_block(sufile, &header_bh);
417 if (ret < 0)
418 goto out_sem;
419
420 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
421 header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
422 sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile);
423 sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs);
424 sustat->ss_ndirtysegs = le64_to_cpu(header->sh_ndirtysegs);
425 sustat->ss_ctime = nilfs->ns_ctime;
426 sustat->ss_nongc_ctime = nilfs->ns_nongc_ctime;
427 spin_lock(&nilfs->ns_last_segment_lock);
428 sustat->ss_prot_seq = nilfs->ns_prot_seq;
429 spin_unlock(&nilfs->ns_last_segment_lock);
430 kunmap_atomic(kaddr, KM_USER0);
431 brelse(header_bh);
432
433 out_sem:
434 up_read(&NILFS_MDT(sufile)->mi_sem);
435 return ret;
436}
437
438/**
439 * nilfs_sufile_get_ncleansegs - get the number of clean segments
440 * @sufile: inode of segment usage file
441 * @nsegsp: pointer to the number of clean segments
442 *
443 * Description: nilfs_sufile_get_ncleansegs() acquires the number of clean
444 * segments.
445 *
446 * Return Value: On success, 0 is returned and the number of clean segments is
447 * stored in the place pointed by @nsegsp. On error, one of the following
448 * negative error codes is returned.
449 *
450 * %-EIO - I/O error.
451 *
452 * %-ENOMEM - Insufficient amount of memory available.
453 */
454int nilfs_sufile_get_ncleansegs(struct inode *sufile, unsigned long *nsegsp)
455{
456 struct nilfs_sustat sustat;
457 int ret;
458
459 ret = nilfs_sufile_get_stat(sufile, &sustat);
460 if (ret == 0)
461 *nsegsp = sustat.ss_ncleansegs;
462 return ret;
463}
464
465void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
466 struct buffer_head *header_bh,
467 struct buffer_head *su_bh)
468{
469 struct nilfs_segment_usage *su;
470 void *kaddr;
471 int suclean;
472
473 kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
474 su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
475 if (nilfs_segment_usage_error(su)) {
476 kunmap_atomic(kaddr, KM_USER0);
477 return;
478 }
479 suclean = nilfs_segment_usage_clean(su);
480 nilfs_segment_usage_set_error(su);
481 kunmap_atomic(kaddr, KM_USER0);
482
483 if (suclean)
484 nilfs_sufile_mod_counter(header_bh, -1, 0);
485 nilfs_mdt_mark_buffer_dirty(su_bh);
486 nilfs_mdt_mark_dirty(sufile);
487}
488
489/**
490 * nilfs_sufile_get_suinfo -
491 * @sufile: inode of segment usage file
492 * @segnum: segment number to start looking
493 * @si: array of suinfo
494 * @nsi: size of suinfo array
495 *
496 * Description:
497 *
498 * Return Value: On success, 0 is returned and .... On error, one of the
499 * following negative error codes is returned.
500 *
501 * %-EIO - I/O error.
502 *
503 * %-ENOMEM - Insufficient amount of memory available.
504 */
505ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum,
506 struct nilfs_suinfo *si, size_t nsi)
507{
508 struct buffer_head *su_bh;
509 struct nilfs_segment_usage *su;
510 size_t susz = NILFS_MDT(sufile)->mi_entry_size;
511 struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs;
512 void *kaddr;
513 unsigned long nsegs, segusages_per_block;
514 ssize_t n;
515 int ret, i, j;
516
517 down_read(&NILFS_MDT(sufile)->mi_sem);
518
519 segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile);
520 nsegs = min_t(unsigned long,
521 nilfs_sufile_get_nsegments(sufile) - segnum,
522 nsi);
523 for (i = 0; i < nsegs; i += n, segnum += n) {
524 n = min_t(unsigned long,
525 segusages_per_block -
526 nilfs_sufile_get_offset(sufile, segnum),
527 nsegs - i);
528 ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0,
529 &su_bh);
530 if (ret < 0) {
531 if (ret != -ENOENT)
532 goto out;
533 /* hole */
534 memset(&si[i], 0, sizeof(struct nilfs_suinfo) * n);
535 continue;
536 }
537
538 kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
539 su = nilfs_sufile_block_get_segment_usage(
540 sufile, segnum, su_bh, kaddr);
541 for (j = 0; j < n; j++, su = (void *)su + susz) {
542 si[i + j].sui_lastmod = le64_to_cpu(su->su_lastmod);
543 si[i + j].sui_nblocks = le32_to_cpu(su->su_nblocks);
544 si[i + j].sui_flags = le32_to_cpu(su->su_flags) &
545 ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
546 if (nilfs_segment_is_active(nilfs, segnum + j))
547 si[i + j].sui_flags |=
548 (1UL << NILFS_SEGMENT_USAGE_ACTIVE);
549 }
550 kunmap_atomic(kaddr, KM_USER0);
551 brelse(su_bh);
552 }
553 ret = nsegs;
554
555 out:
556 up_read(&NILFS_MDT(sufile)->mi_sem);
557 return ret;
558}
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
new file mode 100644
index 000000000000..a2e2efd4ade1
--- /dev/null
+++ b/fs/nilfs2/sufile.h
@@ -0,0 +1,125 @@
1/*
2 * sufile.h - NILFS segment usage file.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_SUFILE_H
24#define _NILFS_SUFILE_H
25
26#include <linux/fs.h>
27#include <linux/buffer_head.h>
28#include <linux/nilfs2_fs.h>
29#include "mdt.h"
30
31#define NILFS_SUFILE_GFP NILFS_MDT_GFP
32
33static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
34{
35 return NILFS_MDT(sufile)->mi_nilfs->ns_nsegments;
36}
37
38int nilfs_sufile_alloc(struct inode *, __u64 *);
39int nilfs_sufile_get_segment_usage(struct inode *, __u64,
40 struct nilfs_segment_usage **,
41 struct buffer_head **);
42void nilfs_sufile_put_segment_usage(struct inode *, __u64,
43 struct buffer_head *);
44int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
45int nilfs_sufile_get_ncleansegs(struct inode *, unsigned long *);
46ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, struct nilfs_suinfo *,
47 size_t);
48
49int nilfs_sufile_update(struct inode *, __u64, int,
50 void (*dofunc)(struct inode *, __u64,
51 struct buffer_head *,
52 struct buffer_head *));
53void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
54 struct buffer_head *);
55void nilfs_sufile_do_scrap(struct inode *, __u64, struct buffer_head *,
56 struct buffer_head *);
57void nilfs_sufile_do_free(struct inode *, __u64, struct buffer_head *,
58 struct buffer_head *);
59void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
60 struct buffer_head *);
61
62/**
63 * nilfs_sufile_cancel_free -
64 * @sufile: inode of segment usage file
65 * @segnum: segment number
66 *
67 * Description:
68 *
69 * Return Value: On success, 0 is returned. On error, one of the following
70 * negative error codes is returned.
71 *
72 * %-EIO - I/O error.
73 *
74 * %-ENOMEM - Insufficient amount of memory available.
75 */
76static inline int nilfs_sufile_cancel_free(struct inode *sufile, __u64 segnum)
77{
78 return nilfs_sufile_update(sufile, segnum, 0,
79 nilfs_sufile_do_cancel_free);
80}
81
82/**
83 * nilfs_sufile_scrap - make a segment garbage
84 * @sufile: inode of segment usage file
85 * @segnum: segment number to be freed
86 */
87static inline int nilfs_sufile_scrap(struct inode *sufile, __u64 segnum)
88{
89 return nilfs_sufile_update(sufile, segnum, 1, nilfs_sufile_do_scrap);
90}
91
92/**
93 * nilfs_sufile_free - free segment
94 * @sufile: inode of segment usage file
95 * @segnum: segment number to be freed
96 */
97static inline int nilfs_sufile_free(struct inode *sufile, __u64 segnum)
98{
99 return nilfs_sufile_update(sufile, segnum, 0, nilfs_sufile_do_free);
100}
101
102/**
103 * nilfs_sufile_set_error - mark a segment as erroneous
104 * @sufile: inode of segment usage file
105 * @segnum: segment number
106 *
107 * Description: nilfs_sufile_set_error() marks the segment specified by
108 * @segnum as erroneous. The error segment will never be used again.
109 *
110 * Return Value: On success, 0 is returned. On error, one of the following
111 * negative error codes is returned.
112 *
113 * %-EIO - I/O error.
114 *
115 * %-ENOMEM - Insufficient amount of memory available.
116 *
117 * %-EINVAL - Invalid segment usage number.
118 */
119static inline int nilfs_sufile_set_error(struct inode *sufile, __u64 segnum)
120{
121 return nilfs_sufile_update(sufile, segnum, 0,
122 nilfs_sufile_do_set_error);
123}
124
125#endif /* _NILFS_SUFILE_H */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
new file mode 100644
index 000000000000..6989b03e97ab
--- /dev/null
+++ b/fs/nilfs2/super.c
@@ -0,0 +1,1326 @@
1/*
2 * super.c - NILFS module and super block management.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 */
22/*
23 * linux/fs/ext2/super.c
24 *
25 * Copyright (C) 1992, 1993, 1994, 1995
26 * Remy Card (card@masi.ibp.fr)
27 * Laboratoire MASI - Institut Blaise Pascal
28 * Universite Pierre et Marie Curie (Paris VI)
29 *
30 * from
31 *
32 * linux/fs/minix/inode.c
33 *
34 * Copyright (C) 1991, 1992 Linus Torvalds
35 *
36 * Big-endian to little-endian byte-swapping/bitmaps by
37 * David S. Miller (davem@caip.rutgers.edu), 1995
38 */
39
40#include <linux/module.h>
41#include <linux/string.h>
42#include <linux/slab.h>
43#include <linux/init.h>
44#include <linux/blkdev.h>
45#include <linux/parser.h>
46#include <linux/random.h>
47#include <linux/crc32.h>
48#include <linux/smp_lock.h>
49#include <linux/vfs.h>
50#include <linux/writeback.h>
51#include <linux/kobject.h>
52#include <linux/exportfs.h>
53#include "nilfs.h"
54#include "mdt.h"
55#include "alloc.h"
56#include "page.h"
57#include "cpfile.h"
58#include "ifile.h"
59#include "dat.h"
60#include "segment.h"
61#include "segbuf.h"
62
63MODULE_AUTHOR("NTT Corp.");
64MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
65 "(NILFS)");
66MODULE_LICENSE("GPL");
67
68static int nilfs_remount(struct super_block *sb, int *flags, char *data);
69static int test_exclusive_mount(struct file_system_type *fs_type,
70 struct block_device *bdev, int flags);
71
72/**
73 * nilfs_error() - report failure condition on a filesystem
74 *
75 * nilfs_error() sets an ERROR_FS flag on the superblock as well as
76 * reporting an error message. It should be called when NILFS detects
77 * incoherences or defects of meta data on disk. As for sustainable
78 * errors such as a single-shot I/O error, nilfs_warning() or the printk()
79 * function should be used instead.
80 *
81 * The segment constructor must not call this function because it can
82 * kill itself.
83 */
84void nilfs_error(struct super_block *sb, const char *function,
85 const char *fmt, ...)
86{
87 struct nilfs_sb_info *sbi = NILFS_SB(sb);
88 va_list args;
89
90 va_start(args, fmt);
91 printk(KERN_CRIT "NILFS error (device %s): %s: ", sb->s_id, function);
92 vprintk(fmt, args);
93 printk("\n");
94 va_end(args);
95
96 if (!(sb->s_flags & MS_RDONLY)) {
97 struct the_nilfs *nilfs = sbi->s_nilfs;
98
99 if (!nilfs_test_opt(sbi, ERRORS_CONT))
100 nilfs_detach_segment_constructor(sbi);
101
102 down_write(&nilfs->ns_sem);
103 if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
104 nilfs->ns_mount_state |= NILFS_ERROR_FS;
105 nilfs->ns_sbp[0]->s_state |=
106 cpu_to_le16(NILFS_ERROR_FS);
107 nilfs_commit_super(sbi, 1);
108 }
109 up_write(&nilfs->ns_sem);
110
111 if (nilfs_test_opt(sbi, ERRORS_RO)) {
112 printk(KERN_CRIT "Remounting filesystem read-only\n");
113 sb->s_flags |= MS_RDONLY;
114 }
115 }
116
117 if (nilfs_test_opt(sbi, ERRORS_PANIC))
118 panic("NILFS (device %s): panic forced after error\n",
119 sb->s_id);
120}
121
122void nilfs_warning(struct super_block *sb, const char *function,
123 const char *fmt, ...)
124{
125 va_list args;
126
127 va_start(args, fmt);
128 printk(KERN_WARNING "NILFS warning (device %s): %s: ",
129 sb->s_id, function);
130 vprintk(fmt, args);
131 printk("\n");
132 va_end(args);
133}
134
135static struct kmem_cache *nilfs_inode_cachep;
136
137struct inode *nilfs_alloc_inode(struct super_block *sb)
138{
139 struct nilfs_inode_info *ii;
140
141 ii = kmem_cache_alloc(nilfs_inode_cachep, GFP_NOFS);
142 if (!ii)
143 return NULL;
144 ii->i_bh = NULL;
145 ii->i_state = 0;
146 ii->vfs_inode.i_version = 1;
147 nilfs_btnode_cache_init(&ii->i_btnode_cache);
148 return &ii->vfs_inode;
149}
150
151void nilfs_destroy_inode(struct inode *inode)
152{
153 kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
154}
155
156static void init_once(void *obj)
157{
158 struct nilfs_inode_info *ii = obj;
159
160 INIT_LIST_HEAD(&ii->i_dirty);
161#ifdef CONFIG_NILFS_XATTR
162 init_rwsem(&ii->xattr_sem);
163#endif
164 nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
165 ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union;
166 inode_init_once(&ii->vfs_inode);
167}
168
169static int nilfs_init_inode_cache(void)
170{
171 nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
172 sizeof(struct nilfs_inode_info),
173 0, SLAB_RECLAIM_ACCOUNT,
174 init_once);
175
176 return (nilfs_inode_cachep == NULL) ? -ENOMEM : 0;
177}
178
179static inline void nilfs_destroy_inode_cache(void)
180{
181 kmem_cache_destroy(nilfs_inode_cachep);
182}
183
184static void nilfs_clear_inode(struct inode *inode)
185{
186 struct nilfs_inode_info *ii = NILFS_I(inode);
187
188#ifdef CONFIG_NILFS_POSIX_ACL
189 if (ii->i_acl && ii->i_acl != NILFS_ACL_NOT_CACHED) {
190 posix_acl_release(ii->i_acl);
191 ii->i_acl = NILFS_ACL_NOT_CACHED;
192 }
193 if (ii->i_default_acl && ii->i_default_acl != NILFS_ACL_NOT_CACHED) {
194 posix_acl_release(ii->i_default_acl);
195 ii->i_default_acl = NILFS_ACL_NOT_CACHED;
196 }
197#endif
198 /*
199 * Free resources allocated in nilfs_read_inode(), here.
200 */
201 BUG_ON(!list_empty(&ii->i_dirty));
202 brelse(ii->i_bh);
203 ii->i_bh = NULL;
204
205 if (test_bit(NILFS_I_BMAP, &ii->i_state))
206 nilfs_bmap_clear(ii->i_bmap);
207
208 nilfs_btnode_cache_clear(&ii->i_btnode_cache);
209}
210
211static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb)
212{
213 struct the_nilfs *nilfs = sbi->s_nilfs;
214 int err;
215 int barrier_done = 0;
216
217 if (nilfs_test_opt(sbi, BARRIER)) {
218 set_buffer_ordered(nilfs->ns_sbh[0]);
219 barrier_done = 1;
220 }
221 retry:
222 set_buffer_dirty(nilfs->ns_sbh[0]);
223 err = sync_dirty_buffer(nilfs->ns_sbh[0]);
224 if (err == -EOPNOTSUPP && barrier_done) {
225 nilfs_warning(sbi->s_super, __func__,
226 "barrier-based sync failed. "
227 "disabling barriers\n");
228 nilfs_clear_opt(sbi, BARRIER);
229 barrier_done = 0;
230 clear_buffer_ordered(nilfs->ns_sbh[0]);
231 goto retry;
232 }
233 if (unlikely(err)) {
234 printk(KERN_ERR
235 "NILFS: unable to write superblock (err=%d)\n", err);
236 if (err == -EIO && nilfs->ns_sbh[1]) {
237 nilfs_fall_back_super_block(nilfs);
238 goto retry;
239 }
240 } else {
241 struct nilfs_super_block *sbp = nilfs->ns_sbp[0];
242
243 /*
244 * The latest segment becomes trailable from the position
245 * written in superblock.
246 */
247 clear_nilfs_discontinued(nilfs);
248
249 /* update GC protection for recent segments */
250 if (nilfs->ns_sbh[1]) {
251 sbp = NULL;
252 if (dupsb) {
253 set_buffer_dirty(nilfs->ns_sbh[1]);
254 if (!sync_dirty_buffer(nilfs->ns_sbh[1]))
255 sbp = nilfs->ns_sbp[1];
256 }
257 }
258 if (sbp) {
259 spin_lock(&nilfs->ns_last_segment_lock);
260 nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq);
261 spin_unlock(&nilfs->ns_last_segment_lock);
262 }
263 }
264
265 return err;
266}
267
268int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb)
269{
270 struct the_nilfs *nilfs = sbi->s_nilfs;
271 struct nilfs_super_block **sbp = nilfs->ns_sbp;
272 sector_t nfreeblocks;
273 time_t t;
274 int err;
275
276 /* nilfs->sem must be locked by the caller. */
277 if (sbp[0]->s_magic != NILFS_SUPER_MAGIC) {
278 if (sbp[1] && sbp[1]->s_magic == NILFS_SUPER_MAGIC)
279 nilfs_swap_super_block(nilfs);
280 else {
281 printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
282 sbi->s_super->s_id);
283 return -EIO;
284 }
285 }
286 err = nilfs_count_free_blocks(nilfs, &nfreeblocks);
287 if (unlikely(err)) {
288 printk(KERN_ERR "NILFS: failed to count free blocks\n");
289 return err;
290 }
291 spin_lock(&nilfs->ns_last_segment_lock);
292 sbp[0]->s_last_seq = cpu_to_le64(nilfs->ns_last_seq);
293 sbp[0]->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg);
294 sbp[0]->s_last_cno = cpu_to_le64(nilfs->ns_last_cno);
295 spin_unlock(&nilfs->ns_last_segment_lock);
296
297 t = get_seconds();
298 nilfs->ns_sbwtime[0] = t;
299 sbp[0]->s_free_blocks_count = cpu_to_le64(nfreeblocks);
300 sbp[0]->s_wtime = cpu_to_le64(t);
301 sbp[0]->s_sum = 0;
302 sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed,
303 (unsigned char *)sbp[0],
304 nilfs->ns_sbsize));
305 if (dupsb && sbp[1]) {
306 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
307 nilfs->ns_sbwtime[1] = t;
308 }
309 sbi->s_super->s_dirt = 0;
310 return nilfs_sync_super(sbi, dupsb);
311}
312
313static void nilfs_put_super(struct super_block *sb)
314{
315 struct nilfs_sb_info *sbi = NILFS_SB(sb);
316 struct the_nilfs *nilfs = sbi->s_nilfs;
317
318 nilfs_detach_segment_constructor(sbi);
319
320 if (!(sb->s_flags & MS_RDONLY)) {
321 down_write(&nilfs->ns_sem);
322 nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state);
323 nilfs_commit_super(sbi, 1);
324 up_write(&nilfs->ns_sem);
325 }
326
327 nilfs_detach_checkpoint(sbi);
328 put_nilfs(sbi->s_nilfs);
329 sbi->s_super = NULL;
330 sb->s_fs_info = NULL;
331 kfree(sbi);
332}
333
334/**
335 * nilfs_write_super - write super block(s) of NILFS
336 * @sb: super_block
337 *
338 * nilfs_write_super() gets a fs-dependent lock, writes super block(s), and
339 * clears s_dirt. This function is called in the section protected by
340 * lock_super().
341 *
342 * The s_dirt flag is managed by each filesystem and we protect it by ns_sem
343 * of the struct the_nilfs. Lock order must be as follows:
344 *
345 * 1. lock_super()
346 * 2. down_write(&nilfs->ns_sem)
347 *
348 * Inside NILFS, locking ns_sem is enough to protect s_dirt and the buffer
349 * of the super block (nilfs->ns_sbp[]).
350 *
351 * In most cases, VFS functions call lock_super() before calling these
352 * methods. So we must be careful not to bring on deadlocks when using
353 * lock_super(); see generic_shutdown_super(), write_super(), and so on.
354 *
355 * Note that order of lock_kernel() and lock_super() depends on contexts
356 * of VFS. We should also note that lock_kernel() can be used in its
357 * protective section and only the outermost one has an effect.
358 */
359static void nilfs_write_super(struct super_block *sb)
360{
361 struct nilfs_sb_info *sbi = NILFS_SB(sb);
362 struct the_nilfs *nilfs = sbi->s_nilfs;
363
364 down_write(&nilfs->ns_sem);
365 if (!(sb->s_flags & MS_RDONLY)) {
366 struct nilfs_super_block **sbp = nilfs->ns_sbp;
367 u64 t = get_seconds();
368 int dupsb;
369
370 if (!nilfs_discontinued(nilfs) && t >= nilfs->ns_sbwtime[0] &&
371 t < nilfs->ns_sbwtime[0] + NILFS_SB_FREQ) {
372 up_write(&nilfs->ns_sem);
373 return;
374 }
375 dupsb = sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
376 nilfs_commit_super(sbi, dupsb);
377 }
378 sb->s_dirt = 0;
379 up_write(&nilfs->ns_sem);
380}
381
382static int nilfs_sync_fs(struct super_block *sb, int wait)
383{
384 int err = 0;
385
386 /* This function is called when super block should be written back */
387 if (wait)
388 err = nilfs_construct_segment(sb);
389 return err;
390}
391
392int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
393{
394 struct the_nilfs *nilfs = sbi->s_nilfs;
395 struct nilfs_checkpoint *raw_cp;
396 struct buffer_head *bh_cp;
397 int err;
398
399 down_write(&nilfs->ns_sem);
400 list_add(&sbi->s_list, &nilfs->ns_supers);
401 up_write(&nilfs->ns_sem);
402
403 sbi->s_ifile = nilfs_mdt_new(
404 nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP);
405 if (!sbi->s_ifile)
406 return -ENOMEM;
407
408 err = nilfs_palloc_init_blockgroup(sbi->s_ifile, nilfs->ns_inode_size);
409 if (unlikely(err))
410 goto failed;
411
412 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
413 &bh_cp);
414 if (unlikely(err)) {
415 if (err == -ENOENT || err == -EINVAL) {
416 printk(KERN_ERR
417 "NILFS: Invalid checkpoint "
418 "(checkpoint number=%llu)\n",
419 (unsigned long long)cno);
420 err = -EINVAL;
421 }
422 goto failed;
423 }
424 err = nilfs_read_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode);
425 if (unlikely(err))
426 goto failed_bh;
427 atomic_set(&sbi->s_inodes_count, le64_to_cpu(raw_cp->cp_inodes_count));
428 atomic_set(&sbi->s_blocks_count, le64_to_cpu(raw_cp->cp_blocks_count));
429
430 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
431 return 0;
432
433 failed_bh:
434 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
435 failed:
436 nilfs_mdt_destroy(sbi->s_ifile);
437 sbi->s_ifile = NULL;
438
439 down_write(&nilfs->ns_sem);
440 list_del_init(&sbi->s_list);
441 up_write(&nilfs->ns_sem);
442
443 return err;
444}
445
446void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi)
447{
448 struct the_nilfs *nilfs = sbi->s_nilfs;
449
450 nilfs_mdt_clear(sbi->s_ifile);
451 nilfs_mdt_destroy(sbi->s_ifile);
452 sbi->s_ifile = NULL;
453 down_write(&nilfs->ns_sem);
454 list_del_init(&sbi->s_list);
455 up_write(&nilfs->ns_sem);
456}
457
458static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi)
459{
460 struct the_nilfs *nilfs = sbi->s_nilfs;
461 int err = 0;
462
463 down_write(&nilfs->ns_sem);
464 if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) {
465 nilfs->ns_mount_state |= NILFS_VALID_FS;
466 err = nilfs_commit_super(sbi, 1);
467 if (likely(!err))
468 printk(KERN_INFO "NILFS: recovery complete.\n");
469 }
470 up_write(&nilfs->ns_sem);
471 return err;
472}
473
474static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
475{
476 struct super_block *sb = dentry->d_sb;
477 struct nilfs_sb_info *sbi = NILFS_SB(sb);
478 struct the_nilfs *nilfs = sbi->s_nilfs;
479 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
480 unsigned long long blocks;
481 unsigned long overhead;
482 unsigned long nrsvblocks;
483 sector_t nfreeblocks;
484 int err;
485
486 /*
487 * Compute all of the segment blocks
488 *
489 * The blocks before first segment and after last segment
490 * are excluded.
491 */
492 blocks = nilfs->ns_blocks_per_segment * nilfs->ns_nsegments
493 - nilfs->ns_first_data_block;
494 nrsvblocks = nilfs->ns_nrsvsegs * nilfs->ns_blocks_per_segment;
495
496 /*
497 * Compute the overhead
498 *
499 * When distributing meta data blocks outside semgent structure,
500 * We must count them as the overhead.
501 */
502 overhead = 0;
503
504 err = nilfs_count_free_blocks(nilfs, &nfreeblocks);
505 if (unlikely(err))
506 return err;
507
508 buf->f_type = NILFS_SUPER_MAGIC;
509 buf->f_bsize = sb->s_blocksize;
510 buf->f_blocks = blocks - overhead;
511 buf->f_bfree = nfreeblocks;
512 buf->f_bavail = (buf->f_bfree >= nrsvblocks) ?
513 (buf->f_bfree - nrsvblocks) : 0;
514 buf->f_files = atomic_read(&sbi->s_inodes_count);
515 buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */
516 buf->f_namelen = NILFS_NAME_LEN;
517 buf->f_fsid.val[0] = (u32)id;
518 buf->f_fsid.val[1] = (u32)(id >> 32);
519
520 return 0;
521}
522
523static struct super_operations nilfs_sops = {
524 .alloc_inode = nilfs_alloc_inode,
525 .destroy_inode = nilfs_destroy_inode,
526 .dirty_inode = nilfs_dirty_inode,
527 /* .write_inode = nilfs_write_inode, */
528 /* .put_inode = nilfs_put_inode, */
529 /* .drop_inode = nilfs_drop_inode, */
530 .delete_inode = nilfs_delete_inode,
531 .put_super = nilfs_put_super,
532 .write_super = nilfs_write_super,
533 .sync_fs = nilfs_sync_fs,
534 /* .write_super_lockfs */
535 /* .unlockfs */
536 .statfs = nilfs_statfs,
537 .remount_fs = nilfs_remount,
538 .clear_inode = nilfs_clear_inode,
539 /* .umount_begin */
540 /* .show_options */
541};
542
543static struct inode *
544nilfs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
545{
546 struct inode *inode;
547
548 if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO &&
549 ino != NILFS_SKETCH_INO)
550 return ERR_PTR(-ESTALE);
551
552 inode = nilfs_iget(sb, ino);
553 if (IS_ERR(inode))
554 return ERR_CAST(inode);
555 if (generation && inode->i_generation != generation) {
556 iput(inode);
557 return ERR_PTR(-ESTALE);
558 }
559
560 return inode;
561}
562
563static struct dentry *
564nilfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len,
565 int fh_type)
566{
567 return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
568 nilfs_nfs_get_inode);
569}
570
571static struct dentry *
572nilfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len,
573 int fh_type)
574{
575 return generic_fh_to_parent(sb, fid, fh_len, fh_type,
576 nilfs_nfs_get_inode);
577}
578
579static struct export_operations nilfs_export_ops = {
580 .fh_to_dentry = nilfs_fh_to_dentry,
581 .fh_to_parent = nilfs_fh_to_parent,
582 .get_parent = nilfs_get_parent,
583};
584
585enum {
586 Opt_err_cont, Opt_err_panic, Opt_err_ro,
587 Opt_barrier, Opt_snapshot, Opt_order,
588 Opt_err,
589};
590
591static match_table_t tokens = {
592 {Opt_err_cont, "errors=continue"},
593 {Opt_err_panic, "errors=panic"},
594 {Opt_err_ro, "errors=remount-ro"},
595 {Opt_barrier, "barrier=%s"},
596 {Opt_snapshot, "cp=%u"},
597 {Opt_order, "order=%s"},
598 {Opt_err, NULL}
599};
600
601static int match_bool(substring_t *s, int *result)
602{
603 int len = s->to - s->from;
604
605 if (strncmp(s->from, "on", len) == 0)
606 *result = 1;
607 else if (strncmp(s->from, "off", len) == 0)
608 *result = 0;
609 else
610 return 1;
611 return 0;
612}
613
614static int parse_options(char *options, struct super_block *sb)
615{
616 struct nilfs_sb_info *sbi = NILFS_SB(sb);
617 char *p;
618 substring_t args[MAX_OPT_ARGS];
619 int option;
620
621 if (!options)
622 return 1;
623
624 while ((p = strsep(&options, ",")) != NULL) {
625 int token;
626 if (!*p)
627 continue;
628
629 token = match_token(p, tokens, args);
630 switch (token) {
631 case Opt_barrier:
632 if (match_bool(&args[0], &option))
633 return 0;
634 if (option)
635 nilfs_set_opt(sbi, BARRIER);
636 else
637 nilfs_clear_opt(sbi, BARRIER);
638 break;
639 case Opt_order:
640 if (strcmp(args[0].from, "relaxed") == 0)
641 /* Ordered data semantics */
642 nilfs_clear_opt(sbi, STRICT_ORDER);
643 else if (strcmp(args[0].from, "strict") == 0)
644 /* Strict in-order semantics */
645 nilfs_set_opt(sbi, STRICT_ORDER);
646 else
647 return 0;
648 break;
649 case Opt_err_panic:
650 nilfs_write_opt(sbi, ERROR_MODE, ERRORS_PANIC);
651 break;
652 case Opt_err_ro:
653 nilfs_write_opt(sbi, ERROR_MODE, ERRORS_RO);
654 break;
655 case Opt_err_cont:
656 nilfs_write_opt(sbi, ERROR_MODE, ERRORS_CONT);
657 break;
658 case Opt_snapshot:
659 if (match_int(&args[0], &option) || option <= 0)
660 return 0;
661 if (!(sb->s_flags & MS_RDONLY))
662 return 0;
663 sbi->s_snapshot_cno = option;
664 nilfs_set_opt(sbi, SNAPSHOT);
665 break;
666 default:
667 printk(KERN_ERR
668 "NILFS: Unrecognized mount option \"%s\"\n", p);
669 return 0;
670 }
671 }
672 return 1;
673}
674
675static inline void
676nilfs_set_default_options(struct nilfs_sb_info *sbi,
677 struct nilfs_super_block *sbp)
678{
679 sbi->s_mount_opt =
680 NILFS_MOUNT_ERRORS_CONT | NILFS_MOUNT_BARRIER;
681}
682
683static int nilfs_setup_super(struct nilfs_sb_info *sbi)
684{
685 struct the_nilfs *nilfs = sbi->s_nilfs;
686 struct nilfs_super_block *sbp = nilfs->ns_sbp[0];
687 int max_mnt_count = le16_to_cpu(sbp->s_max_mnt_count);
688 int mnt_count = le16_to_cpu(sbp->s_mnt_count);
689
690 /* nilfs->sem must be locked by the caller. */
691 if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) {
692 printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
693 } else if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
694 printk(KERN_WARNING
695 "NILFS warning: mounting fs with errors\n");
696#if 0
697 } else if (max_mnt_count >= 0 && mnt_count >= max_mnt_count) {
698 printk(KERN_WARNING
699 "NILFS warning: maximal mount count reached\n");
700#endif
701 }
702 if (!max_mnt_count)
703 sbp->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT);
704
705 sbp->s_mnt_count = cpu_to_le16(mnt_count + 1);
706 sbp->s_state = cpu_to_le16(le16_to_cpu(sbp->s_state) & ~NILFS_VALID_FS);
707 sbp->s_mtime = cpu_to_le64(get_seconds());
708 return nilfs_commit_super(sbi, 1);
709}
710
711struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb,
712 u64 pos, int blocksize,
713 struct buffer_head **pbh)
714{
715 unsigned long long sb_index = pos;
716 unsigned long offset;
717
718 offset = do_div(sb_index, blocksize);
719 *pbh = sb_bread(sb, sb_index);
720 if (!*pbh)
721 return NULL;
722 return (struct nilfs_super_block *)((char *)(*pbh)->b_data + offset);
723}
724
725int nilfs_store_magic_and_option(struct super_block *sb,
726 struct nilfs_super_block *sbp,
727 char *data)
728{
729 struct nilfs_sb_info *sbi = NILFS_SB(sb);
730
731 sb->s_magic = le16_to_cpu(sbp->s_magic);
732
733 /* FS independent flags */
734#ifdef NILFS_ATIME_DISABLE
735 sb->s_flags |= MS_NOATIME;
736#endif
737
738 nilfs_set_default_options(sbi, sbp);
739
740 sbi->s_resuid = le16_to_cpu(sbp->s_def_resuid);
741 sbi->s_resgid = le16_to_cpu(sbp->s_def_resgid);
742 sbi->s_interval = le32_to_cpu(sbp->s_c_interval);
743 sbi->s_watermark = le32_to_cpu(sbp->s_c_block_max);
744
745 return !parse_options(data, sb) ? -EINVAL : 0 ;
746}
747
748/**
749 * nilfs_fill_super() - initialize a super block instance
750 * @sb: super_block
751 * @data: mount options
752 * @silent: silent mode flag
753 * @nilfs: the_nilfs struct
754 *
755 * This function is called exclusively by bd_mount_mutex.
756 * So, the recovery process is protected from other simultaneous mounts.
757 */
758static int
759nilfs_fill_super(struct super_block *sb, void *data, int silent,
760 struct the_nilfs *nilfs)
761{
762 struct nilfs_sb_info *sbi;
763 struct inode *root;
764 __u64 cno;
765 int err;
766
767 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
768 if (!sbi)
769 return -ENOMEM;
770
771 sb->s_fs_info = sbi;
772
773 get_nilfs(nilfs);
774 sbi->s_nilfs = nilfs;
775 sbi->s_super = sb;
776
777 err = init_nilfs(nilfs, sbi, (char *)data);
778 if (err)
779 goto failed_sbi;
780
781 spin_lock_init(&sbi->s_inode_lock);
782 INIT_LIST_HEAD(&sbi->s_dirty_files);
783 INIT_LIST_HEAD(&sbi->s_list);
784
785 /*
786 * Following initialization is overlapped because
787 * nilfs_sb_info structure has been cleared at the beginning.
788 * But we reserve them to keep our interest and make ready
789 * for the future change.
790 */
791 get_random_bytes(&sbi->s_next_generation,
792 sizeof(sbi->s_next_generation));
793 spin_lock_init(&sbi->s_next_gen_lock);
794
795 sb->s_op = &nilfs_sops;
796 sb->s_export_op = &nilfs_export_ops;
797 sb->s_root = NULL;
798 sb->s_time_gran = 1;
799
800 if (!nilfs_loaded(nilfs)) {
801 err = load_nilfs(nilfs, sbi);
802 if (err)
803 goto failed_sbi;
804 }
805 cno = nilfs_last_cno(nilfs);
806
807 if (sb->s_flags & MS_RDONLY) {
808 if (nilfs_test_opt(sbi, SNAPSHOT)) {
809 err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile,
810 sbi->s_snapshot_cno);
811 if (err < 0)
812 goto failed_sbi;
813 if (!err) {
814 printk(KERN_ERR
815 "NILFS: The specified checkpoint is "
816 "not a snapshot "
817 "(checkpoint number=%llu).\n",
818 (unsigned long long)sbi->s_snapshot_cno);
819 err = -EINVAL;
820 goto failed_sbi;
821 }
822 cno = sbi->s_snapshot_cno;
823 } else
824 /* Read-only mount */
825 sbi->s_snapshot_cno = cno;
826 }
827
828 err = nilfs_attach_checkpoint(sbi, cno);
829 if (err) {
830 printk(KERN_ERR "NILFS: error loading a checkpoint"
831 " (checkpoint number=%llu).\n", (unsigned long long)cno);
832 goto failed_sbi;
833 }
834
835 if (!(sb->s_flags & MS_RDONLY)) {
836 err = nilfs_attach_segment_constructor(sbi);
837 if (err)
838 goto failed_checkpoint;
839 }
840
841 root = nilfs_iget(sb, NILFS_ROOT_INO);
842 if (IS_ERR(root)) {
843 printk(KERN_ERR "NILFS: get root inode failed\n");
844 err = PTR_ERR(root);
845 goto failed_segctor;
846 }
847 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
848 iput(root);
849 printk(KERN_ERR "NILFS: corrupt root inode.\n");
850 err = -EINVAL;
851 goto failed_segctor;
852 }
853 sb->s_root = d_alloc_root(root);
854 if (!sb->s_root) {
855 iput(root);
856 printk(KERN_ERR "NILFS: get root dentry failed\n");
857 err = -ENOMEM;
858 goto failed_segctor;
859 }
860
861 if (!(sb->s_flags & MS_RDONLY)) {
862 down_write(&nilfs->ns_sem);
863 nilfs_setup_super(sbi);
864 up_write(&nilfs->ns_sem);
865 }
866
867 err = nilfs_mark_recovery_complete(sbi);
868 if (unlikely(err)) {
869 printk(KERN_ERR "NILFS: recovery failed.\n");
870 goto failed_root;
871 }
872
873 return 0;
874
875 failed_root:
876 dput(sb->s_root);
877 sb->s_root = NULL;
878
879 failed_segctor:
880 nilfs_detach_segment_constructor(sbi);
881
882 failed_checkpoint:
883 nilfs_detach_checkpoint(sbi);
884
885 failed_sbi:
886 put_nilfs(nilfs);
887 sb->s_fs_info = NULL;
888 kfree(sbi);
889 return err;
890}
891
892static int nilfs_remount(struct super_block *sb, int *flags, char *data)
893{
894 struct nilfs_sb_info *sbi = NILFS_SB(sb);
895 struct nilfs_super_block *sbp;
896 struct the_nilfs *nilfs = sbi->s_nilfs;
897 unsigned long old_sb_flags;
898 struct nilfs_mount_options old_opts;
899 int err;
900
901 old_sb_flags = sb->s_flags;
902 old_opts.mount_opt = sbi->s_mount_opt;
903 old_opts.snapshot_cno = sbi->s_snapshot_cno;
904
905 if (!parse_options(data, sb)) {
906 err = -EINVAL;
907 goto restore_opts;
908 }
909 sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
910
911 if ((*flags & MS_RDONLY) &&
912 sbi->s_snapshot_cno != old_opts.snapshot_cno) {
913 printk(KERN_WARNING "NILFS (device %s): couldn't "
914 "remount to a different snapshot. \n",
915 sb->s_id);
916 err = -EINVAL;
917 goto restore_opts;
918 }
919
920 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
921 goto out;
922 if (*flags & MS_RDONLY) {
923 /* Shutting down the segment constructor */
924 nilfs_detach_segment_constructor(sbi);
925 sb->s_flags |= MS_RDONLY;
926
927 sbi->s_snapshot_cno = nilfs_last_cno(nilfs);
928 /* nilfs_set_opt(sbi, SNAPSHOT); */
929
930 /*
931 * Remounting a valid RW partition RDONLY, so set
932 * the RDONLY flag and then mark the partition as valid again.
933 */
934 down_write(&nilfs->ns_sem);
935 sbp = nilfs->ns_sbp[0];
936 if (!(sbp->s_state & le16_to_cpu(NILFS_VALID_FS)) &&
937 (nilfs->ns_mount_state & NILFS_VALID_FS))
938 sbp->s_state = cpu_to_le16(nilfs->ns_mount_state);
939 sbp->s_mtime = cpu_to_le64(get_seconds());
940 nilfs_commit_super(sbi, 1);
941 up_write(&nilfs->ns_sem);
942 } else {
943 /*
944 * Mounting a RDONLY partition read-write, so reread and
945 * store the current valid flag. (It may have been changed
946 * by fsck since we originally mounted the partition.)
947 */
948 down(&sb->s_bdev->bd_mount_sem);
949 /* Check existing RW-mount */
950 if (test_exclusive_mount(sb->s_type, sb->s_bdev, 0)) {
951 printk(KERN_WARNING "NILFS (device %s): couldn't "
952 "remount because a RW-mount exists.\n",
953 sb->s_id);
954 err = -EBUSY;
955 goto rw_remount_failed;
956 }
957 if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) {
958 printk(KERN_WARNING "NILFS (device %s): couldn't "
959 "remount because the current RO-mount is not "
960 "the latest one.\n",
961 sb->s_id);
962 err = -EINVAL;
963 goto rw_remount_failed;
964 }
965 sb->s_flags &= ~MS_RDONLY;
966 nilfs_clear_opt(sbi, SNAPSHOT);
967 sbi->s_snapshot_cno = 0;
968
969 err = nilfs_attach_segment_constructor(sbi);
970 if (err)
971 goto rw_remount_failed;
972
973 down_write(&nilfs->ns_sem);
974 nilfs_setup_super(sbi);
975 up_write(&nilfs->ns_sem);
976
977 up(&sb->s_bdev->bd_mount_sem);
978 }
979 out:
980 return 0;
981
982 rw_remount_failed:
983 up(&sb->s_bdev->bd_mount_sem);
984 restore_opts:
985 sb->s_flags = old_sb_flags;
986 sbi->s_mount_opt = old_opts.mount_opt;
987 sbi->s_snapshot_cno = old_opts.snapshot_cno;
988 return err;
989}
990
991struct nilfs_super_data {
992 struct block_device *bdev;
993 __u64 cno;
994 int flags;
995};
996
997/**
998 * nilfs_identify - pre-read mount options needed to identify mount instance
999 * @data: mount options
1000 * @sd: nilfs_super_data
1001 */
1002static int nilfs_identify(char *data, struct nilfs_super_data *sd)
1003{
1004 char *p, *options = data;
1005 substring_t args[MAX_OPT_ARGS];
1006 int option, token;
1007 int ret = 0;
1008
1009 do {
1010 p = strsep(&options, ",");
1011 if (p != NULL && *p) {
1012 token = match_token(p, tokens, args);
1013 if (token == Opt_snapshot) {
1014 if (!(sd->flags & MS_RDONLY))
1015 ret++;
1016 else {
1017 ret = match_int(&args[0], &option);
1018 if (!ret) {
1019 if (option > 0)
1020 sd->cno = option;
1021 else
1022 ret++;
1023 }
1024 }
1025 }
1026 if (ret)
1027 printk(KERN_ERR
1028 "NILFS: invalid mount option: %s\n", p);
1029 }
1030 if (!options)
1031 break;
1032 BUG_ON(options == data);
1033 *(options - 1) = ',';
1034 } while (!ret);
1035 return ret;
1036}
1037
1038static int nilfs_set_bdev_super(struct super_block *s, void *data)
1039{
1040 struct nilfs_super_data *sd = data;
1041
1042 s->s_bdev = sd->bdev;
1043 s->s_dev = s->s_bdev->bd_dev;
1044 return 0;
1045}
1046
1047static int nilfs_test_bdev_super(struct super_block *s, void *data)
1048{
1049 struct nilfs_super_data *sd = data;
1050
1051 return s->s_bdev == sd->bdev;
1052}
1053
1054static int nilfs_test_bdev_super2(struct super_block *s, void *data)
1055{
1056 struct nilfs_super_data *sd = data;
1057 int ret;
1058
1059 if (s->s_bdev != sd->bdev)
1060 return 0;
1061
1062 if (!((s->s_flags | sd->flags) & MS_RDONLY))
1063 return 1; /* Reuse an old R/W-mode super_block */
1064
1065 if (s->s_flags & sd->flags & MS_RDONLY) {
1066 if (down_read_trylock(&s->s_umount)) {
1067 ret = s->s_root &&
1068 (sd->cno == NILFS_SB(s)->s_snapshot_cno);
1069 up_read(&s->s_umount);
1070 /*
1071 * This path is locked with sb_lock by sget().
1072 * So, drop_super() causes deadlock.
1073 */
1074 return ret;
1075 }
1076 }
1077 return 0;
1078}
1079
1080static int
1081nilfs_get_sb(struct file_system_type *fs_type, int flags,
1082 const char *dev_name, void *data, struct vfsmount *mnt)
1083{
1084 struct nilfs_super_data sd;
1085 struct super_block *s, *s2;
1086 struct the_nilfs *nilfs = NULL;
1087 int err, need_to_close = 1;
1088
1089 sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type);
1090 if (IS_ERR(sd.bdev))
1091 return PTR_ERR(sd.bdev);
1092
1093 /*
1094 * To get mount instance using sget() vfs-routine, NILFS needs
1095 * much more information than normal filesystems to identify mount
1096 * instance. For snapshot mounts, not only a mount type (ro-mount
1097 * or rw-mount) but also a checkpoint number is required.
1098 * The results are passed in sget() using nilfs_super_data.
1099 */
1100 sd.cno = 0;
1101 sd.flags = flags;
1102 if (nilfs_identify((char *)data, &sd)) {
1103 err = -EINVAL;
1104 goto failed;
1105 }
1106
1107 /*
1108 * once the super is inserted into the list by sget, s_umount
1109 * will protect the lockfs code from trying to start a snapshot
1110 * while we are mounting
1111 */
1112 down(&sd.bdev->bd_mount_sem);
1113 if (!sd.cno &&
1114 (err = test_exclusive_mount(fs_type, sd.bdev, flags ^ MS_RDONLY))) {
1115 err = (err < 0) ? : -EBUSY;
1116 goto failed_unlock;
1117 }
1118
1119 /*
1120 * Phase-1: search any existent instance and get the_nilfs
1121 */
1122 s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd);
1123 if (IS_ERR(s))
1124 goto error_s;
1125
1126 if (!s->s_root) {
1127 err = -ENOMEM;
1128 nilfs = alloc_nilfs(sd.bdev);
1129 if (!nilfs)
1130 goto cancel_new;
1131 } else {
1132 struct nilfs_sb_info *sbi = NILFS_SB(s);
1133
1134 /*
1135 * s_umount protects super_block from unmount process;
1136 * It covers pointers of nilfs_sb_info and the_nilfs.
1137 */
1138 nilfs = sbi->s_nilfs;
1139 get_nilfs(nilfs);
1140 up_write(&s->s_umount);
1141
1142 /*
1143 * Phase-2: search specified snapshot or R/W mode super_block
1144 */
1145 if (!sd.cno)
1146 /* trying to get the latest checkpoint. */
1147 sd.cno = nilfs_last_cno(nilfs);
1148
1149 s2 = sget(fs_type, nilfs_test_bdev_super2,
1150 nilfs_set_bdev_super, &sd);
1151 deactivate_super(s);
1152 /*
1153 * Although deactivate_super() invokes close_bdev_exclusive() at
1154 * kill_block_super(). Here, s is an existent mount; we need
1155 * one more close_bdev_exclusive() call.
1156 */
1157 s = s2;
1158 if (IS_ERR(s))
1159 goto error_s;
1160 }
1161
1162 if (!s->s_root) {
1163 char b[BDEVNAME_SIZE];
1164
1165 s->s_flags = flags;
1166 strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
1167 sb_set_blocksize(s, block_size(sd.bdev));
1168
1169 err = nilfs_fill_super(s, data, flags & MS_VERBOSE, nilfs);
1170 if (err)
1171 goto cancel_new;
1172
1173 s->s_flags |= MS_ACTIVE;
1174 need_to_close = 0;
1175 } else if (!(s->s_flags & MS_RDONLY)) {
1176 err = -EBUSY;
1177 }
1178
1179 up(&sd.bdev->bd_mount_sem);
1180 put_nilfs(nilfs);
1181 if (need_to_close)
1182 close_bdev_exclusive(sd.bdev, flags);
1183 simple_set_mnt(mnt, s);
1184 return 0;
1185
1186 error_s:
1187 up(&sd.bdev->bd_mount_sem);
1188 if (nilfs)
1189 put_nilfs(nilfs);
1190 close_bdev_exclusive(sd.bdev, flags);
1191 return PTR_ERR(s);
1192
1193 failed_unlock:
1194 up(&sd.bdev->bd_mount_sem);
1195 failed:
1196 close_bdev_exclusive(sd.bdev, flags);
1197
1198 return err;
1199
1200 cancel_new:
1201 /* Abandoning the newly allocated superblock */
1202 up(&sd.bdev->bd_mount_sem);
1203 if (nilfs)
1204 put_nilfs(nilfs);
1205 up_write(&s->s_umount);
1206 deactivate_super(s);
1207 /*
1208 * deactivate_super() invokes close_bdev_exclusive().
1209 * We must finish all post-cleaning before this call;
1210 * put_nilfs() and unlocking bd_mount_sem need the block device.
1211 */
1212 return err;
1213}
1214
1215static int nilfs_test_bdev_super3(struct super_block *s, void *data)
1216{
1217 struct nilfs_super_data *sd = data;
1218 int ret;
1219
1220 if (s->s_bdev != sd->bdev)
1221 return 0;
1222 if (down_read_trylock(&s->s_umount)) {
1223 ret = (s->s_flags & MS_RDONLY) && s->s_root &&
1224 nilfs_test_opt(NILFS_SB(s), SNAPSHOT);
1225 up_read(&s->s_umount);
1226 if (ret)
1227 return 0; /* ignore snapshot mounts */
1228 }
1229 return !((sd->flags ^ s->s_flags) & MS_RDONLY);
1230}
1231
1232static int __false_bdev_super(struct super_block *s, void *data)
1233{
1234#if 0 /* XXX: workaround for lock debug. This is not good idea */
1235 up_write(&s->s_umount);
1236#endif
1237 return -EFAULT;
1238}
1239
1240/**
1241 * test_exclusive_mount - check whether an exclusive RW/RO mount exists or not.
1242 * fs_type: filesystem type
1243 * bdev: block device
1244 * flag: 0 (check rw-mount) or MS_RDONLY (check ro-mount)
1245 * res: pointer to an integer to store result
1246 *
1247 * This function must be called within a section protected by bd_mount_mutex.
1248 */
1249static int test_exclusive_mount(struct file_system_type *fs_type,
1250 struct block_device *bdev, int flags)
1251{
1252 struct super_block *s;
1253 struct nilfs_super_data sd = { .flags = flags, .bdev = bdev };
1254
1255 s = sget(fs_type, nilfs_test_bdev_super3, __false_bdev_super, &sd);
1256 if (IS_ERR(s)) {
1257 if (PTR_ERR(s) != -EFAULT)
1258 return PTR_ERR(s);
1259 return 0; /* Not found */
1260 }
1261 up_write(&s->s_umount);
1262 deactivate_super(s);
1263 return 1; /* Found */
1264}
1265
1266struct file_system_type nilfs_fs_type = {
1267 .owner = THIS_MODULE,
1268 .name = "nilfs2",
1269 .get_sb = nilfs_get_sb,
1270 .kill_sb = kill_block_super,
1271 .fs_flags = FS_REQUIRES_DEV,
1272};
1273
1274static int __init init_nilfs_fs(void)
1275{
1276 int err;
1277
1278 err = nilfs_init_inode_cache();
1279 if (err)
1280 goto failed;
1281
1282 err = nilfs_init_transaction_cache();
1283 if (err)
1284 goto failed_inode_cache;
1285
1286 err = nilfs_init_segbuf_cache();
1287 if (err)
1288 goto failed_transaction_cache;
1289
1290 err = nilfs_btree_path_cache_init();
1291 if (err)
1292 goto failed_segbuf_cache;
1293
1294 err = register_filesystem(&nilfs_fs_type);
1295 if (err)
1296 goto failed_btree_path_cache;
1297
1298 return 0;
1299
1300 failed_btree_path_cache:
1301 nilfs_btree_path_cache_destroy();
1302
1303 failed_segbuf_cache:
1304 nilfs_destroy_segbuf_cache();
1305
1306 failed_transaction_cache:
1307 nilfs_destroy_transaction_cache();
1308
1309 failed_inode_cache:
1310 nilfs_destroy_inode_cache();
1311
1312 failed:
1313 return err;
1314}
1315
1316static void __exit exit_nilfs_fs(void)
1317{
1318 nilfs_destroy_segbuf_cache();
1319 nilfs_destroy_transaction_cache();
1320 nilfs_destroy_inode_cache();
1321 nilfs_btree_path_cache_destroy();
1322 unregister_filesystem(&nilfs_fs_type);
1323}
1324
1325module_init(init_nilfs_fs)
1326module_exit(exit_nilfs_fs)
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
new file mode 100644
index 000000000000..7f65b3be4aa9
--- /dev/null
+++ b/fs/nilfs2/the_nilfs.c
@@ -0,0 +1,641 @@
1/*
2 * the_nilfs.c - the_nilfs shared structure.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23
24#include <linux/buffer_head.h>
25#include <linux/slab.h>
26#include <linux/blkdev.h>
27#include <linux/backing-dev.h>
28#include <linux/crc32.h>
29#include "nilfs.h"
30#include "segment.h"
31#include "alloc.h"
32#include "cpfile.h"
33#include "sufile.h"
34#include "dat.h"
35#include "seglist.h"
36#include "segbuf.h"
37
38void nilfs_set_last_segment(struct the_nilfs *nilfs,
39 sector_t start_blocknr, u64 seq, __u64 cno)
40{
41 spin_lock(&nilfs->ns_last_segment_lock);
42 nilfs->ns_last_pseg = start_blocknr;
43 nilfs->ns_last_seq = seq;
44 nilfs->ns_last_cno = cno;
45 spin_unlock(&nilfs->ns_last_segment_lock);
46}
47
48/**
49 * alloc_nilfs - allocate the_nilfs structure
50 * @bdev: block device to which the_nilfs is related
51 *
52 * alloc_nilfs() allocates memory for the_nilfs and
53 * initializes its reference count and locks.
54 *
55 * Return Value: On success, pointer to the_nilfs is returned.
56 * On error, NULL is returned.
57 */
58struct the_nilfs *alloc_nilfs(struct block_device *bdev)
59{
60 struct the_nilfs *nilfs;
61
62 nilfs = kzalloc(sizeof(*nilfs), GFP_KERNEL);
63 if (!nilfs)
64 return NULL;
65
66 nilfs->ns_bdev = bdev;
67 atomic_set(&nilfs->ns_count, 1);
68 atomic_set(&nilfs->ns_writer_refcount, -1);
69 atomic_set(&nilfs->ns_ndirtyblks, 0);
70 init_rwsem(&nilfs->ns_sem);
71 mutex_init(&nilfs->ns_writer_mutex);
72 INIT_LIST_HEAD(&nilfs->ns_supers);
73 spin_lock_init(&nilfs->ns_last_segment_lock);
74 nilfs->ns_gc_inodes_h = NULL;
75 init_rwsem(&nilfs->ns_segctor_sem);
76
77 return nilfs;
78}
79
80/**
81 * put_nilfs - release a reference to the_nilfs
82 * @nilfs: the_nilfs structure to be released
83 *
84 * put_nilfs() decrements a reference counter of the_nilfs.
85 * If the reference count reaches zero, the_nilfs is freed.
86 */
87void put_nilfs(struct the_nilfs *nilfs)
88{
89 if (!atomic_dec_and_test(&nilfs->ns_count))
90 return;
91 /*
92 * Increment of ns_count never occur below because the caller
93 * of get_nilfs() holds at least one reference to the_nilfs.
94 * Thus its exclusion control is not required here.
95 */
96 might_sleep();
97 if (nilfs_loaded(nilfs)) {
98 nilfs_mdt_clear(nilfs->ns_sufile);
99 nilfs_mdt_destroy(nilfs->ns_sufile);
100 nilfs_mdt_clear(nilfs->ns_cpfile);
101 nilfs_mdt_destroy(nilfs->ns_cpfile);
102 nilfs_mdt_clear(nilfs->ns_dat);
103 nilfs_mdt_destroy(nilfs->ns_dat);
104 /* XXX: how and when to clear nilfs->ns_gc_dat? */
105 nilfs_mdt_destroy(nilfs->ns_gc_dat);
106 }
107 if (nilfs_init(nilfs)) {
108 nilfs_destroy_gccache(nilfs);
109 brelse(nilfs->ns_sbh[0]);
110 brelse(nilfs->ns_sbh[1]);
111 }
112 kfree(nilfs);
113}
114
115static int nilfs_load_super_root(struct the_nilfs *nilfs,
116 struct nilfs_sb_info *sbi, sector_t sr_block)
117{
118 static struct lock_class_key dat_lock_key;
119 struct buffer_head *bh_sr;
120 struct nilfs_super_root *raw_sr;
121 struct nilfs_super_block **sbp = nilfs->ns_sbp;
122 unsigned dat_entry_size, segment_usage_size, checkpoint_size;
123 unsigned inode_size;
124 int err;
125
126 err = nilfs_read_super_root_block(sbi->s_super, sr_block, &bh_sr, 1);
127 if (unlikely(err))
128 return err;
129
130 down_read(&nilfs->ns_sem);
131 dat_entry_size = le16_to_cpu(sbp[0]->s_dat_entry_size);
132 checkpoint_size = le16_to_cpu(sbp[0]->s_checkpoint_size);
133 segment_usage_size = le16_to_cpu(sbp[0]->s_segment_usage_size);
134 up_read(&nilfs->ns_sem);
135
136 inode_size = nilfs->ns_inode_size;
137
138 err = -ENOMEM;
139 nilfs->ns_dat = nilfs_mdt_new(
140 nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
141 if (unlikely(!nilfs->ns_dat))
142 goto failed;
143
144 nilfs->ns_gc_dat = nilfs_mdt_new(
145 nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
146 if (unlikely(!nilfs->ns_gc_dat))
147 goto failed_dat;
148
149 nilfs->ns_cpfile = nilfs_mdt_new(
150 nilfs, NULL, NILFS_CPFILE_INO, NILFS_CPFILE_GFP);
151 if (unlikely(!nilfs->ns_cpfile))
152 goto failed_gc_dat;
153
154 nilfs->ns_sufile = nilfs_mdt_new(
155 nilfs, NULL, NILFS_SUFILE_INO, NILFS_SUFILE_GFP);
156 if (unlikely(!nilfs->ns_sufile))
157 goto failed_cpfile;
158
159 err = nilfs_palloc_init_blockgroup(nilfs->ns_dat, dat_entry_size);
160 if (unlikely(err))
161 goto failed_sufile;
162
163 err = nilfs_palloc_init_blockgroup(nilfs->ns_gc_dat, dat_entry_size);
164 if (unlikely(err))
165 goto failed_sufile;
166
167 lockdep_set_class(&NILFS_MDT(nilfs->ns_dat)->mi_sem, &dat_lock_key);
168 lockdep_set_class(&NILFS_MDT(nilfs->ns_gc_dat)->mi_sem, &dat_lock_key);
169
170 nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat);
171 nilfs_mdt_set_entry_size(nilfs->ns_cpfile, checkpoint_size,
172 sizeof(struct nilfs_cpfile_header));
173 nilfs_mdt_set_entry_size(nilfs->ns_sufile, segment_usage_size,
174 sizeof(struct nilfs_sufile_header));
175
176 err = nilfs_mdt_read_inode_direct(
177 nilfs->ns_dat, bh_sr, NILFS_SR_DAT_OFFSET(inode_size));
178 if (unlikely(err))
179 goto failed_sufile;
180
181 err = nilfs_mdt_read_inode_direct(
182 nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(inode_size));
183 if (unlikely(err))
184 goto failed_sufile;
185
186 err = nilfs_mdt_read_inode_direct(
187 nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(inode_size));
188 if (unlikely(err))
189 goto failed_sufile;
190
191 raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
192 nilfs->ns_nongc_ctime = le64_to_cpu(raw_sr->sr_nongc_ctime);
193
194 failed:
195 brelse(bh_sr);
196 return err;
197
198 failed_sufile:
199 nilfs_mdt_destroy(nilfs->ns_sufile);
200
201 failed_cpfile:
202 nilfs_mdt_destroy(nilfs->ns_cpfile);
203
204 failed_gc_dat:
205 nilfs_mdt_destroy(nilfs->ns_gc_dat);
206
207 failed_dat:
208 nilfs_mdt_destroy(nilfs->ns_dat);
209 goto failed;
210}
211
212static void nilfs_init_recovery_info(struct nilfs_recovery_info *ri)
213{
214 memset(ri, 0, sizeof(*ri));
215 INIT_LIST_HEAD(&ri->ri_used_segments);
216}
217
218static void nilfs_clear_recovery_info(struct nilfs_recovery_info *ri)
219{
220 nilfs_dispose_segment_list(&ri->ri_used_segments);
221}
222
223/**
224 * load_nilfs - load and recover the nilfs
225 * @nilfs: the_nilfs structure to be released
226 * @sbi: nilfs_sb_info used to recover past segment
227 *
228 * load_nilfs() searches and load the latest super root,
229 * attaches the last segment, and does recovery if needed.
230 * The caller must call this exclusively for simultaneous mounts.
231 */
232int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
233{
234 struct nilfs_recovery_info ri;
235 unsigned int s_flags = sbi->s_super->s_flags;
236 int really_read_only = bdev_read_only(nilfs->ns_bdev);
237 unsigned valid_fs;
238 int err = 0;
239
240 nilfs_init_recovery_info(&ri);
241
242 down_write(&nilfs->ns_sem);
243 valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS);
244 up_write(&nilfs->ns_sem);
245
246 if (!valid_fs && (s_flags & MS_RDONLY)) {
247 printk(KERN_INFO "NILFS: INFO: recovery "
248 "required for readonly filesystem.\n");
249 if (really_read_only) {
250 printk(KERN_ERR "NILFS: write access "
251 "unavailable, cannot proceed.\n");
252 err = -EROFS;
253 goto failed;
254 }
255 printk(KERN_INFO "NILFS: write access will "
256 "be enabled during recovery.\n");
257 sbi->s_super->s_flags &= ~MS_RDONLY;
258 }
259
260 err = nilfs_search_super_root(nilfs, sbi, &ri);
261 if (unlikely(err)) {
262 printk(KERN_ERR "NILFS: error searching super root.\n");
263 goto failed;
264 }
265
266 err = nilfs_load_super_root(nilfs, sbi, ri.ri_super_root);
267 if (unlikely(err)) {
268 printk(KERN_ERR "NILFS: error loading super root.\n");
269 goto failed;
270 }
271
272 if (!valid_fs) {
273 err = nilfs_recover_logical_segments(nilfs, sbi, &ri);
274 if (unlikely(err)) {
275 nilfs_mdt_destroy(nilfs->ns_cpfile);
276 nilfs_mdt_destroy(nilfs->ns_sufile);
277 nilfs_mdt_destroy(nilfs->ns_dat);
278 goto failed;
279 }
280 if (ri.ri_need_recovery == NILFS_RECOVERY_SR_UPDATED)
281 sbi->s_super->s_dirt = 1;
282 }
283
284 set_nilfs_loaded(nilfs);
285
286 failed:
287 nilfs_clear_recovery_info(&ri);
288 sbi->s_super->s_flags = s_flags;
289 return err;
290}
291
292static unsigned long long nilfs_max_size(unsigned int blkbits)
293{
294 unsigned int max_bits;
295 unsigned long long res = MAX_LFS_FILESIZE; /* page cache limit */
296
297 max_bits = blkbits + NILFS_BMAP_KEY_BIT; /* bmap size limit */
298 if (max_bits < 64)
299 res = min_t(unsigned long long, res, (1ULL << max_bits) - 1);
300 return res;
301}
302
303static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
304 struct nilfs_super_block *sbp)
305{
306 if (le32_to_cpu(sbp->s_rev_level) != NILFS_CURRENT_REV) {
307 printk(KERN_ERR "NILFS: revision mismatch "
308 "(superblock rev.=%d.%d, current rev.=%d.%d). "
309 "Please check the version of mkfs.nilfs.\n",
310 le32_to_cpu(sbp->s_rev_level),
311 le16_to_cpu(sbp->s_minor_rev_level),
312 NILFS_CURRENT_REV, NILFS_MINOR_REV);
313 return -EINVAL;
314 }
315 nilfs->ns_sbsize = le16_to_cpu(sbp->s_bytes);
316 if (nilfs->ns_sbsize > BLOCK_SIZE)
317 return -EINVAL;
318
319 nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size);
320 nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino);
321
322 nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
323 if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
324 printk(KERN_ERR "NILFS: too short segment. \n");
325 return -EINVAL;
326 }
327
328 nilfs->ns_first_data_block = le64_to_cpu(sbp->s_first_data_block);
329 nilfs->ns_nsegments = le64_to_cpu(sbp->s_nsegments);
330 nilfs->ns_r_segments_percentage =
331 le32_to_cpu(sbp->s_r_segments_percentage);
332 nilfs->ns_nrsvsegs =
333 max_t(unsigned long, NILFS_MIN_NRSVSEGS,
334 DIV_ROUND_UP(nilfs->ns_nsegments *
335 nilfs->ns_r_segments_percentage, 100));
336 nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed);
337 return 0;
338}
339
340static int nilfs_valid_sb(struct nilfs_super_block *sbp)
341{
342 static unsigned char sum[4];
343 const int sumoff = offsetof(struct nilfs_super_block, s_sum);
344 size_t bytes;
345 u32 crc;
346
347 if (!sbp || le16_to_cpu(sbp->s_magic) != NILFS_SUPER_MAGIC)
348 return 0;
349 bytes = le16_to_cpu(sbp->s_bytes);
350 if (bytes > BLOCK_SIZE)
351 return 0;
352 crc = crc32_le(le32_to_cpu(sbp->s_crc_seed), (unsigned char *)sbp,
353 sumoff);
354 crc = crc32_le(crc, sum, 4);
355 crc = crc32_le(crc, (unsigned char *)sbp + sumoff + 4,
356 bytes - sumoff - 4);
357 return crc == le32_to_cpu(sbp->s_sum);
358}
359
360static int nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset)
361{
362 return offset < ((le64_to_cpu(sbp->s_nsegments) *
363 le32_to_cpu(sbp->s_blocks_per_segment)) <<
364 (le32_to_cpu(sbp->s_log_block_size) + 10));
365}
366
367static void nilfs_release_super_block(struct the_nilfs *nilfs)
368{
369 int i;
370
371 for (i = 0; i < 2; i++) {
372 if (nilfs->ns_sbp[i]) {
373 brelse(nilfs->ns_sbh[i]);
374 nilfs->ns_sbh[i] = NULL;
375 nilfs->ns_sbp[i] = NULL;
376 }
377 }
378}
379
380void nilfs_fall_back_super_block(struct the_nilfs *nilfs)
381{
382 brelse(nilfs->ns_sbh[0]);
383 nilfs->ns_sbh[0] = nilfs->ns_sbh[1];
384 nilfs->ns_sbp[0] = nilfs->ns_sbp[1];
385 nilfs->ns_sbh[1] = NULL;
386 nilfs->ns_sbp[1] = NULL;
387}
388
389void nilfs_swap_super_block(struct the_nilfs *nilfs)
390{
391 struct buffer_head *tsbh = nilfs->ns_sbh[0];
392 struct nilfs_super_block *tsbp = nilfs->ns_sbp[0];
393
394 nilfs->ns_sbh[0] = nilfs->ns_sbh[1];
395 nilfs->ns_sbp[0] = nilfs->ns_sbp[1];
396 nilfs->ns_sbh[1] = tsbh;
397 nilfs->ns_sbp[1] = tsbp;
398}
399
400static int nilfs_load_super_block(struct the_nilfs *nilfs,
401 struct super_block *sb, int blocksize,
402 struct nilfs_super_block **sbpp)
403{
404 struct nilfs_super_block **sbp = nilfs->ns_sbp;
405 struct buffer_head **sbh = nilfs->ns_sbh;
406 u64 sb2off = NILFS_SB2_OFFSET_BYTES(nilfs->ns_bdev->bd_inode->i_size);
407 int valid[2], swp = 0;
408
409 sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize,
410 &sbh[0]);
411 sbp[1] = nilfs_read_super_block(sb, sb2off, blocksize, &sbh[1]);
412
413 if (!sbp[0]) {
414 if (!sbp[1]) {
415 printk(KERN_ERR "NILFS: unable to read superblock\n");
416 return -EIO;
417 }
418 printk(KERN_WARNING
419 "NILFS warning: unable to read primary superblock\n");
420 } else if (!sbp[1])
421 printk(KERN_WARNING
422 "NILFS warning: unable to read secondary superblock\n");
423
424 valid[0] = nilfs_valid_sb(sbp[0]);
425 valid[1] = nilfs_valid_sb(sbp[1]);
426 swp = valid[1] &&
427 (!valid[0] ||
428 le64_to_cpu(sbp[1]->s_wtime) > le64_to_cpu(sbp[0]->s_wtime));
429
430 if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) {
431 brelse(sbh[1]);
432 sbh[1] = NULL;
433 sbp[1] = NULL;
434 swp = 0;
435 }
436 if (!valid[swp]) {
437 nilfs_release_super_block(nilfs);
438 printk(KERN_ERR "NILFS: Can't find nilfs on dev %s.\n",
439 sb->s_id);
440 return -EINVAL;
441 }
442
443 if (swp) {
444 printk(KERN_WARNING "NILFS warning: broken superblock. "
445 "using spare superblock.\n");
446 nilfs_swap_super_block(nilfs);
447 }
448
449 nilfs->ns_sbwtime[0] = le64_to_cpu(sbp[0]->s_wtime);
450 nilfs->ns_sbwtime[1] = valid[!swp] ? le64_to_cpu(sbp[1]->s_wtime) : 0;
451 nilfs->ns_prot_seq = le64_to_cpu(sbp[valid[1] & !swp]->s_last_seq);
452 *sbpp = sbp[0];
453 return 0;
454}
455
456/**
457 * init_nilfs - initialize a NILFS instance.
458 * @nilfs: the_nilfs structure
459 * @sbi: nilfs_sb_info
460 * @sb: super block
461 * @data: mount options
462 *
463 * init_nilfs() performs common initialization per block device (e.g.
464 * reading the super block, getting disk layout information, initializing
465 * shared fields in the_nilfs). It takes on some portion of the jobs
466 * typically done by a fill_super() routine. This division arises from
467 * the nature that multiple NILFS instances may be simultaneously
468 * mounted on a device.
469 * For multiple mounts on the same device, only the first mount
470 * invokes these tasks.
471 *
472 * Return Value: On success, 0 is returned. On error, a negative error
473 * code is returned.
474 */
475int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
476{
477 struct super_block *sb = sbi->s_super;
478 struct nilfs_super_block *sbp;
479 struct backing_dev_info *bdi;
480 int blocksize;
481 int err;
482
483 down_write(&nilfs->ns_sem);
484 if (nilfs_init(nilfs)) {
485 /* Load values from existing the_nilfs */
486 sbp = nilfs->ns_sbp[0];
487 err = nilfs_store_magic_and_option(sb, sbp, data);
488 if (err)
489 goto out;
490
491 blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
492 if (sb->s_blocksize != blocksize &&
493 !sb_set_blocksize(sb, blocksize)) {
494 printk(KERN_ERR "NILFS: blocksize %d unfit to device\n",
495 blocksize);
496 err = -EINVAL;
497 }
498 sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits);
499 goto out;
500 }
501
502 blocksize = sb_min_blocksize(sb, BLOCK_SIZE);
503 if (!blocksize) {
504 printk(KERN_ERR "NILFS: unable to set blocksize\n");
505 err = -EINVAL;
506 goto out;
507 }
508 err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
509 if (err)
510 goto out;
511
512 err = nilfs_store_magic_and_option(sb, sbp, data);
513 if (err)
514 goto failed_sbh;
515
516 blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
517 if (sb->s_blocksize != blocksize) {
518 int hw_blocksize = bdev_hardsect_size(sb->s_bdev);
519
520 if (blocksize < hw_blocksize) {
521 printk(KERN_ERR
522 "NILFS: blocksize %d too small for device "
523 "(sector-size = %d).\n",
524 blocksize, hw_blocksize);
525 err = -EINVAL;
526 goto failed_sbh;
527 }
528 nilfs_release_super_block(nilfs);
529 sb_set_blocksize(sb, blocksize);
530
531 err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
532 if (err)
533 goto out;
534 /* not failed_sbh; sbh is released automatically
535 when reloading fails. */
536 }
537 nilfs->ns_blocksize_bits = sb->s_blocksize_bits;
538
539 err = nilfs_store_disk_layout(nilfs, sbp);
540 if (err)
541 goto failed_sbh;
542
543 sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits);
544
545 nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
546
547 bdi = nilfs->ns_bdev->bd_inode_backing_dev_info;
548 if (!bdi)
549 bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
550 nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
551
552 /* Finding last segment */
553 nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg);
554 nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno);
555 nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq);
556
557 nilfs->ns_seg_seq = nilfs->ns_last_seq;
558 nilfs->ns_segnum =
559 nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg);
560 nilfs->ns_cno = nilfs->ns_last_cno + 1;
561 if (nilfs->ns_segnum >= nilfs->ns_nsegments) {
562 printk(KERN_ERR "NILFS invalid last segment number.\n");
563 err = -EINVAL;
564 goto failed_sbh;
565 }
566 /* Dummy values */
567 nilfs->ns_free_segments_count =
568 nilfs->ns_nsegments - (nilfs->ns_segnum + 1);
569
570 /* Initialize gcinode cache */
571 err = nilfs_init_gccache(nilfs);
572 if (err)
573 goto failed_sbh;
574
575 set_nilfs_init(nilfs);
576 err = 0;
577 out:
578 up_write(&nilfs->ns_sem);
579 return err;
580
581 failed_sbh:
582 nilfs_release_super_block(nilfs);
583 goto out;
584}
585
586int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
587{
588 struct inode *dat = nilfs_dat_inode(nilfs);
589 unsigned long ncleansegs;
590 int err;
591
592 down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
593 err = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile, &ncleansegs);
594 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
595 if (likely(!err))
596 *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
597 return err;
598}
599
600int nilfs_near_disk_full(struct the_nilfs *nilfs)
601{
602 struct inode *sufile = nilfs->ns_sufile;
603 unsigned long ncleansegs, nincsegs;
604 int ret;
605
606 ret = nilfs_sufile_get_ncleansegs(sufile, &ncleansegs);
607 if (likely(!ret)) {
608 nincsegs = atomic_read(&nilfs->ns_ndirtyblks) /
609 nilfs->ns_blocks_per_segment + 1;
610 if (ncleansegs <= nilfs->ns_nrsvsegs + nincsegs)
611 ret++;
612 }
613 return ret;
614}
615
616int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
617 int snapshot_mount)
618{
619 struct nilfs_sb_info *sbi;
620 int ret = 0;
621
622 down_read(&nilfs->ns_sem);
623 if (cno == 0 || cno > nilfs->ns_cno)
624 goto out_unlock;
625
626 list_for_each_entry(sbi, &nilfs->ns_supers, s_list) {
627 if (sbi->s_snapshot_cno == cno &&
628 (!snapshot_mount || nilfs_test_opt(sbi, SNAPSHOT))) {
629 /* exclude read-only mounts */
630 ret++;
631 break;
632 }
633 }
634 /* for protecting recent checkpoints */
635 if (cno >= nilfs_last_cno(nilfs))
636 ret++;
637
638 out_unlock:
639 up_read(&nilfs->ns_sem);
640 return ret;
641}
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
new file mode 100644
index 000000000000..30fe58778d05
--- /dev/null
+++ b/fs/nilfs2/the_nilfs.h
@@ -0,0 +1,298 @@
1/*
2 * the_nilfs.h - the_nilfs shared structure.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23
24#ifndef _THE_NILFS_H
25#define _THE_NILFS_H
26
27#include <linux/types.h>
28#include <linux/buffer_head.h>
29#include <linux/fs.h>
30#include <linux/blkdev.h>
31#include <linux/backing-dev.h>
32#include "sb.h"
33
34/* the_nilfs struct */
35enum {
36 THE_NILFS_INIT = 0, /* Information from super_block is set */
37 THE_NILFS_LOADED, /* Roll-back/roll-forward has done and
38 the latest checkpoint was loaded */
39 THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
40};
41
42/**
43 * struct the_nilfs - struct to supervise multiple nilfs mount points
44 * @ns_flags: flags
45 * @ns_count: reference count
46 * @ns_bdev: block device
47 * @ns_bdi: backing dev info
48 * @ns_writer: back pointer to writable nilfs_sb_info
49 * @ns_sem: semaphore for shared states
50 * @ns_writer_mutex: mutex protecting ns_writer attach/detach
51 * @ns_writer_refcount: number of referrers on ns_writer
52 * @ns_sbh: buffer heads of on-disk super blocks
53 * @ns_sbp: pointers to super block data
54 * @ns_sbwtime: previous write time of super blocks
55 * @ns_sbsize: size of valid data in super block
56 * @ns_supers: list of nilfs super block structs
57 * @ns_seg_seq: segment sequence counter
58 * @ns_segnum: index number of the latest full segment.
59 * @ns_nextnum: index number of the full segment index to be used next
60 * @ns_pseg_offset: offset of next partial segment in the current full segment
61 * @ns_cno: next checkpoint number
62 * @ns_ctime: write time of the last segment
63 * @ns_nongc_ctime: write time of the last segment not for cleaner operation
64 * @ns_ndirtyblks: Number of dirty data blocks
65 * @ns_last_segment_lock: lock protecting fields for the latest segment
66 * @ns_last_pseg: start block number of the latest segment
67 * @ns_last_seq: sequence value of the latest segment
68 * @ns_last_cno: checkpoint number of the latest segment
69 * @ns_prot_seq: least sequence number of segments which must not be reclaimed
70 * @ns_free_segments_count: counter of free segments
71 * @ns_segctor_sem: segment constructor semaphore
72 * @ns_dat: DAT file inode
73 * @ns_cpfile: checkpoint file inode
74 * @ns_sufile: segusage file inode
75 * @ns_gc_dat: shadow inode of the DAT file inode for GC
76 * @ns_gc_inodes: dummy inodes to keep live blocks
77 * @ns_gc_inodes_h: hash list to keep dummy inode holding live blocks
78 * @ns_blocksize_bits: bit length of block size
79 * @ns_nsegments: number of segments in filesystem
80 * @ns_blocks_per_segment: number of blocks per segment
81 * @ns_r_segments_percentage: reserved segments percentage
82 * @ns_nrsvsegs: number of reserved segments
83 * @ns_first_data_block: block number of first data block
84 * @ns_inode_size: size of on-disk inode
85 * @ns_first_ino: first not-special inode number
86 * @ns_crc_seed: seed value of CRC32 calculation
87 */
88struct the_nilfs {
89 unsigned long ns_flags;
90 atomic_t ns_count;
91
92 struct block_device *ns_bdev;
93 struct backing_dev_info *ns_bdi;
94 struct nilfs_sb_info *ns_writer;
95 struct rw_semaphore ns_sem;
96 struct mutex ns_writer_mutex;
97 atomic_t ns_writer_refcount;
98
99 /*
100 * used for
101 * - loading the latest checkpoint exclusively.
102 * - allocating a new full segment.
103 * - protecting s_dirt in the super_block struct
104 * (see nilfs_write_super) and the following fields.
105 */
106 struct buffer_head *ns_sbh[2];
107 struct nilfs_super_block *ns_sbp[2];
108 time_t ns_sbwtime[2];
109 unsigned ns_sbsize;
110 unsigned ns_mount_state;
111 struct list_head ns_supers;
112
113 /*
114 * Following fields are dedicated to a writable FS-instance.
115 * Except for the period seeking checkpoint, code outside the segment
116 * constructor must lock a segment semaphore while accessing these
117 * fields.
118 * The writable FS-instance is sole during a lifetime of the_nilfs.
119 */
120 u64 ns_seg_seq;
121 __u64 ns_segnum;
122 __u64 ns_nextnum;
123 unsigned long ns_pseg_offset;
124 __u64 ns_cno;
125 time_t ns_ctime;
126 time_t ns_nongc_ctime;
127 atomic_t ns_ndirtyblks;
128
129 /*
130 * The following fields hold information on the latest partial segment
131 * written to disk with a super root. These fields are protected by
132 * ns_last_segment_lock.
133 */
134 spinlock_t ns_last_segment_lock;
135 sector_t ns_last_pseg;
136 u64 ns_last_seq;
137 __u64 ns_last_cno;
138 u64 ns_prot_seq;
139 unsigned long ns_free_segments_count;
140
141 struct rw_semaphore ns_segctor_sem;
142
143 /*
144 * Following fields are lock free except for the period before
145 * the_nilfs is initialized.
146 */
147 struct inode *ns_dat;
148 struct inode *ns_cpfile;
149 struct inode *ns_sufile;
150 struct inode *ns_gc_dat;
151
152 /* GC inode list and hash table head */
153 struct list_head ns_gc_inodes;
154 struct hlist_head *ns_gc_inodes_h;
155
156 /* Disk layout information (static) */
157 unsigned int ns_blocksize_bits;
158 unsigned long ns_nsegments;
159 unsigned long ns_blocks_per_segment;
160 unsigned long ns_r_segments_percentage;
161 unsigned long ns_nrsvsegs;
162 unsigned long ns_first_data_block;
163 int ns_inode_size;
164 int ns_first_ino;
165 u32 ns_crc_seed;
166};
167
168#define NILFS_GCINODE_HASH_BITS 8
169#define NILFS_GCINODE_HASH_SIZE (1<<NILFS_GCINODE_HASH_BITS)
170
171#define THE_NILFS_FNS(bit, name) \
172static inline void set_nilfs_##name(struct the_nilfs *nilfs) \
173{ \
174 set_bit(THE_NILFS_##bit, &(nilfs)->ns_flags); \
175} \
176static inline void clear_nilfs_##name(struct the_nilfs *nilfs) \
177{ \
178 clear_bit(THE_NILFS_##bit, &(nilfs)->ns_flags); \
179} \
180static inline int nilfs_##name(struct the_nilfs *nilfs) \
181{ \
182 return test_bit(THE_NILFS_##bit, &(nilfs)->ns_flags); \
183}
184
185THE_NILFS_FNS(INIT, init)
186THE_NILFS_FNS(LOADED, loaded)
187THE_NILFS_FNS(DISCONTINUED, discontinued)
188
189/* Minimum interval of periodical update of superblocks (in seconds) */
190#define NILFS_SB_FREQ 10
191#define NILFS_ALTSB_FREQ 60 /* spare superblock */
192
193void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
194struct the_nilfs *alloc_nilfs(struct block_device *);
195void put_nilfs(struct the_nilfs *);
196int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
197int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
198int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
199int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
200int nilfs_near_disk_full(struct the_nilfs *);
201void nilfs_fall_back_super_block(struct the_nilfs *);
202void nilfs_swap_super_block(struct the_nilfs *);
203
204
205static inline void get_nilfs(struct the_nilfs *nilfs)
206{
207 /* Caller must have at least one reference of the_nilfs. */
208 atomic_inc(&nilfs->ns_count);
209}
210
211static inline struct nilfs_sb_info *nilfs_get_writer(struct the_nilfs *nilfs)
212{
213 if (atomic_inc_and_test(&nilfs->ns_writer_refcount))
214 mutex_lock(&nilfs->ns_writer_mutex);
215 return nilfs->ns_writer;
216}
217
218static inline void nilfs_put_writer(struct the_nilfs *nilfs)
219{
220 if (atomic_add_negative(-1, &nilfs->ns_writer_refcount))
221 mutex_unlock(&nilfs->ns_writer_mutex);
222}
223
224static inline void
225nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
226{
227 mutex_lock(&nilfs->ns_writer_mutex);
228 nilfs->ns_writer = sbi;
229 mutex_unlock(&nilfs->ns_writer_mutex);
230}
231
232static inline void
233nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
234{
235 mutex_lock(&nilfs->ns_writer_mutex);
236 if (sbi == nilfs->ns_writer)
237 nilfs->ns_writer = NULL;
238 mutex_unlock(&nilfs->ns_writer_mutex);
239}
240
241static inline void
242nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum,
243 sector_t *seg_start, sector_t *seg_end)
244{
245 *seg_start = (sector_t)nilfs->ns_blocks_per_segment * segnum;
246 *seg_end = *seg_start + nilfs->ns_blocks_per_segment - 1;
247 if (segnum == 0)
248 *seg_start = nilfs->ns_first_data_block;
249}
250
251static inline sector_t
252nilfs_get_segment_start_blocknr(struct the_nilfs *nilfs, __u64 segnum)
253{
254 return (segnum == 0) ? nilfs->ns_first_data_block :
255 (sector_t)nilfs->ns_blocks_per_segment * segnum;
256}
257
258static inline __u64
259nilfs_get_segnum_of_block(struct the_nilfs *nilfs, sector_t blocknr)
260{
261 sector_t segnum = blocknr;
262
263 sector_div(segnum, nilfs->ns_blocks_per_segment);
264 return segnum;
265}
266
267static inline void
268nilfs_terminate_segment(struct the_nilfs *nilfs, sector_t seg_start,
269 sector_t seg_end)
270{
271 /* terminate the current full segment (used in case of I/O-error) */
272 nilfs->ns_pseg_offset = seg_end - seg_start + 1;
273}
274
275static inline void nilfs_shift_to_next_segment(struct the_nilfs *nilfs)
276{
277 /* move forward with a full segment */
278 nilfs->ns_segnum = nilfs->ns_nextnum;
279 nilfs->ns_pseg_offset = 0;
280 nilfs->ns_seg_seq++;
281}
282
283static inline __u64 nilfs_last_cno(struct the_nilfs *nilfs)
284{
285 __u64 cno;
286
287 spin_lock(&nilfs->ns_last_segment_lock);
288 cno = nilfs->ns_last_cno;
289 spin_unlock(&nilfs->ns_last_segment_lock);
290 return cno;
291}
292
293static inline int nilfs_segment_is_active(struct the_nilfs *nilfs, __u64 n)
294{
295 return n == nilfs->ns_segnum || n == nilfs->ns_nextnum;
296}
297
298#endif /* _THE_NILFS_H */
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index bed766e435b5..1634319e2404 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -220,7 +220,7 @@ static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
220 rem = 0; 220 rem = 0;
221 } 221 }
222 222
223 kevent->name = kmalloc(len + rem, GFP_KERNEL); 223 kevent->name = kmalloc(len + rem, GFP_NOFS);
224 if (unlikely(!kevent->name)) { 224 if (unlikely(!kevent->name)) {
225 kmem_cache_free(event_cachep, kevent); 225 kmem_cache_free(event_cachep, kevent);
226 return NULL; 226 return NULL;
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 34314b33dbd4..5a9e34475e37 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -32,8 +32,8 @@
32/** 32/**
33 * The little endian Unicode string $I30 as a global constant. 33 * The little endian Unicode string $I30 as a global constant.
34 */ 34 */
35ntfschar I30[5] = { const_cpu_to_le16('$'), const_cpu_to_le16('I'), 35ntfschar I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'),
36 const_cpu_to_le16('3'), const_cpu_to_le16('0'), 0 }; 36 cpu_to_le16('3'), cpu_to_le16('0'), 0 };
37 37
38/** 38/**
39 * ntfs_lookup_inode_by_name - find an inode in a directory given its name 39 * ntfs_lookup_inode_by_name - find an inode in a directory given its name
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 86bef156cf0a..82c5085559c6 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1975,8 +1975,7 @@ int ntfs_read_inode_mount(struct inode *vi)
1975 goto em_put_err_out; 1975 goto em_put_err_out;
1976 next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry + 1976 next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
1977 le16_to_cpu(al_entry->length)); 1977 le16_to_cpu(al_entry->length));
1978 if (le32_to_cpu(al_entry->type) > 1978 if (le32_to_cpu(al_entry->type) > le32_to_cpu(AT_DATA))
1979 const_le32_to_cpu(AT_DATA))
1980 goto em_put_err_out; 1979 goto em_put_err_out;
1981 if (AT_DATA != al_entry->type) 1980 if (AT_DATA != al_entry->type)
1982 continue; 1981 continue;
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
index 1e383328eceb..50931b1ce4b9 100644
--- a/fs/ntfs/layout.h
+++ b/fs/ntfs/layout.h
@@ -31,19 +31,8 @@
31 31
32#include "types.h" 32#include "types.h"
33 33
34/*
35 * Constant endianness conversion defines.
36 */
37#define const_le16_to_cpu(x) __constant_le16_to_cpu(x)
38#define const_le32_to_cpu(x) __constant_le32_to_cpu(x)
39#define const_le64_to_cpu(x) __constant_le64_to_cpu(x)
40
41#define const_cpu_to_le16(x) __constant_cpu_to_le16(x)
42#define const_cpu_to_le32(x) __constant_cpu_to_le32(x)
43#define const_cpu_to_le64(x) __constant_cpu_to_le64(x)
44
45/* The NTFS oem_id "NTFS " */ 34/* The NTFS oem_id "NTFS " */
46#define magicNTFS const_cpu_to_le64(0x202020205346544eULL) 35#define magicNTFS cpu_to_le64(0x202020205346544eULL)
47 36
48/* 37/*
49 * Location of bootsector on partition: 38 * Location of bootsector on partition:
@@ -114,25 +103,25 @@ typedef struct {
114 */ 103 */
115enum { 104enum {
116 /* Found in $MFT/$DATA. */ 105 /* Found in $MFT/$DATA. */
117 magic_FILE = const_cpu_to_le32(0x454c4946), /* Mft entry. */ 106 magic_FILE = cpu_to_le32(0x454c4946), /* Mft entry. */
118 magic_INDX = const_cpu_to_le32(0x58444e49), /* Index buffer. */ 107 magic_INDX = cpu_to_le32(0x58444e49), /* Index buffer. */
119 magic_HOLE = const_cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */ 108 magic_HOLE = cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */
120 109
121 /* Found in $LogFile/$DATA. */ 110 /* Found in $LogFile/$DATA. */
122 magic_RSTR = const_cpu_to_le32(0x52545352), /* Restart page. */ 111 magic_RSTR = cpu_to_le32(0x52545352), /* Restart page. */
123 magic_RCRD = const_cpu_to_le32(0x44524352), /* Log record page. */ 112 magic_RCRD = cpu_to_le32(0x44524352), /* Log record page. */
124 113
125 /* Found in $LogFile/$DATA. (May be found in $MFT/$DATA, also?) */ 114 /* Found in $LogFile/$DATA. (May be found in $MFT/$DATA, also?) */
126 magic_CHKD = const_cpu_to_le32(0x444b4843), /* Modified by chkdsk. */ 115 magic_CHKD = cpu_to_le32(0x444b4843), /* Modified by chkdsk. */
127 116
128 /* Found in all ntfs record containing records. */ 117 /* Found in all ntfs record containing records. */
129 magic_BAAD = const_cpu_to_le32(0x44414142), /* Failed multi sector 118 magic_BAAD = cpu_to_le32(0x44414142), /* Failed multi sector
130 transfer was detected. */ 119 transfer was detected. */
131 /* 120 /*
132 * Found in $LogFile/$DATA when a page is full of 0xff bytes and is 121 * Found in $LogFile/$DATA when a page is full of 0xff bytes and is
133 * thus not initialized. Page must be initialized before using it. 122 * thus not initialized. Page must be initialized before using it.
134 */ 123 */
135 magic_empty = const_cpu_to_le32(0xffffffff) /* Record is empty. */ 124 magic_empty = cpu_to_le32(0xffffffff) /* Record is empty. */
136}; 125};
137 126
138typedef le32 NTFS_RECORD_TYPE; 127typedef le32 NTFS_RECORD_TYPE;
@@ -258,8 +247,8 @@ typedef enum {
258 * information about the mft record in which they are present. 247 * information about the mft record in which they are present.
259 */ 248 */
260enum { 249enum {
261 MFT_RECORD_IN_USE = const_cpu_to_le16(0x0001), 250 MFT_RECORD_IN_USE = cpu_to_le16(0x0001),
262 MFT_RECORD_IS_DIRECTORY = const_cpu_to_le16(0x0002), 251 MFT_RECORD_IS_DIRECTORY = cpu_to_le16(0x0002),
263} __attribute__ ((__packed__)); 252} __attribute__ ((__packed__));
264 253
265typedef le16 MFT_RECORD_FLAGS; 254typedef le16 MFT_RECORD_FLAGS;
@@ -309,7 +298,7 @@ typedef le16 MFT_RECORD_FLAGS;
309 * Note: The _LE versions will return a CPU endian formatted value! 298 * Note: The _LE versions will return a CPU endian formatted value!
310 */ 299 */
311#define MFT_REF_MASK_CPU 0x0000ffffffffffffULL 300#define MFT_REF_MASK_CPU 0x0000ffffffffffffULL
312#define MFT_REF_MASK_LE const_cpu_to_le64(MFT_REF_MASK_CPU) 301#define MFT_REF_MASK_LE cpu_to_le64(MFT_REF_MASK_CPU)
313 302
314typedef u64 MFT_REF; 303typedef u64 MFT_REF;
315typedef le64 leMFT_REF; 304typedef le64 leMFT_REF;
@@ -477,25 +466,25 @@ typedef struct {
477 * a revealing choice of symbol I do not know what is... (-; 466 * a revealing choice of symbol I do not know what is... (-;
478 */ 467 */
479enum { 468enum {
480 AT_UNUSED = const_cpu_to_le32( 0), 469 AT_UNUSED = cpu_to_le32( 0),
481 AT_STANDARD_INFORMATION = const_cpu_to_le32( 0x10), 470 AT_STANDARD_INFORMATION = cpu_to_le32( 0x10),
482 AT_ATTRIBUTE_LIST = const_cpu_to_le32( 0x20), 471 AT_ATTRIBUTE_LIST = cpu_to_le32( 0x20),
483 AT_FILE_NAME = const_cpu_to_le32( 0x30), 472 AT_FILE_NAME = cpu_to_le32( 0x30),
484 AT_OBJECT_ID = const_cpu_to_le32( 0x40), 473 AT_OBJECT_ID = cpu_to_le32( 0x40),
485 AT_SECURITY_DESCRIPTOR = const_cpu_to_le32( 0x50), 474 AT_SECURITY_DESCRIPTOR = cpu_to_le32( 0x50),
486 AT_VOLUME_NAME = const_cpu_to_le32( 0x60), 475 AT_VOLUME_NAME = cpu_to_le32( 0x60),
487 AT_VOLUME_INFORMATION = const_cpu_to_le32( 0x70), 476 AT_VOLUME_INFORMATION = cpu_to_le32( 0x70),
488 AT_DATA = const_cpu_to_le32( 0x80), 477 AT_DATA = cpu_to_le32( 0x80),
489 AT_INDEX_ROOT = const_cpu_to_le32( 0x90), 478 AT_INDEX_ROOT = cpu_to_le32( 0x90),
490 AT_INDEX_ALLOCATION = const_cpu_to_le32( 0xa0), 479 AT_INDEX_ALLOCATION = cpu_to_le32( 0xa0),
491 AT_BITMAP = const_cpu_to_le32( 0xb0), 480 AT_BITMAP = cpu_to_le32( 0xb0),
492 AT_REPARSE_POINT = const_cpu_to_le32( 0xc0), 481 AT_REPARSE_POINT = cpu_to_le32( 0xc0),
493 AT_EA_INFORMATION = const_cpu_to_le32( 0xd0), 482 AT_EA_INFORMATION = cpu_to_le32( 0xd0),
494 AT_EA = const_cpu_to_le32( 0xe0), 483 AT_EA = cpu_to_le32( 0xe0),
495 AT_PROPERTY_SET = const_cpu_to_le32( 0xf0), 484 AT_PROPERTY_SET = cpu_to_le32( 0xf0),
496 AT_LOGGED_UTILITY_STREAM = const_cpu_to_le32( 0x100), 485 AT_LOGGED_UTILITY_STREAM = cpu_to_le32( 0x100),
497 AT_FIRST_USER_DEFINED_ATTRIBUTE = const_cpu_to_le32( 0x1000), 486 AT_FIRST_USER_DEFINED_ATTRIBUTE = cpu_to_le32( 0x1000),
498 AT_END = const_cpu_to_le32(0xffffffff) 487 AT_END = cpu_to_le32(0xffffffff)
499}; 488};
500 489
501typedef le32 ATTR_TYPE; 490typedef le32 ATTR_TYPE;
@@ -539,13 +528,13 @@ typedef le32 ATTR_TYPE;
539 * equal then the second le32 values would be compared, etc. 528 * equal then the second le32 values would be compared, etc.
540 */ 529 */
541enum { 530enum {
542 COLLATION_BINARY = const_cpu_to_le32(0x00), 531 COLLATION_BINARY = cpu_to_le32(0x00),
543 COLLATION_FILE_NAME = const_cpu_to_le32(0x01), 532 COLLATION_FILE_NAME = cpu_to_le32(0x01),
544 COLLATION_UNICODE_STRING = const_cpu_to_le32(0x02), 533 COLLATION_UNICODE_STRING = cpu_to_le32(0x02),
545 COLLATION_NTOFS_ULONG = const_cpu_to_le32(0x10), 534 COLLATION_NTOFS_ULONG = cpu_to_le32(0x10),
546 COLLATION_NTOFS_SID = const_cpu_to_le32(0x11), 535 COLLATION_NTOFS_SID = cpu_to_le32(0x11),
547 COLLATION_NTOFS_SECURITY_HASH = const_cpu_to_le32(0x12), 536 COLLATION_NTOFS_SECURITY_HASH = cpu_to_le32(0x12),
548 COLLATION_NTOFS_ULONGS = const_cpu_to_le32(0x13), 537 COLLATION_NTOFS_ULONGS = cpu_to_le32(0x13),
549}; 538};
550 539
551typedef le32 COLLATION_RULE; 540typedef le32 COLLATION_RULE;
@@ -559,25 +548,25 @@ typedef le32 COLLATION_RULE;
559 * NT4. 548 * NT4.
560 */ 549 */
561enum { 550enum {
562 ATTR_DEF_INDEXABLE = const_cpu_to_le32(0x02), /* Attribute can be 551 ATTR_DEF_INDEXABLE = cpu_to_le32(0x02), /* Attribute can be
563 indexed. */ 552 indexed. */
564 ATTR_DEF_MULTIPLE = const_cpu_to_le32(0x04), /* Attribute type 553 ATTR_DEF_MULTIPLE = cpu_to_le32(0x04), /* Attribute type
565 can be present multiple times in the 554 can be present multiple times in the
566 mft records of an inode. */ 555 mft records of an inode. */
567 ATTR_DEF_NOT_ZERO = const_cpu_to_le32(0x08), /* Attribute value 556 ATTR_DEF_NOT_ZERO = cpu_to_le32(0x08), /* Attribute value
568 must contain at least one non-zero 557 must contain at least one non-zero
569 byte. */ 558 byte. */
570 ATTR_DEF_INDEXED_UNIQUE = const_cpu_to_le32(0x10), /* Attribute must be 559 ATTR_DEF_INDEXED_UNIQUE = cpu_to_le32(0x10), /* Attribute must be
571 indexed and the attribute value must be 560 indexed and the attribute value must be
572 unique for the attribute type in all of 561 unique for the attribute type in all of
573 the mft records of an inode. */ 562 the mft records of an inode. */
574 ATTR_DEF_NAMED_UNIQUE = const_cpu_to_le32(0x20), /* Attribute must be 563 ATTR_DEF_NAMED_UNIQUE = cpu_to_le32(0x20), /* Attribute must be
575 named and the name must be unique for 564 named and the name must be unique for
576 the attribute type in all of the mft 565 the attribute type in all of the mft
577 records of an inode. */ 566 records of an inode. */
578 ATTR_DEF_RESIDENT = const_cpu_to_le32(0x40), /* Attribute must be 567 ATTR_DEF_RESIDENT = cpu_to_le32(0x40), /* Attribute must be
579 resident. */ 568 resident. */
580 ATTR_DEF_ALWAYS_LOG = const_cpu_to_le32(0x80), /* Always log 569 ATTR_DEF_ALWAYS_LOG = cpu_to_le32(0x80), /* Always log
581 modifications to this attribute, 570 modifications to this attribute,
582 regardless of whether it is resident or 571 regardless of whether it is resident or
583 non-resident. Without this, only log 572 non-resident. Without this, only log
@@ -614,12 +603,12 @@ typedef struct {
614 * Attribute flags (16-bit). 603 * Attribute flags (16-bit).
615 */ 604 */
616enum { 605enum {
617 ATTR_IS_COMPRESSED = const_cpu_to_le16(0x0001), 606 ATTR_IS_COMPRESSED = cpu_to_le16(0x0001),
618 ATTR_COMPRESSION_MASK = const_cpu_to_le16(0x00ff), /* Compression method 607 ATTR_COMPRESSION_MASK = cpu_to_le16(0x00ff), /* Compression method
619 mask. Also, first 608 mask. Also, first
620 illegal value. */ 609 illegal value. */
621 ATTR_IS_ENCRYPTED = const_cpu_to_le16(0x4000), 610 ATTR_IS_ENCRYPTED = cpu_to_le16(0x4000),
622 ATTR_IS_SPARSE = const_cpu_to_le16(0x8000), 611 ATTR_IS_SPARSE = cpu_to_le16(0x8000),
623} __attribute__ ((__packed__)); 612} __attribute__ ((__packed__));
624 613
625typedef le16 ATTR_FLAGS; 614typedef le16 ATTR_FLAGS;
@@ -811,32 +800,32 @@ typedef ATTR_RECORD ATTR_REC;
811 * flags appear in all of the above. 800 * flags appear in all of the above.
812 */ 801 */
813enum { 802enum {
814 FILE_ATTR_READONLY = const_cpu_to_le32(0x00000001), 803 FILE_ATTR_READONLY = cpu_to_le32(0x00000001),
815 FILE_ATTR_HIDDEN = const_cpu_to_le32(0x00000002), 804 FILE_ATTR_HIDDEN = cpu_to_le32(0x00000002),
816 FILE_ATTR_SYSTEM = const_cpu_to_le32(0x00000004), 805 FILE_ATTR_SYSTEM = cpu_to_le32(0x00000004),
817 /* Old DOS volid. Unused in NT. = const_cpu_to_le32(0x00000008), */ 806 /* Old DOS volid. Unused in NT. = cpu_to_le32(0x00000008), */
818 807
819 FILE_ATTR_DIRECTORY = const_cpu_to_le32(0x00000010), 808 FILE_ATTR_DIRECTORY = cpu_to_le32(0x00000010),
820 /* Note, FILE_ATTR_DIRECTORY is not considered valid in NT. It is 809 /* Note, FILE_ATTR_DIRECTORY is not considered valid in NT. It is
821 reserved for the DOS SUBDIRECTORY flag. */ 810 reserved for the DOS SUBDIRECTORY flag. */
822 FILE_ATTR_ARCHIVE = const_cpu_to_le32(0x00000020), 811 FILE_ATTR_ARCHIVE = cpu_to_le32(0x00000020),
823 FILE_ATTR_DEVICE = const_cpu_to_le32(0x00000040), 812 FILE_ATTR_DEVICE = cpu_to_le32(0x00000040),
824 FILE_ATTR_NORMAL = const_cpu_to_le32(0x00000080), 813 FILE_ATTR_NORMAL = cpu_to_le32(0x00000080),
825 814
826 FILE_ATTR_TEMPORARY = const_cpu_to_le32(0x00000100), 815 FILE_ATTR_TEMPORARY = cpu_to_le32(0x00000100),
827 FILE_ATTR_SPARSE_FILE = const_cpu_to_le32(0x00000200), 816 FILE_ATTR_SPARSE_FILE = cpu_to_le32(0x00000200),
828 FILE_ATTR_REPARSE_POINT = const_cpu_to_le32(0x00000400), 817 FILE_ATTR_REPARSE_POINT = cpu_to_le32(0x00000400),
829 FILE_ATTR_COMPRESSED = const_cpu_to_le32(0x00000800), 818 FILE_ATTR_COMPRESSED = cpu_to_le32(0x00000800),
830 819
831 FILE_ATTR_OFFLINE = const_cpu_to_le32(0x00001000), 820 FILE_ATTR_OFFLINE = cpu_to_le32(0x00001000),
832 FILE_ATTR_NOT_CONTENT_INDEXED = const_cpu_to_le32(0x00002000), 821 FILE_ATTR_NOT_CONTENT_INDEXED = cpu_to_le32(0x00002000),
833 FILE_ATTR_ENCRYPTED = const_cpu_to_le32(0x00004000), 822 FILE_ATTR_ENCRYPTED = cpu_to_le32(0x00004000),
834 823
835 FILE_ATTR_VALID_FLAGS = const_cpu_to_le32(0x00007fb7), 824 FILE_ATTR_VALID_FLAGS = cpu_to_le32(0x00007fb7),
836 /* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the 825 /* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the
837 FILE_ATTR_DEVICE and preserves everything else. This mask is used 826 FILE_ATTR_DEVICE and preserves everything else. This mask is used
838 to obtain all flags that are valid for reading. */ 827 to obtain all flags that are valid for reading. */
839 FILE_ATTR_VALID_SET_FLAGS = const_cpu_to_le32(0x000031a7), 828 FILE_ATTR_VALID_SET_FLAGS = cpu_to_le32(0x000031a7),
840 /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the 829 /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the
841 F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT, 830 F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT,
842 F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask 831 F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask
@@ -846,11 +835,11 @@ enum {
846 * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION 835 * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION
847 * attribute of an mft record. 836 * attribute of an mft record.
848 */ 837 */
849 FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT = const_cpu_to_le32(0x10000000), 838 FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT = cpu_to_le32(0x10000000),
850 /* Note, this is a copy of the corresponding bit from the mft record, 839 /* Note, this is a copy of the corresponding bit from the mft record,
851 telling us whether this is a directory or not, i.e. whether it has 840 telling us whether this is a directory or not, i.e. whether it has
852 an index root attribute or not. */ 841 an index root attribute or not. */
853 FILE_ATTR_DUP_VIEW_INDEX_PRESENT = const_cpu_to_le32(0x20000000), 842 FILE_ATTR_DUP_VIEW_INDEX_PRESENT = cpu_to_le32(0x20000000),
854 /* Note, this is a copy of the corresponding bit from the mft record, 843 /* Note, this is a copy of the corresponding bit from the mft record,
855 telling us whether this file has a view index present (eg. object id 844 telling us whether this file has a view index present (eg. object id
856 index, quota index, one of the security indexes or the encrypting 845 index, quota index, one of the security indexes or the encrypting
@@ -1446,42 +1435,42 @@ enum {
1446 /* Specific rights for files and directories are as follows: */ 1435 /* Specific rights for files and directories are as follows: */
1447 1436
1448 /* Right to read data from the file. (FILE) */ 1437 /* Right to read data from the file. (FILE) */
1449 FILE_READ_DATA = const_cpu_to_le32(0x00000001), 1438 FILE_READ_DATA = cpu_to_le32(0x00000001),
1450 /* Right to list contents of a directory. (DIRECTORY) */ 1439 /* Right to list contents of a directory. (DIRECTORY) */
1451 FILE_LIST_DIRECTORY = const_cpu_to_le32(0x00000001), 1440 FILE_LIST_DIRECTORY = cpu_to_le32(0x00000001),
1452 1441
1453 /* Right to write data to the file. (FILE) */ 1442 /* Right to write data to the file. (FILE) */
1454 FILE_WRITE_DATA = const_cpu_to_le32(0x00000002), 1443 FILE_WRITE_DATA = cpu_to_le32(0x00000002),
1455 /* Right to create a file in the directory. (DIRECTORY) */ 1444 /* Right to create a file in the directory. (DIRECTORY) */
1456 FILE_ADD_FILE = const_cpu_to_le32(0x00000002), 1445 FILE_ADD_FILE = cpu_to_le32(0x00000002),
1457 1446
1458 /* Right to append data to the file. (FILE) */ 1447 /* Right to append data to the file. (FILE) */
1459 FILE_APPEND_DATA = const_cpu_to_le32(0x00000004), 1448 FILE_APPEND_DATA = cpu_to_le32(0x00000004),
1460 /* Right to create a subdirectory. (DIRECTORY) */ 1449 /* Right to create a subdirectory. (DIRECTORY) */
1461 FILE_ADD_SUBDIRECTORY = const_cpu_to_le32(0x00000004), 1450 FILE_ADD_SUBDIRECTORY = cpu_to_le32(0x00000004),
1462 1451
1463 /* Right to read extended attributes. (FILE/DIRECTORY) */ 1452 /* Right to read extended attributes. (FILE/DIRECTORY) */
1464 FILE_READ_EA = const_cpu_to_le32(0x00000008), 1453 FILE_READ_EA = cpu_to_le32(0x00000008),
1465 1454
1466 /* Right to write extended attributes. (FILE/DIRECTORY) */ 1455 /* Right to write extended attributes. (FILE/DIRECTORY) */
1467 FILE_WRITE_EA = const_cpu_to_le32(0x00000010), 1456 FILE_WRITE_EA = cpu_to_le32(0x00000010),
1468 1457
1469 /* Right to execute a file. (FILE) */ 1458 /* Right to execute a file. (FILE) */
1470 FILE_EXECUTE = const_cpu_to_le32(0x00000020), 1459 FILE_EXECUTE = cpu_to_le32(0x00000020),
1471 /* Right to traverse the directory. (DIRECTORY) */ 1460 /* Right to traverse the directory. (DIRECTORY) */
1472 FILE_TRAVERSE = const_cpu_to_le32(0x00000020), 1461 FILE_TRAVERSE = cpu_to_le32(0x00000020),
1473 1462
1474 /* 1463 /*
1475 * Right to delete a directory and all the files it contains (its 1464 * Right to delete a directory and all the files it contains (its
1476 * children), even if the files are read-only. (DIRECTORY) 1465 * children), even if the files are read-only. (DIRECTORY)
1477 */ 1466 */
1478 FILE_DELETE_CHILD = const_cpu_to_le32(0x00000040), 1467 FILE_DELETE_CHILD = cpu_to_le32(0x00000040),
1479 1468
1480 /* Right to read file attributes. (FILE/DIRECTORY) */ 1469 /* Right to read file attributes. (FILE/DIRECTORY) */
1481 FILE_READ_ATTRIBUTES = const_cpu_to_le32(0x00000080), 1470 FILE_READ_ATTRIBUTES = cpu_to_le32(0x00000080),
1482 1471
1483 /* Right to change file attributes. (FILE/DIRECTORY) */ 1472 /* Right to change file attributes. (FILE/DIRECTORY) */
1484 FILE_WRITE_ATTRIBUTES = const_cpu_to_le32(0x00000100), 1473 FILE_WRITE_ATTRIBUTES = cpu_to_le32(0x00000100),
1485 1474
1486 /* 1475 /*
1487 * The standard rights (bits 16 to 23). These are independent of the 1476 * The standard rights (bits 16 to 23). These are independent of the
@@ -1489,27 +1478,27 @@ enum {
1489 */ 1478 */
1490 1479
1491 /* Right to delete the object. */ 1480 /* Right to delete the object. */
1492 DELETE = const_cpu_to_le32(0x00010000), 1481 DELETE = cpu_to_le32(0x00010000),
1493 1482
1494 /* 1483 /*
1495 * Right to read the information in the object's security descriptor, 1484 * Right to read the information in the object's security descriptor,
1496 * not including the information in the SACL, i.e. right to read the 1485 * not including the information in the SACL, i.e. right to read the
1497 * security descriptor and owner. 1486 * security descriptor and owner.
1498 */ 1487 */
1499 READ_CONTROL = const_cpu_to_le32(0x00020000), 1488 READ_CONTROL = cpu_to_le32(0x00020000),
1500 1489
1501 /* Right to modify the DACL in the object's security descriptor. */ 1490 /* Right to modify the DACL in the object's security descriptor. */
1502 WRITE_DAC = const_cpu_to_le32(0x00040000), 1491 WRITE_DAC = cpu_to_le32(0x00040000),
1503 1492
1504 /* Right to change the owner in the object's security descriptor. */ 1493 /* Right to change the owner in the object's security descriptor. */
1505 WRITE_OWNER = const_cpu_to_le32(0x00080000), 1494 WRITE_OWNER = cpu_to_le32(0x00080000),
1506 1495
1507 /* 1496 /*
1508 * Right to use the object for synchronization. Enables a process to 1497 * Right to use the object for synchronization. Enables a process to
1509 * wait until the object is in the signalled state. Some object types 1498 * wait until the object is in the signalled state. Some object types
1510 * do not support this access right. 1499 * do not support this access right.
1511 */ 1500 */
1512 SYNCHRONIZE = const_cpu_to_le32(0x00100000), 1501 SYNCHRONIZE = cpu_to_le32(0x00100000),
1513 1502
1514 /* 1503 /*
1515 * The following STANDARD_RIGHTS_* are combinations of the above for 1504 * The following STANDARD_RIGHTS_* are combinations of the above for
@@ -1517,25 +1506,25 @@ enum {
1517 */ 1506 */
1518 1507
1519 /* These are currently defined to READ_CONTROL. */ 1508 /* These are currently defined to READ_CONTROL. */
1520 STANDARD_RIGHTS_READ = const_cpu_to_le32(0x00020000), 1509 STANDARD_RIGHTS_READ = cpu_to_le32(0x00020000),
1521 STANDARD_RIGHTS_WRITE = const_cpu_to_le32(0x00020000), 1510 STANDARD_RIGHTS_WRITE = cpu_to_le32(0x00020000),
1522 STANDARD_RIGHTS_EXECUTE = const_cpu_to_le32(0x00020000), 1511 STANDARD_RIGHTS_EXECUTE = cpu_to_le32(0x00020000),
1523 1512
1524 /* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */ 1513 /* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */
1525 STANDARD_RIGHTS_REQUIRED = const_cpu_to_le32(0x000f0000), 1514 STANDARD_RIGHTS_REQUIRED = cpu_to_le32(0x000f0000),
1526 1515
1527 /* 1516 /*
1528 * Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and 1517 * Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and
1529 * SYNCHRONIZE access. 1518 * SYNCHRONIZE access.
1530 */ 1519 */
1531 STANDARD_RIGHTS_ALL = const_cpu_to_le32(0x001f0000), 1520 STANDARD_RIGHTS_ALL = cpu_to_le32(0x001f0000),
1532 1521
1533 /* 1522 /*
1534 * The access system ACL and maximum allowed access types (bits 24 to 1523 * The access system ACL and maximum allowed access types (bits 24 to
1535 * 25, bits 26 to 27 are reserved). 1524 * 25, bits 26 to 27 are reserved).
1536 */ 1525 */
1537 ACCESS_SYSTEM_SECURITY = const_cpu_to_le32(0x01000000), 1526 ACCESS_SYSTEM_SECURITY = cpu_to_le32(0x01000000),
1538 MAXIMUM_ALLOWED = const_cpu_to_le32(0x02000000), 1527 MAXIMUM_ALLOWED = cpu_to_le32(0x02000000),
1539 1528
1540 /* 1529 /*
1541 * The generic rights (bits 28 to 31). These map onto the standard and 1530 * The generic rights (bits 28 to 31). These map onto the standard and
@@ -1543,10 +1532,10 @@ enum {
1543 */ 1532 */
1544 1533
1545 /* Read, write, and execute access. */ 1534 /* Read, write, and execute access. */
1546 GENERIC_ALL = const_cpu_to_le32(0x10000000), 1535 GENERIC_ALL = cpu_to_le32(0x10000000),
1547 1536
1548 /* Execute access. */ 1537 /* Execute access. */
1549 GENERIC_EXECUTE = const_cpu_to_le32(0x20000000), 1538 GENERIC_EXECUTE = cpu_to_le32(0x20000000),
1550 1539
1551 /* 1540 /*
1552 * Write access. For files, this maps onto: 1541 * Write access. For files, this maps onto:
@@ -1555,7 +1544,7 @@ enum {
1555 * For directories, the mapping has the same numerical value. See 1544 * For directories, the mapping has the same numerical value. See
1556 * above for the descriptions of the rights granted. 1545 * above for the descriptions of the rights granted.
1557 */ 1546 */
1558 GENERIC_WRITE = const_cpu_to_le32(0x40000000), 1547 GENERIC_WRITE = cpu_to_le32(0x40000000),
1559 1548
1560 /* 1549 /*
1561 * Read access. For files, this maps onto: 1550 * Read access. For files, this maps onto:
@@ -1564,7 +1553,7 @@ enum {
1564 * For directories, the mapping has the same numberical value. See 1553 * For directories, the mapping has the same numberical value. See
1565 * above for the descriptions of the rights granted. 1554 * above for the descriptions of the rights granted.
1566 */ 1555 */
1567 GENERIC_READ = const_cpu_to_le32(0x80000000), 1556 GENERIC_READ = cpu_to_le32(0x80000000),
1568}; 1557};
1569 1558
1570typedef le32 ACCESS_MASK; 1559typedef le32 ACCESS_MASK;
@@ -1604,8 +1593,8 @@ typedef struct {
1604 * The object ACE flags (32-bit). 1593 * The object ACE flags (32-bit).
1605 */ 1594 */
1606enum { 1595enum {
1607 ACE_OBJECT_TYPE_PRESENT = const_cpu_to_le32(1), 1596 ACE_OBJECT_TYPE_PRESENT = cpu_to_le32(1),
1608 ACE_INHERITED_OBJECT_TYPE_PRESENT = const_cpu_to_le32(2), 1597 ACE_INHERITED_OBJECT_TYPE_PRESENT = cpu_to_le32(2),
1609}; 1598};
1610 1599
1611typedef le32 OBJECT_ACE_FLAGS; 1600typedef le32 OBJECT_ACE_FLAGS;
@@ -1706,23 +1695,23 @@ typedef enum {
1706 * expressed as offsets from the beginning of the security descriptor. 1695 * expressed as offsets from the beginning of the security descriptor.
1707 */ 1696 */
1708enum { 1697enum {
1709 SE_OWNER_DEFAULTED = const_cpu_to_le16(0x0001), 1698 SE_OWNER_DEFAULTED = cpu_to_le16(0x0001),
1710 SE_GROUP_DEFAULTED = const_cpu_to_le16(0x0002), 1699 SE_GROUP_DEFAULTED = cpu_to_le16(0x0002),
1711 SE_DACL_PRESENT = const_cpu_to_le16(0x0004), 1700 SE_DACL_PRESENT = cpu_to_le16(0x0004),
1712 SE_DACL_DEFAULTED = const_cpu_to_le16(0x0008), 1701 SE_DACL_DEFAULTED = cpu_to_le16(0x0008),
1713 1702
1714 SE_SACL_PRESENT = const_cpu_to_le16(0x0010), 1703 SE_SACL_PRESENT = cpu_to_le16(0x0010),
1715 SE_SACL_DEFAULTED = const_cpu_to_le16(0x0020), 1704 SE_SACL_DEFAULTED = cpu_to_le16(0x0020),
1716 1705
1717 SE_DACL_AUTO_INHERIT_REQ = const_cpu_to_le16(0x0100), 1706 SE_DACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0100),
1718 SE_SACL_AUTO_INHERIT_REQ = const_cpu_to_le16(0x0200), 1707 SE_SACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0200),
1719 SE_DACL_AUTO_INHERITED = const_cpu_to_le16(0x0400), 1708 SE_DACL_AUTO_INHERITED = cpu_to_le16(0x0400),
1720 SE_SACL_AUTO_INHERITED = const_cpu_to_le16(0x0800), 1709 SE_SACL_AUTO_INHERITED = cpu_to_le16(0x0800),
1721 1710
1722 SE_DACL_PROTECTED = const_cpu_to_le16(0x1000), 1711 SE_DACL_PROTECTED = cpu_to_le16(0x1000),
1723 SE_SACL_PROTECTED = const_cpu_to_le16(0x2000), 1712 SE_SACL_PROTECTED = cpu_to_le16(0x2000),
1724 SE_RM_CONTROL_VALID = const_cpu_to_le16(0x4000), 1713 SE_RM_CONTROL_VALID = cpu_to_le16(0x4000),
1725 SE_SELF_RELATIVE = const_cpu_to_le16(0x8000) 1714 SE_SELF_RELATIVE = cpu_to_le16(0x8000)
1726} __attribute__ ((__packed__)); 1715} __attribute__ ((__packed__));
1727 1716
1728typedef le16 SECURITY_DESCRIPTOR_CONTROL; 1717typedef le16 SECURITY_DESCRIPTOR_CONTROL;
@@ -1910,21 +1899,21 @@ typedef struct {
1910 * Possible flags for the volume (16-bit). 1899 * Possible flags for the volume (16-bit).
1911 */ 1900 */
1912enum { 1901enum {
1913 VOLUME_IS_DIRTY = const_cpu_to_le16(0x0001), 1902 VOLUME_IS_DIRTY = cpu_to_le16(0x0001),
1914 VOLUME_RESIZE_LOG_FILE = const_cpu_to_le16(0x0002), 1903 VOLUME_RESIZE_LOG_FILE = cpu_to_le16(0x0002),
1915 VOLUME_UPGRADE_ON_MOUNT = const_cpu_to_le16(0x0004), 1904 VOLUME_UPGRADE_ON_MOUNT = cpu_to_le16(0x0004),
1916 VOLUME_MOUNTED_ON_NT4 = const_cpu_to_le16(0x0008), 1905 VOLUME_MOUNTED_ON_NT4 = cpu_to_le16(0x0008),
1917 1906
1918 VOLUME_DELETE_USN_UNDERWAY = const_cpu_to_le16(0x0010), 1907 VOLUME_DELETE_USN_UNDERWAY = cpu_to_le16(0x0010),
1919 VOLUME_REPAIR_OBJECT_ID = const_cpu_to_le16(0x0020), 1908 VOLUME_REPAIR_OBJECT_ID = cpu_to_le16(0x0020),
1920 1909
1921 VOLUME_CHKDSK_UNDERWAY = const_cpu_to_le16(0x4000), 1910 VOLUME_CHKDSK_UNDERWAY = cpu_to_le16(0x4000),
1922 VOLUME_MODIFIED_BY_CHKDSK = const_cpu_to_le16(0x8000), 1911 VOLUME_MODIFIED_BY_CHKDSK = cpu_to_le16(0x8000),
1923 1912
1924 VOLUME_FLAGS_MASK = const_cpu_to_le16(0xc03f), 1913 VOLUME_FLAGS_MASK = cpu_to_le16(0xc03f),
1925 1914
1926 /* To make our life easier when checking if we must mount read-only. */ 1915 /* To make our life easier when checking if we must mount read-only. */
1927 VOLUME_MUST_MOUNT_RO_MASK = const_cpu_to_le16(0xc027), 1916 VOLUME_MUST_MOUNT_RO_MASK = cpu_to_le16(0xc027),
1928} __attribute__ ((__packed__)); 1917} __attribute__ ((__packed__));
1929 1918
1930typedef le16 VOLUME_FLAGS; 1919typedef le16 VOLUME_FLAGS;
@@ -2109,26 +2098,26 @@ typedef struct {
2109 * The user quota flags. Names explain meaning. 2098 * The user quota flags. Names explain meaning.
2110 */ 2099 */
2111enum { 2100enum {
2112 QUOTA_FLAG_DEFAULT_LIMITS = const_cpu_to_le32(0x00000001), 2101 QUOTA_FLAG_DEFAULT_LIMITS = cpu_to_le32(0x00000001),
2113 QUOTA_FLAG_LIMIT_REACHED = const_cpu_to_le32(0x00000002), 2102 QUOTA_FLAG_LIMIT_REACHED = cpu_to_le32(0x00000002),
2114 QUOTA_FLAG_ID_DELETED = const_cpu_to_le32(0x00000004), 2103 QUOTA_FLAG_ID_DELETED = cpu_to_le32(0x00000004),
2115 2104
2116 QUOTA_FLAG_USER_MASK = const_cpu_to_le32(0x00000007), 2105 QUOTA_FLAG_USER_MASK = cpu_to_le32(0x00000007),
2117 /* This is a bit mask for the user quota flags. */ 2106 /* This is a bit mask for the user quota flags. */
2118 2107
2119 /* 2108 /*
2120 * These flags are only present in the quota defaults index entry, i.e. 2109 * These flags are only present in the quota defaults index entry, i.e.
2121 * in the entry where owner_id = QUOTA_DEFAULTS_ID. 2110 * in the entry where owner_id = QUOTA_DEFAULTS_ID.
2122 */ 2111 */
2123 QUOTA_FLAG_TRACKING_ENABLED = const_cpu_to_le32(0x00000010), 2112 QUOTA_FLAG_TRACKING_ENABLED = cpu_to_le32(0x00000010),
2124 QUOTA_FLAG_ENFORCEMENT_ENABLED = const_cpu_to_le32(0x00000020), 2113 QUOTA_FLAG_ENFORCEMENT_ENABLED = cpu_to_le32(0x00000020),
2125 QUOTA_FLAG_TRACKING_REQUESTED = const_cpu_to_le32(0x00000040), 2114 QUOTA_FLAG_TRACKING_REQUESTED = cpu_to_le32(0x00000040),
2126 QUOTA_FLAG_LOG_THRESHOLD = const_cpu_to_le32(0x00000080), 2115 QUOTA_FLAG_LOG_THRESHOLD = cpu_to_le32(0x00000080),
2127 2116
2128 QUOTA_FLAG_LOG_LIMIT = const_cpu_to_le32(0x00000100), 2117 QUOTA_FLAG_LOG_LIMIT = cpu_to_le32(0x00000100),
2129 QUOTA_FLAG_OUT_OF_DATE = const_cpu_to_le32(0x00000200), 2118 QUOTA_FLAG_OUT_OF_DATE = cpu_to_le32(0x00000200),
2130 QUOTA_FLAG_CORRUPT = const_cpu_to_le32(0x00000400), 2119 QUOTA_FLAG_CORRUPT = cpu_to_le32(0x00000400),
2131 QUOTA_FLAG_PENDING_DELETES = const_cpu_to_le32(0x00000800), 2120 QUOTA_FLAG_PENDING_DELETES = cpu_to_le32(0x00000800),
2132}; 2121};
2133 2122
2134typedef le32 QUOTA_FLAGS; 2123typedef le32 QUOTA_FLAGS;
@@ -2172,9 +2161,9 @@ typedef struct {
2172 * Predefined owner_id values (32-bit). 2161 * Predefined owner_id values (32-bit).
2173 */ 2162 */
2174enum { 2163enum {
2175 QUOTA_INVALID_ID = const_cpu_to_le32(0x00000000), 2164 QUOTA_INVALID_ID = cpu_to_le32(0x00000000),
2176 QUOTA_DEFAULTS_ID = const_cpu_to_le32(0x00000001), 2165 QUOTA_DEFAULTS_ID = cpu_to_le32(0x00000001),
2177 QUOTA_FIRST_USER_ID = const_cpu_to_le32(0x00000100), 2166 QUOTA_FIRST_USER_ID = cpu_to_le32(0x00000100),
2178}; 2167};
2179 2168
2180/* 2169/*
@@ -2189,14 +2178,14 @@ typedef enum {
2189 * Index entry flags (16-bit). 2178 * Index entry flags (16-bit).
2190 */ 2179 */
2191enum { 2180enum {
2192 INDEX_ENTRY_NODE = const_cpu_to_le16(1), /* This entry contains a 2181 INDEX_ENTRY_NODE = cpu_to_le16(1), /* This entry contains a
2193 sub-node, i.e. a reference to an index block in form of 2182 sub-node, i.e. a reference to an index block in form of
2194 a virtual cluster number (see below). */ 2183 a virtual cluster number (see below). */
2195 INDEX_ENTRY_END = const_cpu_to_le16(2), /* This signifies the last 2184 INDEX_ENTRY_END = cpu_to_le16(2), /* This signifies the last
2196 entry in an index block. The index entry does not 2185 entry in an index block. The index entry does not
2197 represent a file but it can point to a sub-node. */ 2186 represent a file but it can point to a sub-node. */
2198 2187
2199 INDEX_ENTRY_SPACE_FILLER = const_cpu_to_le16(0xffff), /* gcc: Force 2188 INDEX_ENTRY_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force
2200 enum bit width to 16-bit. */ 2189 enum bit width to 16-bit. */
2201} __attribute__ ((__packed__)); 2190} __attribute__ ((__packed__));
2202 2191
@@ -2334,26 +2323,26 @@ typedef struct {
2334 * These are the predefined reparse point tags: 2323 * These are the predefined reparse point tags:
2335 */ 2324 */
2336enum { 2325enum {
2337 IO_REPARSE_TAG_IS_ALIAS = const_cpu_to_le32(0x20000000), 2326 IO_REPARSE_TAG_IS_ALIAS = cpu_to_le32(0x20000000),
2338 IO_REPARSE_TAG_IS_HIGH_LATENCY = const_cpu_to_le32(0x40000000), 2327 IO_REPARSE_TAG_IS_HIGH_LATENCY = cpu_to_le32(0x40000000),
2339 IO_REPARSE_TAG_IS_MICROSOFT = const_cpu_to_le32(0x80000000), 2328 IO_REPARSE_TAG_IS_MICROSOFT = cpu_to_le32(0x80000000),
2340 2329
2341 IO_REPARSE_TAG_RESERVED_ZERO = const_cpu_to_le32(0x00000000), 2330 IO_REPARSE_TAG_RESERVED_ZERO = cpu_to_le32(0x00000000),
2342 IO_REPARSE_TAG_RESERVED_ONE = const_cpu_to_le32(0x00000001), 2331 IO_REPARSE_TAG_RESERVED_ONE = cpu_to_le32(0x00000001),
2343 IO_REPARSE_TAG_RESERVED_RANGE = const_cpu_to_le32(0x00000001), 2332 IO_REPARSE_TAG_RESERVED_RANGE = cpu_to_le32(0x00000001),
2344 2333
2345 IO_REPARSE_TAG_NSS = const_cpu_to_le32(0x68000005), 2334 IO_REPARSE_TAG_NSS = cpu_to_le32(0x68000005),
2346 IO_REPARSE_TAG_NSS_RECOVER = const_cpu_to_le32(0x68000006), 2335 IO_REPARSE_TAG_NSS_RECOVER = cpu_to_le32(0x68000006),
2347 IO_REPARSE_TAG_SIS = const_cpu_to_le32(0x68000007), 2336 IO_REPARSE_TAG_SIS = cpu_to_le32(0x68000007),
2348 IO_REPARSE_TAG_DFS = const_cpu_to_le32(0x68000008), 2337 IO_REPARSE_TAG_DFS = cpu_to_le32(0x68000008),
2349 2338
2350 IO_REPARSE_TAG_MOUNT_POINT = const_cpu_to_le32(0x88000003), 2339 IO_REPARSE_TAG_MOUNT_POINT = cpu_to_le32(0x88000003),
2351 2340
2352 IO_REPARSE_TAG_HSM = const_cpu_to_le32(0xa8000004), 2341 IO_REPARSE_TAG_HSM = cpu_to_le32(0xa8000004),
2353 2342
2354 IO_REPARSE_TAG_SYMBOLIC_LINK = const_cpu_to_le32(0xe8000000), 2343 IO_REPARSE_TAG_SYMBOLIC_LINK = cpu_to_le32(0xe8000000),
2355 2344
2356 IO_REPARSE_TAG_VALID_VALUES = const_cpu_to_le32(0xe000ffff), 2345 IO_REPARSE_TAG_VALID_VALUES = cpu_to_le32(0xe000ffff),
2357}; 2346};
2358 2347
2359/* 2348/*
diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h
index 9468e1c45ae3..b5a6f08bd35c 100644
--- a/fs/ntfs/logfile.h
+++ b/fs/ntfs/logfile.h
@@ -104,7 +104,7 @@ typedef struct {
104 * in this particular client array. Also inside the client records themselves, 104 * in this particular client array. Also inside the client records themselves,
105 * this means that there are no client records preceding or following this one. 105 * this means that there are no client records preceding or following this one.
106 */ 106 */
107#define LOGFILE_NO_CLIENT const_cpu_to_le16(0xffff) 107#define LOGFILE_NO_CLIENT cpu_to_le16(0xffff)
108#define LOGFILE_NO_CLIENT_CPU 0xffff 108#define LOGFILE_NO_CLIENT_CPU 0xffff
109 109
110/* 110/*
@@ -112,8 +112,8 @@ typedef struct {
112 * information about the log file in which they are present. 112 * information about the log file in which they are present.
113 */ 113 */
114enum { 114enum {
115 RESTART_VOLUME_IS_CLEAN = const_cpu_to_le16(0x0002), 115 RESTART_VOLUME_IS_CLEAN = cpu_to_le16(0x0002),
116 RESTART_SPACE_FILLER = const_cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */ 116 RESTART_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */
117} __attribute__ ((__packed__)); 117} __attribute__ ((__packed__));
118 118
119typedef le16 RESTART_AREA_FLAGS; 119typedef le16 RESTART_AREA_FLAGS;
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 17d32ca6bc35..23bf68453d7d 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -2839,7 +2839,7 @@ int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m)
2839 */ 2839 */
2840 2840
2841 /* Mark the mft record as not in use. */ 2841 /* Mark the mft record as not in use. */
2842 m->flags &= const_cpu_to_le16(~const_le16_to_cpu(MFT_RECORD_IN_USE)); 2842 m->flags &= ~MFT_RECORD_IN_USE;
2843 2843
2844 /* Increment the sequence number, skipping zero, if it is not zero. */ 2844 /* Increment the sequence number, skipping zero, if it is not zero. */
2845 old_seq_no = m->sequence_number; 2845 old_seq_no = m->sequence_number;
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 4a46743b5077..f76951dcd4a6 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -618,7 +618,7 @@ static bool is_boot_sector_ntfs(const struct super_block *sb,
618 * many BIOSes will refuse to boot from a bootsector if the magic is 618 * many BIOSes will refuse to boot from a bootsector if the magic is
619 * incorrect, so we emit a warning. 619 * incorrect, so we emit a warning.
620 */ 620 */
621 if (!silent && b->end_of_sector_marker != const_cpu_to_le16(0xaa55)) 621 if (!silent && b->end_of_sector_marker != cpu_to_le16(0xaa55))
622 ntfs_warning(sb, "Invalid end of sector marker."); 622 ntfs_warning(sb, "Invalid end of sector marker.");
623 return true; 623 return true;
624not_ntfs: 624not_ntfs:
@@ -1242,13 +1242,13 @@ static int check_windows_hibernation_status(ntfs_volume *vol)
1242 u32 *kaddr, *kend; 1242 u32 *kaddr, *kend;
1243 ntfs_name *name = NULL; 1243 ntfs_name *name = NULL;
1244 int ret = 1; 1244 int ret = 1;
1245 static const ntfschar hiberfil[13] = { const_cpu_to_le16('h'), 1245 static const ntfschar hiberfil[13] = { cpu_to_le16('h'),
1246 const_cpu_to_le16('i'), const_cpu_to_le16('b'), 1246 cpu_to_le16('i'), cpu_to_le16('b'),
1247 const_cpu_to_le16('e'), const_cpu_to_le16('r'), 1247 cpu_to_le16('e'), cpu_to_le16('r'),
1248 const_cpu_to_le16('f'), const_cpu_to_le16('i'), 1248 cpu_to_le16('f'), cpu_to_le16('i'),
1249 const_cpu_to_le16('l'), const_cpu_to_le16('.'), 1249 cpu_to_le16('l'), cpu_to_le16('.'),
1250 const_cpu_to_le16('s'), const_cpu_to_le16('y'), 1250 cpu_to_le16('s'), cpu_to_le16('y'),
1251 const_cpu_to_le16('s'), 0 }; 1251 cpu_to_le16('s'), 0 };
1252 1252
1253 ntfs_debug("Entering."); 1253 ntfs_debug("Entering.");
1254 /* 1254 /*
@@ -1296,7 +1296,7 @@ static int check_windows_hibernation_status(ntfs_volume *vol)
1296 goto iput_out; 1296 goto iput_out;
1297 } 1297 }
1298 kaddr = (u32*)page_address(page); 1298 kaddr = (u32*)page_address(page);
1299 if (*(le32*)kaddr == const_cpu_to_le32(0x72626968)/*'hibr'*/) { 1299 if (*(le32*)kaddr == cpu_to_le32(0x72626968)/*'hibr'*/) {
1300 ntfs_debug("Magic \"hibr\" found in hiberfil.sys. Windows is " 1300 ntfs_debug("Magic \"hibr\" found in hiberfil.sys. Windows is "
1301 "hibernated on the volume. This is the " 1301 "hibernated on the volume. This is the "
1302 "system volume."); 1302 "system volume.");
@@ -1337,12 +1337,12 @@ static bool load_and_init_quota(ntfs_volume *vol)
1337 MFT_REF mref; 1337 MFT_REF mref;
1338 struct inode *tmp_ino; 1338 struct inode *tmp_ino;
1339 ntfs_name *name = NULL; 1339 ntfs_name *name = NULL;
1340 static const ntfschar Quota[7] = { const_cpu_to_le16('$'), 1340 static const ntfschar Quota[7] = { cpu_to_le16('$'),
1341 const_cpu_to_le16('Q'), const_cpu_to_le16('u'), 1341 cpu_to_le16('Q'), cpu_to_le16('u'),
1342 const_cpu_to_le16('o'), const_cpu_to_le16('t'), 1342 cpu_to_le16('o'), cpu_to_le16('t'),
1343 const_cpu_to_le16('a'), 0 }; 1343 cpu_to_le16('a'), 0 };
1344 static ntfschar Q[3] = { const_cpu_to_le16('$'), 1344 static ntfschar Q[3] = { cpu_to_le16('$'),
1345 const_cpu_to_le16('Q'), 0 }; 1345 cpu_to_le16('Q'), 0 };
1346 1346
1347 ntfs_debug("Entering."); 1347 ntfs_debug("Entering.");
1348 /* 1348 /*
@@ -1416,16 +1416,16 @@ static bool load_and_init_usnjrnl(ntfs_volume *vol)
1416 struct page *page; 1416 struct page *page;
1417 ntfs_name *name = NULL; 1417 ntfs_name *name = NULL;
1418 USN_HEADER *uh; 1418 USN_HEADER *uh;
1419 static const ntfschar UsnJrnl[9] = { const_cpu_to_le16('$'), 1419 static const ntfschar UsnJrnl[9] = { cpu_to_le16('$'),
1420 const_cpu_to_le16('U'), const_cpu_to_le16('s'), 1420 cpu_to_le16('U'), cpu_to_le16('s'),
1421 const_cpu_to_le16('n'), const_cpu_to_le16('J'), 1421 cpu_to_le16('n'), cpu_to_le16('J'),
1422 const_cpu_to_le16('r'), const_cpu_to_le16('n'), 1422 cpu_to_le16('r'), cpu_to_le16('n'),
1423 const_cpu_to_le16('l'), 0 }; 1423 cpu_to_le16('l'), 0 };
1424 static ntfschar Max[5] = { const_cpu_to_le16('$'), 1424 static ntfschar Max[5] = { cpu_to_le16('$'),
1425 const_cpu_to_le16('M'), const_cpu_to_le16('a'), 1425 cpu_to_le16('M'), cpu_to_le16('a'),
1426 const_cpu_to_le16('x'), 0 }; 1426 cpu_to_le16('x'), 0 };
1427 static ntfschar J[3] = { const_cpu_to_le16('$'), 1427 static ntfschar J[3] = { cpu_to_le16('$'),
1428 const_cpu_to_le16('J'), 0 }; 1428 cpu_to_le16('J'), 0 };
1429 1429
1430 ntfs_debug("Entering."); 1430 ntfs_debug("Entering.");
1431 /* 1431 /*
diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h
index 4087fbdac327..00d8e6bd7c36 100644
--- a/fs/ntfs/usnjrnl.h
+++ b/fs/ntfs/usnjrnl.h
@@ -116,27 +116,27 @@ typedef struct {
116 * documentation: http://www.linux-ntfs.org/ 116 * documentation: http://www.linux-ntfs.org/
117 */ 117 */
118enum { 118enum {
119 USN_REASON_DATA_OVERWRITE = const_cpu_to_le32(0x00000001), 119 USN_REASON_DATA_OVERWRITE = cpu_to_le32(0x00000001),
120 USN_REASON_DATA_EXTEND = const_cpu_to_le32(0x00000002), 120 USN_REASON_DATA_EXTEND = cpu_to_le32(0x00000002),
121 USN_REASON_DATA_TRUNCATION = const_cpu_to_le32(0x00000004), 121 USN_REASON_DATA_TRUNCATION = cpu_to_le32(0x00000004),
122 USN_REASON_NAMED_DATA_OVERWRITE = const_cpu_to_le32(0x00000010), 122 USN_REASON_NAMED_DATA_OVERWRITE = cpu_to_le32(0x00000010),
123 USN_REASON_NAMED_DATA_EXTEND = const_cpu_to_le32(0x00000020), 123 USN_REASON_NAMED_DATA_EXTEND = cpu_to_le32(0x00000020),
124 USN_REASON_NAMED_DATA_TRUNCATION= const_cpu_to_le32(0x00000040), 124 USN_REASON_NAMED_DATA_TRUNCATION= cpu_to_le32(0x00000040),
125 USN_REASON_FILE_CREATE = const_cpu_to_le32(0x00000100), 125 USN_REASON_FILE_CREATE = cpu_to_le32(0x00000100),
126 USN_REASON_FILE_DELETE = const_cpu_to_le32(0x00000200), 126 USN_REASON_FILE_DELETE = cpu_to_le32(0x00000200),
127 USN_REASON_EA_CHANGE = const_cpu_to_le32(0x00000400), 127 USN_REASON_EA_CHANGE = cpu_to_le32(0x00000400),
128 USN_REASON_SECURITY_CHANGE = const_cpu_to_le32(0x00000800), 128 USN_REASON_SECURITY_CHANGE = cpu_to_le32(0x00000800),
129 USN_REASON_RENAME_OLD_NAME = const_cpu_to_le32(0x00001000), 129 USN_REASON_RENAME_OLD_NAME = cpu_to_le32(0x00001000),
130 USN_REASON_RENAME_NEW_NAME = const_cpu_to_le32(0x00002000), 130 USN_REASON_RENAME_NEW_NAME = cpu_to_le32(0x00002000),
131 USN_REASON_INDEXABLE_CHANGE = const_cpu_to_le32(0x00004000), 131 USN_REASON_INDEXABLE_CHANGE = cpu_to_le32(0x00004000),
132 USN_REASON_BASIC_INFO_CHANGE = const_cpu_to_le32(0x00008000), 132 USN_REASON_BASIC_INFO_CHANGE = cpu_to_le32(0x00008000),
133 USN_REASON_HARD_LINK_CHANGE = const_cpu_to_le32(0x00010000), 133 USN_REASON_HARD_LINK_CHANGE = cpu_to_le32(0x00010000),
134 USN_REASON_COMPRESSION_CHANGE = const_cpu_to_le32(0x00020000), 134 USN_REASON_COMPRESSION_CHANGE = cpu_to_le32(0x00020000),
135 USN_REASON_ENCRYPTION_CHANGE = const_cpu_to_le32(0x00040000), 135 USN_REASON_ENCRYPTION_CHANGE = cpu_to_le32(0x00040000),
136 USN_REASON_OBJECT_ID_CHANGE = const_cpu_to_le32(0x00080000), 136 USN_REASON_OBJECT_ID_CHANGE = cpu_to_le32(0x00080000),
137 USN_REASON_REPARSE_POINT_CHANGE = const_cpu_to_le32(0x00100000), 137 USN_REASON_REPARSE_POINT_CHANGE = cpu_to_le32(0x00100000),
138 USN_REASON_STREAM_CHANGE = const_cpu_to_le32(0x00200000), 138 USN_REASON_STREAM_CHANGE = cpu_to_le32(0x00200000),
139 USN_REASON_CLOSE = const_cpu_to_le32(0x80000000), 139 USN_REASON_CLOSE = cpu_to_le32(0x80000000),
140}; 140};
141 141
142typedef le32 USN_REASON_FLAGS; 142typedef le32 USN_REASON_FLAGS;
@@ -148,9 +148,9 @@ typedef le32 USN_REASON_FLAGS;
148 * http://www.linux-ntfs.org/ 148 * http://www.linux-ntfs.org/
149 */ 149 */
150enum { 150enum {
151 USN_SOURCE_DATA_MANAGEMENT = const_cpu_to_le32(0x00000001), 151 USN_SOURCE_DATA_MANAGEMENT = cpu_to_le32(0x00000001),
152 USN_SOURCE_AUXILIARY_DATA = const_cpu_to_le32(0x00000002), 152 USN_SOURCE_AUXILIARY_DATA = cpu_to_le32(0x00000002),
153 USN_SOURCE_REPLICATION_MANAGEMENT = const_cpu_to_le32(0x00000004), 153 USN_SOURCE_REPLICATION_MANAGEMENT = cpu_to_le32(0x00000004),
154}; 154};
155 155
156typedef le32 USN_SOURCE_INFO_FLAGS; 156typedef le32 USN_SOURCE_INFO_FLAGS;
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 12dfb44c22e5..fbeaec762103 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -296,7 +296,7 @@ int ocfs2_init_acl(handle_t *handle,
296 return PTR_ERR(acl); 296 return PTR_ERR(acl);
297 } 297 }
298 if (!acl) 298 if (!acl)
299 inode->i_mode &= ~current->fs->umask; 299 inode->i_mode &= ~current_umask();
300 } 300 }
301 if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { 301 if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
302 struct posix_acl *clone; 302 struct posix_acl *clone;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 19e3a96aa02c..678a067d9251 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -294,6 +294,55 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
294 .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters, 294 .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
295}; 295};
296 296
297static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et,
298 u64 blkno)
299{
300 struct ocfs2_dx_root_block *dx_root = et->et_object;
301
302 dx_root->dr_last_eb_blk = cpu_to_le64(blkno);
303}
304
305static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
306{
307 struct ocfs2_dx_root_block *dx_root = et->et_object;
308
309 return le64_to_cpu(dx_root->dr_last_eb_blk);
310}
311
312static void ocfs2_dx_root_update_clusters(struct inode *inode,
313 struct ocfs2_extent_tree *et,
314 u32 clusters)
315{
316 struct ocfs2_dx_root_block *dx_root = et->et_object;
317
318 le32_add_cpu(&dx_root->dr_clusters, clusters);
319}
320
321static int ocfs2_dx_root_sanity_check(struct inode *inode,
322 struct ocfs2_extent_tree *et)
323{
324 struct ocfs2_dx_root_block *dx_root = et->et_object;
325
326 BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root));
327
328 return 0;
329}
330
331static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
332{
333 struct ocfs2_dx_root_block *dx_root = et->et_object;
334
335 et->et_root_el = &dx_root->dr_list;
336}
337
338static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
339 .eo_set_last_eb_blk = ocfs2_dx_root_set_last_eb_blk,
340 .eo_get_last_eb_blk = ocfs2_dx_root_get_last_eb_blk,
341 .eo_update_clusters = ocfs2_dx_root_update_clusters,
342 .eo_sanity_check = ocfs2_dx_root_sanity_check,
343 .eo_fill_root_el = ocfs2_dx_root_fill_root_el,
344};
345
297static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, 346static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
298 struct inode *inode, 347 struct inode *inode,
299 struct buffer_head *bh, 348 struct buffer_head *bh,
@@ -339,6 +388,14 @@ void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
339 &ocfs2_xattr_value_et_ops); 388 &ocfs2_xattr_value_et_ops);
340} 389}
341 390
391void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
392 struct inode *inode,
393 struct buffer_head *bh)
394{
395 __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_dr,
396 NULL, &ocfs2_dx_root_et_ops);
397}
398
342static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et, 399static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
343 u64 new_last_eb_blk) 400 u64 new_last_eb_blk)
344{ 401{
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index cceff5c37f47..353254ba29e1 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -75,6 +75,9 @@ struct ocfs2_xattr_value_buf;
75void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, 75void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
76 struct inode *inode, 76 struct inode *inode,
77 struct ocfs2_xattr_value_buf *vb); 77 struct ocfs2_xattr_value_buf *vb);
78void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
79 struct inode *inode,
80 struct buffer_head *bh);
78 81
79/* 82/*
80 * Read an extent block into *bh. If *bh is NULL, a bh will be 83 * Read an extent block into *bh. If *bh is NULL, a bh will be
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 8e1709a679b7..b2c52b3a1484 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1956,15 +1956,16 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
1956} 1956}
1957 1957
1958const struct address_space_operations ocfs2_aops = { 1958const struct address_space_operations ocfs2_aops = {
1959 .readpage = ocfs2_readpage, 1959 .readpage = ocfs2_readpage,
1960 .readpages = ocfs2_readpages, 1960 .readpages = ocfs2_readpages,
1961 .writepage = ocfs2_writepage, 1961 .writepage = ocfs2_writepage,
1962 .write_begin = ocfs2_write_begin, 1962 .write_begin = ocfs2_write_begin,
1963 .write_end = ocfs2_write_end, 1963 .write_end = ocfs2_write_end,
1964 .bmap = ocfs2_bmap, 1964 .bmap = ocfs2_bmap,
1965 .sync_page = block_sync_page, 1965 .sync_page = block_sync_page,
1966 .direct_IO = ocfs2_direct_IO, 1966 .direct_IO = ocfs2_direct_IO,
1967 .invalidatepage = ocfs2_invalidatepage, 1967 .invalidatepage = ocfs2_invalidatepage,
1968 .releasepage = ocfs2_releasepage, 1968 .releasepage = ocfs2_releasepage,
1969 .migratepage = buffer_migrate_page, 1969 .migratepage = buffer_migrate_page,
1970 .is_partially_uptodate = block_is_partially_uptodate,
1970}; 1971};
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 04697ba7f73e..4f85eceab376 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -33,6 +33,7 @@
33#include <linux/random.h> 33#include <linux/random.h>
34#include <linux/crc32.h> 34#include <linux/crc32.h>
35#include <linux/time.h> 35#include <linux/time.h>
36#include <linux/debugfs.h>
36 37
37#include "heartbeat.h" 38#include "heartbeat.h"
38#include "tcp.h" 39#include "tcp.h"
@@ -60,6 +61,11 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
60static LIST_HEAD(o2hb_node_events); 61static LIST_HEAD(o2hb_node_events);
61static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); 62static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
62 63
64#define O2HB_DEBUG_DIR "o2hb"
65#define O2HB_DEBUG_LIVENODES "livenodes"
66static struct dentry *o2hb_debug_dir;
67static struct dentry *o2hb_debug_livenodes;
68
63static LIST_HEAD(o2hb_all_regions); 69static LIST_HEAD(o2hb_all_regions);
64 70
65static struct o2hb_callback { 71static struct o2hb_callback {
@@ -905,7 +911,77 @@ static int o2hb_thread(void *data)
905 return 0; 911 return 0;
906} 912}
907 913
908void o2hb_init(void) 914#ifdef CONFIG_DEBUG_FS
915static int o2hb_debug_open(struct inode *inode, struct file *file)
916{
917 unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
918 char *buf = NULL;
919 int i = -1;
920 int out = 0;
921
922 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
923 if (!buf)
924 goto bail;
925
926 o2hb_fill_node_map(map, sizeof(map));
927
928 while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
929 out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
930 out += snprintf(buf + out, PAGE_SIZE - out, "\n");
931
932 i_size_write(inode, out);
933
934 file->private_data = buf;
935
936 return 0;
937bail:
938 return -ENOMEM;
939}
940
941static int o2hb_debug_release(struct inode *inode, struct file *file)
942{
943 kfree(file->private_data);
944 return 0;
945}
946
947static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
948 size_t nbytes, loff_t *ppos)
949{
950 return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
951 i_size_read(file->f_mapping->host));
952}
953#else
954static int o2hb_debug_open(struct inode *inode, struct file *file)
955{
956 return 0;
957}
958static int o2hb_debug_release(struct inode *inode, struct file *file)
959{
960 return 0;
961}
962static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
963 size_t nbytes, loff_t *ppos)
964{
965 return 0;
966}
967#endif /* CONFIG_DEBUG_FS */
968
969static struct file_operations o2hb_debug_fops = {
970 .open = o2hb_debug_open,
971 .release = o2hb_debug_release,
972 .read = o2hb_debug_read,
973 .llseek = generic_file_llseek,
974};
975
976void o2hb_exit(void)
977{
978 if (o2hb_debug_livenodes)
979 debugfs_remove(o2hb_debug_livenodes);
980 if (o2hb_debug_dir)
981 debugfs_remove(o2hb_debug_dir);
982}
983
984int o2hb_init(void)
909{ 985{
910 int i; 986 int i;
911 987
@@ -918,6 +994,24 @@ void o2hb_init(void)
918 INIT_LIST_HEAD(&o2hb_node_events); 994 INIT_LIST_HEAD(&o2hb_node_events);
919 995
920 memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); 996 memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
997
998 o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
999 if (!o2hb_debug_dir) {
1000 mlog_errno(-ENOMEM);
1001 return -ENOMEM;
1002 }
1003
1004 o2hb_debug_livenodes = debugfs_create_file(O2HB_DEBUG_LIVENODES,
1005 S_IFREG|S_IRUSR,
1006 o2hb_debug_dir, NULL,
1007 &o2hb_debug_fops);
1008 if (!o2hb_debug_livenodes) {
1009 mlog_errno(-ENOMEM);
1010 debugfs_remove(o2hb_debug_dir);
1011 return -ENOMEM;
1012 }
1013
1014 return 0;
921} 1015}
922 1016
923/* if we're already in a callback then we're already serialized by the sem */ 1017/* if we're already in a callback then we're already serialized by the sem */
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index e511339886b3..2f1649253b49 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -75,7 +75,8 @@ void o2hb_unregister_callback(const char *region_uuid,
75 struct o2hb_callback_func *hc); 75 struct o2hb_callback_func *hc);
76void o2hb_fill_node_map(unsigned long *map, 76void o2hb_fill_node_map(unsigned long *map,
77 unsigned bytes); 77 unsigned bytes);
78void o2hb_init(void); 78void o2hb_exit(void);
79int o2hb_init(void);
79int o2hb_check_node_heartbeating(u8 node_num); 80int o2hb_check_node_heartbeating(u8 node_num);
80int o2hb_check_node_heartbeating_from_callback(u8 node_num); 81int o2hb_check_node_heartbeating_from_callback(u8 node_num);
81int o2hb_check_local_node_heartbeating(void); 82int o2hb_check_local_node_heartbeating(void);
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 70e8fa9e2539..7ee6188bc79a 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -881,6 +881,7 @@ static void __exit exit_o2nm(void)
881 o2cb_sys_shutdown(); 881 o2cb_sys_shutdown();
882 882
883 o2net_exit(); 883 o2net_exit();
884 o2hb_exit();
884} 885}
885 886
886static int __init init_o2nm(void) 887static int __init init_o2nm(void)
@@ -889,11 +890,13 @@ static int __init init_o2nm(void)
889 890
890 cluster_print_version(); 891 cluster_print_version();
891 892
892 o2hb_init(); 893 ret = o2hb_init();
894 if (ret)
895 goto out;
893 896
894 ret = o2net_init(); 897 ret = o2net_init();
895 if (ret) 898 if (ret)
896 goto out; 899 goto out_o2hb;
897 900
898 ret = o2net_register_hb_callbacks(); 901 ret = o2net_register_hb_callbacks();
899 if (ret) 902 if (ret)
@@ -916,6 +919,8 @@ out_callbacks:
916 o2net_unregister_hb_callbacks(); 919 o2net_unregister_hb_callbacks();
917out_o2net: 920out_o2net:
918 o2net_exit(); 921 o2net_exit();
922out_o2hb:
923 o2hb_exit();
919out: 924out:
920 return ret; 925 return ret;
921} 926}
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 7d604480557a..b574431a031d 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -290,6 +290,21 @@ out_attach:
290 else 290 else
291 mlog_errno(ret); 291 mlog_errno(ret);
292 292
293 /*
294 * In case of error, manually free the allocation and do the iput().
295 * We need to do this because error here means no d_instantiate(),
296 * which means iput() will not be called during dput(dentry).
297 */
298 if (ret < 0 && !alias) {
299 ocfs2_lock_res_free(&dl->dl_lockres);
300 BUG_ON(dl->dl_count != 1);
301 spin_lock(&dentry_attach_lock);
302 dentry->d_fsdata = NULL;
303 spin_unlock(&dentry_attach_lock);
304 kfree(dl);
305 iput(inode);
306 }
307
293 dput(alias); 308 dput(alias);
294 309
295 return ret; 310 return ret;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index f2c4098cf337..c5752305627c 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -41,6 +41,7 @@
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/quotaops.h> 43#include <linux/quotaops.h>
44#include <linux/sort.h>
44 45
45#define MLOG_MASK_PREFIX ML_NAMEI 46#define MLOG_MASK_PREFIX ML_NAMEI
46#include <cluster/masklog.h> 47#include <cluster/masklog.h>
@@ -58,6 +59,7 @@
58#include "namei.h" 59#include "namei.h"
59#include "suballoc.h" 60#include "suballoc.h"
60#include "super.h" 61#include "super.h"
62#include "sysfile.h"
61#include "uptodate.h" 63#include "uptodate.h"
62 64
63#include "buffer_head_io.h" 65#include "buffer_head_io.h"
@@ -71,11 +73,6 @@ static unsigned char ocfs2_filetype_table[] = {
71 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 73 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
72}; 74};
73 75
74static int ocfs2_extend_dir(struct ocfs2_super *osb,
75 struct inode *dir,
76 struct buffer_head *parent_fe_bh,
77 unsigned int blocks_wanted,
78 struct buffer_head **new_de_bh);
79static int ocfs2_do_extend_dir(struct super_block *sb, 76static int ocfs2_do_extend_dir(struct super_block *sb,
80 handle_t *handle, 77 handle_t *handle,
81 struct inode *dir, 78 struct inode *dir,
@@ -83,22 +80,36 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
83 struct ocfs2_alloc_context *data_ac, 80 struct ocfs2_alloc_context *data_ac,
84 struct ocfs2_alloc_context *meta_ac, 81 struct ocfs2_alloc_context *meta_ac,
85 struct buffer_head **new_bh); 82 struct buffer_head **new_bh);
83static int ocfs2_dir_indexed(struct inode *inode);
86 84
87/* 85/*
88 * These are distinct checks because future versions of the file system will 86 * These are distinct checks because future versions of the file system will
89 * want to have a trailing dirent structure independent of indexing. 87 * want to have a trailing dirent structure independent of indexing.
90 */ 88 */
91static int ocfs2_dir_has_trailer(struct inode *dir) 89static int ocfs2_supports_dir_trailer(struct inode *dir)
92{ 90{
91 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
92
93 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 93 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
94 return 0; 94 return 0;
95 95
96 return ocfs2_meta_ecc(OCFS2_SB(dir->i_sb)); 96 return ocfs2_meta_ecc(osb) || ocfs2_dir_indexed(dir);
97} 97}
98 98
99static int ocfs2_supports_dir_trailer(struct ocfs2_super *osb) 99/*
100 * "new' here refers to the point at which we're creating a new
101 * directory via "mkdir()", but also when we're expanding an inline
102 * directory. In either case, we don't yet have the indexing bit set
103 * on the directory, so the standard checks will fail in when metaecc
104 * is turned off. Only directory-initialization type functions should
105 * use this then. Everything else wants ocfs2_supports_dir_trailer()
106 */
107static int ocfs2_new_dir_wants_trailer(struct inode *dir)
100{ 108{
101 return ocfs2_meta_ecc(osb); 109 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
110
111 return ocfs2_meta_ecc(osb) ||
112 ocfs2_supports_indexed_dirs(osb);
102} 113}
103 114
104static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb) 115static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb)
@@ -130,7 +141,7 @@ static int ocfs2_skip_dir_trailer(struct inode *dir,
130{ 141{
131 unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer); 142 unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer);
132 143
133 if (!ocfs2_dir_has_trailer(dir)) 144 if (!ocfs2_supports_dir_trailer(dir))
134 return 0; 145 return 0;
135 146
136 if (offset != toff) 147 if (offset != toff)
@@ -140,7 +151,7 @@ static int ocfs2_skip_dir_trailer(struct inode *dir,
140} 151}
141 152
142static void ocfs2_init_dir_trailer(struct inode *inode, 153static void ocfs2_init_dir_trailer(struct inode *inode,
143 struct buffer_head *bh) 154 struct buffer_head *bh, u16 rec_len)
144{ 155{
145 struct ocfs2_dir_block_trailer *trailer; 156 struct ocfs2_dir_block_trailer *trailer;
146 157
@@ -150,6 +161,153 @@ static void ocfs2_init_dir_trailer(struct inode *inode,
150 cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer)); 161 cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer));
151 trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno); 162 trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
152 trailer->db_blkno = cpu_to_le64(bh->b_blocknr); 163 trailer->db_blkno = cpu_to_le64(bh->b_blocknr);
164 trailer->db_free_rec_len = cpu_to_le16(rec_len);
165}
166/*
167 * Link an unindexed block with a dir trailer structure into the index free
168 * list. This function will modify dirdata_bh, but assumes you've already
169 * passed it to the journal.
170 */
171static int ocfs2_dx_dir_link_trailer(struct inode *dir, handle_t *handle,
172 struct buffer_head *dx_root_bh,
173 struct buffer_head *dirdata_bh)
174{
175 int ret;
176 struct ocfs2_dx_root_block *dx_root;
177 struct ocfs2_dir_block_trailer *trailer;
178
179 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
180 OCFS2_JOURNAL_ACCESS_WRITE);
181 if (ret) {
182 mlog_errno(ret);
183 goto out;
184 }
185 trailer = ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
186 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
187
188 trailer->db_free_next = dx_root->dr_free_blk;
189 dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
190
191 ocfs2_journal_dirty(handle, dx_root_bh);
192
193out:
194 return ret;
195}
196
197static int ocfs2_free_list_at_root(struct ocfs2_dir_lookup_result *res)
198{
199 return res->dl_prev_leaf_bh == NULL;
200}
201
202void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res)
203{
204 brelse(res->dl_dx_root_bh);
205 brelse(res->dl_leaf_bh);
206 brelse(res->dl_dx_leaf_bh);
207 brelse(res->dl_prev_leaf_bh);
208}
209
210static int ocfs2_dir_indexed(struct inode *inode)
211{
212 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INDEXED_DIR_FL)
213 return 1;
214 return 0;
215}
216
217static inline int ocfs2_dx_root_inline(struct ocfs2_dx_root_block *dx_root)
218{
219 return dx_root->dr_flags & OCFS2_DX_FLAG_INLINE;
220}
221
222/*
223 * Hashing code adapted from ext3
224 */
225#define DELTA 0x9E3779B9
226
227static void TEA_transform(__u32 buf[4], __u32 const in[])
228{
229 __u32 sum = 0;
230 __u32 b0 = buf[0], b1 = buf[1];
231 __u32 a = in[0], b = in[1], c = in[2], d = in[3];
232 int n = 16;
233
234 do {
235 sum += DELTA;
236 b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
237 b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
238 } while (--n);
239
240 buf[0] += b0;
241 buf[1] += b1;
242}
243
244static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
245{
246 __u32 pad, val;
247 int i;
248
249 pad = (__u32)len | ((__u32)len << 8);
250 pad |= pad << 16;
251
252 val = pad;
253 if (len > num*4)
254 len = num * 4;
255 for (i = 0; i < len; i++) {
256 if ((i % 4) == 0)
257 val = pad;
258 val = msg[i] + (val << 8);
259 if ((i % 4) == 3) {
260 *buf++ = val;
261 val = pad;
262 num--;
263 }
264 }
265 if (--num >= 0)
266 *buf++ = val;
267 while (--num >= 0)
268 *buf++ = pad;
269}
270
271static void ocfs2_dx_dir_name_hash(struct inode *dir, const char *name, int len,
272 struct ocfs2_dx_hinfo *hinfo)
273{
274 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
275 const char *p;
276 __u32 in[8], buf[4];
277
278 /*
279 * XXX: Is this really necessary, if the index is never looked
280 * at by readdir? Is a hash value of '0' a bad idea?
281 */
282 if ((len == 1 && !strncmp(".", name, 1)) ||
283 (len == 2 && !strncmp("..", name, 2))) {
284 buf[0] = buf[1] = 0;
285 goto out;
286 }
287
288#ifdef OCFS2_DEBUG_DX_DIRS
289 /*
290 * This makes it very easy to debug indexing problems. We
291 * should never allow this to be selected without hand editing
292 * this file though.
293 */
294 buf[0] = buf[1] = len;
295 goto out;
296#endif
297
298 memcpy(buf, osb->osb_dx_seed, sizeof(buf));
299
300 p = name;
301 while (len > 0) {
302 str2hashbuf(p, len, in, 4);
303 TEA_transform(buf, in);
304 len -= 16;
305 p += 16;
306 }
307
308out:
309 hinfo->major_hash = buf[0];
310 hinfo->minor_hash = buf[1];
153} 311}
154 312
155/* 313/*
@@ -312,6 +470,52 @@ static int ocfs2_validate_dir_block(struct super_block *sb,
312} 470}
313 471
314/* 472/*
473 * Validate a directory trailer.
474 *
475 * We check the trailer here rather than in ocfs2_validate_dir_block()
476 * because that function doesn't have the inode to test.
477 */
478static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
479{
480 int rc = 0;
481 struct ocfs2_dir_block_trailer *trailer;
482
483 trailer = ocfs2_trailer_from_bh(bh, dir->i_sb);
484 if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
485 rc = -EINVAL;
486 ocfs2_error(dir->i_sb,
487 "Invalid dirblock #%llu: "
488 "signature = %.*s\n",
489 (unsigned long long)bh->b_blocknr, 7,
490 trailer->db_signature);
491 goto out;
492 }
493 if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) {
494 rc = -EINVAL;
495 ocfs2_error(dir->i_sb,
496 "Directory block #%llu has an invalid "
497 "db_blkno of %llu",
498 (unsigned long long)bh->b_blocknr,
499 (unsigned long long)le64_to_cpu(trailer->db_blkno));
500 goto out;
501 }
502 if (le64_to_cpu(trailer->db_parent_dinode) !=
503 OCFS2_I(dir)->ip_blkno) {
504 rc = -EINVAL;
505 ocfs2_error(dir->i_sb,
506 "Directory block #%llu on dinode "
507 "#%llu has an invalid parent_dinode "
508 "of %llu",
509 (unsigned long long)bh->b_blocknr,
510 (unsigned long long)OCFS2_I(dir)->ip_blkno,
511 (unsigned long long)le64_to_cpu(trailer->db_blkno));
512 goto out;
513 }
514out:
515 return rc;
516}
517
518/*
315 * This function forces all errors to -EIO for consistency with its 519 * This function forces all errors to -EIO for consistency with its
316 * predecessor, ocfs2_bread(). We haven't audited what returning the 520 * predecessor, ocfs2_bread(). We haven't audited what returning the
317 * real error codes would do to callers. We log the real codes with 521 * real error codes would do to callers. We log the real codes with
@@ -322,7 +526,6 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
322{ 526{
323 int rc = 0; 527 int rc = 0;
324 struct buffer_head *tmp = *bh; 528 struct buffer_head *tmp = *bh;
325 struct ocfs2_dir_block_trailer *trailer;
326 529
327 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags, 530 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
328 ocfs2_validate_dir_block); 531 ocfs2_validate_dir_block);
@@ -331,42 +534,13 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
331 goto out; 534 goto out;
332 } 535 }
333 536
334 /*
335 * We check the trailer here rather than in
336 * ocfs2_validate_dir_block() because that function doesn't have
337 * the inode to test.
338 */
339 if (!(flags & OCFS2_BH_READAHEAD) && 537 if (!(flags & OCFS2_BH_READAHEAD) &&
340 ocfs2_dir_has_trailer(inode)) { 538 ocfs2_supports_dir_trailer(inode)) {
341 trailer = ocfs2_trailer_from_bh(tmp, inode->i_sb); 539 rc = ocfs2_check_dir_trailer(inode, tmp);
342 if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) { 540 if (rc) {
343 rc = -EINVAL; 541 if (!*bh)
344 ocfs2_error(inode->i_sb, 542 brelse(tmp);
345 "Invalid dirblock #%llu: " 543 mlog_errno(rc);
346 "signature = %.*s\n",
347 (unsigned long long)tmp->b_blocknr, 7,
348 trailer->db_signature);
349 goto out;
350 }
351 if (le64_to_cpu(trailer->db_blkno) != tmp->b_blocknr) {
352 rc = -EINVAL;
353 ocfs2_error(inode->i_sb,
354 "Directory block #%llu has an invalid "
355 "db_blkno of %llu",
356 (unsigned long long)tmp->b_blocknr,
357 (unsigned long long)le64_to_cpu(trailer->db_blkno));
358 goto out;
359 }
360 if (le64_to_cpu(trailer->db_parent_dinode) !=
361 OCFS2_I(inode)->ip_blkno) {
362 rc = -EINVAL;
363 ocfs2_error(inode->i_sb,
364 "Directory block #%llu on dinode "
365 "#%llu has an invalid parent_dinode "
366 "of %llu",
367 (unsigned long long)tmp->b_blocknr,
368 (unsigned long long)OCFS2_I(inode)->ip_blkno,
369 (unsigned long long)le64_to_cpu(trailer->db_blkno));
370 goto out; 544 goto out;
371 } 545 }
372 } 546 }
@@ -379,6 +553,141 @@ out:
379 return rc ? -EIO : 0; 553 return rc ? -EIO : 0;
380} 554}
381 555
556/*
557 * Read the block at 'phys' which belongs to this directory
558 * inode. This function does no virtual->physical block translation -
559 * what's passed in is assumed to be a valid directory block.
560 */
561static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys,
562 struct buffer_head **bh)
563{
564 int ret;
565 struct buffer_head *tmp = *bh;
566
567 ret = ocfs2_read_block(dir, phys, &tmp, ocfs2_validate_dir_block);
568 if (ret) {
569 mlog_errno(ret);
570 goto out;
571 }
572
573 if (ocfs2_supports_dir_trailer(dir)) {
574 ret = ocfs2_check_dir_trailer(dir, tmp);
575 if (ret) {
576 if (!*bh)
577 brelse(tmp);
578 mlog_errno(ret);
579 goto out;
580 }
581 }
582
583 if (!ret && !*bh)
584 *bh = tmp;
585out:
586 return ret;
587}
588
589static int ocfs2_validate_dx_root(struct super_block *sb,
590 struct buffer_head *bh)
591{
592 int ret;
593 struct ocfs2_dx_root_block *dx_root;
594
595 BUG_ON(!buffer_uptodate(bh));
596
597 dx_root = (struct ocfs2_dx_root_block *) bh->b_data;
598
599 ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_root->dr_check);
600 if (ret) {
601 mlog(ML_ERROR,
602 "Checksum failed for dir index root block %llu\n",
603 (unsigned long long)bh->b_blocknr);
604 return ret;
605 }
606
607 if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) {
608 ocfs2_error(sb,
609 "Dir Index Root # %llu has bad signature %.*s",
610 (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
611 7, dx_root->dr_signature);
612 return -EINVAL;
613 }
614
615 return 0;
616}
617
618static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
619 struct buffer_head **dx_root_bh)
620{
621 int ret;
622 u64 blkno = le64_to_cpu(di->i_dx_root);
623 struct buffer_head *tmp = *dx_root_bh;
624
625 ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_root);
626
627 /* If ocfs2_read_block() got us a new bh, pass it up. */
628 if (!ret && !*dx_root_bh)
629 *dx_root_bh = tmp;
630
631 return ret;
632}
633
634static int ocfs2_validate_dx_leaf(struct super_block *sb,
635 struct buffer_head *bh)
636{
637 int ret;
638 struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)bh->b_data;
639
640 BUG_ON(!buffer_uptodate(bh));
641
642 ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_leaf->dl_check);
643 if (ret) {
644 mlog(ML_ERROR,
645 "Checksum failed for dir index leaf block %llu\n",
646 (unsigned long long)bh->b_blocknr);
647 return ret;
648 }
649
650 if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) {
651 ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s",
652 7, dx_leaf->dl_signature);
653 return -EROFS;
654 }
655
656 return 0;
657}
658
659static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
660 struct buffer_head **dx_leaf_bh)
661{
662 int ret;
663 struct buffer_head *tmp = *dx_leaf_bh;
664
665 ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_leaf);
666
667 /* If ocfs2_read_block() got us a new bh, pass it up. */
668 if (!ret && !*dx_leaf_bh)
669 *dx_leaf_bh = tmp;
670
671 return ret;
672}
673
674/*
675 * Read a series of dx_leaf blocks. This expects all buffer_head
676 * pointers to be NULL on function entry.
677 */
678static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num,
679 struct buffer_head **dx_leaf_bhs)
680{
681 int ret;
682
683 ret = ocfs2_read_blocks(dir, start, num, dx_leaf_bhs, 0,
684 ocfs2_validate_dx_leaf);
685 if (ret)
686 mlog_errno(ret);
687
688 return ret;
689}
690
382static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen, 691static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
383 struct inode *dir, 692 struct inode *dir,
384 struct ocfs2_dir_entry **res_dir) 693 struct ocfs2_dir_entry **res_dir)
@@ -480,39 +789,340 @@ cleanup_and_exit:
480 return ret; 789 return ret;
481} 790}
482 791
792static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
793 struct ocfs2_extent_list *el,
794 u32 major_hash,
795 u32 *ret_cpos,
796 u64 *ret_phys_blkno,
797 unsigned int *ret_clen)
798{
799 int ret = 0, i, found;
800 struct buffer_head *eb_bh = NULL;
801 struct ocfs2_extent_block *eb;
802 struct ocfs2_extent_rec *rec = NULL;
803
804 if (el->l_tree_depth) {
805 ret = ocfs2_find_leaf(inode, el, major_hash, &eb_bh);
806 if (ret) {
807 mlog_errno(ret);
808 goto out;
809 }
810
811 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
812 el = &eb->h_list;
813
814 if (el->l_tree_depth) {
815 ocfs2_error(inode->i_sb,
816 "Inode %lu has non zero tree depth in "
817 "btree tree block %llu\n", inode->i_ino,
818 (unsigned long long)eb_bh->b_blocknr);
819 ret = -EROFS;
820 goto out;
821 }
822 }
823
824 found = 0;
825 for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
826 rec = &el->l_recs[i];
827
828 if (le32_to_cpu(rec->e_cpos) <= major_hash) {
829 found = 1;
830 break;
831 }
832 }
833
834 if (!found) {
835 ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
836 "record (%u, %u, 0) in btree", inode->i_ino,
837 le32_to_cpu(rec->e_cpos),
838 ocfs2_rec_clusters(el, rec));
839 ret = -EROFS;
840 goto out;
841 }
842
843 if (ret_phys_blkno)
844 *ret_phys_blkno = le64_to_cpu(rec->e_blkno);
845 if (ret_cpos)
846 *ret_cpos = le32_to_cpu(rec->e_cpos);
847 if (ret_clen)
848 *ret_clen = le16_to_cpu(rec->e_leaf_clusters);
849
850out:
851 brelse(eb_bh);
852 return ret;
853}
854
855/*
856 * Returns the block index, from the start of the cluster which this
857 * hash belongs too.
858 */
859static inline unsigned int __ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
860 u32 minor_hash)
861{
862 return minor_hash & osb->osb_dx_mask;
863}
864
865static inline unsigned int ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
866 struct ocfs2_dx_hinfo *hinfo)
867{
868 return __ocfs2_dx_dir_hash_idx(osb, hinfo->minor_hash);
869}
870
871static int ocfs2_dx_dir_lookup(struct inode *inode,
872 struct ocfs2_extent_list *el,
873 struct ocfs2_dx_hinfo *hinfo,
874 u32 *ret_cpos,
875 u64 *ret_phys_blkno)
876{
877 int ret = 0;
878 unsigned int cend, uninitialized_var(clen);
879 u32 uninitialized_var(cpos);
880 u64 uninitialized_var(blkno);
881 u32 name_hash = hinfo->major_hash;
882
883 ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno,
884 &clen);
885 if (ret) {
886 mlog_errno(ret);
887 goto out;
888 }
889
890 cend = cpos + clen;
891 if (name_hash >= cend) {
892 /* We want the last cluster */
893 blkno += ocfs2_clusters_to_blocks(inode->i_sb, clen - 1);
894 cpos += clen - 1;
895 } else {
896 blkno += ocfs2_clusters_to_blocks(inode->i_sb,
897 name_hash - cpos);
898 cpos = name_hash;
899 }
900
901 /*
902 * We now have the cluster which should hold our entry. To
903 * find the exact block from the start of the cluster to
904 * search, we take the lower bits of the hash.
905 */
906 blkno += ocfs2_dx_dir_hash_idx(OCFS2_SB(inode->i_sb), hinfo);
907
908 if (ret_phys_blkno)
909 *ret_phys_blkno = blkno;
910 if (ret_cpos)
911 *ret_cpos = cpos;
912
913out:
914
915 return ret;
916}
917
918static int ocfs2_dx_dir_search(const char *name, int namelen,
919 struct inode *dir,
920 struct ocfs2_dx_root_block *dx_root,
921 struct ocfs2_dir_lookup_result *res)
922{
923 int ret, i, found;
924 u64 uninitialized_var(phys);
925 struct buffer_head *dx_leaf_bh = NULL;
926 struct ocfs2_dx_leaf *dx_leaf;
927 struct ocfs2_dx_entry *dx_entry = NULL;
928 struct buffer_head *dir_ent_bh = NULL;
929 struct ocfs2_dir_entry *dir_ent = NULL;
930 struct ocfs2_dx_hinfo *hinfo = &res->dl_hinfo;
931 struct ocfs2_extent_list *dr_el;
932 struct ocfs2_dx_entry_list *entry_list;
933
934 ocfs2_dx_dir_name_hash(dir, name, namelen, &res->dl_hinfo);
935
936 if (ocfs2_dx_root_inline(dx_root)) {
937 entry_list = &dx_root->dr_entries;
938 goto search;
939 }
940
941 dr_el = &dx_root->dr_list;
942
943 ret = ocfs2_dx_dir_lookup(dir, dr_el, hinfo, NULL, &phys);
944 if (ret) {
945 mlog_errno(ret);
946 goto out;
947 }
948
949 mlog(0, "Dir %llu: name: \"%.*s\", lookup of hash: %u.0x%x "
950 "returns: %llu\n",
951 (unsigned long long)OCFS2_I(dir)->ip_blkno,
952 namelen, name, hinfo->major_hash, hinfo->minor_hash,
953 (unsigned long long)phys);
954
955 ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh);
956 if (ret) {
957 mlog_errno(ret);
958 goto out;
959 }
960
961 dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data;
962
963 mlog(0, "leaf info: num_used: %d, count: %d\n",
964 le16_to_cpu(dx_leaf->dl_list.de_num_used),
965 le16_to_cpu(dx_leaf->dl_list.de_count));
966
967 entry_list = &dx_leaf->dl_list;
968
969search:
970 /*
971 * Empty leaf is legal, so no need to check for that.
972 */
973 found = 0;
974 for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
975 dx_entry = &entry_list->de_entries[i];
976
977 if (hinfo->major_hash != le32_to_cpu(dx_entry->dx_major_hash)
978 || hinfo->minor_hash != le32_to_cpu(dx_entry->dx_minor_hash))
979 continue;
980
981 /*
982 * Search unindexed leaf block now. We're not
983 * guaranteed to find anything.
984 */
985 ret = ocfs2_read_dir_block_direct(dir,
986 le64_to_cpu(dx_entry->dx_dirent_blk),
987 &dir_ent_bh);
988 if (ret) {
989 mlog_errno(ret);
990 goto out;
991 }
992
993 /*
994 * XXX: We should check the unindexed block here,
995 * before using it.
996 */
997
998 found = ocfs2_search_dirblock(dir_ent_bh, dir, name, namelen,
999 0, dir_ent_bh->b_data,
1000 dir->i_sb->s_blocksize, &dir_ent);
1001 if (found == 1)
1002 break;
1003
1004 if (found == -1) {
1005 /* This means we found a bad directory entry. */
1006 ret = -EIO;
1007 mlog_errno(ret);
1008 goto out;
1009 }
1010
1011 brelse(dir_ent_bh);
1012 dir_ent_bh = NULL;
1013 }
1014
1015 if (found <= 0) {
1016 ret = -ENOENT;
1017 goto out;
1018 }
1019
1020 res->dl_leaf_bh = dir_ent_bh;
1021 res->dl_entry = dir_ent;
1022 res->dl_dx_leaf_bh = dx_leaf_bh;
1023 res->dl_dx_entry = dx_entry;
1024
1025 ret = 0;
1026out:
1027 if (ret) {
1028 brelse(dx_leaf_bh);
1029 brelse(dir_ent_bh);
1030 }
1031 return ret;
1032}
1033
1034static int ocfs2_find_entry_dx(const char *name, int namelen,
1035 struct inode *dir,
1036 struct ocfs2_dir_lookup_result *lookup)
1037{
1038 int ret;
1039 struct buffer_head *di_bh = NULL;
1040 struct ocfs2_dinode *di;
1041 struct buffer_head *dx_root_bh = NULL;
1042 struct ocfs2_dx_root_block *dx_root;
1043
1044 ret = ocfs2_read_inode_block(dir, &di_bh);
1045 if (ret) {
1046 mlog_errno(ret);
1047 goto out;
1048 }
1049
1050 di = (struct ocfs2_dinode *)di_bh->b_data;
1051
1052 ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
1053 if (ret) {
1054 mlog_errno(ret);
1055 goto out;
1056 }
1057 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
1058
1059 ret = ocfs2_dx_dir_search(name, namelen, dir, dx_root, lookup);
1060 if (ret) {
1061 if (ret != -ENOENT)
1062 mlog_errno(ret);
1063 goto out;
1064 }
1065
1066 lookup->dl_dx_root_bh = dx_root_bh;
1067 dx_root_bh = NULL;
1068out:
1069 brelse(di_bh);
1070 brelse(dx_root_bh);
1071 return ret;
1072}
1073
483/* 1074/*
484 * Try to find an entry of the provided name within 'dir'. 1075 * Try to find an entry of the provided name within 'dir'.
485 * 1076 *
486 * If nothing was found, NULL is returned. Otherwise, a buffer_head 1077 * If nothing was found, -ENOENT is returned. Otherwise, zero is
487 * and pointer to the dir entry are passed back. 1078 * returned and the struct 'res' will contain information useful to
1079 * other directory manipulation functions.
488 * 1080 *
489 * Caller can NOT assume anything about the contents of the 1081 * Caller can NOT assume anything about the contents of the
490 * buffer_head - it is passed back only so that it can be passed into 1082 * buffer_heads - they are passed back only so that it can be passed
491 * any one of the manipulation functions (add entry, delete entry, 1083 * into any one of the manipulation functions (add entry, delete
492 * etc). As an example, bh in the extent directory case is a data 1084 * entry, etc). As an example, bh in the extent directory case is a
493 * block, in the inline-data case it actually points to an inode. 1085 * data block, in the inline-data case it actually points to an inode,
1086 * in the indexed directory case, multiple buffers are involved.
494 */ 1087 */
495struct buffer_head *ocfs2_find_entry(const char *name, int namelen, 1088int ocfs2_find_entry(const char *name, int namelen,
496 struct inode *dir, 1089 struct inode *dir, struct ocfs2_dir_lookup_result *lookup)
497 struct ocfs2_dir_entry **res_dir)
498{ 1090{
499 *res_dir = NULL; 1091 struct buffer_head *bh;
1092 struct ocfs2_dir_entry *res_dir = NULL;
500 1093
1094 if (ocfs2_dir_indexed(dir))
1095 return ocfs2_find_entry_dx(name, namelen, dir, lookup);
1096
1097 /*
1098 * The unindexed dir code only uses part of the lookup
1099 * structure, so there's no reason to push it down further
1100 * than this.
1101 */
501 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 1102 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
502 return ocfs2_find_entry_id(name, namelen, dir, res_dir); 1103 bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir);
1104 else
1105 bh = ocfs2_find_entry_el(name, namelen, dir, &res_dir);
1106
1107 if (bh == NULL)
1108 return -ENOENT;
503 1109
504 return ocfs2_find_entry_el(name, namelen, dir, res_dir); 1110 lookup->dl_leaf_bh = bh;
1111 lookup->dl_entry = res_dir;
1112 return 0;
505} 1113}
506 1114
507/* 1115/*
508 * Update inode number and type of a previously found directory entry. 1116 * Update inode number and type of a previously found directory entry.
509 */ 1117 */
510int ocfs2_update_entry(struct inode *dir, handle_t *handle, 1118int ocfs2_update_entry(struct inode *dir, handle_t *handle,
511 struct buffer_head *de_bh, struct ocfs2_dir_entry *de, 1119 struct ocfs2_dir_lookup_result *res,
512 struct inode *new_entry_inode) 1120 struct inode *new_entry_inode)
513{ 1121{
514 int ret; 1122 int ret;
515 ocfs2_journal_access_func access = ocfs2_journal_access_db; 1123 ocfs2_journal_access_func access = ocfs2_journal_access_db;
1124 struct ocfs2_dir_entry *de = res->dl_entry;
1125 struct buffer_head *de_bh = res->dl_leaf_bh;
516 1126
517 /* 1127 /*
518 * The same code works fine for both inline-data and extent 1128 * The same code works fine for both inline-data and extent
@@ -538,6 +1148,10 @@ out:
538 return ret; 1148 return ret;
539} 1149}
540 1150
1151/*
1152 * __ocfs2_delete_entry deletes a directory entry by merging it with the
1153 * previous entry
1154 */
541static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, 1155static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
542 struct ocfs2_dir_entry *de_del, 1156 struct ocfs2_dir_entry *de_del,
543 struct buffer_head *bh, char *first_de, 1157 struct buffer_head *bh, char *first_de,
@@ -587,6 +1201,181 @@ bail:
587 return status; 1201 return status;
588} 1202}
589 1203
1204static unsigned int ocfs2_figure_dirent_hole(struct ocfs2_dir_entry *de)
1205{
1206 unsigned int hole;
1207
1208 if (le64_to_cpu(de->inode) == 0)
1209 hole = le16_to_cpu(de->rec_len);
1210 else
1211 hole = le16_to_cpu(de->rec_len) -
1212 OCFS2_DIR_REC_LEN(de->name_len);
1213
1214 return hole;
1215}
1216
1217static int ocfs2_find_max_rec_len(struct super_block *sb,
1218 struct buffer_head *dirblock_bh)
1219{
1220 int size, this_hole, largest_hole = 0;
1221 char *trailer, *de_buf, *limit, *start = dirblock_bh->b_data;
1222 struct ocfs2_dir_entry *de;
1223
1224 trailer = (char *)ocfs2_trailer_from_bh(dirblock_bh, sb);
1225 size = ocfs2_dir_trailer_blk_off(sb);
1226 limit = start + size;
1227 de_buf = start;
1228 de = (struct ocfs2_dir_entry *)de_buf;
1229 do {
1230 if (de_buf != trailer) {
1231 this_hole = ocfs2_figure_dirent_hole(de);
1232 if (this_hole > largest_hole)
1233 largest_hole = this_hole;
1234 }
1235
1236 de_buf += le16_to_cpu(de->rec_len);
1237 de = (struct ocfs2_dir_entry *)de_buf;
1238 } while (de_buf < limit);
1239
1240 if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
1241 return largest_hole;
1242 return 0;
1243}
1244
1245static void ocfs2_dx_list_remove_entry(struct ocfs2_dx_entry_list *entry_list,
1246 int index)
1247{
1248 int num_used = le16_to_cpu(entry_list->de_num_used);
1249
1250 if (num_used == 1 || index == (num_used - 1))
1251 goto clear;
1252
1253 memmove(&entry_list->de_entries[index],
1254 &entry_list->de_entries[index + 1],
1255 (num_used - index - 1)*sizeof(struct ocfs2_dx_entry));
1256clear:
1257 num_used--;
1258 memset(&entry_list->de_entries[num_used], 0,
1259 sizeof(struct ocfs2_dx_entry));
1260 entry_list->de_num_used = cpu_to_le16(num_used);
1261}
1262
1263static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
1264 struct ocfs2_dir_lookup_result *lookup)
1265{
1266 int ret, index, max_rec_len, add_to_free_list = 0;
1267 struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
1268 struct buffer_head *leaf_bh = lookup->dl_leaf_bh;
1269 struct ocfs2_dx_leaf *dx_leaf;
1270 struct ocfs2_dx_entry *dx_entry = lookup->dl_dx_entry;
1271 struct ocfs2_dir_block_trailer *trailer;
1272 struct ocfs2_dx_root_block *dx_root;
1273 struct ocfs2_dx_entry_list *entry_list;
1274
1275 /*
1276 * This function gets a bit messy because we might have to
1277 * modify the root block, regardless of whether the indexed
1278 * entries are stored inline.
1279 */
1280
1281 /*
1282 * *Only* set 'entry_list' here, based on where we're looking
1283 * for the indexed entries. Later, we might still want to
1284 * journal both blocks, based on free list state.
1285 */
1286 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
1287 if (ocfs2_dx_root_inline(dx_root)) {
1288 entry_list = &dx_root->dr_entries;
1289 } else {
1290 dx_leaf = (struct ocfs2_dx_leaf *) lookup->dl_dx_leaf_bh->b_data;
1291 entry_list = &dx_leaf->dl_list;
1292 }
1293
1294 /* Neither of these are a disk corruption - that should have
1295 * been caught by lookup, before we got here. */
1296 BUG_ON(le16_to_cpu(entry_list->de_count) <= 0);
1297 BUG_ON(le16_to_cpu(entry_list->de_num_used) <= 0);
1298
1299 index = (char *)dx_entry - (char *)entry_list->de_entries;
1300 index /= sizeof(*dx_entry);
1301
1302 if (index >= le16_to_cpu(entry_list->de_num_used)) {
1303 mlog(ML_ERROR, "Dir %llu: Bad dx_entry ptr idx %d, (%p, %p)\n",
1304 (unsigned long long)OCFS2_I(dir)->ip_blkno, index,
1305 entry_list, dx_entry);
1306 return -EIO;
1307 }
1308
1309 /*
1310 * We know that removal of this dirent will leave enough room
1311 * for a new one, so add this block to the free list if it
1312 * isn't already there.
1313 */
1314 trailer = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
1315 if (trailer->db_free_rec_len == 0)
1316 add_to_free_list = 1;
1317
1318 /*
1319 * Add the block holding our index into the journal before
1320 * removing the unindexed entry. If we get an error return
1321 * from __ocfs2_delete_entry(), then it hasn't removed the
1322 * entry yet. Likewise, successful return means we *must*
1323 * remove the indexed entry.
1324 *
1325 * We're also careful to journal the root tree block here as
1326 * the entry count needs to be updated. Also, we might be
1327 * adding to the start of the free list.
1328 */
1329 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
1330 OCFS2_JOURNAL_ACCESS_WRITE);
1331 if (ret) {
1332 mlog_errno(ret);
1333 goto out;
1334 }
1335
1336 if (!ocfs2_dx_root_inline(dx_root)) {
1337 ret = ocfs2_journal_access_dl(handle, dir,
1338 lookup->dl_dx_leaf_bh,
1339 OCFS2_JOURNAL_ACCESS_WRITE);
1340 if (ret) {
1341 mlog_errno(ret);
1342 goto out;
1343 }
1344 }
1345
1346 mlog(0, "Dir %llu: delete entry at index: %d\n",
1347 (unsigned long long)OCFS2_I(dir)->ip_blkno, index);
1348
1349 ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry,
1350 leaf_bh, leaf_bh->b_data, leaf_bh->b_size);
1351 if (ret) {
1352 mlog_errno(ret);
1353 goto out;
1354 }
1355
1356 max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, leaf_bh);
1357 trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
1358 if (add_to_free_list) {
1359 trailer->db_free_next = dx_root->dr_free_blk;
1360 dx_root->dr_free_blk = cpu_to_le64(leaf_bh->b_blocknr);
1361 ocfs2_journal_dirty(handle, dx_root_bh);
1362 }
1363
1364 /* leaf_bh was journal_accessed for us in __ocfs2_delete_entry */
1365 ocfs2_journal_dirty(handle, leaf_bh);
1366
1367 le32_add_cpu(&dx_root->dr_num_entries, -1);
1368 ocfs2_journal_dirty(handle, dx_root_bh);
1369
1370 ocfs2_dx_list_remove_entry(entry_list, index);
1371
1372 if (!ocfs2_dx_root_inline(dx_root))
1373 ocfs2_journal_dirty(handle, lookup->dl_dx_leaf_bh);
1374
1375out:
1376 return ret;
1377}
1378
590static inline int ocfs2_delete_entry_id(handle_t *handle, 1379static inline int ocfs2_delete_entry_id(handle_t *handle,
591 struct inode *dir, 1380 struct inode *dir,
592 struct ocfs2_dir_entry *de_del, 1381 struct ocfs2_dir_entry *de_del,
@@ -624,18 +1413,22 @@ static inline int ocfs2_delete_entry_el(handle_t *handle,
624} 1413}
625 1414
626/* 1415/*
627 * ocfs2_delete_entry deletes a directory entry by merging it with the 1416 * Delete a directory entry. Hide the details of directory
628 * previous entry 1417 * implementation from the caller.
629 */ 1418 */
630int ocfs2_delete_entry(handle_t *handle, 1419int ocfs2_delete_entry(handle_t *handle,
631 struct inode *dir, 1420 struct inode *dir,
632 struct ocfs2_dir_entry *de_del, 1421 struct ocfs2_dir_lookup_result *res)
633 struct buffer_head *bh)
634{ 1422{
1423 if (ocfs2_dir_indexed(dir))
1424 return ocfs2_delete_entry_dx(handle, dir, res);
1425
635 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 1426 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
636 return ocfs2_delete_entry_id(handle, dir, de_del, bh); 1427 return ocfs2_delete_entry_id(handle, dir, res->dl_entry,
1428 res->dl_leaf_bh);
637 1429
638 return ocfs2_delete_entry_el(handle, dir, de_del, bh); 1430 return ocfs2_delete_entry_el(handle, dir, res->dl_entry,
1431 res->dl_leaf_bh);
639} 1432}
640 1433
641/* 1434/*
@@ -663,18 +1456,166 @@ static inline int ocfs2_dirent_would_fit(struct ocfs2_dir_entry *de,
663 return 0; 1456 return 0;
664} 1457}
665 1458
1459static void ocfs2_dx_dir_leaf_insert_tail(struct ocfs2_dx_leaf *dx_leaf,
1460 struct ocfs2_dx_entry *dx_new_entry)
1461{
1462 int i;
1463
1464 i = le16_to_cpu(dx_leaf->dl_list.de_num_used);
1465 dx_leaf->dl_list.de_entries[i] = *dx_new_entry;
1466
1467 le16_add_cpu(&dx_leaf->dl_list.de_num_used, 1);
1468}
1469
1470static void ocfs2_dx_entry_list_insert(struct ocfs2_dx_entry_list *entry_list,
1471 struct ocfs2_dx_hinfo *hinfo,
1472 u64 dirent_blk)
1473{
1474 int i;
1475 struct ocfs2_dx_entry *dx_entry;
1476
1477 i = le16_to_cpu(entry_list->de_num_used);
1478 dx_entry = &entry_list->de_entries[i];
1479
1480 memset(dx_entry, 0, sizeof(*dx_entry));
1481 dx_entry->dx_major_hash = cpu_to_le32(hinfo->major_hash);
1482 dx_entry->dx_minor_hash = cpu_to_le32(hinfo->minor_hash);
1483 dx_entry->dx_dirent_blk = cpu_to_le64(dirent_blk);
1484
1485 le16_add_cpu(&entry_list->de_num_used, 1);
1486}
1487
1488static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle,
1489 struct ocfs2_dx_hinfo *hinfo,
1490 u64 dirent_blk,
1491 struct buffer_head *dx_leaf_bh)
1492{
1493 int ret;
1494 struct ocfs2_dx_leaf *dx_leaf;
1495
1496 ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh,
1497 OCFS2_JOURNAL_ACCESS_WRITE);
1498 if (ret) {
1499 mlog_errno(ret);
1500 goto out;
1501 }
1502
1503 dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
1504 ocfs2_dx_entry_list_insert(&dx_leaf->dl_list, hinfo, dirent_blk);
1505 ocfs2_journal_dirty(handle, dx_leaf_bh);
1506
1507out:
1508 return ret;
1509}
1510
1511static void ocfs2_dx_inline_root_insert(struct inode *dir, handle_t *handle,
1512 struct ocfs2_dx_hinfo *hinfo,
1513 u64 dirent_blk,
1514 struct ocfs2_dx_root_block *dx_root)
1515{
1516 ocfs2_dx_entry_list_insert(&dx_root->dr_entries, hinfo, dirent_blk);
1517}
1518
1519static int ocfs2_dx_dir_insert(struct inode *dir, handle_t *handle,
1520 struct ocfs2_dir_lookup_result *lookup)
1521{
1522 int ret = 0;
1523 struct ocfs2_dx_root_block *dx_root;
1524 struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
1525
1526 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
1527 OCFS2_JOURNAL_ACCESS_WRITE);
1528 if (ret) {
1529 mlog_errno(ret);
1530 goto out;
1531 }
1532
1533 dx_root = (struct ocfs2_dx_root_block *)lookup->dl_dx_root_bh->b_data;
1534 if (ocfs2_dx_root_inline(dx_root)) {
1535 ocfs2_dx_inline_root_insert(dir, handle,
1536 &lookup->dl_hinfo,
1537 lookup->dl_leaf_bh->b_blocknr,
1538 dx_root);
1539 } else {
1540 ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &lookup->dl_hinfo,
1541 lookup->dl_leaf_bh->b_blocknr,
1542 lookup->dl_dx_leaf_bh);
1543 if (ret)
1544 goto out;
1545 }
1546
1547 le32_add_cpu(&dx_root->dr_num_entries, 1);
1548 ocfs2_journal_dirty(handle, dx_root_bh);
1549
1550out:
1551 return ret;
1552}
1553
1554static void ocfs2_remove_block_from_free_list(struct inode *dir,
1555 handle_t *handle,
1556 struct ocfs2_dir_lookup_result *lookup)
1557{
1558 struct ocfs2_dir_block_trailer *trailer, *prev;
1559 struct ocfs2_dx_root_block *dx_root;
1560 struct buffer_head *bh;
1561
1562 trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
1563
1564 if (ocfs2_free_list_at_root(lookup)) {
1565 bh = lookup->dl_dx_root_bh;
1566 dx_root = (struct ocfs2_dx_root_block *)bh->b_data;
1567 dx_root->dr_free_blk = trailer->db_free_next;
1568 } else {
1569 bh = lookup->dl_prev_leaf_bh;
1570 prev = ocfs2_trailer_from_bh(bh, dir->i_sb);
1571 prev->db_free_next = trailer->db_free_next;
1572 }
1573
1574 trailer->db_free_rec_len = cpu_to_le16(0);
1575 trailer->db_free_next = cpu_to_le64(0);
1576
1577 ocfs2_journal_dirty(handle, bh);
1578 ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
1579}
1580
1581/*
1582 * This expects that a journal write has been reserved on
1583 * lookup->dl_prev_leaf_bh or lookup->dl_dx_root_bh
1584 */
1585static void ocfs2_recalc_free_list(struct inode *dir, handle_t *handle,
1586 struct ocfs2_dir_lookup_result *lookup)
1587{
1588 int max_rec_len;
1589 struct ocfs2_dir_block_trailer *trailer;
1590
1591 /* Walk dl_leaf_bh to figure out what the new free rec_len is. */
1592 max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, lookup->dl_leaf_bh);
1593 if (max_rec_len) {
1594 /*
1595 * There's still room in this block, so no need to remove it
1596 * from the free list. In this case, we just want to update
1597 * the rec len accounting.
1598 */
1599 trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
1600 trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
1601 ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
1602 } else {
1603 ocfs2_remove_block_from_free_list(dir, handle, lookup);
1604 }
1605}
1606
666/* we don't always have a dentry for what we want to add, so people 1607/* we don't always have a dentry for what we want to add, so people
667 * like orphan dir can call this instead. 1608 * like orphan dir can call this instead.
668 * 1609 *
669 * If you pass me insert_bh, I'll skip the search of the other dir 1610 * The lookup context must have been filled from
670 * blocks and put the record in there. 1611 * ocfs2_prepare_dir_for_insert.
671 */ 1612 */
672int __ocfs2_add_entry(handle_t *handle, 1613int __ocfs2_add_entry(handle_t *handle,
673 struct inode *dir, 1614 struct inode *dir,
674 const char *name, int namelen, 1615 const char *name, int namelen,
675 struct inode *inode, u64 blkno, 1616 struct inode *inode, u64 blkno,
676 struct buffer_head *parent_fe_bh, 1617 struct buffer_head *parent_fe_bh,
677 struct buffer_head *insert_bh) 1618 struct ocfs2_dir_lookup_result *lookup)
678{ 1619{
679 unsigned long offset; 1620 unsigned long offset;
680 unsigned short rec_len; 1621 unsigned short rec_len;
@@ -683,6 +1624,7 @@ int __ocfs2_add_entry(handle_t *handle,
683 struct super_block *sb = dir->i_sb; 1624 struct super_block *sb = dir->i_sb;
684 int retval, status; 1625 int retval, status;
685 unsigned int size = sb->s_blocksize; 1626 unsigned int size = sb->s_blocksize;
1627 struct buffer_head *insert_bh = lookup->dl_leaf_bh;
686 char *data_start = insert_bh->b_data; 1628 char *data_start = insert_bh->b_data;
687 1629
688 mlog_entry_void(); 1630 mlog_entry_void();
@@ -690,7 +1632,31 @@ int __ocfs2_add_entry(handle_t *handle,
690 if (!namelen) 1632 if (!namelen)
691 return -EINVAL; 1633 return -EINVAL;
692 1634
693 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1635 if (ocfs2_dir_indexed(dir)) {
1636 struct buffer_head *bh;
1637
1638 /*
1639 * An indexed dir may require that we update the free space
1640 * list. Reserve a write to the previous node in the list so
1641 * that we don't fail later.
1642 *
1643 * XXX: This can be either a dx_root_block, or an unindexed
1644 * directory tree leaf block.
1645 */
1646 if (ocfs2_free_list_at_root(lookup)) {
1647 bh = lookup->dl_dx_root_bh;
1648 retval = ocfs2_journal_access_dr(handle, dir, bh,
1649 OCFS2_JOURNAL_ACCESS_WRITE);
1650 } else {
1651 bh = lookup->dl_prev_leaf_bh;
1652 retval = ocfs2_journal_access_db(handle, dir, bh,
1653 OCFS2_JOURNAL_ACCESS_WRITE);
1654 }
1655 if (retval) {
1656 mlog_errno(retval);
1657 return retval;
1658 }
1659 } else if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
694 data_start = di->id2.i_data.id_data; 1660 data_start = di->id2.i_data.id_data;
695 size = i_size_read(dir); 1661 size = i_size_read(dir);
696 1662
@@ -737,10 +1703,22 @@ int __ocfs2_add_entry(handle_t *handle,
737 status = ocfs2_journal_access_di(handle, dir, 1703 status = ocfs2_journal_access_di(handle, dir,
738 insert_bh, 1704 insert_bh,
739 OCFS2_JOURNAL_ACCESS_WRITE); 1705 OCFS2_JOURNAL_ACCESS_WRITE);
740 else 1706 else {
741 status = ocfs2_journal_access_db(handle, dir, 1707 status = ocfs2_journal_access_db(handle, dir,
742 insert_bh, 1708 insert_bh,
743 OCFS2_JOURNAL_ACCESS_WRITE); 1709 OCFS2_JOURNAL_ACCESS_WRITE);
1710
1711 if (ocfs2_dir_indexed(dir)) {
1712 status = ocfs2_dx_dir_insert(dir,
1713 handle,
1714 lookup);
1715 if (status) {
1716 mlog_errno(status);
1717 goto bail;
1718 }
1719 }
1720 }
1721
744 /* By now the buffer is marked for journaling */ 1722 /* By now the buffer is marked for journaling */
745 offset += le16_to_cpu(de->rec_len); 1723 offset += le16_to_cpu(de->rec_len);
746 if (le64_to_cpu(de->inode)) { 1724 if (le64_to_cpu(de->inode)) {
@@ -761,6 +1739,9 @@ int __ocfs2_add_entry(handle_t *handle,
761 de->name_len = namelen; 1739 de->name_len = namelen;
762 memcpy(de->name, name, namelen); 1740 memcpy(de->name, name, namelen);
763 1741
1742 if (ocfs2_dir_indexed(dir))
1743 ocfs2_recalc_free_list(dir, handle, lookup);
1744
764 dir->i_version++; 1745 dir->i_version++;
765 status = ocfs2_journal_dirty(handle, insert_bh); 1746 status = ocfs2_journal_dirty(handle, insert_bh);
766 retval = 0; 1747 retval = 0;
@@ -870,6 +1851,10 @@ out:
870 return 0; 1851 return 0;
871} 1852}
872 1853
1854/*
1855 * NOTE: This function can be called against unindexed directories,
1856 * and indexed ones.
1857 */
873static int ocfs2_dir_foreach_blk_el(struct inode *inode, 1858static int ocfs2_dir_foreach_blk_el(struct inode *inode,
874 u64 *f_version, 1859 u64 *f_version,
875 loff_t *f_pos, void *priv, 1860 loff_t *f_pos, void *priv,
@@ -1071,31 +2056,22 @@ int ocfs2_find_files_on_disk(const char *name,
1071 int namelen, 2056 int namelen,
1072 u64 *blkno, 2057 u64 *blkno,
1073 struct inode *inode, 2058 struct inode *inode,
1074 struct buffer_head **dirent_bh, 2059 struct ocfs2_dir_lookup_result *lookup)
1075 struct ocfs2_dir_entry **dirent)
1076{ 2060{
1077 int status = -ENOENT; 2061 int status = -ENOENT;
1078 2062
1079 mlog_entry("(name=%.*s, blkno=%p, inode=%p, dirent_bh=%p, dirent=%p)\n", 2063 mlog(0, "name=%.*s, blkno=%p, inode=%llu\n", namelen, name, blkno,
1080 namelen, name, blkno, inode, dirent_bh, dirent); 2064 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1081 2065
1082 *dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent); 2066 status = ocfs2_find_entry(name, namelen, inode, lookup);
1083 if (!*dirent_bh || !*dirent) { 2067 if (status)
1084 status = -ENOENT;
1085 goto leave; 2068 goto leave;
1086 }
1087 2069
1088 *blkno = le64_to_cpu((*dirent)->inode); 2070 *blkno = le64_to_cpu(lookup->dl_entry->inode);
1089 2071
1090 status = 0; 2072 status = 0;
1091leave: 2073leave:
1092 if (status < 0) {
1093 *dirent = NULL;
1094 brelse(*dirent_bh);
1095 *dirent_bh = NULL;
1096 }
1097 2074
1098 mlog_exit(status);
1099 return status; 2075 return status;
1100} 2076}
1101 2077
@@ -1107,11 +2083,10 @@ int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
1107 int namelen, u64 *blkno) 2083 int namelen, u64 *blkno)
1108{ 2084{
1109 int ret; 2085 int ret;
1110 struct buffer_head *bh = NULL; 2086 struct ocfs2_dir_lookup_result lookup = { NULL, };
1111 struct ocfs2_dir_entry *dirent = NULL;
1112 2087
1113 ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &bh, &dirent); 2088 ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &lookup);
1114 brelse(bh); 2089 ocfs2_free_dir_lookup_result(&lookup);
1115 2090
1116 return ret; 2091 return ret;
1117} 2092}
@@ -1128,20 +2103,18 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
1128 int namelen) 2103 int namelen)
1129{ 2104{
1130 int ret; 2105 int ret;
1131 struct buffer_head *dirent_bh = NULL; 2106 struct ocfs2_dir_lookup_result lookup = { NULL, };
1132 struct ocfs2_dir_entry *dirent = NULL;
1133 2107
1134 mlog_entry("dir %llu, name '%.*s'\n", 2108 mlog_entry("dir %llu, name '%.*s'\n",
1135 (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name); 2109 (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
1136 2110
1137 ret = -EEXIST; 2111 ret = -EEXIST;
1138 dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent); 2112 if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0)
1139 if (dirent_bh)
1140 goto bail; 2113 goto bail;
1141 2114
1142 ret = 0; 2115 ret = 0;
1143bail: 2116bail:
1144 brelse(dirent_bh); 2117 ocfs2_free_dir_lookup_result(&lookup);
1145 2118
1146 mlog_exit(ret); 2119 mlog_exit(ret);
1147 return ret; 2120 return ret;
@@ -1151,6 +2124,7 @@ struct ocfs2_empty_dir_priv {
1151 unsigned seen_dot; 2124 unsigned seen_dot;
1152 unsigned seen_dot_dot; 2125 unsigned seen_dot_dot;
1153 unsigned seen_other; 2126 unsigned seen_other;
2127 unsigned dx_dir;
1154}; 2128};
1155static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len, 2129static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
1156 loff_t pos, u64 ino, unsigned type) 2130 loff_t pos, u64 ino, unsigned type)
@@ -1160,6 +2134,13 @@ static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
1160 /* 2134 /*
1161 * Check the positions of "." and ".." records to be sure 2135 * Check the positions of "." and ".." records to be sure
1162 * they're in the correct place. 2136 * they're in the correct place.
2137 *
2138 * Indexed directories don't need to proceed past the first
2139 * two entries, so we end the scan after seeing '..'. Despite
2140 * that, we allow the scan to proceed In the event that we
2141 * have a corrupted indexed directory (no dot or dot dot
2142 * entries). This allows us to double check for existing
2143 * entries which might not have been found in the index.
1163 */ 2144 */
1164 if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) { 2145 if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) {
1165 p->seen_dot = 1; 2146 p->seen_dot = 1;
@@ -1169,16 +2150,57 @@ static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
1169 if (name_len == 2 && !strncmp("..", name, 2) && 2150 if (name_len == 2 && !strncmp("..", name, 2) &&
1170 pos == OCFS2_DIR_REC_LEN(1)) { 2151 pos == OCFS2_DIR_REC_LEN(1)) {
1171 p->seen_dot_dot = 1; 2152 p->seen_dot_dot = 1;
2153
2154 if (p->dx_dir && p->seen_dot)
2155 return 1;
2156
1172 return 0; 2157 return 0;
1173 } 2158 }
1174 2159
1175 p->seen_other = 1; 2160 p->seen_other = 1;
1176 return 1; 2161 return 1;
1177} 2162}
2163
2164static int ocfs2_empty_dir_dx(struct inode *inode,
2165 struct ocfs2_empty_dir_priv *priv)
2166{
2167 int ret;
2168 struct buffer_head *di_bh = NULL;
2169 struct buffer_head *dx_root_bh = NULL;
2170 struct ocfs2_dinode *di;
2171 struct ocfs2_dx_root_block *dx_root;
2172
2173 priv->dx_dir = 1;
2174
2175 ret = ocfs2_read_inode_block(inode, &di_bh);
2176 if (ret) {
2177 mlog_errno(ret);
2178 goto out;
2179 }
2180 di = (struct ocfs2_dinode *)di_bh->b_data;
2181
2182 ret = ocfs2_read_dx_root(inode, di, &dx_root_bh);
2183 if (ret) {
2184 mlog_errno(ret);
2185 goto out;
2186 }
2187 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2188
2189 if (le32_to_cpu(dx_root->dr_num_entries) != 2)
2190 priv->seen_other = 1;
2191
2192out:
2193 brelse(di_bh);
2194 brelse(dx_root_bh);
2195 return ret;
2196}
2197
1178/* 2198/*
1179 * routine to check that the specified directory is empty (for rmdir) 2199 * routine to check that the specified directory is empty (for rmdir)
1180 * 2200 *
1181 * Returns 1 if dir is empty, zero otherwise. 2201 * Returns 1 if dir is empty, zero otherwise.
2202 *
2203 * XXX: This is a performance problem for unindexed directories.
1182 */ 2204 */
1183int ocfs2_empty_dir(struct inode *inode) 2205int ocfs2_empty_dir(struct inode *inode)
1184{ 2206{
@@ -1188,6 +2210,16 @@ int ocfs2_empty_dir(struct inode *inode)
1188 2210
1189 memset(&priv, 0, sizeof(priv)); 2211 memset(&priv, 0, sizeof(priv));
1190 2212
2213 if (ocfs2_dir_indexed(inode)) {
2214 ret = ocfs2_empty_dir_dx(inode, &priv);
2215 if (ret)
2216 mlog_errno(ret);
2217 /*
2218 * We still run ocfs2_dir_foreach to get the checks
2219 * for "." and "..".
2220 */
2221 }
2222
1191 ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir); 2223 ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir);
1192 if (ret) 2224 if (ret)
1193 mlog_errno(ret); 2225 mlog_errno(ret);
@@ -1280,7 +2312,8 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1280 struct inode *parent, 2312 struct inode *parent,
1281 struct inode *inode, 2313 struct inode *inode,
1282 struct buffer_head *fe_bh, 2314 struct buffer_head *fe_bh,
1283 struct ocfs2_alloc_context *data_ac) 2315 struct ocfs2_alloc_context *data_ac,
2316 struct buffer_head **ret_new_bh)
1284{ 2317{
1285 int status; 2318 int status;
1286 unsigned int size = osb->sb->s_blocksize; 2319 unsigned int size = osb->sb->s_blocksize;
@@ -1289,7 +2322,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1289 2322
1290 mlog_entry_void(); 2323 mlog_entry_void();
1291 2324
1292 if (ocfs2_supports_dir_trailer(osb)) 2325 if (ocfs2_new_dir_wants_trailer(inode))
1293 size = ocfs2_dir_trailer_blk_off(parent->i_sb); 2326 size = ocfs2_dir_trailer_blk_off(parent->i_sb);
1294 2327
1295 status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh, 2328 status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
@@ -1310,8 +2343,19 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1310 memset(new_bh->b_data, 0, osb->sb->s_blocksize); 2343 memset(new_bh->b_data, 0, osb->sb->s_blocksize);
1311 2344
1312 de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size); 2345 de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size);
1313 if (ocfs2_supports_dir_trailer(osb)) 2346 if (ocfs2_new_dir_wants_trailer(inode)) {
1314 ocfs2_init_dir_trailer(inode, new_bh); 2347 int size = le16_to_cpu(de->rec_len);
2348
2349 /*
2350 * Figure out the size of the hole left over after
2351 * insertion of '.' and '..'. The trailer wants this
2352 * information.
2353 */
2354 size -= OCFS2_DIR_REC_LEN(2);
2355 size -= sizeof(struct ocfs2_dir_block_trailer);
2356
2357 ocfs2_init_dir_trailer(inode, new_bh, size);
2358 }
1315 2359
1316 status = ocfs2_journal_dirty(handle, new_bh); 2360 status = ocfs2_journal_dirty(handle, new_bh);
1317 if (status < 0) { 2361 if (status < 0) {
@@ -1329,6 +2373,10 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1329 } 2373 }
1330 2374
1331 status = 0; 2375 status = 0;
2376 if (ret_new_bh) {
2377 *ret_new_bh = new_bh;
2378 new_bh = NULL;
2379 }
1332bail: 2380bail:
1333 brelse(new_bh); 2381 brelse(new_bh);
1334 2382
@@ -1336,20 +2384,427 @@ bail:
1336 return status; 2384 return status;
1337} 2385}
1338 2386
2387static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2388 handle_t *handle, struct inode *dir,
2389 struct buffer_head *di_bh,
2390 struct buffer_head *dirdata_bh,
2391 struct ocfs2_alloc_context *meta_ac,
2392 int dx_inline, u32 num_entries,
2393 struct buffer_head **ret_dx_root_bh)
2394{
2395 int ret;
2396 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
2397 u16 dr_suballoc_bit;
2398 u64 dr_blkno;
2399 unsigned int num_bits;
2400 struct buffer_head *dx_root_bh = NULL;
2401 struct ocfs2_dx_root_block *dx_root;
2402 struct ocfs2_dir_block_trailer *trailer =
2403 ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
2404
2405 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, &dr_suballoc_bit,
2406 &num_bits, &dr_blkno);
2407 if (ret) {
2408 mlog_errno(ret);
2409 goto out;
2410 }
2411
2412 mlog(0, "Dir %llu, attach new index block: %llu\n",
2413 (unsigned long long)OCFS2_I(dir)->ip_blkno,
2414 (unsigned long long)dr_blkno);
2415
2416 dx_root_bh = sb_getblk(osb->sb, dr_blkno);
2417 if (dx_root_bh == NULL) {
2418 ret = -EIO;
2419 goto out;
2420 }
2421 ocfs2_set_new_buffer_uptodate(dir, dx_root_bh);
2422
2423 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
2424 OCFS2_JOURNAL_ACCESS_CREATE);
2425 if (ret < 0) {
2426 mlog_errno(ret);
2427 goto out;
2428 }
2429
2430 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2431 memset(dx_root, 0, osb->sb->s_blocksize);
2432 strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
2433 dx_root->dr_suballoc_slot = cpu_to_le16(osb->slot_num);
2434 dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
2435 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
2436 dx_root->dr_blkno = cpu_to_le64(dr_blkno);
2437 dx_root->dr_dir_blkno = cpu_to_le64(OCFS2_I(dir)->ip_blkno);
2438 dx_root->dr_num_entries = cpu_to_le32(num_entries);
2439 if (le16_to_cpu(trailer->db_free_rec_len))
2440 dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
2441 else
2442 dx_root->dr_free_blk = cpu_to_le64(0);
2443
2444 if (dx_inline) {
2445 dx_root->dr_flags |= OCFS2_DX_FLAG_INLINE;
2446 dx_root->dr_entries.de_count =
2447 cpu_to_le16(ocfs2_dx_entries_per_root(osb->sb));
2448 } else {
2449 dx_root->dr_list.l_count =
2450 cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
2451 }
2452
2453 ret = ocfs2_journal_dirty(handle, dx_root_bh);
2454 if (ret)
2455 mlog_errno(ret);
2456
2457 ret = ocfs2_journal_access_di(handle, dir, di_bh,
2458 OCFS2_JOURNAL_ACCESS_CREATE);
2459 if (ret) {
2460 mlog_errno(ret);
2461 goto out;
2462 }
2463
2464 di->i_dx_root = cpu_to_le64(dr_blkno);
2465
2466 OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
2467 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
2468
2469 ret = ocfs2_journal_dirty(handle, di_bh);
2470 if (ret)
2471 mlog_errno(ret);
2472
2473 *ret_dx_root_bh = dx_root_bh;
2474 dx_root_bh = NULL;
2475
2476out:
2477 brelse(dx_root_bh);
2478 return ret;
2479}
2480
2481static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
2482 handle_t *handle, struct inode *dir,
2483 struct buffer_head **dx_leaves,
2484 int num_dx_leaves, u64 start_blk)
2485{
2486 int ret, i;
2487 struct ocfs2_dx_leaf *dx_leaf;
2488 struct buffer_head *bh;
2489
2490 for (i = 0; i < num_dx_leaves; i++) {
2491 bh = sb_getblk(osb->sb, start_blk + i);
2492 if (bh == NULL) {
2493 ret = -EIO;
2494 goto out;
2495 }
2496 dx_leaves[i] = bh;
2497
2498 ocfs2_set_new_buffer_uptodate(dir, bh);
2499
2500 ret = ocfs2_journal_access_dl(handle, dir, bh,
2501 OCFS2_JOURNAL_ACCESS_CREATE);
2502 if (ret < 0) {
2503 mlog_errno(ret);
2504 goto out;
2505 }
2506
2507 dx_leaf = (struct ocfs2_dx_leaf *) bh->b_data;
2508
2509 memset(dx_leaf, 0, osb->sb->s_blocksize);
2510 strcpy(dx_leaf->dl_signature, OCFS2_DX_LEAF_SIGNATURE);
2511 dx_leaf->dl_fs_generation = cpu_to_le32(osb->fs_generation);
2512 dx_leaf->dl_blkno = cpu_to_le64(bh->b_blocknr);
2513 dx_leaf->dl_list.de_count =
2514 cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb));
2515
2516 mlog(0,
2517 "Dir %llu, format dx_leaf: %llu, entry count: %u\n",
2518 (unsigned long long)OCFS2_I(dir)->ip_blkno,
2519 (unsigned long long)bh->b_blocknr,
2520 le16_to_cpu(dx_leaf->dl_list.de_count));
2521
2522 ocfs2_journal_dirty(handle, bh);
2523 }
2524
2525 ret = 0;
2526out:
2527 return ret;
2528}
2529
2530/*
2531 * Allocates and formats a new cluster for use in an indexed dir
2532 * leaf. This version will not do the extent insert, so that it can be
2533 * used by operations which need careful ordering.
2534 */
2535static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
2536 u32 cpos, handle_t *handle,
2537 struct ocfs2_alloc_context *data_ac,
2538 struct buffer_head **dx_leaves,
2539 int num_dx_leaves, u64 *ret_phys_blkno)
2540{
2541 int ret;
2542 u32 phys, num;
2543 u64 phys_blkno;
2544 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2545
2546 /*
2547 * XXX: For create, this should claim cluster for the index
2548 * *before* the unindexed insert so that we have a better
2549 * chance of contiguousness as the directory grows in number
2550 * of entries.
2551 */
2552 ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 1, &phys, &num);
2553 if (ret) {
2554 mlog_errno(ret);
2555 goto out;
2556 }
2557
2558 /*
2559 * Format the new cluster first. That way, we're inserting
2560 * valid data.
2561 */
2562 phys_blkno = ocfs2_clusters_to_blocks(osb->sb, phys);
2563 ret = ocfs2_dx_dir_format_cluster(osb, handle, dir, dx_leaves,
2564 num_dx_leaves, phys_blkno);
2565 if (ret) {
2566 mlog_errno(ret);
2567 goto out;
2568 }
2569
2570 *ret_phys_blkno = phys_blkno;
2571out:
2572 return ret;
2573}
2574
2575static int ocfs2_dx_dir_new_cluster(struct inode *dir,
2576 struct ocfs2_extent_tree *et,
2577 u32 cpos, handle_t *handle,
2578 struct ocfs2_alloc_context *data_ac,
2579 struct ocfs2_alloc_context *meta_ac,
2580 struct buffer_head **dx_leaves,
2581 int num_dx_leaves)
2582{
2583 int ret;
2584 u64 phys_blkno;
2585 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2586
2587 ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves,
2588 num_dx_leaves, &phys_blkno);
2589 if (ret) {
2590 mlog_errno(ret);
2591 goto out;
2592 }
2593
2594 ret = ocfs2_insert_extent(osb, handle, dir, et, cpos, phys_blkno, 1, 0,
2595 meta_ac);
2596 if (ret)
2597 mlog_errno(ret);
2598out:
2599 return ret;
2600}
2601
2602static struct buffer_head **ocfs2_dx_dir_kmalloc_leaves(struct super_block *sb,
2603 int *ret_num_leaves)
2604{
2605 int num_dx_leaves = ocfs2_clusters_to_blocks(sb, 1);
2606 struct buffer_head **dx_leaves;
2607
2608 dx_leaves = kcalloc(num_dx_leaves, sizeof(struct buffer_head *),
2609 GFP_NOFS);
2610 if (dx_leaves && ret_num_leaves)
2611 *ret_num_leaves = num_dx_leaves;
2612
2613 return dx_leaves;
2614}
2615
2616static int ocfs2_fill_new_dir_dx(struct ocfs2_super *osb,
2617 handle_t *handle,
2618 struct inode *parent,
2619 struct inode *inode,
2620 struct buffer_head *di_bh,
2621 struct ocfs2_alloc_context *data_ac,
2622 struct ocfs2_alloc_context *meta_ac)
2623{
2624 int ret;
2625 struct buffer_head *leaf_bh = NULL;
2626 struct buffer_head *dx_root_bh = NULL;
2627 struct ocfs2_dx_hinfo hinfo;
2628 struct ocfs2_dx_root_block *dx_root;
2629 struct ocfs2_dx_entry_list *entry_list;
2630
2631 /*
2632 * Our strategy is to create the directory as though it were
2633 * unindexed, then add the index block. This works with very
2634 * little complication since the state of a new directory is a
2635 * very well known quantity.
2636 *
2637 * Essentially, we have two dirents ("." and ".."), in the 1st
2638 * block which need indexing. These are easily inserted into
2639 * the index block.
2640 */
2641
2642 ret = ocfs2_fill_new_dir_el(osb, handle, parent, inode, di_bh,
2643 data_ac, &leaf_bh);
2644 if (ret) {
2645 mlog_errno(ret);
2646 goto out;
2647 }
2648
2649 ret = ocfs2_dx_dir_attach_index(osb, handle, inode, di_bh, leaf_bh,
2650 meta_ac, 1, 2, &dx_root_bh);
2651 if (ret) {
2652 mlog_errno(ret);
2653 goto out;
2654 }
2655 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2656 entry_list = &dx_root->dr_entries;
2657
2658 /* Buffer has been journaled for us by ocfs2_dx_dir_attach_index */
2659 ocfs2_dx_dir_name_hash(inode, ".", 1, &hinfo);
2660 ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
2661
2662 ocfs2_dx_dir_name_hash(inode, "..", 2, &hinfo);
2663 ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
2664
2665out:
2666 brelse(dx_root_bh);
2667 brelse(leaf_bh);
2668 return ret;
2669}
2670
1339int ocfs2_fill_new_dir(struct ocfs2_super *osb, 2671int ocfs2_fill_new_dir(struct ocfs2_super *osb,
1340 handle_t *handle, 2672 handle_t *handle,
1341 struct inode *parent, 2673 struct inode *parent,
1342 struct inode *inode, 2674 struct inode *inode,
1343 struct buffer_head *fe_bh, 2675 struct buffer_head *fe_bh,
1344 struct ocfs2_alloc_context *data_ac) 2676 struct ocfs2_alloc_context *data_ac,
2677 struct ocfs2_alloc_context *meta_ac)
2678
1345{ 2679{
1346 BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL); 2680 BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL);
1347 2681
1348 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 2682 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1349 return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh); 2683 return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh);
1350 2684
2685 if (ocfs2_supports_indexed_dirs(osb))
2686 return ocfs2_fill_new_dir_dx(osb, handle, parent, inode, fe_bh,
2687 data_ac, meta_ac);
2688
1351 return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh, 2689 return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh,
1352 data_ac); 2690 data_ac, NULL);
2691}
2692
2693static int ocfs2_dx_dir_index_block(struct inode *dir,
2694 handle_t *handle,
2695 struct buffer_head **dx_leaves,
2696 int num_dx_leaves,
2697 u32 *num_dx_entries,
2698 struct buffer_head *dirent_bh)
2699{
2700 int ret = 0, namelen, i;
2701 char *de_buf, *limit;
2702 struct ocfs2_dir_entry *de;
2703 struct buffer_head *dx_leaf_bh;
2704 struct ocfs2_dx_hinfo hinfo;
2705 u64 dirent_blk = dirent_bh->b_blocknr;
2706
2707 de_buf = dirent_bh->b_data;
2708 limit = de_buf + dir->i_sb->s_blocksize;
2709
2710 while (de_buf < limit) {
2711 de = (struct ocfs2_dir_entry *)de_buf;
2712
2713 namelen = de->name_len;
2714 if (!namelen || !de->inode)
2715 goto inc;
2716
2717 ocfs2_dx_dir_name_hash(dir, de->name, namelen, &hinfo);
2718
2719 i = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), &hinfo);
2720 dx_leaf_bh = dx_leaves[i];
2721
2722 ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &hinfo,
2723 dirent_blk, dx_leaf_bh);
2724 if (ret) {
2725 mlog_errno(ret);
2726 goto out;
2727 }
2728
2729 *num_dx_entries = *num_dx_entries + 1;
2730
2731inc:
2732 de_buf += le16_to_cpu(de->rec_len);
2733 }
2734
2735out:
2736 return ret;
2737}
2738
2739/*
2740 * XXX: This expects dx_root_bh to already be part of the transaction.
2741 */
2742static void ocfs2_dx_dir_index_root_block(struct inode *dir,
2743 struct buffer_head *dx_root_bh,
2744 struct buffer_head *dirent_bh)
2745{
2746 char *de_buf, *limit;
2747 struct ocfs2_dx_root_block *dx_root;
2748 struct ocfs2_dir_entry *de;
2749 struct ocfs2_dx_hinfo hinfo;
2750 u64 dirent_blk = dirent_bh->b_blocknr;
2751
2752 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2753
2754 de_buf = dirent_bh->b_data;
2755 limit = de_buf + dir->i_sb->s_blocksize;
2756
2757 while (de_buf < limit) {
2758 de = (struct ocfs2_dir_entry *)de_buf;
2759
2760 if (!de->name_len || !de->inode)
2761 goto inc;
2762
2763 ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo);
2764
2765 mlog(0,
2766 "dir: %llu, major: 0x%x minor: 0x%x, index: %u, name: %.*s\n",
2767 (unsigned long long)dir->i_ino, hinfo.major_hash,
2768 hinfo.minor_hash,
2769 le16_to_cpu(dx_root->dr_entries.de_num_used),
2770 de->name_len, de->name);
2771
2772 ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo,
2773 dirent_blk);
2774
2775 le32_add_cpu(&dx_root->dr_num_entries, 1);
2776inc:
2777 de_buf += le16_to_cpu(de->rec_len);
2778 }
2779}
2780
2781/*
2782 * Count the number of inline directory entries in di_bh and compare
2783 * them against the number of entries we can hold in an inline dx root
2784 * block.
2785 */
2786static int ocfs2_new_dx_should_be_inline(struct inode *dir,
2787 struct buffer_head *di_bh)
2788{
2789 int dirent_count = 0;
2790 char *de_buf, *limit;
2791 struct ocfs2_dir_entry *de;
2792 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2793
2794 de_buf = di->id2.i_data.id_data;
2795 limit = de_buf + i_size_read(dir);
2796
2797 while (de_buf < limit) {
2798 de = (struct ocfs2_dir_entry *)de_buf;
2799
2800 if (de->name_len && de->inode)
2801 dirent_count++;
2802
2803 de_buf += le16_to_cpu(de->rec_len);
2804 }
2805
2806 /* We are careful to leave room for one extra record. */
2807 return dirent_count < ocfs2_dx_entries_per_root(dir->i_sb);
1353} 2808}
1354 2809
1355/* 2810/*
@@ -1358,18 +2813,26 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
1358 * expansion from an inline directory to one with extents. The first dir block 2813 * expansion from an inline directory to one with extents. The first dir block
1359 * in that case is taken from the inline data portion of the inode block. 2814 * in that case is taken from the inline data portion of the inode block.
1360 * 2815 *
2816 * This will also return the largest amount of contiguous space for a dirent
2817 * in the block. That value is *not* necessarily the last dirent, even after
2818 * expansion. The directory indexing code wants this value for free space
2819 * accounting. We do this here since we're already walking the entire dir
2820 * block.
2821 *
1361 * We add the dir trailer if this filesystem wants it. 2822 * We add the dir trailer if this filesystem wants it.
1362 */ 2823 */
1363static void ocfs2_expand_last_dirent(char *start, unsigned int old_size, 2824static unsigned int ocfs2_expand_last_dirent(char *start, unsigned int old_size,
1364 struct super_block *sb) 2825 struct inode *dir)
1365{ 2826{
2827 struct super_block *sb = dir->i_sb;
1366 struct ocfs2_dir_entry *de; 2828 struct ocfs2_dir_entry *de;
1367 struct ocfs2_dir_entry *prev_de; 2829 struct ocfs2_dir_entry *prev_de;
1368 char *de_buf, *limit; 2830 char *de_buf, *limit;
1369 unsigned int new_size = sb->s_blocksize; 2831 unsigned int new_size = sb->s_blocksize;
1370 unsigned int bytes; 2832 unsigned int bytes, this_hole;
2833 unsigned int largest_hole = 0;
1371 2834
1372 if (ocfs2_supports_dir_trailer(OCFS2_SB(sb))) 2835 if (ocfs2_new_dir_wants_trailer(dir))
1373 new_size = ocfs2_dir_trailer_blk_off(sb); 2836 new_size = ocfs2_dir_trailer_blk_off(sb);
1374 2837
1375 bytes = new_size - old_size; 2838 bytes = new_size - old_size;
@@ -1378,12 +2841,26 @@ static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
1378 de_buf = start; 2841 de_buf = start;
1379 de = (struct ocfs2_dir_entry *)de_buf; 2842 de = (struct ocfs2_dir_entry *)de_buf;
1380 do { 2843 do {
2844 this_hole = ocfs2_figure_dirent_hole(de);
2845 if (this_hole > largest_hole)
2846 largest_hole = this_hole;
2847
1381 prev_de = de; 2848 prev_de = de;
1382 de_buf += le16_to_cpu(de->rec_len); 2849 de_buf += le16_to_cpu(de->rec_len);
1383 de = (struct ocfs2_dir_entry *)de_buf; 2850 de = (struct ocfs2_dir_entry *)de_buf;
1384 } while (de_buf < limit); 2851 } while (de_buf < limit);
1385 2852
1386 le16_add_cpu(&prev_de->rec_len, bytes); 2853 le16_add_cpu(&prev_de->rec_len, bytes);
2854
2855 /* We need to double check this after modification of the final
2856 * dirent. */
2857 this_hole = ocfs2_figure_dirent_hole(prev_de);
2858 if (this_hole > largest_hole)
2859 largest_hole = this_hole;
2860
2861 if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
2862 return largest_hole;
2863 return 0;
1387} 2864}
1388 2865
1389/* 2866/*
@@ -1396,36 +2873,68 @@ static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
1396 */ 2873 */
1397static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, 2874static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1398 unsigned int blocks_wanted, 2875 unsigned int blocks_wanted,
2876 struct ocfs2_dir_lookup_result *lookup,
1399 struct buffer_head **first_block_bh) 2877 struct buffer_head **first_block_bh)
1400{ 2878{
1401 u32 alloc, bit_off, len; 2879 u32 alloc, dx_alloc, bit_off, len, num_dx_entries = 0;
1402 struct super_block *sb = dir->i_sb; 2880 struct super_block *sb = dir->i_sb;
1403 int ret, credits = ocfs2_inline_to_extents_credits(sb); 2881 int ret, i, num_dx_leaves = 0, dx_inline = 0,
1404 u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits; 2882 credits = ocfs2_inline_to_extents_credits(sb);
2883 u64 dx_insert_blkno, blkno,
2884 bytes = blocks_wanted << sb->s_blocksize_bits;
1405 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 2885 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
1406 struct ocfs2_inode_info *oi = OCFS2_I(dir); 2886 struct ocfs2_inode_info *oi = OCFS2_I(dir);
1407 struct ocfs2_alloc_context *data_ac; 2887 struct ocfs2_alloc_context *data_ac;
2888 struct ocfs2_alloc_context *meta_ac = NULL;
1408 struct buffer_head *dirdata_bh = NULL; 2889 struct buffer_head *dirdata_bh = NULL;
2890 struct buffer_head *dx_root_bh = NULL;
2891 struct buffer_head **dx_leaves = NULL;
1409 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 2892 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1410 handle_t *handle; 2893 handle_t *handle;
1411 struct ocfs2_extent_tree et; 2894 struct ocfs2_extent_tree et;
1412 int did_quota = 0; 2895 struct ocfs2_extent_tree dx_et;
2896 int did_quota = 0, bytes_allocated = 0;
1413 2897
1414 ocfs2_init_dinode_extent_tree(&et, dir, di_bh); 2898 ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
1415 2899
1416 alloc = ocfs2_clusters_for_bytes(sb, bytes); 2900 alloc = ocfs2_clusters_for_bytes(sb, bytes);
2901 dx_alloc = 0;
2902
2903 if (ocfs2_supports_indexed_dirs(osb)) {
2904 credits += ocfs2_add_dir_index_credits(sb);
2905
2906 dx_inline = ocfs2_new_dx_should_be_inline(dir, di_bh);
2907 if (!dx_inline) {
2908 /* Add one more cluster for an index leaf */
2909 dx_alloc++;
2910 dx_leaves = ocfs2_dx_dir_kmalloc_leaves(sb,
2911 &num_dx_leaves);
2912 if (!dx_leaves) {
2913 ret = -ENOMEM;
2914 mlog_errno(ret);
2915 goto out;
2916 }
2917 }
2918
2919 /* This gets us the dx_root */
2920 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
2921 if (ret) {
2922 mlog_errno(ret);
2923 goto out;
2924 }
2925 }
1417 2926
1418 /* 2927 /*
1419 * We should never need more than 2 clusters for this - 2928 * We should never need more than 2 clusters for the unindexed
1420 * maximum dirent size is far less than one block. In fact, 2929 * tree - maximum dirent size is far less than one block. In
1421 * the only time we'd need more than one cluster is if 2930 * fact, the only time we'd need more than one cluster is if
1422 * blocksize == clustersize and the dirent won't fit in the 2931 * blocksize == clustersize and the dirent won't fit in the
1423 * extra space that the expansion to a single block gives. As 2932 * extra space that the expansion to a single block gives. As
1424 * of today, that only happens on 4k/4k file systems. 2933 * of today, that only happens on 4k/4k file systems.
1425 */ 2934 */
1426 BUG_ON(alloc > 2); 2935 BUG_ON(alloc > 2);
1427 2936
1428 ret = ocfs2_reserve_clusters(osb, alloc, &data_ac); 2937 ret = ocfs2_reserve_clusters(osb, alloc + dx_alloc, &data_ac);
1429 if (ret) { 2938 if (ret) {
1430 mlog_errno(ret); 2939 mlog_errno(ret);
1431 goto out; 2940 goto out;
@@ -1435,7 +2944,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1435 2944
1436 /* 2945 /*
1437 * Prepare for worst case allocation scenario of two separate 2946 * Prepare for worst case allocation scenario of two separate
1438 * extents. 2947 * extents in the unindexed tree.
1439 */ 2948 */
1440 if (alloc == 2) 2949 if (alloc == 2)
1441 credits += OCFS2_SUBALLOC_ALLOC; 2950 credits += OCFS2_SUBALLOC_ALLOC;
@@ -1448,11 +2957,29 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1448 } 2957 }
1449 2958
1450 if (vfs_dq_alloc_space_nodirty(dir, 2959 if (vfs_dq_alloc_space_nodirty(dir,
1451 ocfs2_clusters_to_bytes(osb->sb, alloc))) { 2960 ocfs2_clusters_to_bytes(osb->sb,
2961 alloc + dx_alloc))) {
1452 ret = -EDQUOT; 2962 ret = -EDQUOT;
1453 goto out_commit; 2963 goto out_commit;
1454 } 2964 }
1455 did_quota = 1; 2965 did_quota = 1;
2966
2967 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
2968 /*
2969 * Allocate our index cluster first, to maximize the
2970 * possibility that unindexed leaves grow
2971 * contiguously.
2972 */
2973 ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac,
2974 dx_leaves, num_dx_leaves,
2975 &dx_insert_blkno);
2976 if (ret) {
2977 mlog_errno(ret);
2978 goto out_commit;
2979 }
2980 bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
2981 }
2982
1456 /* 2983 /*
1457 * Try to claim as many clusters as the bitmap can give though 2984 * Try to claim as many clusters as the bitmap can give though
1458 * if we only get one now, that's enough to continue. The rest 2985 * if we only get one now, that's enough to continue. The rest
@@ -1463,6 +2990,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1463 mlog_errno(ret); 2990 mlog_errno(ret);
1464 goto out_commit; 2991 goto out_commit;
1465 } 2992 }
2993 bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
1466 2994
1467 /* 2995 /*
1468 * Operations are carefully ordered so that we set up the new 2996 * Operations are carefully ordered so that we set up the new
@@ -1489,9 +3017,16 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1489 memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir)); 3017 memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
1490 memset(dirdata_bh->b_data + i_size_read(dir), 0, 3018 memset(dirdata_bh->b_data + i_size_read(dir), 0,
1491 sb->s_blocksize - i_size_read(dir)); 3019 sb->s_blocksize - i_size_read(dir));
1492 ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), sb); 3020 i = ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), dir);
1493 if (ocfs2_supports_dir_trailer(osb)) 3021 if (ocfs2_new_dir_wants_trailer(dir)) {
1494 ocfs2_init_dir_trailer(dir, dirdata_bh); 3022 /*
3023 * Prepare the dir trailer up front. It will otherwise look
3024 * like a valid dirent. Even if inserting the index fails
3025 * (unlikely), then all we'll have done is given first dir
3026 * block a small amount of fragmentation.
3027 */
3028 ocfs2_init_dir_trailer(dir, dirdata_bh, i);
3029 }
1495 3030
1496 ret = ocfs2_journal_dirty(handle, dirdata_bh); 3031 ret = ocfs2_journal_dirty(handle, dirdata_bh);
1497 if (ret) { 3032 if (ret) {
@@ -1499,6 +3034,24 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1499 goto out_commit; 3034 goto out_commit;
1500 } 3035 }
1501 3036
3037 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
3038 /*
3039 * Dx dirs with an external cluster need to do this up
3040 * front. Inline dx root's get handled later, after
3041 * we've allocated our root block. We get passed back
3042 * a total number of items so that dr_num_entries can
3043 * be correctly set once the dx_root has been
3044 * allocated.
3045 */
3046 ret = ocfs2_dx_dir_index_block(dir, handle, dx_leaves,
3047 num_dx_leaves, &num_dx_entries,
3048 dirdata_bh);
3049 if (ret) {
3050 mlog_errno(ret);
3051 goto out_commit;
3052 }
3053 }
3054
1502 /* 3055 /*
1503 * Set extent, i_size, etc on the directory. After this, the 3056 * Set extent, i_size, etc on the directory. After this, the
1504 * inode should contain the same exact dirents as before and 3057 * inode should contain the same exact dirents as before and
@@ -1551,6 +3104,27 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1551 goto out_commit; 3104 goto out_commit;
1552 } 3105 }
1553 3106
3107 if (ocfs2_supports_indexed_dirs(osb)) {
3108 ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
3109 dirdata_bh, meta_ac, dx_inline,
3110 num_dx_entries, &dx_root_bh);
3111 if (ret) {
3112 mlog_errno(ret);
3113 goto out_commit;
3114 }
3115
3116 if (dx_inline) {
3117 ocfs2_dx_dir_index_root_block(dir, dx_root_bh,
3118 dirdata_bh);
3119 } else {
3120 ocfs2_init_dx_root_extent_tree(&dx_et, dir, dx_root_bh);
3121 ret = ocfs2_insert_extent(osb, handle, dir, &dx_et, 0,
3122 dx_insert_blkno, 1, 0, NULL);
3123 if (ret)
3124 mlog_errno(ret);
3125 }
3126 }
3127
1554 /* 3128 /*
1555 * We asked for two clusters, but only got one in the 1st 3129 * We asked for two clusters, but only got one in the 1st
1556 * pass. Claim the 2nd cluster as a separate extent. 3130 * pass. Claim the 2nd cluster as a separate extent.
@@ -1570,15 +3144,32 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1570 mlog_errno(ret); 3144 mlog_errno(ret);
1571 goto out_commit; 3145 goto out_commit;
1572 } 3146 }
3147 bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
1573 } 3148 }
1574 3149
1575 *first_block_bh = dirdata_bh; 3150 *first_block_bh = dirdata_bh;
1576 dirdata_bh = NULL; 3151 dirdata_bh = NULL;
3152 if (ocfs2_supports_indexed_dirs(osb)) {
3153 unsigned int off;
3154
3155 if (!dx_inline) {
3156 /*
3157 * We need to return the correct block within the
3158 * cluster which should hold our entry.
3159 */
3160 off = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb),
3161 &lookup->dl_hinfo);
3162 get_bh(dx_leaves[off]);
3163 lookup->dl_dx_leaf_bh = dx_leaves[off];
3164 }
3165 lookup->dl_dx_root_bh = dx_root_bh;
3166 dx_root_bh = NULL;
3167 }
1577 3168
1578out_commit: 3169out_commit:
1579 if (ret < 0 && did_quota) 3170 if (ret < 0 && did_quota)
1580 vfs_dq_free_space_nodirty(dir, 3171 vfs_dq_free_space_nodirty(dir, bytes_allocated);
1581 ocfs2_clusters_to_bytes(osb->sb, 2)); 3172
1582 ocfs2_commit_trans(osb, handle); 3173 ocfs2_commit_trans(osb, handle);
1583 3174
1584out_sem: 3175out_sem:
@@ -1587,8 +3178,17 @@ out_sem:
1587out: 3178out:
1588 if (data_ac) 3179 if (data_ac)
1589 ocfs2_free_alloc_context(data_ac); 3180 ocfs2_free_alloc_context(data_ac);
3181 if (meta_ac)
3182 ocfs2_free_alloc_context(meta_ac);
3183
3184 if (dx_leaves) {
3185 for (i = 0; i < num_dx_leaves; i++)
3186 brelse(dx_leaves[i]);
3187 kfree(dx_leaves);
3188 }
1590 3189
1591 brelse(dirdata_bh); 3190 brelse(dirdata_bh);
3191 brelse(dx_root_bh);
1592 3192
1593 return ret; 3193 return ret;
1594} 3194}
@@ -1658,11 +3258,14 @@ bail:
1658 * is to be turned into an extent based one. The size of the dirent to 3258 * is to be turned into an extent based one. The size of the dirent to
1659 * insert might be larger than the space gained by growing to just one 3259 * insert might be larger than the space gained by growing to just one
1660 * block, so we may have to grow the inode by two blocks in that case. 3260 * block, so we may have to grow the inode by two blocks in that case.
3261 *
3262 * If the directory is already indexed, dx_root_bh must be provided.
1661 */ 3263 */
1662static int ocfs2_extend_dir(struct ocfs2_super *osb, 3264static int ocfs2_extend_dir(struct ocfs2_super *osb,
1663 struct inode *dir, 3265 struct inode *dir,
1664 struct buffer_head *parent_fe_bh, 3266 struct buffer_head *parent_fe_bh,
1665 unsigned int blocks_wanted, 3267 unsigned int blocks_wanted,
3268 struct ocfs2_dir_lookup_result *lookup,
1666 struct buffer_head **new_de_bh) 3269 struct buffer_head **new_de_bh)
1667{ 3270{
1668 int status = 0; 3271 int status = 0;
@@ -1677,17 +3280,29 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
1677 struct ocfs2_dir_entry * de; 3280 struct ocfs2_dir_entry * de;
1678 struct super_block *sb = osb->sb; 3281 struct super_block *sb = osb->sb;
1679 struct ocfs2_extent_tree et; 3282 struct ocfs2_extent_tree et;
3283 struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
1680 3284
1681 mlog_entry_void(); 3285 mlog_entry_void();
1682 3286
1683 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 3287 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
3288 /*
3289 * This would be a code error as an inline directory should
3290 * never have an index root.
3291 */
3292 BUG_ON(dx_root_bh);
3293
1684 status = ocfs2_expand_inline_dir(dir, parent_fe_bh, 3294 status = ocfs2_expand_inline_dir(dir, parent_fe_bh,
1685 blocks_wanted, &new_bh); 3295 blocks_wanted, lookup,
3296 &new_bh);
1686 if (status) { 3297 if (status) {
1687 mlog_errno(status); 3298 mlog_errno(status);
1688 goto bail; 3299 goto bail;
1689 } 3300 }
1690 3301
3302 /* Expansion from inline to an indexed directory will
3303 * have given us this. */
3304 dx_root_bh = lookup->dl_dx_root_bh;
3305
1691 if (blocks_wanted == 1) { 3306 if (blocks_wanted == 1) {
1692 /* 3307 /*
1693 * If the new dirent will fit inside the space 3308 * If the new dirent will fit inside the space
@@ -1751,6 +3366,10 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
1751 } 3366 }
1752 3367
1753do_extend: 3368do_extend:
3369 if (ocfs2_dir_indexed(dir))
3370 credits++; /* For attaching the new dirent block to the
3371 * dx_root */
3372
1754 down_write(&OCFS2_I(dir)->ip_alloc_sem); 3373 down_write(&OCFS2_I(dir)->ip_alloc_sem);
1755 drop_alloc_sem = 1; 3374 drop_alloc_sem = 1;
1756 3375
@@ -1781,9 +3400,19 @@ do_extend:
1781 3400
1782 de = (struct ocfs2_dir_entry *) new_bh->b_data; 3401 de = (struct ocfs2_dir_entry *) new_bh->b_data;
1783 de->inode = 0; 3402 de->inode = 0;
1784 if (ocfs2_dir_has_trailer(dir)) { 3403 if (ocfs2_supports_dir_trailer(dir)) {
1785 de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb)); 3404 de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb));
1786 ocfs2_init_dir_trailer(dir, new_bh); 3405
3406 ocfs2_init_dir_trailer(dir, new_bh, le16_to_cpu(de->rec_len));
3407
3408 if (ocfs2_dir_indexed(dir)) {
3409 status = ocfs2_dx_dir_link_trailer(dir, handle,
3410 dx_root_bh, new_bh);
3411 if (status) {
3412 mlog_errno(status);
3413 goto bail;
3414 }
3415 }
1787 } else { 3416 } else {
1788 de->rec_len = cpu_to_le16(sb->s_blocksize); 3417 de->rec_len = cpu_to_le16(sb->s_blocksize);
1789 } 3418 }
@@ -1839,7 +3468,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
1839 * This calculates how many free bytes we'd have in block zero, should 3468 * This calculates how many free bytes we'd have in block zero, should
1840 * this function force expansion to an extent tree. 3469 * this function force expansion to an extent tree.
1841 */ 3470 */
1842 if (ocfs2_supports_dir_trailer(OCFS2_SB(sb))) 3471 if (ocfs2_new_dir_wants_trailer(dir))
1843 free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir); 3472 free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir);
1844 else 3473 else
1845 free_space = dir->i_sb->s_blocksize - i_size_read(dir); 3474 free_space = dir->i_sb->s_blocksize - i_size_read(dir);
@@ -1970,12 +3599,766 @@ bail:
1970 return status; 3599 return status;
1971} 3600}
1972 3601
3602static int dx_leaf_sort_cmp(const void *a, const void *b)
3603{
3604 const struct ocfs2_dx_entry *entry1 = a;
3605 const struct ocfs2_dx_entry *entry2 = b;
3606 u32 major_hash1 = le32_to_cpu(entry1->dx_major_hash);
3607 u32 major_hash2 = le32_to_cpu(entry2->dx_major_hash);
3608 u32 minor_hash1 = le32_to_cpu(entry1->dx_minor_hash);
3609 u32 minor_hash2 = le32_to_cpu(entry2->dx_minor_hash);
3610
3611 if (major_hash1 > major_hash2)
3612 return 1;
3613 if (major_hash1 < major_hash2)
3614 return -1;
3615
3616 /*
3617 * It is not strictly necessary to sort by minor
3618 */
3619 if (minor_hash1 > minor_hash2)
3620 return 1;
3621 if (minor_hash1 < minor_hash2)
3622 return -1;
3623 return 0;
3624}
3625
3626static void dx_leaf_sort_swap(void *a, void *b, int size)
3627{
3628 struct ocfs2_dx_entry *entry1 = a;
3629 struct ocfs2_dx_entry *entry2 = b;
3630 struct ocfs2_dx_entry tmp;
3631
3632 BUG_ON(size != sizeof(*entry1));
3633
3634 tmp = *entry1;
3635 *entry1 = *entry2;
3636 *entry2 = tmp;
3637}
3638
3639static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf)
3640{
3641 struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
3642 int i, num = le16_to_cpu(dl_list->de_num_used);
3643
3644 for (i = 0; i < (num - 1); i++) {
3645 if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) !=
3646 le32_to_cpu(dl_list->de_entries[i + 1].dx_major_hash))
3647 return 0;
3648 }
3649
3650 return 1;
3651}
3652
3653/*
3654 * Find the optimal value to split this leaf on. This expects the leaf
3655 * entries to be in sorted order.
3656 *
3657 * leaf_cpos is the cpos of the leaf we're splitting. insert_hash is
3658 * the hash we want to insert.
3659 *
3660 * This function is only concerned with the major hash - that which
3661 * determines which cluster an item belongs to.
3662 */
3663static int ocfs2_dx_dir_find_leaf_split(struct ocfs2_dx_leaf *dx_leaf,
3664 u32 leaf_cpos, u32 insert_hash,
3665 u32 *split_hash)
3666{
3667 struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
3668 int i, num_used = le16_to_cpu(dl_list->de_num_used);
3669 int allsame;
3670
3671 /*
3672 * There's a couple rare, but nasty corner cases we have to
3673 * check for here. All of them involve a leaf where all value
3674 * have the same hash, which is what we look for first.
3675 *
3676 * Most of the time, all of the above is false, and we simply
3677 * pick the median value for a split.
3678 */
3679 allsame = ocfs2_dx_leaf_same_major(dx_leaf);
3680 if (allsame) {
3681 u32 val = le32_to_cpu(dl_list->de_entries[0].dx_major_hash);
3682
3683 if (val == insert_hash) {
3684 /*
3685 * No matter where we would choose to split,
3686 * the new entry would want to occupy the same
3687 * block as these. Since there's no space left
3688 * in their existing block, we know there
3689 * won't be space after the split.
3690 */
3691 return -ENOSPC;
3692 }
3693
3694 if (val == leaf_cpos) {
3695 /*
3696 * Because val is the same as leaf_cpos (which
3697 * is the smallest value this leaf can have),
3698 * yet is not equal to insert_hash, then we
3699 * know that insert_hash *must* be larger than
3700 * val (and leaf_cpos). At least cpos+1 in value.
3701 *
3702 * We also know then, that there cannot be an
3703 * adjacent extent (otherwise we'd be looking
3704 * at it). Choosing this value gives us a
3705 * chance to get some contiguousness.
3706 */
3707 *split_hash = leaf_cpos + 1;
3708 return 0;
3709 }
3710
3711 if (val > insert_hash) {
3712 /*
3713 * val can not be the same as insert hash, and
3714 * also must be larger than leaf_cpos. Also,
3715 * we know that there can't be a leaf between
3716 * cpos and val, otherwise the entries with
3717 * hash 'val' would be there.
3718 */
3719 *split_hash = val;
3720 return 0;
3721 }
3722
3723 *split_hash = insert_hash;
3724 return 0;
3725 }
3726
3727 /*
3728 * Since the records are sorted and the checks above
3729 * guaranteed that not all records in this block are the same,
3730 * we simple travel forward, from the median, and pick the 1st
3731 * record whose value is larger than leaf_cpos.
3732 */
3733 for (i = (num_used / 2); i < num_used; i++)
3734 if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) >
3735 leaf_cpos)
3736 break;
3737
3738 BUG_ON(i == num_used); /* Should be impossible */
3739 *split_hash = le32_to_cpu(dl_list->de_entries[i].dx_major_hash);
3740 return 0;
3741}
3742
3743/*
3744 * Transfer all entries in orig_dx_leaves whose major hash is equal to or
3745 * larger than split_hash into new_dx_leaves. We use a temporary
3746 * buffer (tmp_dx_leaf) to make the changes to the original leaf blocks.
3747 *
3748 * Since the block offset inside a leaf (cluster) is a constant mask
3749 * of minor_hash, we can optimize - an item at block offset X within
3750 * the original cluster, will be at offset X within the new cluster.
3751 */
3752static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash,
3753 handle_t *handle,
3754 struct ocfs2_dx_leaf *tmp_dx_leaf,
3755 struct buffer_head **orig_dx_leaves,
3756 struct buffer_head **new_dx_leaves,
3757 int num_dx_leaves)
3758{
3759 int i, j, num_used;
3760 u32 major_hash;
3761 struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf;
3762 struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list;
3763 struct ocfs2_dx_entry *dx_entry;
3764
3765 tmp_list = &tmp_dx_leaf->dl_list;
3766
3767 for (i = 0; i < num_dx_leaves; i++) {
3768 orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data;
3769 orig_list = &orig_dx_leaf->dl_list;
3770 new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data;
3771 new_list = &new_dx_leaf->dl_list;
3772
3773 num_used = le16_to_cpu(orig_list->de_num_used);
3774
3775 memcpy(tmp_dx_leaf, orig_dx_leaf, dir->i_sb->s_blocksize);
3776 tmp_list->de_num_used = cpu_to_le16(0);
3777 memset(&tmp_list->de_entries, 0, sizeof(*dx_entry)*num_used);
3778
3779 for (j = 0; j < num_used; j++) {
3780 dx_entry = &orig_list->de_entries[j];
3781 major_hash = le32_to_cpu(dx_entry->dx_major_hash);
3782 if (major_hash >= split_hash)
3783 ocfs2_dx_dir_leaf_insert_tail(new_dx_leaf,
3784 dx_entry);
3785 else
3786 ocfs2_dx_dir_leaf_insert_tail(tmp_dx_leaf,
3787 dx_entry);
3788 }
3789 memcpy(orig_dx_leaf, tmp_dx_leaf, dir->i_sb->s_blocksize);
3790
3791 ocfs2_journal_dirty(handle, orig_dx_leaves[i]);
3792 ocfs2_journal_dirty(handle, new_dx_leaves[i]);
3793 }
3794}
3795
3796static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb,
3797 struct ocfs2_dx_root_block *dx_root)
3798{
3799 int credits = ocfs2_clusters_to_blocks(osb->sb, 2);
3800
3801 credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list, 1);
3802 credits += ocfs2_quota_trans_credits(osb->sb);
3803 return credits;
3804}
3805
3806/*
3807 * Find the median value in dx_leaf_bh and allocate a new leaf to move
3808 * half our entries into.
3809 */
3810static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3811 struct buffer_head *dx_root_bh,
3812 struct buffer_head *dx_leaf_bh,
3813 struct ocfs2_dx_hinfo *hinfo, u32 leaf_cpos,
3814 u64 leaf_blkno)
3815{
3816 struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
3817 int credits, ret, i, num_used, did_quota = 0;
3818 u32 cpos, split_hash, insert_hash = hinfo->major_hash;
3819 u64 orig_leaves_start;
3820 int num_dx_leaves;
3821 struct buffer_head **orig_dx_leaves = NULL;
3822 struct buffer_head **new_dx_leaves = NULL;
3823 struct ocfs2_alloc_context *data_ac = NULL, *meta_ac = NULL;
3824 struct ocfs2_extent_tree et;
3825 handle_t *handle = NULL;
3826 struct ocfs2_dx_root_block *dx_root;
3827 struct ocfs2_dx_leaf *tmp_dx_leaf = NULL;
3828
3829 mlog(0, "DX Dir: %llu, rebalance leaf leaf_blkno: %llu insert: %u\n",
3830 (unsigned long long)OCFS2_I(dir)->ip_blkno,
3831 (unsigned long long)leaf_blkno, insert_hash);
3832
3833 ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
3834
3835 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
3836 /*
3837 * XXX: This is a rather large limit. We should use a more
3838 * realistic value.
3839 */
3840 if (le32_to_cpu(dx_root->dr_clusters) == UINT_MAX)
3841 return -ENOSPC;
3842
3843 num_used = le16_to_cpu(dx_leaf->dl_list.de_num_used);
3844 if (num_used < le16_to_cpu(dx_leaf->dl_list.de_count)) {
3845 mlog(ML_ERROR, "DX Dir: %llu, Asked to rebalance empty leaf: "
3846 "%llu, %d\n", (unsigned long long)OCFS2_I(dir)->ip_blkno,
3847 (unsigned long long)leaf_blkno, num_used);
3848 ret = -EIO;
3849 goto out;
3850 }
3851
3852 orig_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
3853 if (!orig_dx_leaves) {
3854 ret = -ENOMEM;
3855 mlog_errno(ret);
3856 goto out;
3857 }
3858
3859 new_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, NULL);
3860 if (!new_dx_leaves) {
3861 ret = -ENOMEM;
3862 mlog_errno(ret);
3863 goto out;
3864 }
3865
3866 ret = ocfs2_lock_allocators(dir, &et, 1, 0, &data_ac, &meta_ac);
3867 if (ret) {
3868 if (ret != -ENOSPC)
3869 mlog_errno(ret);
3870 goto out;
3871 }
3872
3873 credits = ocfs2_dx_dir_rebalance_credits(osb, dx_root);
3874 handle = ocfs2_start_trans(osb, credits);
3875 if (IS_ERR(handle)) {
3876 ret = PTR_ERR(handle);
3877 handle = NULL;
3878 mlog_errno(ret);
3879 goto out;
3880 }
3881
3882 if (vfs_dq_alloc_space_nodirty(dir,
3883 ocfs2_clusters_to_bytes(dir->i_sb, 1))) {
3884 ret = -EDQUOT;
3885 goto out_commit;
3886 }
3887 did_quota = 1;
3888
3889 ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh,
3890 OCFS2_JOURNAL_ACCESS_WRITE);
3891 if (ret) {
3892 mlog_errno(ret);
3893 goto out_commit;
3894 }
3895
3896 /*
3897 * This block is changing anyway, so we can sort it in place.
3898 */
3899 sort(dx_leaf->dl_list.de_entries, num_used,
3900 sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
3901 dx_leaf_sort_swap);
3902
3903 ret = ocfs2_journal_dirty(handle, dx_leaf_bh);
3904 if (ret) {
3905 mlog_errno(ret);
3906 goto out_commit;
3907 }
3908
3909 ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
3910 &split_hash);
3911 if (ret) {
3912 mlog_errno(ret);
3913 goto out_commit;
3914 }
3915
3916 mlog(0, "Split leaf (%u) at %u, insert major hash is %u\n",
3917 leaf_cpos, split_hash, insert_hash);
3918
3919 /*
3920 * We have to carefully order operations here. There are items
3921 * which want to be in the new cluster before insert, but in
3922 * order to put those items in the new cluster, we alter the
3923 * old cluster. A failure to insert gets nasty.
3924 *
3925 * So, start by reserving writes to the old
3926 * cluster. ocfs2_dx_dir_new_cluster will reserve writes on
3927 * the new cluster for us, before inserting it. The insert
3928 * won't happen if there's an error before that. Once the
3929 * insert is done then, we can transfer from one leaf into the
3930 * other without fear of hitting any error.
3931 */
3932
3933 /*
3934 * The leaf transfer wants some scratch space so that we don't
3935 * wind up doing a bunch of expensive memmove().
3936 */
3937 tmp_dx_leaf = kmalloc(osb->sb->s_blocksize, GFP_NOFS);
3938 if (!tmp_dx_leaf) {
3939 ret = -ENOMEM;
3940 mlog_errno(ret);
3941 goto out_commit;
3942 }
3943
3944 orig_leaves_start = ocfs2_block_to_cluster_start(dir->i_sb, leaf_blkno);
3945 ret = ocfs2_read_dx_leaves(dir, orig_leaves_start, num_dx_leaves,
3946 orig_dx_leaves);
3947 if (ret) {
3948 mlog_errno(ret);
3949 goto out_commit;
3950 }
3951
3952 for (i = 0; i < num_dx_leaves; i++) {
3953 ret = ocfs2_journal_access_dl(handle, dir, orig_dx_leaves[i],
3954 OCFS2_JOURNAL_ACCESS_WRITE);
3955 if (ret) {
3956 mlog_errno(ret);
3957 goto out_commit;
3958 }
3959 }
3960
3961 cpos = split_hash;
3962 ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
3963 data_ac, meta_ac, new_dx_leaves,
3964 num_dx_leaves);
3965 if (ret) {
3966 mlog_errno(ret);
3967 goto out_commit;
3968 }
3969
3970 ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf,
3971 orig_dx_leaves, new_dx_leaves, num_dx_leaves);
3972
3973out_commit:
3974 if (ret < 0 && did_quota)
3975 vfs_dq_free_space_nodirty(dir,
3976 ocfs2_clusters_to_bytes(dir->i_sb, 1));
3977
3978 ocfs2_commit_trans(osb, handle);
3979
3980out:
3981 if (orig_dx_leaves || new_dx_leaves) {
3982 for (i = 0; i < num_dx_leaves; i++) {
3983 if (orig_dx_leaves)
3984 brelse(orig_dx_leaves[i]);
3985 if (new_dx_leaves)
3986 brelse(new_dx_leaves[i]);
3987 }
3988 kfree(orig_dx_leaves);
3989 kfree(new_dx_leaves);
3990 }
3991
3992 if (meta_ac)
3993 ocfs2_free_alloc_context(meta_ac);
3994 if (data_ac)
3995 ocfs2_free_alloc_context(data_ac);
3996
3997 kfree(tmp_dx_leaf);
3998 return ret;
3999}
4000
4001static int ocfs2_find_dir_space_dx(struct ocfs2_super *osb, struct inode *dir,
4002 struct buffer_head *di_bh,
4003 struct buffer_head *dx_root_bh,
4004 const char *name, int namelen,
4005 struct ocfs2_dir_lookup_result *lookup)
4006{
4007 int ret, rebalanced = 0;
4008 struct ocfs2_dx_root_block *dx_root;
4009 struct buffer_head *dx_leaf_bh = NULL;
4010 struct ocfs2_dx_leaf *dx_leaf;
4011 u64 blkno;
4012 u32 leaf_cpos;
4013
4014 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
4015
4016restart_search:
4017 ret = ocfs2_dx_dir_lookup(dir, &dx_root->dr_list, &lookup->dl_hinfo,
4018 &leaf_cpos, &blkno);
4019 if (ret) {
4020 mlog_errno(ret);
4021 goto out;
4022 }
4023
4024 ret = ocfs2_read_dx_leaf(dir, blkno, &dx_leaf_bh);
4025 if (ret) {
4026 mlog_errno(ret);
4027 goto out;
4028 }
4029
4030 dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
4031
4032 if (le16_to_cpu(dx_leaf->dl_list.de_num_used) >=
4033 le16_to_cpu(dx_leaf->dl_list.de_count)) {
4034 if (rebalanced) {
4035 /*
4036 * Rebalancing should have provided us with
4037 * space in an appropriate leaf.
4038 *
4039 * XXX: Is this an abnormal condition then?
4040 * Should we print a message here?
4041 */
4042 ret = -ENOSPC;
4043 goto out;
4044 }
4045
4046 ret = ocfs2_dx_dir_rebalance(osb, dir, dx_root_bh, dx_leaf_bh,
4047 &lookup->dl_hinfo, leaf_cpos,
4048 blkno);
4049 if (ret) {
4050 if (ret != -ENOSPC)
4051 mlog_errno(ret);
4052 goto out;
4053 }
4054
4055 /*
4056 * Restart the lookup. The rebalance might have
4057 * changed which block our item fits into. Mark our
4058 * progress, so we only execute this once.
4059 */
4060 brelse(dx_leaf_bh);
4061 dx_leaf_bh = NULL;
4062 rebalanced = 1;
4063 goto restart_search;
4064 }
4065
4066 lookup->dl_dx_leaf_bh = dx_leaf_bh;
4067 dx_leaf_bh = NULL;
4068
4069out:
4070 brelse(dx_leaf_bh);
4071 return ret;
4072}
4073
4074static int ocfs2_search_dx_free_list(struct inode *dir,
4075 struct buffer_head *dx_root_bh,
4076 int namelen,
4077 struct ocfs2_dir_lookup_result *lookup)
4078{
4079 int ret = -ENOSPC;
4080 struct buffer_head *leaf_bh = NULL, *prev_leaf_bh = NULL;
4081 struct ocfs2_dir_block_trailer *db;
4082 u64 next_block;
4083 int rec_len = OCFS2_DIR_REC_LEN(namelen);
4084 struct ocfs2_dx_root_block *dx_root;
4085
4086 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
4087 next_block = le64_to_cpu(dx_root->dr_free_blk);
4088
4089 while (next_block) {
4090 brelse(prev_leaf_bh);
4091 prev_leaf_bh = leaf_bh;
4092 leaf_bh = NULL;
4093
4094 ret = ocfs2_read_dir_block_direct(dir, next_block, &leaf_bh);
4095 if (ret) {
4096 mlog_errno(ret);
4097 goto out;
4098 }
4099
4100 db = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
4101 if (rec_len <= le16_to_cpu(db->db_free_rec_len)) {
4102 lookup->dl_leaf_bh = leaf_bh;
4103 lookup->dl_prev_leaf_bh = prev_leaf_bh;
4104 leaf_bh = NULL;
4105 prev_leaf_bh = NULL;
4106 break;
4107 }
4108
4109 next_block = le64_to_cpu(db->db_free_next);
4110 }
4111
4112 if (!next_block)
4113 ret = -ENOSPC;
4114
4115out:
4116
4117 brelse(leaf_bh);
4118 brelse(prev_leaf_bh);
4119 return ret;
4120}
4121
4122static int ocfs2_expand_inline_dx_root(struct inode *dir,
4123 struct buffer_head *dx_root_bh)
4124{
4125 int ret, num_dx_leaves, i, j, did_quota = 0;
4126 struct buffer_head **dx_leaves = NULL;
4127 struct ocfs2_extent_tree et;
4128 u64 insert_blkno;
4129 struct ocfs2_alloc_context *data_ac = NULL;
4130 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4131 handle_t *handle = NULL;
4132 struct ocfs2_dx_root_block *dx_root;
4133 struct ocfs2_dx_entry_list *entry_list;
4134 struct ocfs2_dx_entry *dx_entry;
4135 struct ocfs2_dx_leaf *target_leaf;
4136
4137 ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
4138 if (ret) {
4139 mlog_errno(ret);
4140 goto out;
4141 }
4142
4143 dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
4144 if (!dx_leaves) {
4145 ret = -ENOMEM;
4146 mlog_errno(ret);
4147 goto out;
4148 }
4149
4150 handle = ocfs2_start_trans(osb, ocfs2_calc_dxi_expand_credits(osb->sb));
4151 if (IS_ERR(handle)) {
4152 ret = PTR_ERR(handle);
4153 mlog_errno(ret);
4154 goto out;
4155 }
4156
4157 if (vfs_dq_alloc_space_nodirty(dir,
4158 ocfs2_clusters_to_bytes(osb->sb, 1))) {
4159 ret = -EDQUOT;
4160 goto out_commit;
4161 }
4162 did_quota = 1;
4163
4164 /*
4165 * We do this up front, before the allocation, so that a
4166 * failure to add the dx_root_bh to the journal won't result
4167 * us losing clusters.
4168 */
4169 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
4170 OCFS2_JOURNAL_ACCESS_WRITE);
4171 if (ret) {
4172 mlog_errno(ret);
4173 goto out_commit;
4174 }
4175
4176 ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac, dx_leaves,
4177 num_dx_leaves, &insert_blkno);
4178 if (ret) {
4179 mlog_errno(ret);
4180 goto out_commit;
4181 }
4182
4183 /*
4184 * Transfer the entries from our dx_root into the appropriate
4185 * block
4186 */
4187 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
4188 entry_list = &dx_root->dr_entries;
4189
4190 for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
4191 dx_entry = &entry_list->de_entries[i];
4192
4193 j = __ocfs2_dx_dir_hash_idx(osb,
4194 le32_to_cpu(dx_entry->dx_minor_hash));
4195 target_leaf = (struct ocfs2_dx_leaf *)dx_leaves[j]->b_data;
4196
4197 ocfs2_dx_dir_leaf_insert_tail(target_leaf, dx_entry);
4198
4199 /* Each leaf has been passed to the journal already
4200 * via __ocfs2_dx_dir_new_cluster() */
4201 }
4202
4203 dx_root->dr_flags &= ~OCFS2_DX_FLAG_INLINE;
4204 memset(&dx_root->dr_list, 0, osb->sb->s_blocksize -
4205 offsetof(struct ocfs2_dx_root_block, dr_list));
4206 dx_root->dr_list.l_count =
4207 cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
4208
4209 /* This should never fail considering we start with an empty
4210 * dx_root. */
4211 ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
4212 ret = ocfs2_insert_extent(osb, handle, dir, &et, 0,
4213 insert_blkno, 1, 0, NULL);
4214 if (ret)
4215 mlog_errno(ret);
4216 did_quota = 0;
4217
4218 ocfs2_journal_dirty(handle, dx_root_bh);
4219
4220out_commit:
4221 if (ret < 0 && did_quota)
4222 vfs_dq_free_space_nodirty(dir,
4223 ocfs2_clusters_to_bytes(dir->i_sb, 1));
4224
4225 ocfs2_commit_trans(osb, handle);
4226
4227out:
4228 if (data_ac)
4229 ocfs2_free_alloc_context(data_ac);
4230
4231 if (dx_leaves) {
4232 for (i = 0; i < num_dx_leaves; i++)
4233 brelse(dx_leaves[i]);
4234 kfree(dx_leaves);
4235 }
4236 return ret;
4237}
4238
4239static int ocfs2_inline_dx_has_space(struct buffer_head *dx_root_bh)
4240{
4241 struct ocfs2_dx_root_block *dx_root;
4242 struct ocfs2_dx_entry_list *entry_list;
4243
4244 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
4245 entry_list = &dx_root->dr_entries;
4246
4247 if (le16_to_cpu(entry_list->de_num_used) >=
4248 le16_to_cpu(entry_list->de_count))
4249 return -ENOSPC;
4250
4251 return 0;
4252}
4253
4254static int ocfs2_prepare_dx_dir_for_insert(struct inode *dir,
4255 struct buffer_head *di_bh,
4256 const char *name,
4257 int namelen,
4258 struct ocfs2_dir_lookup_result *lookup)
4259{
4260 int ret, free_dx_root = 1;
4261 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4262 struct buffer_head *dx_root_bh = NULL;
4263 struct buffer_head *leaf_bh = NULL;
4264 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4265 struct ocfs2_dx_root_block *dx_root;
4266
4267 ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
4268 if (ret) {
4269 mlog_errno(ret);
4270 goto out;
4271 }
4272
4273 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
4274 if (le32_to_cpu(dx_root->dr_num_entries) == OCFS2_DX_ENTRIES_MAX) {
4275 ret = -ENOSPC;
4276 mlog_errno(ret);
4277 goto out;
4278 }
4279
4280 if (ocfs2_dx_root_inline(dx_root)) {
4281 ret = ocfs2_inline_dx_has_space(dx_root_bh);
4282
4283 if (ret == 0)
4284 goto search_el;
4285
4286 /*
4287 * We ran out of room in the root block. Expand it to
4288 * an extent, then allow ocfs2_find_dir_space_dx to do
4289 * the rest.
4290 */
4291 ret = ocfs2_expand_inline_dx_root(dir, dx_root_bh);
4292 if (ret) {
4293 mlog_errno(ret);
4294 goto out;
4295 }
4296 }
4297
4298 /*
4299 * Insert preparation for an indexed directory is split into two
4300 * steps. The call to find_dir_space_dx reserves room in the index for
4301 * an additional item. If we run out of space there, it's a real error
4302 * we can't continue on.
4303 */
4304 ret = ocfs2_find_dir_space_dx(osb, dir, di_bh, dx_root_bh, name,
4305 namelen, lookup);
4306 if (ret) {
4307 mlog_errno(ret);
4308 goto out;
4309 }
4310
4311search_el:
4312 /*
4313 * Next, we need to find space in the unindexed tree. This call
4314 * searches using the free space linked list. If the unindexed tree
4315 * lacks sufficient space, we'll expand it below. The expansion code
4316 * is smart enough to add any new blocks to the free space list.
4317 */
4318 ret = ocfs2_search_dx_free_list(dir, dx_root_bh, namelen, lookup);
4319 if (ret && ret != -ENOSPC) {
4320 mlog_errno(ret);
4321 goto out;
4322 }
4323
4324 /* Do this up here - ocfs2_extend_dir might need the dx_root */
4325 lookup->dl_dx_root_bh = dx_root_bh;
4326 free_dx_root = 0;
4327
4328 if (ret == -ENOSPC) {
4329 ret = ocfs2_extend_dir(osb, dir, di_bh, 1, lookup, &leaf_bh);
4330
4331 if (ret) {
4332 mlog_errno(ret);
4333 goto out;
4334 }
4335
4336 /*
4337 * We make the assumption here that new leaf blocks are added
4338 * to the front of our free list.
4339 */
4340 lookup->dl_prev_leaf_bh = NULL;
4341 lookup->dl_leaf_bh = leaf_bh;
4342 }
4343
4344out:
4345 if (free_dx_root)
4346 brelse(dx_root_bh);
4347 return ret;
4348}
4349
4350/*
4351 * Get a directory ready for insert. Any directory allocation required
4352 * happens here. Success returns zero, and enough context in the dir
4353 * lookup result that ocfs2_add_entry() will be able complete the task
4354 * with minimal performance impact.
4355 */
1973int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, 4356int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
1974 struct inode *dir, 4357 struct inode *dir,
1975 struct buffer_head *parent_fe_bh, 4358 struct buffer_head *parent_fe_bh,
1976 const char *name, 4359 const char *name,
1977 int namelen, 4360 int namelen,
1978 struct buffer_head **ret_de_bh) 4361 struct ocfs2_dir_lookup_result *lookup)
1979{ 4362{
1980 int ret; 4363 int ret;
1981 unsigned int blocks_wanted = 1; 4364 unsigned int blocks_wanted = 1;
@@ -1984,14 +4367,34 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
1984 mlog(0, "getting ready to insert namelen %d into dir %llu\n", 4367 mlog(0, "getting ready to insert namelen %d into dir %llu\n",
1985 namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno); 4368 namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno);
1986 4369
1987 *ret_de_bh = NULL;
1988
1989 if (!namelen) { 4370 if (!namelen) {
1990 ret = -EINVAL; 4371 ret = -EINVAL;
1991 mlog_errno(ret); 4372 mlog_errno(ret);
1992 goto out; 4373 goto out;
1993 } 4374 }
1994 4375
4376 /*
4377 * Do this up front to reduce confusion.
4378 *
4379 * The directory might start inline, then be turned into an
4380 * indexed one, in which case we'd need to hash deep inside
4381 * ocfs2_find_dir_space_id(). Since
4382 * ocfs2_prepare_dx_dir_for_insert() also needs this hash
4383 * done, there seems no point in spreading out the calls. We
4384 * can optimize away the case where the file system doesn't
4385 * support indexing.
4386 */
4387 if (ocfs2_supports_indexed_dirs(osb))
4388 ocfs2_dx_dir_name_hash(dir, name, namelen, &lookup->dl_hinfo);
4389
4390 if (ocfs2_dir_indexed(dir)) {
4391 ret = ocfs2_prepare_dx_dir_for_insert(dir, parent_fe_bh,
4392 name, namelen, lookup);
4393 if (ret)
4394 mlog_errno(ret);
4395 goto out;
4396 }
4397
1995 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 4398 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1996 ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name, 4399 ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name,
1997 namelen, &bh, &blocks_wanted); 4400 namelen, &bh, &blocks_wanted);
@@ -2010,7 +4413,7 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
2010 BUG_ON(bh); 4413 BUG_ON(bh);
2011 4414
2012 ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted, 4415 ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted,
2013 &bh); 4416 lookup, &bh);
2014 if (ret) { 4417 if (ret) {
2015 if (ret != -ENOSPC) 4418 if (ret != -ENOSPC)
2016 mlog_errno(ret); 4419 mlog_errno(ret);
@@ -2020,9 +4423,154 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
2020 BUG_ON(!bh); 4423 BUG_ON(!bh);
2021 } 4424 }
2022 4425
2023 *ret_de_bh = bh; 4426 lookup->dl_leaf_bh = bh;
2024 bh = NULL; 4427 bh = NULL;
2025out: 4428out:
2026 brelse(bh); 4429 brelse(bh);
2027 return ret; 4430 return ret;
2028} 4431}
4432
4433static int ocfs2_dx_dir_remove_index(struct inode *dir,
4434 struct buffer_head *di_bh,
4435 struct buffer_head *dx_root_bh)
4436{
4437 int ret;
4438 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4439 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4440 struct ocfs2_dx_root_block *dx_root;
4441 struct inode *dx_alloc_inode = NULL;
4442 struct buffer_head *dx_alloc_bh = NULL;
4443 handle_t *handle;
4444 u64 blk;
4445 u16 bit;
4446 u64 bg_blkno;
4447
4448 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
4449
4450 dx_alloc_inode = ocfs2_get_system_file_inode(osb,
4451 EXTENT_ALLOC_SYSTEM_INODE,
4452 le16_to_cpu(dx_root->dr_suballoc_slot));
4453 if (!dx_alloc_inode) {
4454 ret = -ENOMEM;
4455 mlog_errno(ret);
4456 goto out;
4457 }
4458 mutex_lock(&dx_alloc_inode->i_mutex);
4459
4460 ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1);
4461 if (ret) {
4462 mlog_errno(ret);
4463 goto out_mutex;
4464 }
4465
4466 handle = ocfs2_start_trans(osb, OCFS2_DX_ROOT_REMOVE_CREDITS);
4467 if (IS_ERR(handle)) {
4468 ret = PTR_ERR(handle);
4469 mlog_errno(ret);
4470 goto out_unlock;
4471 }
4472
4473 ret = ocfs2_journal_access_di(handle, dir, di_bh,
4474 OCFS2_JOURNAL_ACCESS_WRITE);
4475 if (ret) {
4476 mlog_errno(ret);
4477 goto out_commit;
4478 }
4479
4480 OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
4481 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
4482 di->i_dx_root = cpu_to_le64(0ULL);
4483
4484 ocfs2_journal_dirty(handle, di_bh);
4485
4486 blk = le64_to_cpu(dx_root->dr_blkno);
4487 bit = le16_to_cpu(dx_root->dr_suballoc_bit);
4488 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
4489 ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
4490 bit, bg_blkno, 1);
4491 if (ret)
4492 mlog_errno(ret);
4493
4494out_commit:
4495 ocfs2_commit_trans(osb, handle);
4496
4497out_unlock:
4498 ocfs2_inode_unlock(dx_alloc_inode, 1);
4499
4500out_mutex:
4501 mutex_unlock(&dx_alloc_inode->i_mutex);
4502 brelse(dx_alloc_bh);
4503out:
4504 iput(dx_alloc_inode);
4505 return ret;
4506}
4507
4508int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
4509{
4510 int ret;
4511 unsigned int uninitialized_var(clen);
4512 u32 major_hash = UINT_MAX, p_cpos, uninitialized_var(cpos);
4513 u64 uninitialized_var(blkno);
4514 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4515 struct buffer_head *dx_root_bh = NULL;
4516 struct ocfs2_dx_root_block *dx_root;
4517 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4518 struct ocfs2_cached_dealloc_ctxt dealloc;
4519 struct ocfs2_extent_tree et;
4520
4521 ocfs2_init_dealloc_ctxt(&dealloc);
4522
4523 if (!ocfs2_dir_indexed(dir))
4524 return 0;
4525
4526 ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
4527 if (ret) {
4528 mlog_errno(ret);
4529 goto out;
4530 }
4531 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
4532
4533 if (ocfs2_dx_root_inline(dx_root))
4534 goto remove_index;
4535
4536 ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
4537
4538 /* XXX: What if dr_clusters is too large? */
4539 while (le32_to_cpu(dx_root->dr_clusters)) {
4540 ret = ocfs2_dx_dir_lookup_rec(dir, &dx_root->dr_list,
4541 major_hash, &cpos, &blkno, &clen);
4542 if (ret) {
4543 mlog_errno(ret);
4544 goto out;
4545 }
4546
4547 p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
4548
4549 ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen,
4550 &dealloc);
4551 if (ret) {
4552 mlog_errno(ret);
4553 goto out;
4554 }
4555
4556 if (cpos == 0)
4557 break;
4558
4559 major_hash = cpos - 1;
4560 }
4561
4562remove_index:
4563 ret = ocfs2_dx_dir_remove_index(dir, di_bh, dx_root_bh);
4564 if (ret) {
4565 mlog_errno(ret);
4566 goto out;
4567 }
4568
4569 ocfs2_remove_from_cache(dir, dx_root_bh);
4570out:
4571 ocfs2_schedule_truncate_log_flush(osb, 1);
4572 ocfs2_run_deallocs(osb, &dealloc);
4573
4574 brelse(dx_root_bh);
4575 return ret;
4576}
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index c511e2e18e9f..e683f3deb645 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -26,44 +26,70 @@
26#ifndef OCFS2_DIR_H 26#ifndef OCFS2_DIR_H
27#define OCFS2_DIR_H 27#define OCFS2_DIR_H
28 28
29struct buffer_head *ocfs2_find_entry(const char *name, 29struct ocfs2_dx_hinfo {
30 int namelen, 30 u32 major_hash;
31 struct inode *dir, 31 u32 minor_hash;
32 struct ocfs2_dir_entry **res_dir); 32};
33
34struct ocfs2_dir_lookup_result {
35 struct buffer_head *dl_leaf_bh; /* Unindexed leaf
36 * block */
37 struct ocfs2_dir_entry *dl_entry; /* Target dirent in
38 * unindexed leaf */
39
40 struct buffer_head *dl_dx_root_bh; /* Root of indexed
41 * tree */
42
43 struct buffer_head *dl_dx_leaf_bh; /* Indexed leaf block */
44 struct ocfs2_dx_entry *dl_dx_entry; /* Target dx_entry in
45 * indexed leaf */
46 struct ocfs2_dx_hinfo dl_hinfo; /* Name hash results */
47
48 struct buffer_head *dl_prev_leaf_bh;/* Previous entry in
49 * dir free space
50 * list. NULL if
51 * previous entry is
52 * dx root block. */
53};
54
55void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res);
56
57int ocfs2_find_entry(const char *name, int namelen,
58 struct inode *dir,
59 struct ocfs2_dir_lookup_result *lookup);
33int ocfs2_delete_entry(handle_t *handle, 60int ocfs2_delete_entry(handle_t *handle,
34 struct inode *dir, 61 struct inode *dir,
35 struct ocfs2_dir_entry *de_del, 62 struct ocfs2_dir_lookup_result *res);
36 struct buffer_head *bh);
37int __ocfs2_add_entry(handle_t *handle, 63int __ocfs2_add_entry(handle_t *handle,
38 struct inode *dir, 64 struct inode *dir,
39 const char *name, int namelen, 65 const char *name, int namelen,
40 struct inode *inode, u64 blkno, 66 struct inode *inode, u64 blkno,
41 struct buffer_head *parent_fe_bh, 67 struct buffer_head *parent_fe_bh,
42 struct buffer_head *insert_bh); 68 struct ocfs2_dir_lookup_result *lookup);
43static inline int ocfs2_add_entry(handle_t *handle, 69static inline int ocfs2_add_entry(handle_t *handle,
44 struct dentry *dentry, 70 struct dentry *dentry,
45 struct inode *inode, u64 blkno, 71 struct inode *inode, u64 blkno,
46 struct buffer_head *parent_fe_bh, 72 struct buffer_head *parent_fe_bh,
47 struct buffer_head *insert_bh) 73 struct ocfs2_dir_lookup_result *lookup)
48{ 74{
49 return __ocfs2_add_entry(handle, dentry->d_parent->d_inode, 75 return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
50 dentry->d_name.name, dentry->d_name.len, 76 dentry->d_name.name, dentry->d_name.len,
51 inode, blkno, parent_fe_bh, insert_bh); 77 inode, blkno, parent_fe_bh, lookup);
52} 78}
53int ocfs2_update_entry(struct inode *dir, handle_t *handle, 79int ocfs2_update_entry(struct inode *dir, handle_t *handle,
54 struct buffer_head *de_bh, struct ocfs2_dir_entry *de, 80 struct ocfs2_dir_lookup_result *res,
55 struct inode *new_entry_inode); 81 struct inode *new_entry_inode);
56 82
57int ocfs2_check_dir_for_entry(struct inode *dir, 83int ocfs2_check_dir_for_entry(struct inode *dir,
58 const char *name, 84 const char *name,
59 int namelen); 85 int namelen);
60int ocfs2_empty_dir(struct inode *inode); 86int ocfs2_empty_dir(struct inode *inode);
87
61int ocfs2_find_files_on_disk(const char *name, 88int ocfs2_find_files_on_disk(const char *name,
62 int namelen, 89 int namelen,
63 u64 *blkno, 90 u64 *blkno,
64 struct inode *inode, 91 struct inode *inode,
65 struct buffer_head **dirent_bh, 92 struct ocfs2_dir_lookup_result *res);
66 struct ocfs2_dir_entry **dirent);
67int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name, 93int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
68 int namelen, u64 *blkno); 94 int namelen, u64 *blkno);
69int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir); 95int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
@@ -74,14 +100,17 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
74 struct buffer_head *parent_fe_bh, 100 struct buffer_head *parent_fe_bh,
75 const char *name, 101 const char *name,
76 int namelen, 102 int namelen,
77 struct buffer_head **ret_de_bh); 103 struct ocfs2_dir_lookup_result *lookup);
78struct ocfs2_alloc_context; 104struct ocfs2_alloc_context;
79int ocfs2_fill_new_dir(struct ocfs2_super *osb, 105int ocfs2_fill_new_dir(struct ocfs2_super *osb,
80 handle_t *handle, 106 handle_t *handle,
81 struct inode *parent, 107 struct inode *parent,
82 struct inode *inode, 108 struct inode *inode,
83 struct buffer_head *fe_bh, 109 struct buffer_head *fe_bh,
84 struct ocfs2_alloc_context *data_ac); 110 struct ocfs2_alloc_context *data_ac,
111 struct ocfs2_alloc_context *meta_ac);
112
113int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh);
85 114
86struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize, 115struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
87 void *data); 116 void *data);
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index bb53714813ab..0102be35980c 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -52,16 +52,12 @@
52enum dlm_mle_type { 52enum dlm_mle_type {
53 DLM_MLE_BLOCK, 53 DLM_MLE_BLOCK,
54 DLM_MLE_MASTER, 54 DLM_MLE_MASTER,
55 DLM_MLE_MIGRATION 55 DLM_MLE_MIGRATION,
56}; 56 DLM_MLE_NUM_TYPES
57
58struct dlm_lock_name {
59 u8 len;
60 u8 name[DLM_LOCKID_NAME_MAX];
61}; 57};
62 58
63struct dlm_master_list_entry { 59struct dlm_master_list_entry {
64 struct list_head list; 60 struct hlist_node master_hash_node;
65 struct list_head hb_events; 61 struct list_head hb_events;
66 struct dlm_ctxt *dlm; 62 struct dlm_ctxt *dlm;
67 spinlock_t spinlock; 63 spinlock_t spinlock;
@@ -78,10 +74,10 @@ struct dlm_master_list_entry {
78 enum dlm_mle_type type; 74 enum dlm_mle_type type;
79 struct o2hb_callback_func mle_hb_up; 75 struct o2hb_callback_func mle_hb_up;
80 struct o2hb_callback_func mle_hb_down; 76 struct o2hb_callback_func mle_hb_down;
81 union { 77 struct dlm_lock_resource *mleres;
82 struct dlm_lock_resource *res; 78 unsigned char mname[DLM_LOCKID_NAME_MAX];
83 struct dlm_lock_name name; 79 unsigned int mnamelen;
84 } u; 80 unsigned int mnamehash;
85}; 81};
86 82
87enum dlm_ast_type { 83enum dlm_ast_type {
@@ -151,13 +147,14 @@ struct dlm_ctxt
151 unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 147 unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
152 struct dlm_recovery_ctxt reco; 148 struct dlm_recovery_ctxt reco;
153 spinlock_t master_lock; 149 spinlock_t master_lock;
154 struct list_head master_list; 150 struct hlist_head **master_hash;
155 struct list_head mle_hb_events; 151 struct list_head mle_hb_events;
156 152
157 /* these give a really vague idea of the system load */ 153 /* these give a really vague idea of the system load */
158 atomic_t local_resources; 154 atomic_t mle_tot_count[DLM_MLE_NUM_TYPES];
159 atomic_t remote_resources; 155 atomic_t mle_cur_count[DLM_MLE_NUM_TYPES];
160 atomic_t unknown_resources; 156 atomic_t res_tot_count;
157 atomic_t res_cur_count;
161 158
162 struct dlm_debug_ctxt *dlm_debug_ctxt; 159 struct dlm_debug_ctxt *dlm_debug_ctxt;
163 struct dentry *dlm_debugfs_subroot; 160 struct dentry *dlm_debugfs_subroot;
@@ -195,6 +192,13 @@ static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, unsigned
195 return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE); 192 return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE);
196} 193}
197 194
195static inline struct hlist_head *dlm_master_hash(struct dlm_ctxt *dlm,
196 unsigned i)
197{
198 return dlm->master_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] +
199 (i % DLM_BUCKETS_PER_PAGE);
200}
201
198/* these keventd work queue items are for less-frequently 202/* these keventd work queue items are for less-frequently
199 * called functions that cannot be directly called from the 203 * called functions that cannot be directly called from the
200 * net message handlers for some reason, usually because 204 * net message handlers for some reason, usually because
@@ -848,9 +852,7 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
848 unsigned int len); 852 unsigned int len);
849 853
850int dlm_is_host_down(int errno); 854int dlm_is_host_down(int errno);
851void dlm_change_lockres_owner(struct dlm_ctxt *dlm, 855
852 struct dlm_lock_resource *res,
853 u8 owner);
854struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, 856struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
855 const char *lockid, 857 const char *lockid,
856 int namelen, 858 int namelen,
@@ -1008,6 +1010,9 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
1008 DLM_LOCK_RES_MIGRATING)); 1010 DLM_LOCK_RES_MIGRATING));
1009} 1011}
1010 1012
1013void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle);
1014void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle);
1015
1011/* create/destroy slab caches */ 1016/* create/destroy slab caches */
1012int dlm_init_master_caches(void); 1017int dlm_init_master_caches(void);
1013void dlm_destroy_master_caches(void); 1018void dlm_destroy_master_caches(void);
@@ -1110,6 +1115,23 @@ static inline int dlm_node_iter_next(struct dlm_node_iter *iter)
1110 return bit; 1115 return bit;
1111} 1116}
1112 1117
1118static inline void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
1119 struct dlm_lock_resource *res,
1120 u8 owner)
1121{
1122 assert_spin_locked(&res->spinlock);
1123
1124 res->owner = owner;
1125}
1113 1126
1127static inline void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
1128 struct dlm_lock_resource *res,
1129 u8 owner)
1130{
1131 assert_spin_locked(&res->spinlock);
1132
1133 if (owner != res->owner)
1134 dlm_set_lockres_owner(dlm, res, owner);
1135}
1114 1136
1115#endif /* DLMCOMMON_H */ 1137#endif /* DLMCOMMON_H */
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index b32f60a5acfb..df52f706f669 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -287,18 +287,8 @@ static int stringify_nodemap(unsigned long *nodemap, int maxnodes,
287static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len) 287static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
288{ 288{
289 int out = 0; 289 int out = 0;
290 unsigned int namelen;
291 const char *name;
292 char *mle_type; 290 char *mle_type;
293 291
294 if (mle->type != DLM_MLE_MASTER) {
295 namelen = mle->u.name.len;
296 name = mle->u.name.name;
297 } else {
298 namelen = mle->u.res->lockname.len;
299 name = mle->u.res->lockname.name;
300 }
301
302 if (mle->type == DLM_MLE_BLOCK) 292 if (mle->type == DLM_MLE_BLOCK)
303 mle_type = "BLK"; 293 mle_type = "BLK";
304 else if (mle->type == DLM_MLE_MASTER) 294 else if (mle->type == DLM_MLE_MASTER)
@@ -306,7 +296,7 @@ static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
306 else 296 else
307 mle_type = "MIG"; 297 mle_type = "MIG";
308 298
309 out += stringify_lockname(name, namelen, buf + out, len - out); 299 out += stringify_lockname(mle->mname, mle->mnamelen, buf + out, len - out);
310 out += snprintf(buf + out, len - out, 300 out += snprintf(buf + out, len - out,
311 "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n", 301 "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n",
312 mle_type, mle->master, mle->new_master, 302 mle_type, mle->master, mle->new_master,
@@ -501,23 +491,33 @@ static struct file_operations debug_purgelist_fops = {
501static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db) 491static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
502{ 492{
503 struct dlm_master_list_entry *mle; 493 struct dlm_master_list_entry *mle;
504 int out = 0; 494 struct hlist_head *bucket;
505 unsigned long total = 0; 495 struct hlist_node *list;
496 int i, out = 0;
497 unsigned long total = 0, longest = 0, bktcnt;
506 498
507 out += snprintf(db->buf + out, db->len - out, 499 out += snprintf(db->buf + out, db->len - out,
508 "Dumping MLEs for Domain: %s\n", dlm->name); 500 "Dumping MLEs for Domain: %s\n", dlm->name);
509 501
510 spin_lock(&dlm->master_lock); 502 spin_lock(&dlm->master_lock);
511 list_for_each_entry(mle, &dlm->master_list, list) { 503 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
512 ++total; 504 bucket = dlm_master_hash(dlm, i);
513 if (db->len - out < 200) 505 hlist_for_each(list, bucket) {
514 continue; 506 mle = hlist_entry(list, struct dlm_master_list_entry,
515 out += dump_mle(mle, db->buf + out, db->len - out); 507 master_hash_node);
508 ++total;
509 ++bktcnt;
510 if (db->len - out < 200)
511 continue;
512 out += dump_mle(mle, db->buf + out, db->len - out);
513 }
514 longest = max(longest, bktcnt);
515 bktcnt = 0;
516 } 516 }
517 spin_unlock(&dlm->master_lock); 517 spin_unlock(&dlm->master_lock);
518 518
519 out += snprintf(db->buf + out, db->len - out, 519 out += snprintf(db->buf + out, db->len - out,
520 "Total on list: %ld\n", total); 520 "Total: %ld, Longest: %ld\n", total, longest);
521 return out; 521 return out;
522} 522}
523 523
@@ -756,12 +756,8 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
756 int out = 0; 756 int out = 0;
757 struct dlm_reco_node_data *node; 757 struct dlm_reco_node_data *node;
758 char *state; 758 char *state;
759 int lres, rres, ures, tres; 759 int cur_mles = 0, tot_mles = 0;
760 760 int i;
761 lres = atomic_read(&dlm->local_resources);
762 rres = atomic_read(&dlm->remote_resources);
763 ures = atomic_read(&dlm->unknown_resources);
764 tres = lres + rres + ures;
765 761
766 spin_lock(&dlm->spinlock); 762 spin_lock(&dlm->spinlock);
767 763
@@ -804,21 +800,48 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
804 db->buf + out, db->len - out); 800 db->buf + out, db->len - out);
805 out += snprintf(db->buf + out, db->len - out, "\n"); 801 out += snprintf(db->buf + out, db->len - out, "\n");
806 802
807 /* Mastered Resources Total: xxx Locally: xxx Remotely: ... */ 803 /* Lock Resources: xxx (xxx) */
804 out += snprintf(db->buf + out, db->len - out,
805 "Lock Resources: %d (%d)\n",
806 atomic_read(&dlm->res_cur_count),
807 atomic_read(&dlm->res_tot_count));
808
809 for (i = 0; i < DLM_MLE_NUM_TYPES; ++i)
810 tot_mles += atomic_read(&dlm->mle_tot_count[i]);
811
812 for (i = 0; i < DLM_MLE_NUM_TYPES; ++i)
813 cur_mles += atomic_read(&dlm->mle_cur_count[i]);
814
815 /* MLEs: xxx (xxx) */
816 out += snprintf(db->buf + out, db->len - out,
817 "MLEs: %d (%d)\n", cur_mles, tot_mles);
818
819 /* Blocking: xxx (xxx) */
820 out += snprintf(db->buf + out, db->len - out,
821 " Blocking: %d (%d)\n",
822 atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]),
823 atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK]));
824
825 /* Mastery: xxx (xxx) */
826 out += snprintf(db->buf + out, db->len - out,
827 " Mastery: %d (%d)\n",
828 atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]),
829 atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER]));
830
831 /* Migration: xxx (xxx) */
808 out += snprintf(db->buf + out, db->len - out, 832 out += snprintf(db->buf + out, db->len - out,
809 "Mastered Resources Total: %d Locally: %d " 833 " Migration: %d (%d)\n",
810 "Remotely: %d Unknown: %d\n", 834 atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]),
811 tres, lres, rres, ures); 835 atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION]));
812 836
813 /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */ 837 /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */
814 out += snprintf(db->buf + out, db->len - out, 838 out += snprintf(db->buf + out, db->len - out,
815 "Lists: Dirty=%s Purge=%s PendingASTs=%s " 839 "Lists: Dirty=%s Purge=%s PendingASTs=%s "
816 "PendingBASTs=%s Master=%s\n", 840 "PendingBASTs=%s\n",
817 (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"), 841 (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
818 (list_empty(&dlm->purge_list) ? "Empty" : "InUse"), 842 (list_empty(&dlm->purge_list) ? "Empty" : "InUse"),
819 (list_empty(&dlm->pending_asts) ? "Empty" : "InUse"), 843 (list_empty(&dlm->pending_asts) ? "Empty" : "InUse"),
820 (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"), 844 (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"));
821 (list_empty(&dlm->master_list) ? "Empty" : "InUse"));
822 845
823 /* Purge Count: xxx Refs: xxx */ 846 /* Purge Count: xxx Refs: xxx */
824 out += snprintf(db->buf + out, db->len - out, 847 out += snprintf(db->buf + out, db->len - out,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index d8d578f45613..4d9e6b288dd8 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -304,6 +304,9 @@ static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
304 if (dlm->lockres_hash) 304 if (dlm->lockres_hash)
305 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); 305 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
306 306
307 if (dlm->master_hash)
308 dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
309
307 if (dlm->name) 310 if (dlm->name)
308 kfree(dlm->name); 311 kfree(dlm->name);
309 312
@@ -1534,12 +1537,27 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1534 for (i = 0; i < DLM_HASH_BUCKETS; i++) 1537 for (i = 0; i < DLM_HASH_BUCKETS; i++)
1535 INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i)); 1538 INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i));
1536 1539
1540 dlm->master_hash = (struct hlist_head **)
1541 dlm_alloc_pagevec(DLM_HASH_PAGES);
1542 if (!dlm->master_hash) {
1543 mlog_errno(-ENOMEM);
1544 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
1545 kfree(dlm->name);
1546 kfree(dlm);
1547 dlm = NULL;
1548 goto leave;
1549 }
1550
1551 for (i = 0; i < DLM_HASH_BUCKETS; i++)
1552 INIT_HLIST_HEAD(dlm_master_hash(dlm, i));
1553
1537 strcpy(dlm->name, domain); 1554 strcpy(dlm->name, domain);
1538 dlm->key = key; 1555 dlm->key = key;
1539 dlm->node_num = o2nm_this_node(); 1556 dlm->node_num = o2nm_this_node();
1540 1557
1541 ret = dlm_create_debugfs_subroot(dlm); 1558 ret = dlm_create_debugfs_subroot(dlm);
1542 if (ret < 0) { 1559 if (ret < 0) {
1560 dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
1543 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); 1561 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
1544 kfree(dlm->name); 1562 kfree(dlm->name);
1545 kfree(dlm); 1563 kfree(dlm);
@@ -1579,7 +1597,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1579 init_waitqueue_head(&dlm->reco.event); 1597 init_waitqueue_head(&dlm->reco.event);
1580 init_waitqueue_head(&dlm->ast_wq); 1598 init_waitqueue_head(&dlm->ast_wq);
1581 init_waitqueue_head(&dlm->migration_wq); 1599 init_waitqueue_head(&dlm->migration_wq);
1582 INIT_LIST_HEAD(&dlm->master_list);
1583 INIT_LIST_HEAD(&dlm->mle_hb_events); 1600 INIT_LIST_HEAD(&dlm->mle_hb_events);
1584 1601
1585 dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN; 1602 dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
@@ -1587,9 +1604,13 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1587 1604
1588 dlm->reco.new_master = O2NM_INVALID_NODE_NUM; 1605 dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
1589 dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; 1606 dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
1590 atomic_set(&dlm->local_resources, 0); 1607
1591 atomic_set(&dlm->remote_resources, 0); 1608 atomic_set(&dlm->res_tot_count, 0);
1592 atomic_set(&dlm->unknown_resources, 0); 1609 atomic_set(&dlm->res_cur_count, 0);
1610 for (i = 0; i < DLM_MLE_NUM_TYPES; ++i) {
1611 atomic_set(&dlm->mle_tot_count[i], 0);
1612 atomic_set(&dlm->mle_cur_count[i], 0);
1613 }
1593 1614
1594 spin_lock_init(&dlm->work_lock); 1615 spin_lock_init(&dlm->work_lock);
1595 INIT_LIST_HEAD(&dlm->work_list); 1616 INIT_LIST_HEAD(&dlm->work_list);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 0a2813947853..f8b653fcd4dd 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -73,22 +73,13 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
73 const char *name, 73 const char *name,
74 unsigned int namelen) 74 unsigned int namelen)
75{ 75{
76 struct dlm_lock_resource *res;
77
78 if (dlm != mle->dlm) 76 if (dlm != mle->dlm)
79 return 0; 77 return 0;
80 78
81 if (mle->type == DLM_MLE_BLOCK || 79 if (namelen != mle->mnamelen ||
82 mle->type == DLM_MLE_MIGRATION) { 80 memcmp(name, mle->mname, namelen) != 0)
83 if (namelen != mle->u.name.len || 81 return 0;
84 memcmp(name, mle->u.name.name, namelen)!=0) 82
85 return 0;
86 } else {
87 res = mle->u.res;
88 if (namelen != res->lockname.len ||
89 memcmp(res->lockname.name, name, namelen) != 0)
90 return 0;
91 }
92 return 1; 83 return 1;
93} 84}
94 85
@@ -283,7 +274,7 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
283 274
284 mle->dlm = dlm; 275 mle->dlm = dlm;
285 mle->type = type; 276 mle->type = type;
286 INIT_LIST_HEAD(&mle->list); 277 INIT_HLIST_NODE(&mle->master_hash_node);
287 INIT_LIST_HEAD(&mle->hb_events); 278 INIT_LIST_HEAD(&mle->hb_events);
288 memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); 279 memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
289 spin_lock_init(&mle->spinlock); 280 spin_lock_init(&mle->spinlock);
@@ -295,19 +286,27 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
295 mle->new_master = O2NM_MAX_NODES; 286 mle->new_master = O2NM_MAX_NODES;
296 mle->inuse = 0; 287 mle->inuse = 0;
297 288
289 BUG_ON(mle->type != DLM_MLE_BLOCK &&
290 mle->type != DLM_MLE_MASTER &&
291 mle->type != DLM_MLE_MIGRATION);
292
298 if (mle->type == DLM_MLE_MASTER) { 293 if (mle->type == DLM_MLE_MASTER) {
299 BUG_ON(!res); 294 BUG_ON(!res);
300 mle->u.res = res; 295 mle->mleres = res;
301 } else if (mle->type == DLM_MLE_BLOCK) { 296 memcpy(mle->mname, res->lockname.name, res->lockname.len);
302 BUG_ON(!name); 297 mle->mnamelen = res->lockname.len;
303 memcpy(mle->u.name.name, name, namelen); 298 mle->mnamehash = res->lockname.hash;
304 mle->u.name.len = namelen; 299 } else {
305 } else /* DLM_MLE_MIGRATION */ {
306 BUG_ON(!name); 300 BUG_ON(!name);
307 memcpy(mle->u.name.name, name, namelen); 301 mle->mleres = NULL;
308 mle->u.name.len = namelen; 302 memcpy(mle->mname, name, namelen);
303 mle->mnamelen = namelen;
304 mle->mnamehash = dlm_lockid_hash(name, namelen);
309 } 305 }
310 306
307 atomic_inc(&dlm->mle_tot_count[mle->type]);
308 atomic_inc(&dlm->mle_cur_count[mle->type]);
309
311 /* copy off the node_map and register hb callbacks on our copy */ 310 /* copy off the node_map and register hb callbacks on our copy */
312 memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map)); 311 memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
313 memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map)); 312 memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
@@ -318,6 +317,24 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
318 __dlm_mle_attach_hb_events(dlm, mle); 317 __dlm_mle_attach_hb_events(dlm, mle);
319} 318}
320 319
320void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
321{
322 assert_spin_locked(&dlm->spinlock);
323 assert_spin_locked(&dlm->master_lock);
324
325 if (!hlist_unhashed(&mle->master_hash_node))
326 hlist_del_init(&mle->master_hash_node);
327}
328
329void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
330{
331 struct hlist_head *bucket;
332
333 assert_spin_locked(&dlm->master_lock);
334
335 bucket = dlm_master_hash(dlm, mle->mnamehash);
336 hlist_add_head(&mle->master_hash_node, bucket);
337}
321 338
322/* returns 1 if found, 0 if not */ 339/* returns 1 if found, 0 if not */
323static int dlm_find_mle(struct dlm_ctxt *dlm, 340static int dlm_find_mle(struct dlm_ctxt *dlm,
@@ -325,10 +342,17 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
325 char *name, unsigned int namelen) 342 char *name, unsigned int namelen)
326{ 343{
327 struct dlm_master_list_entry *tmpmle; 344 struct dlm_master_list_entry *tmpmle;
345 struct hlist_head *bucket;
346 struct hlist_node *list;
347 unsigned int hash;
328 348
329 assert_spin_locked(&dlm->master_lock); 349 assert_spin_locked(&dlm->master_lock);
330 350
331 list_for_each_entry(tmpmle, &dlm->master_list, list) { 351 hash = dlm_lockid_hash(name, namelen);
352 bucket = dlm_master_hash(dlm, hash);
353 hlist_for_each(list, bucket) {
354 tmpmle = hlist_entry(list, struct dlm_master_list_entry,
355 master_hash_node);
332 if (!dlm_mle_equal(dlm, tmpmle, name, namelen)) 356 if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
333 continue; 357 continue;
334 dlm_get_mle(tmpmle); 358 dlm_get_mle(tmpmle);
@@ -408,24 +432,20 @@ static void dlm_mle_release(struct kref *kref)
408 mle = container_of(kref, struct dlm_master_list_entry, mle_refs); 432 mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
409 dlm = mle->dlm; 433 dlm = mle->dlm;
410 434
411 if (mle->type != DLM_MLE_MASTER) {
412 mlog(0, "calling mle_release for %.*s, type %d\n",
413 mle->u.name.len, mle->u.name.name, mle->type);
414 } else {
415 mlog(0, "calling mle_release for %.*s, type %d\n",
416 mle->u.res->lockname.len,
417 mle->u.res->lockname.name, mle->type);
418 }
419 assert_spin_locked(&dlm->spinlock); 435 assert_spin_locked(&dlm->spinlock);
420 assert_spin_locked(&dlm->master_lock); 436 assert_spin_locked(&dlm->master_lock);
421 437
438 mlog(0, "Releasing mle for %.*s, type %d\n", mle->mnamelen, mle->mname,
439 mle->type);
440
422 /* remove from list if not already */ 441 /* remove from list if not already */
423 if (!list_empty(&mle->list)) 442 __dlm_unlink_mle(dlm, mle);
424 list_del_init(&mle->list);
425 443
426 /* detach the mle from the domain node up/down events */ 444 /* detach the mle from the domain node up/down events */
427 __dlm_mle_detach_hb_events(dlm, mle); 445 __dlm_mle_detach_hb_events(dlm, mle);
428 446
447 atomic_dec(&dlm->mle_cur_count[mle->type]);
448
429 /* NOTE: kfree under spinlock here. 449 /* NOTE: kfree under spinlock here.
430 * if this is bad, we can move this to a freelist. */ 450 * if this is bad, we can move this to a freelist. */
431 kmem_cache_free(dlm_mle_cache, mle); 451 kmem_cache_free(dlm_mle_cache, mle);
@@ -465,43 +485,6 @@ void dlm_destroy_master_caches(void)
465 kmem_cache_destroy(dlm_lockres_cache); 485 kmem_cache_destroy(dlm_lockres_cache);
466} 486}
467 487
468static void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
469 struct dlm_lock_resource *res,
470 u8 owner)
471{
472 assert_spin_locked(&res->spinlock);
473
474 mlog_entry("%.*s, %u\n", res->lockname.len, res->lockname.name, owner);
475
476 if (owner == dlm->node_num)
477 atomic_inc(&dlm->local_resources);
478 else if (owner == DLM_LOCK_RES_OWNER_UNKNOWN)
479 atomic_inc(&dlm->unknown_resources);
480 else
481 atomic_inc(&dlm->remote_resources);
482
483 res->owner = owner;
484}
485
486void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
487 struct dlm_lock_resource *res, u8 owner)
488{
489 assert_spin_locked(&res->spinlock);
490
491 if (owner == res->owner)
492 return;
493
494 if (res->owner == dlm->node_num)
495 atomic_dec(&dlm->local_resources);
496 else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN)
497 atomic_dec(&dlm->unknown_resources);
498 else
499 atomic_dec(&dlm->remote_resources);
500
501 dlm_set_lockres_owner(dlm, res, owner);
502}
503
504
505static void dlm_lockres_release(struct kref *kref) 488static void dlm_lockres_release(struct kref *kref)
506{ 489{
507 struct dlm_lock_resource *res; 490 struct dlm_lock_resource *res;
@@ -527,6 +510,8 @@ static void dlm_lockres_release(struct kref *kref)
527 } 510 }
528 spin_unlock(&dlm->track_lock); 511 spin_unlock(&dlm->track_lock);
529 512
513 atomic_dec(&dlm->res_cur_count);
514
530 dlm_put(dlm); 515 dlm_put(dlm);
531 516
532 if (!hlist_unhashed(&res->hash_node) || 517 if (!hlist_unhashed(&res->hash_node) ||
@@ -607,6 +592,9 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
607 592
608 kref_init(&res->refs); 593 kref_init(&res->refs);
609 594
595 atomic_inc(&dlm->res_tot_count);
596 atomic_inc(&dlm->res_cur_count);
597
610 /* just for consistency */ 598 /* just for consistency */
611 spin_lock(&res->spinlock); 599 spin_lock(&res->spinlock);
612 dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN); 600 dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
@@ -843,7 +831,7 @@ lookup:
843 alloc_mle = NULL; 831 alloc_mle = NULL;
844 dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0); 832 dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
845 set_bit(dlm->node_num, mle->maybe_map); 833 set_bit(dlm->node_num, mle->maybe_map);
846 list_add(&mle->list, &dlm->master_list); 834 __dlm_insert_mle(dlm, mle);
847 835
848 /* still holding the dlm spinlock, check the recovery map 836 /* still holding the dlm spinlock, check the recovery map
849 * to see if there are any nodes that still need to be 837 * to see if there are any nodes that still need to be
@@ -1270,7 +1258,7 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
1270 res->lockname.len, 1258 res->lockname.len,
1271 res->lockname.name); 1259 res->lockname.name);
1272 mle->type = DLM_MLE_MASTER; 1260 mle->type = DLM_MLE_MASTER;
1273 mle->u.res = res; 1261 mle->mleres = res;
1274 } 1262 }
1275 } 1263 }
1276 } 1264 }
@@ -1315,14 +1303,8 @@ static int dlm_do_master_request(struct dlm_lock_resource *res,
1315 1303
1316 BUG_ON(mle->type == DLM_MLE_MIGRATION); 1304 BUG_ON(mle->type == DLM_MLE_MIGRATION);
1317 1305
1318 if (mle->type != DLM_MLE_MASTER) { 1306 request.namelen = (u8)mle->mnamelen;
1319 request.namelen = mle->u.name.len; 1307 memcpy(request.name, mle->mname, request.namelen);
1320 memcpy(request.name, mle->u.name.name, request.namelen);
1321 } else {
1322 request.namelen = mle->u.res->lockname.len;
1323 memcpy(request.name, mle->u.res->lockname.name,
1324 request.namelen);
1325 }
1326 1308
1327again: 1309again:
1328 ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request, 1310 ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,
@@ -1575,7 +1557,7 @@ way_up_top:
1575 // "add the block.\n"); 1557 // "add the block.\n");
1576 dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen); 1558 dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);
1577 set_bit(request->node_idx, mle->maybe_map); 1559 set_bit(request->node_idx, mle->maybe_map);
1578 list_add(&mle->list, &dlm->master_list); 1560 __dlm_insert_mle(dlm, mle);
1579 response = DLM_MASTER_RESP_NO; 1561 response = DLM_MASTER_RESP_NO;
1580 } else { 1562 } else {
1581 // mlog(0, "mle was found\n"); 1563 // mlog(0, "mle was found\n");
@@ -1967,7 +1949,7 @@ ok:
1967 assert->node_idx, rr, extra_ref, mle->inuse); 1949 assert->node_idx, rr, extra_ref, mle->inuse);
1968 dlm_print_one_mle(mle); 1950 dlm_print_one_mle(mle);
1969 } 1951 }
1970 list_del_init(&mle->list); 1952 __dlm_unlink_mle(dlm, mle);
1971 __dlm_mle_detach_hb_events(dlm, mle); 1953 __dlm_mle_detach_hb_events(dlm, mle);
1972 __dlm_put_mle(mle); 1954 __dlm_put_mle(mle);
1973 if (extra_ref) { 1955 if (extra_ref) {
@@ -3159,10 +3141,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
3159 tmp->master = master; 3141 tmp->master = master;
3160 atomic_set(&tmp->woken, 1); 3142 atomic_set(&tmp->woken, 1);
3161 wake_up(&tmp->wq); 3143 wake_up(&tmp->wq);
3162 /* remove it from the list so that only one 3144 /* remove it so that only one mle will be found */
3163 * mle will be found */ 3145 __dlm_unlink_mle(dlm, tmp);
3164 list_del_init(&tmp->list);
3165 /* this was obviously WRONG. mle is uninited here. should be tmp. */
3166 __dlm_mle_detach_hb_events(dlm, tmp); 3146 __dlm_mle_detach_hb_events(dlm, tmp);
3167 ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; 3147 ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
3168 mlog(0, "%s:%.*s: master=%u, newmaster=%u, " 3148 mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
@@ -3181,137 +3161,164 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
3181 mle->master = master; 3161 mle->master = master;
3182 /* do this for consistency with other mle types */ 3162 /* do this for consistency with other mle types */
3183 set_bit(new_master, mle->maybe_map); 3163 set_bit(new_master, mle->maybe_map);
3184 list_add(&mle->list, &dlm->master_list); 3164 __dlm_insert_mle(dlm, mle);
3185 3165
3186 return ret; 3166 return ret;
3187} 3167}
3188 3168
3189 3169/*
3190void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) 3170 * Sets the owner of the lockres, associated to the mle, to UNKNOWN
3171 */
3172static struct dlm_lock_resource *dlm_reset_mleres_owner(struct dlm_ctxt *dlm,
3173 struct dlm_master_list_entry *mle)
3191{ 3174{
3192 struct dlm_master_list_entry *mle, *next;
3193 struct dlm_lock_resource *res; 3175 struct dlm_lock_resource *res;
3194 unsigned int hash;
3195 3176
3196 mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node); 3177 /* Find the lockres associated to the mle and set its owner to UNK */
3197top: 3178 res = __dlm_lookup_lockres(dlm, mle->mname, mle->mnamelen,
3198 assert_spin_locked(&dlm->spinlock); 3179 mle->mnamehash);
3180 if (res) {
3181 spin_unlock(&dlm->master_lock);
3199 3182
3200 /* clean the master list */ 3183 /* move lockres onto recovery list */
3201 spin_lock(&dlm->master_lock); 3184 spin_lock(&res->spinlock);
3202 list_for_each_entry_safe(mle, next, &dlm->master_list, list) { 3185 dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
3203 BUG_ON(mle->type != DLM_MLE_BLOCK && 3186 dlm_move_lockres_to_recovery_list(dlm, res);
3204 mle->type != DLM_MLE_MASTER && 3187 spin_unlock(&res->spinlock);
3205 mle->type != DLM_MLE_MIGRATION); 3188 dlm_lockres_put(res);
3206
3207 /* MASTER mles are initiated locally. the waiting
3208 * process will notice the node map change
3209 * shortly. let that happen as normal. */
3210 if (mle->type == DLM_MLE_MASTER)
3211 continue;
3212 3189
3190 /* about to get rid of mle, detach from heartbeat */
3191 __dlm_mle_detach_hb_events(dlm, mle);
3213 3192
3214 /* BLOCK mles are initiated by other nodes. 3193 /* dump the mle */
3215 * need to clean up if the dead node would have 3194 spin_lock(&dlm->master_lock);
3216 * been the master. */ 3195 __dlm_put_mle(mle);
3217 if (mle->type == DLM_MLE_BLOCK) { 3196 spin_unlock(&dlm->master_lock);
3218 int bit; 3197 }
3219 3198
3220 spin_lock(&mle->spinlock); 3199 return res;
3221 bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); 3200}
3222 if (bit != dead_node) {
3223 mlog(0, "mle found, but dead node %u would "
3224 "not have been master\n", dead_node);
3225 spin_unlock(&mle->spinlock);
3226 } else {
3227 /* must drop the refcount by one since the
3228 * assert_master will never arrive. this
3229 * may result in the mle being unlinked and
3230 * freed, but there may still be a process
3231 * waiting in the dlmlock path which is fine. */
3232 mlog(0, "node %u was expected master\n",
3233 dead_node);
3234 atomic_set(&mle->woken, 1);
3235 spin_unlock(&mle->spinlock);
3236 wake_up(&mle->wq);
3237 /* do not need events any longer, so detach
3238 * from heartbeat */
3239 __dlm_mle_detach_hb_events(dlm, mle);
3240 __dlm_put_mle(mle);
3241 }
3242 continue;
3243 }
3244 3201
3245 /* everything else is a MIGRATION mle */ 3202static void dlm_clean_migration_mle(struct dlm_ctxt *dlm,
3246 3203 struct dlm_master_list_entry *mle)
3247 /* the rule for MIGRATION mles is that the master 3204{
3248 * becomes UNKNOWN if *either* the original or 3205 __dlm_mle_detach_hb_events(dlm, mle);
3249 * the new master dies. all UNKNOWN lockreses
3250 * are sent to whichever node becomes the recovery
3251 * master. the new master is responsible for
3252 * determining if there is still a master for
3253 * this lockres, or if he needs to take over
3254 * mastery. either way, this node should expect
3255 * another message to resolve this. */
3256 if (mle->master != dead_node &&
3257 mle->new_master != dead_node)
3258 continue;
3259 3206
3260 /* if we have reached this point, this mle needs to 3207 spin_lock(&mle->spinlock);
3261 * be removed from the list and freed. */ 3208 __dlm_unlink_mle(dlm, mle);
3209 atomic_set(&mle->woken, 1);
3210 spin_unlock(&mle->spinlock);
3262 3211
3263 /* remove from the list early. NOTE: unlinking 3212 wake_up(&mle->wq);
3264 * list_head while in list_for_each_safe */ 3213}
3265 __dlm_mle_detach_hb_events(dlm, mle); 3214
3266 spin_lock(&mle->spinlock); 3215static void dlm_clean_block_mle(struct dlm_ctxt *dlm,
3267 list_del_init(&mle->list); 3216 struct dlm_master_list_entry *mle, u8 dead_node)
3217{
3218 int bit;
3219
3220 BUG_ON(mle->type != DLM_MLE_BLOCK);
3221
3222 spin_lock(&mle->spinlock);
3223 bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
3224 if (bit != dead_node) {
3225 mlog(0, "mle found, but dead node %u would not have been "
3226 "master\n", dead_node);
3227 spin_unlock(&mle->spinlock);
3228 } else {
3229 /* Must drop the refcount by one since the assert_master will
3230 * never arrive. This may result in the mle being unlinked and
3231 * freed, but there may still be a process waiting in the
3232 * dlmlock path which is fine. */
3233 mlog(0, "node %u was expected master\n", dead_node);
3268 atomic_set(&mle->woken, 1); 3234 atomic_set(&mle->woken, 1);
3269 spin_unlock(&mle->spinlock); 3235 spin_unlock(&mle->spinlock);
3270 wake_up(&mle->wq); 3236 wake_up(&mle->wq);
3271 3237
3272 mlog(0, "%s: node %u died during migration from " 3238 /* Do not need events any longer, so detach from heartbeat */
3273 "%u to %u!\n", dlm->name, dead_node, 3239 __dlm_mle_detach_hb_events(dlm, mle);
3274 mle->master, mle->new_master); 3240 __dlm_put_mle(mle);
3275 /* if there is a lockres associated with this 3241 }
3276 * mle, find it and set its owner to UNKNOWN */ 3242}
3277 hash = dlm_lockid_hash(mle->u.name.name, mle->u.name.len);
3278 res = __dlm_lookup_lockres(dlm, mle->u.name.name,
3279 mle->u.name.len, hash);
3280 if (res) {
3281 /* unfortunately if we hit this rare case, our
3282 * lock ordering is messed. we need to drop
3283 * the master lock so that we can take the
3284 * lockres lock, meaning that we will have to
3285 * restart from the head of list. */
3286 spin_unlock(&dlm->master_lock);
3287 3243
3288 /* move lockres onto recovery list */ 3244void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
3289 spin_lock(&res->spinlock); 3245{
3290 dlm_set_lockres_owner(dlm, res, 3246 struct dlm_master_list_entry *mle;
3291 DLM_LOCK_RES_OWNER_UNKNOWN); 3247 struct dlm_lock_resource *res;
3292 dlm_move_lockres_to_recovery_list(dlm, res); 3248 struct hlist_head *bucket;
3293 spin_unlock(&res->spinlock); 3249 struct hlist_node *list;
3294 dlm_lockres_put(res); 3250 unsigned int i;
3295 3251
3296 /* about to get rid of mle, detach from heartbeat */ 3252 mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
3297 __dlm_mle_detach_hb_events(dlm, mle); 3253top:
3254 assert_spin_locked(&dlm->spinlock);
3298 3255
3299 /* dump the mle */ 3256 /* clean the master list */
3300 spin_lock(&dlm->master_lock); 3257 spin_lock(&dlm->master_lock);
3301 __dlm_put_mle(mle); 3258 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
3302 spin_unlock(&dlm->master_lock); 3259 bucket = dlm_master_hash(dlm, i);
3260 hlist_for_each(list, bucket) {
3261 mle = hlist_entry(list, struct dlm_master_list_entry,
3262 master_hash_node);
3263
3264 BUG_ON(mle->type != DLM_MLE_BLOCK &&
3265 mle->type != DLM_MLE_MASTER &&
3266 mle->type != DLM_MLE_MIGRATION);
3267
3268 /* MASTER mles are initiated locally. The waiting
3269 * process will notice the node map change shortly.
3270 * Let that happen as normal. */
3271 if (mle->type == DLM_MLE_MASTER)
3272 continue;
3273
3274 /* BLOCK mles are initiated by other nodes. Need to
3275 * clean up if the dead node would have been the
3276 * master. */
3277 if (mle->type == DLM_MLE_BLOCK) {
3278 dlm_clean_block_mle(dlm, mle, dead_node);
3279 continue;
3280 }
3303 3281
3304 /* restart */ 3282 /* Everything else is a MIGRATION mle */
3305 goto top; 3283
3306 } 3284 /* The rule for MIGRATION mles is that the master
3285 * becomes UNKNOWN if *either* the original or the new
3286 * master dies. All UNKNOWN lockres' are sent to
3287 * whichever node becomes the recovery master. The new
3288 * master is responsible for determining if there is
3289 * still a master for this lockres, or if he needs to
3290 * take over mastery. Either way, this node should
3291 * expect another message to resolve this. */
3292
3293 if (mle->master != dead_node &&
3294 mle->new_master != dead_node)
3295 continue;
3296
3297 /* If we have reached this point, this mle needs to be
3298 * removed from the list and freed. */
3299 dlm_clean_migration_mle(dlm, mle);
3300
3301 mlog(0, "%s: node %u died during migration from "
3302 "%u to %u!\n", dlm->name, dead_node, mle->master,
3303 mle->new_master);
3304
3305 /* If we find a lockres associated with the mle, we've
3306 * hit this rare case that messes up our lock ordering.
3307 * If so, we need to drop the master lock so that we can
3308 * take the lockres lock, meaning that we will have to
3309 * restart from the head of list. */
3310 res = dlm_reset_mleres_owner(dlm, mle);
3311 if (res)
3312 /* restart */
3313 goto top;
3307 3314
3308 /* this may be the last reference */ 3315 /* This may be the last reference */
3309 __dlm_put_mle(mle); 3316 __dlm_put_mle(mle);
3317 }
3310 } 3318 }
3311 spin_unlock(&dlm->master_lock); 3319 spin_unlock(&dlm->master_lock);
3312} 3320}
3313 3321
3314
3315int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, 3322int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
3316 u8 old_master) 3323 u8 old_master)
3317{ 3324{
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 4060bb328bc8..d490b66ad9d7 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -162,12 +162,28 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
162 162
163 spin_lock(&res->spinlock); 163 spin_lock(&res->spinlock);
164 if (!__dlm_lockres_unused(res)) { 164 if (!__dlm_lockres_unused(res)) {
165 spin_unlock(&res->spinlock);
166 mlog(0, "%s:%.*s: tried to purge but not unused\n", 165 mlog(0, "%s:%.*s: tried to purge but not unused\n",
167 dlm->name, res->lockname.len, res->lockname.name); 166 dlm->name, res->lockname.len, res->lockname.name);
168 return -ENOTEMPTY; 167 __dlm_print_one_lock_resource(res);
168 spin_unlock(&res->spinlock);
169 BUG();
169 } 170 }
171
172 if (res->state & DLM_LOCK_RES_MIGRATING) {
173 mlog(0, "%s:%.*s: Delay dropref as this lockres is "
174 "being remastered\n", dlm->name, res->lockname.len,
175 res->lockname.name);
176 /* Re-add the lockres to the end of the purge list */
177 if (!list_empty(&res->purge)) {
178 list_del_init(&res->purge);
179 list_add_tail(&res->purge, &dlm->purge_list);
180 }
181 spin_unlock(&res->spinlock);
182 return 0;
183 }
184
170 master = (res->owner == dlm->node_num); 185 master = (res->owner == dlm->node_num);
186
171 if (!master) 187 if (!master)
172 res->state |= DLM_LOCK_RES_DROPPING_REF; 188 res->state |= DLM_LOCK_RES_DROPPING_REF;
173 spin_unlock(&res->spinlock); 189 spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 7219a86d34cc..e15fc7d50827 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -244,6 +244,10 @@ static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
244 .flags = 0, 244 .flags = 0,
245}; 245};
246 246
247static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
248 .flags = 0,
249};
250
247static struct ocfs2_lock_res_ops ocfs2_dentry_lops = { 251static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
248 .get_osb = ocfs2_get_dentry_osb, 252 .get_osb = ocfs2_get_dentry_osb,
249 .post_unlock = ocfs2_dentry_post_unlock, 253 .post_unlock = ocfs2_dentry_post_unlock,
@@ -622,6 +626,17 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
622 &ocfs2_rename_lops, osb); 626 &ocfs2_rename_lops, osb);
623} 627}
624 628
629static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
630 struct ocfs2_super *osb)
631{
632 /* nfs_sync lockres doesn't come from a slab so we call init
633 * once on it manually. */
634 ocfs2_lock_res_init_once(res);
635 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_NFS_SYNC, 0, 0, res->l_name);
636 ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_NFS_SYNC,
637 &ocfs2_nfs_sync_lops, osb);
638}
639
625void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres, 640void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
626 struct ocfs2_file_private *fp) 641 struct ocfs2_file_private *fp)
627{ 642{
@@ -2417,6 +2432,34 @@ void ocfs2_rename_unlock(struct ocfs2_super *osb)
2417 ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX); 2432 ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
2418} 2433}
2419 2434
2435int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex)
2436{
2437 int status;
2438 struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
2439
2440 if (ocfs2_is_hard_readonly(osb))
2441 return -EROFS;
2442
2443 if (ocfs2_mount_local(osb))
2444 return 0;
2445
2446 status = ocfs2_cluster_lock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE,
2447 0, 0);
2448 if (status < 0)
2449 mlog(ML_ERROR, "lock on nfs sync lock failed %d\n", status);
2450
2451 return status;
2452}
2453
2454void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex)
2455{
2456 struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
2457
2458 if (!ocfs2_mount_local(osb))
2459 ocfs2_cluster_unlock(osb, lockres,
2460 ex ? LKM_EXMODE : LKM_PRMODE);
2461}
2462
2420int ocfs2_dentry_lock(struct dentry *dentry, int ex) 2463int ocfs2_dentry_lock(struct dentry *dentry, int ex)
2421{ 2464{
2422 int ret; 2465 int ret;
@@ -2798,6 +2841,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
2798local: 2841local:
2799 ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); 2842 ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
2800 ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); 2843 ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
2844 ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
2801 2845
2802 osb->cconn = conn; 2846 osb->cconn = conn;
2803 2847
@@ -2833,6 +2877,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
2833 2877
2834 ocfs2_lock_res_free(&osb->osb_super_lockres); 2878 ocfs2_lock_res_free(&osb->osb_super_lockres);
2835 ocfs2_lock_res_free(&osb->osb_rename_lockres); 2879 ocfs2_lock_res_free(&osb->osb_rename_lockres);
2880 ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
2836 2881
2837 ocfs2_cluster_disconnect(osb->cconn, hangup_pending); 2882 ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
2838 osb->cconn = NULL; 2883 osb->cconn = NULL;
@@ -3015,6 +3060,7 @@ static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
3015{ 3060{
3016 ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres); 3061 ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
3017 ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres); 3062 ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
3063 ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
3018} 3064}
3019 3065
3020int ocfs2_drop_inode_locks(struct inode *inode) 3066int ocfs2_drop_inode_locks(struct inode *inode)
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 3f8d9986b8e0..e1fd5721cd7f 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -115,6 +115,8 @@ void ocfs2_super_unlock(struct ocfs2_super *osb,
115 int ex); 115 int ex);
116int ocfs2_rename_lock(struct ocfs2_super *osb); 116int ocfs2_rename_lock(struct ocfs2_super *osb);
117void ocfs2_rename_unlock(struct ocfs2_super *osb); 117void ocfs2_rename_unlock(struct ocfs2_super *osb);
118int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
119void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex);
118int ocfs2_dentry_lock(struct dentry *dentry, int ex); 120int ocfs2_dentry_lock(struct dentry *dentry, int ex);
119void ocfs2_dentry_unlock(struct dentry *dentry, int ex); 121void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
120int ocfs2_file_lock(struct file *file, int ex, int trylock); 122int ocfs2_file_lock(struct file *file, int ex, int trylock);
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 2f27b332d8b3..15713cbb865c 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -31,6 +31,7 @@
31 31
32#include "ocfs2.h" 32#include "ocfs2.h"
33 33
34#include "alloc.h"
34#include "dir.h" 35#include "dir.h"
35#include "dlmglue.h" 36#include "dlmglue.h"
36#include "dcache.h" 37#include "dcache.h"
@@ -38,6 +39,7 @@
38#include "inode.h" 39#include "inode.h"
39 40
40#include "buffer_head_io.h" 41#include "buffer_head_io.h"
42#include "suballoc.h"
41 43
42struct ocfs2_inode_handle 44struct ocfs2_inode_handle
43{ 45{
@@ -49,29 +51,98 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
49 struct ocfs2_inode_handle *handle) 51 struct ocfs2_inode_handle *handle)
50{ 52{
51 struct inode *inode; 53 struct inode *inode;
54 struct ocfs2_super *osb = OCFS2_SB(sb);
55 u64 blkno = handle->ih_blkno;
56 int status, set;
52 struct dentry *result; 57 struct dentry *result;
53 58
54 mlog_entry("(0x%p, 0x%p)\n", sb, handle); 59 mlog_entry("(0x%p, 0x%p)\n", sb, handle);
55 60
56 if (handle->ih_blkno == 0) { 61 if (blkno == 0) {
57 mlog_errno(-ESTALE); 62 mlog(0, "nfs wants inode with blkno: 0\n");
58 return ERR_PTR(-ESTALE); 63 result = ERR_PTR(-ESTALE);
64 goto bail;
65 }
66
67 inode = ocfs2_ilookup(sb, blkno);
68 /*
69 * If the inode exists in memory, we only need to check it's
70 * generation number
71 */
72 if (inode)
73 goto check_gen;
74
75 /*
76 * This will synchronize us against ocfs2_delete_inode() on
77 * all nodes
78 */
79 status = ocfs2_nfs_sync_lock(osb, 1);
80 if (status < 0) {
81 mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status);
82 goto check_err;
83 }
84
85 status = ocfs2_test_inode_bit(osb, blkno, &set);
86 if (status < 0) {
87 if (status == -EINVAL) {
88 /*
89 * The blkno NFS gave us doesn't even show up
90 * as an inode, we return -ESTALE to be
91 * nice
92 */
93 mlog(0, "test inode bit failed %d\n", status);
94 status = -ESTALE;
95 } else {
96 mlog(ML_ERROR, "test inode bit failed %d\n", status);
97 }
98 goto unlock_nfs_sync;
99 }
100
101 /* If the inode allocator bit is clear, this inode must be stale */
102 if (!set) {
103 mlog(0, "inode %llu suballoc bit is clear\n",
104 (unsigned long long)blkno);
105 status = -ESTALE;
106 goto unlock_nfs_sync;
59 } 107 }
60 108
61 inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0, 0); 109 inode = ocfs2_iget(osb, blkno, 0, 0);
62 110
63 if (IS_ERR(inode)) 111unlock_nfs_sync:
64 return (void *)inode; 112 ocfs2_nfs_sync_unlock(osb, 1);
65 113
114check_err:
115 if (status < 0) {
116 if (status == -ESTALE) {
117 mlog(0, "stale inode ino: %llu generation: %u\n",
118 (unsigned long long)blkno, handle->ih_generation);
119 }
120 result = ERR_PTR(status);
121 goto bail;
122 }
123
124 if (IS_ERR(inode)) {
125 mlog_errno(PTR_ERR(inode));
126 result = (void *)inode;
127 goto bail;
128 }
129
130check_gen:
66 if (handle->ih_generation != inode->i_generation) { 131 if (handle->ih_generation != inode->i_generation) {
67 iput(inode); 132 iput(inode);
68 return ERR_PTR(-ESTALE); 133 mlog(0, "stale inode ino: %llu generation: %u\n",
134 (unsigned long long)blkno, handle->ih_generation);
135 result = ERR_PTR(-ESTALE);
136 goto bail;
69 } 137 }
70 138
71 result = d_obtain_alias(inode); 139 result = d_obtain_alias(inode);
72 if (!IS_ERR(result)) 140 if (!IS_ERR(result))
73 result->d_op = &ocfs2_dentry_ops; 141 result->d_op = &ocfs2_dentry_ops;
142 else
143 mlog_errno(PTR_ERR(result));
74 144
145bail:
75 mlog_exit_ptr(result); 146 mlog_exit_ptr(result);
76 return result; 147 return result;
77} 148}
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index a5887df2cd8a..c2a87c885b73 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1912,6 +1912,22 @@ out_sems:
1912 return written ? written : ret; 1912 return written ? written : ret;
1913} 1913}
1914 1914
1915static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
1916 struct file *out,
1917 struct splice_desc *sd)
1918{
1919 int ret;
1920
1921 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos,
1922 sd->total_len, 0, NULL);
1923 if (ret < 0) {
1924 mlog_errno(ret);
1925 return ret;
1926 }
1927
1928 return splice_from_pipe_feed(pipe, sd, pipe_to_file);
1929}
1930
1915static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, 1931static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1916 struct file *out, 1932 struct file *out,
1917 loff_t *ppos, 1933 loff_t *ppos,
@@ -1919,34 +1935,76 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1919 unsigned int flags) 1935 unsigned int flags)
1920{ 1936{
1921 int ret; 1937 int ret;
1922 struct inode *inode = out->f_path.dentry->d_inode; 1938 struct address_space *mapping = out->f_mapping;
1939 struct inode *inode = mapping->host;
1940 struct splice_desc sd = {
1941 .total_len = len,
1942 .flags = flags,
1943 .pos = *ppos,
1944 .u.file = out,
1945 };
1923 1946
1924 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe, 1947 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,
1925 (unsigned int)len, 1948 (unsigned int)len,
1926 out->f_path.dentry->d_name.len, 1949 out->f_path.dentry->d_name.len,
1927 out->f_path.dentry->d_name.name); 1950 out->f_path.dentry->d_name.name);
1928 1951
1929 inode_double_lock(inode, pipe->inode); 1952 if (pipe->inode)
1953 mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
1930 1954
1931 ret = ocfs2_rw_lock(inode, 1); 1955 splice_from_pipe_begin(&sd);
1932 if (ret < 0) { 1956 do {
1933 mlog_errno(ret); 1957 ret = splice_from_pipe_next(pipe, &sd);
1934 goto out; 1958 if (ret <= 0)
1935 } 1959 break;
1936 1960
1937 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0, 1961 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
1938 NULL); 1962 ret = ocfs2_rw_lock(inode, 1);
1939 if (ret < 0) { 1963 if (ret < 0)
1940 mlog_errno(ret); 1964 mlog_errno(ret);
1941 goto out_unlock; 1965 else {
1942 } 1966 ret = ocfs2_splice_to_file(pipe, out, &sd);
1967 ocfs2_rw_unlock(inode, 1);
1968 }
1969 mutex_unlock(&inode->i_mutex);
1970 } while (ret > 0);
1971 splice_from_pipe_end(pipe, &sd);
1943 1972
1944 ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags); 1973 if (pipe->inode)
1974 mutex_unlock(&pipe->inode->i_mutex);
1945 1975
1946out_unlock: 1976 if (sd.num_spliced)
1947 ocfs2_rw_unlock(inode, 1); 1977 ret = sd.num_spliced;
1948out: 1978
1949 inode_double_unlock(inode, pipe->inode); 1979 if (ret > 0) {
1980 unsigned long nr_pages;
1981
1982 *ppos += ret;
1983 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1984
1985 /*
1986 * If file or inode is SYNC and we actually wrote some data,
1987 * sync it.
1988 */
1989 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
1990 int err;
1991
1992 mutex_lock(&inode->i_mutex);
1993 err = ocfs2_rw_lock(inode, 1);
1994 if (err < 0) {
1995 mlog_errno(err);
1996 } else {
1997 err = generic_osync_inode(inode, mapping,
1998 OSYNC_METADATA|OSYNC_DATA);
1999 ocfs2_rw_unlock(inode, 1);
2000 }
2001 mutex_unlock(&inode->i_mutex);
2002
2003 if (err)
2004 ret = err;
2005 }
2006 balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
2007 }
1950 2008
1951 mlog_exit(ret); 2009 mlog_exit(ret);
1952 return ret; 2010 return ret;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 229e707bc050..10e1fa87396a 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -38,6 +38,7 @@
38#include "ocfs2.h" 38#include "ocfs2.h"
39 39
40#include "alloc.h" 40#include "alloc.h"
41#include "dir.h"
41#include "blockcheck.h" 42#include "blockcheck.h"
42#include "dlmglue.h" 43#include "dlmglue.h"
43#include "extent_map.h" 44#include "extent_map.h"
@@ -112,6 +113,17 @@ void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi)
112 oi->ip_attr |= OCFS2_DIRSYNC_FL; 113 oi->ip_attr |= OCFS2_DIRSYNC_FL;
113} 114}
114 115
116struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno)
117{
118 struct ocfs2_find_inode_args args;
119
120 args.fi_blkno = blkno;
121 args.fi_flags = 0;
122 args.fi_ino = ino_from_blkno(sb, blkno);
123 args.fi_sysfile_type = 0;
124
125 return ilookup5(sb, blkno, ocfs2_find_actor, &args);
126}
115struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, 127struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
116 int sysfile_type) 128 int sysfile_type)
117{ 129{
@@ -275,7 +287,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
275 (unsigned long long)OCFS2_I(inode)->ip_blkno, 287 (unsigned long long)OCFS2_I(inode)->ip_blkno,
276 (unsigned long long)le64_to_cpu(fe->i_blkno)); 288 (unsigned long long)le64_to_cpu(fe->i_blkno));
277 289
278 inode->i_nlink = le16_to_cpu(fe->i_links_count); 290 inode->i_nlink = ocfs2_read_links_count(fe);
279 291
280 if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) { 292 if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
281 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; 293 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
@@ -351,6 +363,8 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
351 363
352 ocfs2_set_inode_flags(inode); 364 ocfs2_set_inode_flags(inode);
353 365
366 OCFS2_I(inode)->ip_last_used_slot = 0;
367 OCFS2_I(inode)->ip_last_used_group = 0;
354 mlog_exit_void(); 368 mlog_exit_void();
355} 369}
356 370
@@ -606,7 +620,7 @@ static int ocfs2_remove_inode(struct inode *inode,
606 } 620 }
607 621
608 handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS + 622 handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS +
609 ocfs2_quota_trans_credits(inode->i_sb)); 623 ocfs2_quota_trans_credits(inode->i_sb));
610 if (IS_ERR(handle)) { 624 if (IS_ERR(handle)) {
611 status = PTR_ERR(handle); 625 status = PTR_ERR(handle);
612 mlog_errno(status); 626 mlog_errno(status);
@@ -740,6 +754,15 @@ static int ocfs2_wipe_inode(struct inode *inode,
740 goto bail_unlock_dir; 754 goto bail_unlock_dir;
741 } 755 }
742 756
757 /* Remove any dir index tree */
758 if (S_ISDIR(inode->i_mode)) {
759 status = ocfs2_dx_dir_truncate(inode, di_bh);
760 if (status) {
761 mlog_errno(status);
762 goto bail_unlock_dir;
763 }
764 }
765
743 /*Free extended attribute resources associated with this inode.*/ 766 /*Free extended attribute resources associated with this inode.*/
744 status = ocfs2_xattr_remove(inode, di_bh); 767 status = ocfs2_xattr_remove(inode, di_bh);
745 if (status < 0) { 768 if (status < 0) {
@@ -949,6 +972,17 @@ void ocfs2_delete_inode(struct inode *inode)
949 goto bail; 972 goto bail;
950 } 973 }
951 974
975 /*
976 * Synchronize us against ocfs2_get_dentry. We take this in
977 * shared mode so that all nodes can still concurrently
978 * process deletes.
979 */
980 status = ocfs2_nfs_sync_lock(OCFS2_SB(inode->i_sb), 0);
981 if (status < 0) {
982 mlog(ML_ERROR, "getting nfs sync lock(PR) failed %d\n", status);
983 ocfs2_cleanup_delete_inode(inode, 0);
984 goto bail_unblock;
985 }
952 /* Lock down the inode. This gives us an up to date view of 986 /* Lock down the inode. This gives us an up to date view of
953 * it's metadata (for verification), and allows us to 987 * it's metadata (for verification), and allows us to
954 * serialize delete_inode on multiple nodes. 988 * serialize delete_inode on multiple nodes.
@@ -962,7 +996,7 @@ void ocfs2_delete_inode(struct inode *inode)
962 if (status != -ENOENT) 996 if (status != -ENOENT)
963 mlog_errno(status); 997 mlog_errno(status);
964 ocfs2_cleanup_delete_inode(inode, 0); 998 ocfs2_cleanup_delete_inode(inode, 0);
965 goto bail_unblock; 999 goto bail_unlock_nfs_sync;
966 } 1000 }
967 1001
968 /* Query the cluster. This will be the final decision made 1002 /* Query the cluster. This will be the final decision made
@@ -1005,6 +1039,10 @@ void ocfs2_delete_inode(struct inode *inode)
1005bail_unlock_inode: 1039bail_unlock_inode:
1006 ocfs2_inode_unlock(inode, 1); 1040 ocfs2_inode_unlock(inode, 1);
1007 brelse(di_bh); 1041 brelse(di_bh);
1042
1043bail_unlock_nfs_sync:
1044 ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0);
1045
1008bail_unblock: 1046bail_unblock:
1009 status = sigprocmask(SIG_SETMASK, &oldset, NULL); 1047 status = sigprocmask(SIG_SETMASK, &oldset, NULL);
1010 if (status < 0) 1048 if (status < 0)
@@ -1205,7 +1243,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
1205 spin_unlock(&OCFS2_I(inode)->ip_lock); 1243 spin_unlock(&OCFS2_I(inode)->ip_lock);
1206 1244
1207 fe->i_size = cpu_to_le64(i_size_read(inode)); 1245 fe->i_size = cpu_to_le64(i_size_read(inode));
1208 fe->i_links_count = cpu_to_le16(inode->i_nlink); 1246 ocfs2_set_links_count(fe, inode->i_nlink);
1209 fe->i_uid = cpu_to_le32(inode->i_uid); 1247 fe->i_uid = cpu_to_le32(inode->i_uid);
1210 fe->i_gid = cpu_to_le32(inode->i_gid); 1248 fe->i_gid = cpu_to_le32(inode->i_gid);
1211 fe->i_mode = cpu_to_le16(inode->i_mode); 1249 fe->i_mode = cpu_to_le16(inode->i_mode);
@@ -1242,7 +1280,7 @@ void ocfs2_refresh_inode(struct inode *inode,
1242 OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features); 1280 OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
1243 ocfs2_set_inode_flags(inode); 1281 ocfs2_set_inode_flags(inode);
1244 i_size_write(inode, le64_to_cpu(fe->i_size)); 1282 i_size_write(inode, le64_to_cpu(fe->i_size));
1245 inode->i_nlink = le16_to_cpu(fe->i_links_count); 1283 inode->i_nlink = ocfs2_read_links_count(fe);
1246 inode->i_uid = le32_to_cpu(fe->i_uid); 1284 inode->i_uid = le32_to_cpu(fe->i_uid);
1247 inode->i_gid = le32_to_cpu(fe->i_gid); 1285 inode->i_gid = le32_to_cpu(fe->i_gid);
1248 inode->i_mode = le16_to_cpu(fe->i_mode); 1286 inode->i_mode = le16_to_cpu(fe->i_mode);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index eb3c302b38d3..ea71525aad41 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -72,6 +72,10 @@ struct ocfs2_inode_info
72 72
73 struct inode vfs_inode; 73 struct inode vfs_inode;
74 struct jbd2_inode ip_jinode; 74 struct jbd2_inode ip_jinode;
75
76 /* Only valid if the inode is the dir. */
77 u32 ip_last_used_slot;
78 u64 ip_last_used_group;
75}; 79};
76 80
77/* 81/*
@@ -124,6 +128,7 @@ void ocfs2_drop_inode(struct inode *inode);
124/* Flags for ocfs2_iget() */ 128/* Flags for ocfs2_iget() */
125#define OCFS2_FI_FLAG_SYSFILE 0x1 129#define OCFS2_FI_FLAG_SYSFILE 0x1
126#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2 130#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2
131struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff);
127struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags, 132struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
128 int sysfile_type); 133 int sysfile_type);
129int ocfs2_inode_init_private(struct inode *inode); 134int ocfs2_inode_init_private(struct inode *inode);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 57d7d25a2b9a..a20a0f1e37fd 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -65,6 +65,11 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb,
65static int ocfs2_recover_orphans(struct ocfs2_super *osb, 65static int ocfs2_recover_orphans(struct ocfs2_super *osb,
66 int slot); 66 int slot);
67static int ocfs2_commit_thread(void *arg); 67static int ocfs2_commit_thread(void *arg);
68static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
69 int slot_num,
70 struct ocfs2_dinode *la_dinode,
71 struct ocfs2_dinode *tl_dinode,
72 struct ocfs2_quota_recovery *qrec);
68 73
69static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb) 74static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
70{ 75{
@@ -76,18 +81,97 @@ static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb)
76 return __ocfs2_wait_on_mount(osb, 1); 81 return __ocfs2_wait_on_mount(osb, 1);
77} 82}
78 83
79
80
81/* 84/*
82 * The recovery_list is a simple linked list of node numbers to recover. 85 * This replay_map is to track online/offline slots, so we could recover
83 * It is protected by the recovery_lock. 86 * offline slots during recovery and mount
84 */ 87 */
85 88
86struct ocfs2_recovery_map { 89enum ocfs2_replay_state {
87 unsigned int rm_used; 90 REPLAY_UNNEEDED = 0, /* Replay is not needed, so ignore this map */
88 unsigned int *rm_entries; 91 REPLAY_NEEDED, /* Replay slots marked in rm_replay_slots */
92 REPLAY_DONE /* Replay was already queued */
89}; 93};
90 94
95struct ocfs2_replay_map {
96 unsigned int rm_slots;
97 enum ocfs2_replay_state rm_state;
98 unsigned char rm_replay_slots[0];
99};
100
101void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state)
102{
103 if (!osb->replay_map)
104 return;
105
106 /* If we've already queued the replay, we don't have any more to do */
107 if (osb->replay_map->rm_state == REPLAY_DONE)
108 return;
109
110 osb->replay_map->rm_state = state;
111}
112
113int ocfs2_compute_replay_slots(struct ocfs2_super *osb)
114{
115 struct ocfs2_replay_map *replay_map;
116 int i, node_num;
117
118 /* If replay map is already set, we don't do it again */
119 if (osb->replay_map)
120 return 0;
121
122 replay_map = kzalloc(sizeof(struct ocfs2_replay_map) +
123 (osb->max_slots * sizeof(char)), GFP_KERNEL);
124
125 if (!replay_map) {
126 mlog_errno(-ENOMEM);
127 return -ENOMEM;
128 }
129
130 spin_lock(&osb->osb_lock);
131
132 replay_map->rm_slots = osb->max_slots;
133 replay_map->rm_state = REPLAY_UNNEEDED;
134
135 /* set rm_replay_slots for offline slot(s) */
136 for (i = 0; i < replay_map->rm_slots; i++) {
137 if (ocfs2_slot_to_node_num_locked(osb, i, &node_num) == -ENOENT)
138 replay_map->rm_replay_slots[i] = 1;
139 }
140
141 osb->replay_map = replay_map;
142 spin_unlock(&osb->osb_lock);
143 return 0;
144}
145
146void ocfs2_queue_replay_slots(struct ocfs2_super *osb)
147{
148 struct ocfs2_replay_map *replay_map = osb->replay_map;
149 int i;
150
151 if (!replay_map)
152 return;
153
154 if (replay_map->rm_state != REPLAY_NEEDED)
155 return;
156
157 for (i = 0; i < replay_map->rm_slots; i++)
158 if (replay_map->rm_replay_slots[i])
159 ocfs2_queue_recovery_completion(osb->journal, i, NULL,
160 NULL, NULL);
161 replay_map->rm_state = REPLAY_DONE;
162}
163
164void ocfs2_free_replay_slots(struct ocfs2_super *osb)
165{
166 struct ocfs2_replay_map *replay_map = osb->replay_map;
167
168 if (!osb->replay_map)
169 return;
170
171 kfree(replay_map);
172 osb->replay_map = NULL;
173}
174
91int ocfs2_recovery_init(struct ocfs2_super *osb) 175int ocfs2_recovery_init(struct ocfs2_super *osb)
92{ 176{
93 struct ocfs2_recovery_map *rm; 177 struct ocfs2_recovery_map *rm;
@@ -496,6 +580,22 @@ static struct ocfs2_triggers dq_triggers = {
496 }, 580 },
497}; 581};
498 582
583static struct ocfs2_triggers dr_triggers = {
584 .ot_triggers = {
585 .t_commit = ocfs2_commit_trigger,
586 .t_abort = ocfs2_abort_trigger,
587 },
588 .ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check),
589};
590
591static struct ocfs2_triggers dl_triggers = {
592 .ot_triggers = {
593 .t_commit = ocfs2_commit_trigger,
594 .t_abort = ocfs2_abort_trigger,
595 },
596 .ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check),
597};
598
499static int __ocfs2_journal_access(handle_t *handle, 599static int __ocfs2_journal_access(handle_t *handle,
500 struct inode *inode, 600 struct inode *inode,
501 struct buffer_head *bh, 601 struct buffer_head *bh,
@@ -600,6 +700,20 @@ int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
600 type); 700 type);
601} 701}
602 702
703int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode,
704 struct buffer_head *bh, int type)
705{
706 return __ocfs2_journal_access(handle, inode, bh, &dr_triggers,
707 type);
708}
709
710int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode,
711 struct buffer_head *bh, int type)
712{
713 return __ocfs2_journal_access(handle, inode, bh, &dl_triggers,
714 type);
715}
716
603int ocfs2_journal_access(handle_t *handle, struct inode *inode, 717int ocfs2_journal_access(handle_t *handle, struct inode *inode,
604 struct buffer_head *bh, int type) 718 struct buffer_head *bh, int type)
605{ 719{
@@ -1176,24 +1290,24 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
1176} 1290}
1177 1291
1178/* Called by the mount code to queue recovery the last part of 1292/* Called by the mount code to queue recovery the last part of
1179 * recovery for it's own slot. */ 1293 * recovery for it's own and offline slot(s). */
1180void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) 1294void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
1181{ 1295{
1182 struct ocfs2_journal *journal = osb->journal; 1296 struct ocfs2_journal *journal = osb->journal;
1183 1297
1184 if (osb->dirty) { 1298 /* No need to queue up our truncate_log as regular cleanup will catch
1185 /* No need to queue up our truncate_log as regular 1299 * that */
1186 * cleanup will catch that. */ 1300 ocfs2_queue_recovery_completion(journal, osb->slot_num,
1187 ocfs2_queue_recovery_completion(journal, 1301 osb->local_alloc_copy, NULL, NULL);
1188 osb->slot_num, 1302 ocfs2_schedule_truncate_log_flush(osb, 0);
1189 osb->local_alloc_copy,
1190 NULL,
1191 NULL);
1192 ocfs2_schedule_truncate_log_flush(osb, 0);
1193 1303
1194 osb->local_alloc_copy = NULL; 1304 osb->local_alloc_copy = NULL;
1195 osb->dirty = 0; 1305 osb->dirty = 0;
1196 } 1306
1307 /* queue to recover orphan slots for all offline slots */
1308 ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
1309 ocfs2_queue_replay_slots(osb);
1310 ocfs2_free_replay_slots(osb);
1197} 1311}
1198 1312
1199void ocfs2_complete_quota_recovery(struct ocfs2_super *osb) 1313void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
@@ -1236,6 +1350,14 @@ restart:
1236 goto bail; 1350 goto bail;
1237 } 1351 }
1238 1352
1353 status = ocfs2_compute_replay_slots(osb);
1354 if (status < 0)
1355 mlog_errno(status);
1356
1357 /* queue recovery for our own slot */
1358 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
1359 NULL, NULL);
1360
1239 spin_lock(&osb->osb_lock); 1361 spin_lock(&osb->osb_lock);
1240 while (rm->rm_used) { 1362 while (rm->rm_used) {
1241 /* It's always safe to remove entry zero, as we won't 1363 /* It's always safe to remove entry zero, as we won't
@@ -1301,11 +1423,8 @@ skip_recovery:
1301 1423
1302 ocfs2_super_unlock(osb, 1); 1424 ocfs2_super_unlock(osb, 1);
1303 1425
1304 /* We always run recovery on our own orphan dir - the dead 1426 /* queue recovery for offline slots */
1305 * node(s) may have disallowd a previos inode delete. Re-processing 1427 ocfs2_queue_replay_slots(osb);
1306 * is therefore required. */
1307 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
1308 NULL, NULL);
1309 1428
1310bail: 1429bail:
1311 mutex_lock(&osb->recovery_lock); 1430 mutex_lock(&osb->recovery_lock);
@@ -1314,6 +1433,7 @@ bail:
1314 goto restart; 1433 goto restart;
1315 } 1434 }
1316 1435
1436 ocfs2_free_replay_slots(osb);
1317 osb->recovery_thread_task = NULL; 1437 osb->recovery_thread_task = NULL;
1318 mb(); /* sync with ocfs2_recovery_thread_running */ 1438 mb(); /* sync with ocfs2_recovery_thread_running */
1319 wake_up(&osb->recovery_event); 1439 wake_up(&osb->recovery_event);
@@ -1465,6 +1585,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1465 goto done; 1585 goto done;
1466 } 1586 }
1467 1587
1588 /* we need to run complete recovery for offline orphan slots */
1589 ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
1590
1468 mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n", 1591 mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",
1469 node_num, slot_num, 1592 node_num, slot_num,
1470 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 1593 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 172850a9a12a..eb7b76331eb7 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -38,6 +38,17 @@ enum ocfs2_journal_state {
38struct ocfs2_super; 38struct ocfs2_super;
39struct ocfs2_dinode; 39struct ocfs2_dinode;
40 40
41/*
42 * The recovery_list is a simple linked list of node numbers to recover.
43 * It is protected by the recovery_lock.
44 */
45
46struct ocfs2_recovery_map {
47 unsigned int rm_used;
48 unsigned int *rm_entries;
49};
50
51
41struct ocfs2_journal { 52struct ocfs2_journal {
42 enum ocfs2_journal_state j_state; /* Journals current state */ 53 enum ocfs2_journal_state j_state; /* Journals current state */
43 54
@@ -139,6 +150,7 @@ void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
139int ocfs2_recovery_init(struct ocfs2_super *osb); 150int ocfs2_recovery_init(struct ocfs2_super *osb);
140void ocfs2_recovery_exit(struct ocfs2_super *osb); 151void ocfs2_recovery_exit(struct ocfs2_super *osb);
141 152
153int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
142/* 154/*
143 * Journal Control: 155 * Journal Control:
144 * Initialize, Load, Shutdown, Wipe a journal. 156 * Initialize, Load, Shutdown, Wipe a journal.
@@ -266,6 +278,12 @@ int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
266/* dirblock */ 278/* dirblock */
267int ocfs2_journal_access_db(handle_t *handle, struct inode *inode, 279int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
268 struct buffer_head *bh, int type); 280 struct buffer_head *bh, int type);
281/* ocfs2_dx_root_block */
282int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode,
283 struct buffer_head *bh, int type);
284/* ocfs2_dx_leaf */
285int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode,
286 struct buffer_head *bh, int type);
269/* Anything that has no ecc */ 287/* Anything that has no ecc */
270int ocfs2_journal_access(handle_t *handle, struct inode *inode, 288int ocfs2_journal_access(handle_t *handle, struct inode *inode,
271 struct buffer_head *bh, int type); 289 struct buffer_head *bh, int type);
@@ -368,14 +386,29 @@ static inline int ocfs2_remove_extent_credits(struct super_block *sb)
368} 386}
369 387
370/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe + 388/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
371 * bitmap block for the new bit) */ 389 * bitmap block for the new bit) dx_root update for free list */
372#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2) 390#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2 + 1)
391
392static inline int ocfs2_add_dir_index_credits(struct super_block *sb)
393{
394 /* 1 block for index, 2 allocs (data, metadata), 1 clusters
395 * worth of blocks for initial extent. */
396 return 1 + 2 * OCFS2_SUBALLOC_ALLOC +
397 ocfs2_clusters_to_blocks(sb, 1);
398}
373 399
374/* parent fe, parent block, new file entry, inode alloc fe, inode alloc 400/* parent fe, parent block, new file entry, index leaf, inode alloc fe, inode
375 * group descriptor + mkdir/symlink blocks + quota update */ 401 * alloc group descriptor + mkdir/symlink blocks + dir blocks + xattr
376static inline int ocfs2_mknod_credits(struct super_block *sb) 402 * blocks + quota update */
403static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir,
404 int xattr_credits)
377{ 405{
378 return 3 + OCFS2_SUBALLOC_ALLOC + OCFS2_DIR_LINK_ADDITIONAL_CREDITS + 406 int dir_credits = OCFS2_DIR_LINK_ADDITIONAL_CREDITS;
407
408 if (is_dir)
409 dir_credits += ocfs2_add_dir_index_credits(sb);
410
411 return 4 + OCFS2_SUBALLOC_ALLOC + dir_credits + xattr_credits +
379 ocfs2_quota_trans_credits(sb); 412 ocfs2_quota_trans_credits(sb);
380} 413}
381 414
@@ -388,31 +421,32 @@ static inline int ocfs2_mknod_credits(struct super_block *sb)
388#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) 421#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
389 422
390/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota 423/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
391 * update on dir */ 424 * update on dir + index leaf + dx root update for free list */
392static inline int ocfs2_link_credits(struct super_block *sb) 425static inline int ocfs2_link_credits(struct super_block *sb)
393{ 426{
394 return 2*OCFS2_INODE_UPDATE_CREDITS + 1 + 427 return 2*OCFS2_INODE_UPDATE_CREDITS + 3 +
395 ocfs2_quota_trans_credits(sb); 428 ocfs2_quota_trans_credits(sb);
396} 429}
397 430
398/* inode + dir inode (if we unlink a dir), + dir entry block + orphan 431/* inode + dir inode (if we unlink a dir), + dir entry block + orphan
399 * dir inode link */ 432 * dir inode link + dir inode index leaf + dir index root */
400static inline int ocfs2_unlink_credits(struct super_block *sb) 433static inline int ocfs2_unlink_credits(struct super_block *sb)
401{ 434{
402 /* The quota update from ocfs2_link_credits is unused here... */ 435 /* The quota update from ocfs2_link_credits is unused here... */
403 return 2 * OCFS2_INODE_UPDATE_CREDITS + 1 + ocfs2_link_credits(sb); 436 return 2 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_link_credits(sb);
404} 437}
405 438
406/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry + 439/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
407 * inode alloc group descriptor */ 440 * inode alloc group descriptor + orphan dir index root +
408#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 1 + 1) 441 * orphan dir index leaf */
442#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 4)
409 443
410/* dinode update, old dir dinode update, new dir dinode update, old 444/* dinode update, old dir dinode update, new dir dinode update, old
411 * dir dir entry, new dir dir entry, dir entry update for renaming 445 * dir dir entry, new dir dir entry, dir entry update for renaming
412 * directory + target unlink */ 446 * directory + target unlink + 3 x dir index leaves */
413static inline int ocfs2_rename_credits(struct super_block *sb) 447static inline int ocfs2_rename_credits(struct super_block *sb)
414{ 448{
415 return 3 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_unlink_credits(sb); 449 return 3 * OCFS2_INODE_UPDATE_CREDITS + 6 + ocfs2_unlink_credits(sb);
416} 450}
417 451
418/* global bitmap dinode, group desc., relinked group, 452/* global bitmap dinode, group desc., relinked group,
@@ -422,6 +456,20 @@ static inline int ocfs2_rename_credits(struct super_block *sb)
422 + OCFS2_INODE_UPDATE_CREDITS \ 456 + OCFS2_INODE_UPDATE_CREDITS \
423 + OCFS2_XATTR_BLOCK_UPDATE_CREDITS) 457 + OCFS2_XATTR_BLOCK_UPDATE_CREDITS)
424 458
459/* inode update, removal of dx root block from allocator */
460#define OCFS2_DX_ROOT_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \
461 OCFS2_SUBALLOC_FREE)
462
463static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb)
464{
465 int credits = 1 + OCFS2_SUBALLOC_ALLOC;
466
467 credits += ocfs2_clusters_to_blocks(sb, 1);
468 credits += ocfs2_quota_trans_credits(sb);
469
470 return credits;
471}
472
425/* 473/*
426 * Please note that the caller must make sure that root_el is the root 474 * Please note that the caller must make sure that root_el is the root
427 * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise 475 * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
@@ -457,7 +505,7 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
457 505
458static inline int ocfs2_calc_symlink_credits(struct super_block *sb) 506static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
459{ 507{
460 int blocks = ocfs2_mknod_credits(sb); 508 int blocks = ocfs2_mknod_credits(sb, 0, 0);
461 509
462 /* links can be longer than one block so we may update many 510 /* links can be longer than one block so we may update many
463 * within our single allocated extent. */ 511 * within our single allocated extent. */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ec70cdbe77fc..bac7e6abaf47 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -28,7 +28,6 @@
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/bitops.h> 30#include <linux/bitops.h>
31#include <linux/debugfs.h>
32 31
33#define MLOG_MASK_PREFIX ML_DISK_ALLOC 32#define MLOG_MASK_PREFIX ML_DISK_ALLOC
34#include <cluster/masklog.h> 33#include <cluster/masklog.h>
@@ -75,84 +74,6 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
75static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, 74static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
76 struct inode *local_alloc_inode); 75 struct inode *local_alloc_inode);
77 76
78#ifdef CONFIG_OCFS2_FS_STATS
79
80static int ocfs2_la_debug_open(struct inode *inode, struct file *file)
81{
82 file->private_data = inode->i_private;
83 return 0;
84}
85
86#define LA_DEBUG_BUF_SZ PAGE_CACHE_SIZE
87#define LA_DEBUG_VER 1
88static ssize_t ocfs2_la_debug_read(struct file *file, char __user *userbuf,
89 size_t count, loff_t *ppos)
90{
91 static DEFINE_MUTEX(la_debug_mutex);
92 struct ocfs2_super *osb = file->private_data;
93 int written, ret;
94 char *buf = osb->local_alloc_debug_buf;
95
96 mutex_lock(&la_debug_mutex);
97 memset(buf, 0, LA_DEBUG_BUF_SZ);
98
99 written = snprintf(buf, LA_DEBUG_BUF_SZ,
100 "0x%x\t0x%llx\t%u\t%u\t0x%x\n",
101 LA_DEBUG_VER,
102 (unsigned long long)osb->la_last_gd,
103 osb->local_alloc_default_bits,
104 osb->local_alloc_bits, osb->local_alloc_state);
105
106 ret = simple_read_from_buffer(userbuf, count, ppos, buf, written);
107
108 mutex_unlock(&la_debug_mutex);
109 return ret;
110}
111
112static const struct file_operations ocfs2_la_debug_fops = {
113 .open = ocfs2_la_debug_open,
114 .read = ocfs2_la_debug_read,
115};
116
117static void ocfs2_init_la_debug(struct ocfs2_super *osb)
118{
119 osb->local_alloc_debug_buf = kmalloc(LA_DEBUG_BUF_SZ, GFP_NOFS);
120 if (!osb->local_alloc_debug_buf)
121 return;
122
123 osb->local_alloc_debug = debugfs_create_file("local_alloc_stats",
124 S_IFREG|S_IRUSR,
125 osb->osb_debug_root,
126 osb,
127 &ocfs2_la_debug_fops);
128 if (!osb->local_alloc_debug) {
129 kfree(osb->local_alloc_debug_buf);
130 osb->local_alloc_debug_buf = NULL;
131 }
132}
133
134static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
135{
136 if (osb->local_alloc_debug)
137 debugfs_remove(osb->local_alloc_debug);
138
139 if (osb->local_alloc_debug_buf)
140 kfree(osb->local_alloc_debug_buf);
141
142 osb->local_alloc_debug_buf = NULL;
143 osb->local_alloc_debug = NULL;
144}
145#else /* CONFIG_OCFS2_FS_STATS */
146static void ocfs2_init_la_debug(struct ocfs2_super *osb)
147{
148 return;
149}
150static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
151{
152 return;
153}
154#endif
155
156static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb) 77static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
157{ 78{
158 return (osb->local_alloc_state == OCFS2_LA_THROTTLED || 79 return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
@@ -226,8 +147,6 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
226 147
227 mlog_entry_void(); 148 mlog_entry_void();
228 149
229 ocfs2_init_la_debug(osb);
230
231 if (osb->local_alloc_bits == 0) 150 if (osb->local_alloc_bits == 0)
232 goto bail; 151 goto bail;
233 152
@@ -299,9 +218,6 @@ bail:
299 if (inode) 218 if (inode)
300 iput(inode); 219 iput(inode);
301 220
302 if (status < 0)
303 ocfs2_shutdown_la_debug(osb);
304
305 mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits); 221 mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits);
306 222
307 mlog_exit(status); 223 mlog_exit(status);
@@ -331,8 +247,6 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
331 cancel_delayed_work(&osb->la_enable_wq); 247 cancel_delayed_work(&osb->la_enable_wq);
332 flush_workqueue(ocfs2_wq); 248 flush_workqueue(ocfs2_wq);
333 249
334 ocfs2_shutdown_la_debug(osb);
335
336 if (osb->local_alloc_state == OCFS2_LA_UNUSED) 250 if (osb->local_alloc_state == OCFS2_LA_UNUSED)
337 goto out; 251 goto out;
338 252
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index eea1d24713ea..b606496b72ec 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -154,8 +154,9 @@ out:
154 return ret; 154 return ret;
155} 155}
156 156
157static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) 157static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
158{ 158{
159 struct page *page = vmf->page;
159 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 160 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
160 struct buffer_head *di_bh = NULL; 161 struct buffer_head *di_bh = NULL;
161 sigset_t blocked, oldset; 162 sigset_t blocked, oldset;
@@ -196,7 +197,8 @@ out:
196 ret2 = ocfs2_vm_op_unblock_sigs(&oldset); 197 ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
197 if (ret2 < 0) 198 if (ret2 < 0)
198 mlog_errno(ret2); 199 mlog_errno(ret2);
199 200 if (ret)
201 ret = VM_FAULT_SIGBUS;
200 return ret; 202 return ret;
201} 203}
202 204
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 4b11762f249e..33464c6b60a2 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -80,14 +80,14 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
80 struct inode **ret_orphan_dir, 80 struct inode **ret_orphan_dir,
81 struct inode *inode, 81 struct inode *inode,
82 char *name, 82 char *name,
83 struct buffer_head **de_bh); 83 struct ocfs2_dir_lookup_result *lookup);
84 84
85static int ocfs2_orphan_add(struct ocfs2_super *osb, 85static int ocfs2_orphan_add(struct ocfs2_super *osb,
86 handle_t *handle, 86 handle_t *handle,
87 struct inode *inode, 87 struct inode *inode,
88 struct ocfs2_dinode *fe, 88 struct ocfs2_dinode *fe,
89 char *name, 89 char *name,
90 struct buffer_head *de_bh, 90 struct ocfs2_dir_lookup_result *lookup,
91 struct inode *orphan_dir_inode); 91 struct inode *orphan_dir_inode);
92 92
93static int ocfs2_create_symlink_data(struct ocfs2_super *osb, 93static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
@@ -228,17 +228,18 @@ static int ocfs2_mknod(struct inode *dir,
228 struct ocfs2_super *osb; 228 struct ocfs2_super *osb;
229 struct ocfs2_dinode *dirfe; 229 struct ocfs2_dinode *dirfe;
230 struct buffer_head *new_fe_bh = NULL; 230 struct buffer_head *new_fe_bh = NULL;
231 struct buffer_head *de_bh = NULL;
232 struct inode *inode = NULL; 231 struct inode *inode = NULL;
233 struct ocfs2_alloc_context *inode_ac = NULL; 232 struct ocfs2_alloc_context *inode_ac = NULL;
234 struct ocfs2_alloc_context *data_ac = NULL; 233 struct ocfs2_alloc_context *data_ac = NULL;
235 struct ocfs2_alloc_context *xattr_ac = NULL; 234 struct ocfs2_alloc_context *meta_ac = NULL;
236 int want_clusters = 0; 235 int want_clusters = 0;
236 int want_meta = 0;
237 int xattr_credits = 0; 237 int xattr_credits = 0;
238 struct ocfs2_security_xattr_info si = { 238 struct ocfs2_security_xattr_info si = {
239 .enable = 1, 239 .enable = 1,
240 }; 240 };
241 int did_quota_inode = 0; 241 int did_quota_inode = 0;
242 struct ocfs2_dir_lookup_result lookup = { NULL, };
242 243
243 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, 244 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
244 (unsigned long)dev, dentry->d_name.len, 245 (unsigned long)dev, dentry->d_name.len,
@@ -254,13 +255,13 @@ static int ocfs2_mknod(struct inode *dir,
254 return status; 255 return status;
255 } 256 }
256 257
257 if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) { 258 if (S_ISDIR(mode) && (dir->i_nlink >= ocfs2_link_max(osb))) {
258 status = -EMLINK; 259 status = -EMLINK;
259 goto leave; 260 goto leave;
260 } 261 }
261 262
262 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data; 263 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
263 if (!dirfe->i_links_count) { 264 if (!ocfs2_read_links_count(dirfe)) {
264 /* can't make a file in a deleted directory. */ 265 /* can't make a file in a deleted directory. */
265 status = -ENOENT; 266 status = -ENOENT;
266 goto leave; 267 goto leave;
@@ -274,7 +275,7 @@ static int ocfs2_mknod(struct inode *dir,
274 /* get a spot inside the dir. */ 275 /* get a spot inside the dir. */
275 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, 276 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
276 dentry->d_name.name, 277 dentry->d_name.name,
277 dentry->d_name.len, &de_bh); 278 dentry->d_name.len, &lookup);
278 if (status < 0) { 279 if (status < 0) {
279 mlog_errno(status); 280 mlog_errno(status);
280 goto leave; 281 goto leave;
@@ -308,17 +309,29 @@ static int ocfs2_mknod(struct inode *dir,
308 309
309 /* calculate meta data/clusters for setting security and acl xattr */ 310 /* calculate meta data/clusters for setting security and acl xattr */
310 status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode, 311 status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode,
311 &si, &want_clusters, 312 &si, &want_clusters,
312 &xattr_credits, &xattr_ac); 313 &xattr_credits, &want_meta);
313 if (status < 0) { 314 if (status < 0) {
314 mlog_errno(status); 315 mlog_errno(status);
315 goto leave; 316 goto leave;
316 } 317 }
317 318
318 /* Reserve a cluster if creating an extent based directory. */ 319 /* Reserve a cluster if creating an extent based directory. */
319 if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) 320 if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) {
320 want_clusters += 1; 321 want_clusters += 1;
321 322
323 /* Dir indexing requires extra space as well */
324 if (ocfs2_supports_indexed_dirs(osb))
325 want_meta++;
326 }
327
328 status = ocfs2_reserve_new_metadata_blocks(osb, want_meta, &meta_ac);
329 if (status < 0) {
330 if (status != -ENOSPC)
331 mlog_errno(status);
332 goto leave;
333 }
334
322 status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac); 335 status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
323 if (status < 0) { 336 if (status < 0) {
324 if (status != -ENOSPC) 337 if (status != -ENOSPC)
@@ -326,8 +339,9 @@ static int ocfs2_mknod(struct inode *dir,
326 goto leave; 339 goto leave;
327 } 340 }
328 341
329 handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb) + 342 handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb,
330 xattr_credits); 343 S_ISDIR(mode),
344 xattr_credits));
331 if (IS_ERR(handle)) { 345 if (IS_ERR(handle)) {
332 status = PTR_ERR(handle); 346 status = PTR_ERR(handle);
333 handle = NULL; 347 handle = NULL;
@@ -355,7 +369,7 @@ static int ocfs2_mknod(struct inode *dir,
355 369
356 if (S_ISDIR(mode)) { 370 if (S_ISDIR(mode)) {
357 status = ocfs2_fill_new_dir(osb, handle, dir, inode, 371 status = ocfs2_fill_new_dir(osb, handle, dir, inode,
358 new_fe_bh, data_ac); 372 new_fe_bh, data_ac, meta_ac);
359 if (status < 0) { 373 if (status < 0) {
360 mlog_errno(status); 374 mlog_errno(status);
361 goto leave; 375 goto leave;
@@ -367,7 +381,7 @@ static int ocfs2_mknod(struct inode *dir,
367 mlog_errno(status); 381 mlog_errno(status);
368 goto leave; 382 goto leave;
369 } 383 }
370 le16_add_cpu(&dirfe->i_links_count, 1); 384 ocfs2_add_links_count(dirfe, 1);
371 status = ocfs2_journal_dirty(handle, parent_fe_bh); 385 status = ocfs2_journal_dirty(handle, parent_fe_bh);
372 if (status < 0) { 386 if (status < 0) {
373 mlog_errno(status); 387 mlog_errno(status);
@@ -377,7 +391,7 @@ static int ocfs2_mknod(struct inode *dir,
377 } 391 }
378 392
379 status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh, 393 status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
380 xattr_ac, data_ac); 394 meta_ac, data_ac);
381 if (status < 0) { 395 if (status < 0) {
382 mlog_errno(status); 396 mlog_errno(status);
383 goto leave; 397 goto leave;
@@ -385,7 +399,7 @@ static int ocfs2_mknod(struct inode *dir,
385 399
386 if (si.enable) { 400 if (si.enable) {
387 status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si, 401 status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
388 xattr_ac, data_ac); 402 meta_ac, data_ac);
389 if (status < 0) { 403 if (status < 0) {
390 mlog_errno(status); 404 mlog_errno(status);
391 goto leave; 405 goto leave;
@@ -394,7 +408,7 @@ static int ocfs2_mknod(struct inode *dir,
394 408
395 status = ocfs2_add_entry(handle, dentry, inode, 409 status = ocfs2_add_entry(handle, dentry, inode,
396 OCFS2_I(inode)->ip_blkno, parent_fe_bh, 410 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
397 de_bh); 411 &lookup);
398 if (status < 0) { 412 if (status < 0) {
399 mlog_errno(status); 413 mlog_errno(status);
400 goto leave; 414 goto leave;
@@ -423,11 +437,12 @@ leave:
423 mlog(0, "Disk is full\n"); 437 mlog(0, "Disk is full\n");
424 438
425 brelse(new_fe_bh); 439 brelse(new_fe_bh);
426 brelse(de_bh);
427 brelse(parent_fe_bh); 440 brelse(parent_fe_bh);
428 kfree(si.name); 441 kfree(si.name);
429 kfree(si.value); 442 kfree(si.value);
430 443
444 ocfs2_free_dir_lookup_result(&lookup);
445
431 if ((status < 0) && inode) { 446 if ((status < 0) && inode) {
432 clear_nlink(inode); 447 clear_nlink(inode);
433 iput(inode); 448 iput(inode);
@@ -439,8 +454,8 @@ leave:
439 if (data_ac) 454 if (data_ac)
440 ocfs2_free_alloc_context(data_ac); 455 ocfs2_free_alloc_context(data_ac);
441 456
442 if (xattr_ac) 457 if (meta_ac)
443 ocfs2_free_alloc_context(xattr_ac); 458 ocfs2_free_alloc_context(meta_ac);
444 459
445 mlog_exit(status); 460 mlog_exit(status);
446 461
@@ -462,6 +477,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
462 struct ocfs2_extent_list *fel; 477 struct ocfs2_extent_list *fel;
463 u64 fe_blkno = 0; 478 u64 fe_blkno = 0;
464 u16 suballoc_bit; 479 u16 suballoc_bit;
480 u16 feat;
465 481
466 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, 482 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
467 inode->i_mode, (unsigned long)dev, dentry->d_name.len, 483 inode->i_mode, (unsigned long)dev, dentry->d_name.len,
@@ -469,8 +485,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
469 485
470 *new_fe_bh = NULL; 486 *new_fe_bh = NULL;
471 487
472 status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit, 488 status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh,
473 &fe_blkno); 489 inode_ac, &suballoc_bit, &fe_blkno);
474 if (status < 0) { 490 if (status < 0) {
475 mlog_errno(status); 491 mlog_errno(status);
476 goto leave; 492 goto leave;
@@ -513,7 +529,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
513 fe->i_mode = cpu_to_le16(inode->i_mode); 529 fe->i_mode = cpu_to_le16(inode->i_mode);
514 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) 530 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
515 fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev)); 531 fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
516 fe->i_links_count = cpu_to_le16(inode->i_nlink); 532
533 ocfs2_set_links_count(fe, inode->i_nlink);
517 534
518 fe->i_last_eb_blk = 0; 535 fe->i_last_eb_blk = 0;
519 strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE); 536 strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
@@ -525,11 +542,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
525 fe->i_dtime = 0; 542 fe->i_dtime = 0;
526 543
527 /* 544 /*
528 * If supported, directories start with inline data. 545 * If supported, directories start with inline data. If inline
546 * isn't supported, but indexing is, we start them as indexed.
529 */ 547 */
548 feat = le16_to_cpu(fe->i_dyn_features);
530 if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) { 549 if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) {
531 u16 feat = le16_to_cpu(fe->i_dyn_features);
532
533 fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL); 550 fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
534 551
535 fe->id2.i_data.id_count = cpu_to_le16( 552 fe->id2.i_data.id_count = cpu_to_le16(
@@ -608,9 +625,9 @@ static int ocfs2_link(struct dentry *old_dentry,
608 int err; 625 int err;
609 struct buffer_head *fe_bh = NULL; 626 struct buffer_head *fe_bh = NULL;
610 struct buffer_head *parent_fe_bh = NULL; 627 struct buffer_head *parent_fe_bh = NULL;
611 struct buffer_head *de_bh = NULL;
612 struct ocfs2_dinode *fe = NULL; 628 struct ocfs2_dinode *fe = NULL;
613 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 629 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
630 struct ocfs2_dir_lookup_result lookup = { NULL, };
614 631
615 mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino, 632 mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
616 old_dentry->d_name.len, old_dentry->d_name.name, 633 old_dentry->d_name.len, old_dentry->d_name.name,
@@ -638,7 +655,7 @@ static int ocfs2_link(struct dentry *old_dentry,
638 655
639 err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, 656 err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
640 dentry->d_name.name, 657 dentry->d_name.name,
641 dentry->d_name.len, &de_bh); 658 dentry->d_name.len, &lookup);
642 if (err < 0) { 659 if (err < 0) {
643 mlog_errno(err); 660 mlog_errno(err);
644 goto out; 661 goto out;
@@ -652,7 +669,7 @@ static int ocfs2_link(struct dentry *old_dentry,
652 } 669 }
653 670
654 fe = (struct ocfs2_dinode *) fe_bh->b_data; 671 fe = (struct ocfs2_dinode *) fe_bh->b_data;
655 if (le16_to_cpu(fe->i_links_count) >= OCFS2_LINK_MAX) { 672 if (ocfs2_read_links_count(fe) >= ocfs2_link_max(osb)) {
656 err = -EMLINK; 673 err = -EMLINK;
657 goto out_unlock_inode; 674 goto out_unlock_inode;
658 } 675 }
@@ -674,13 +691,13 @@ static int ocfs2_link(struct dentry *old_dentry,
674 691
675 inc_nlink(inode); 692 inc_nlink(inode);
676 inode->i_ctime = CURRENT_TIME; 693 inode->i_ctime = CURRENT_TIME;
677 fe->i_links_count = cpu_to_le16(inode->i_nlink); 694 ocfs2_set_links_count(fe, inode->i_nlink);
678 fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 695 fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
679 fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 696 fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
680 697
681 err = ocfs2_journal_dirty(handle, fe_bh); 698 err = ocfs2_journal_dirty(handle, fe_bh);
682 if (err < 0) { 699 if (err < 0) {
683 le16_add_cpu(&fe->i_links_count, -1); 700 ocfs2_add_links_count(fe, -1);
684 drop_nlink(inode); 701 drop_nlink(inode);
685 mlog_errno(err); 702 mlog_errno(err);
686 goto out_commit; 703 goto out_commit;
@@ -688,9 +705,9 @@ static int ocfs2_link(struct dentry *old_dentry,
688 705
689 err = ocfs2_add_entry(handle, dentry, inode, 706 err = ocfs2_add_entry(handle, dentry, inode,
690 OCFS2_I(inode)->ip_blkno, 707 OCFS2_I(inode)->ip_blkno,
691 parent_fe_bh, de_bh); 708 parent_fe_bh, &lookup);
692 if (err) { 709 if (err) {
693 le16_add_cpu(&fe->i_links_count, -1); 710 ocfs2_add_links_count(fe, -1);
694 drop_nlink(inode); 711 drop_nlink(inode);
695 mlog_errno(err); 712 mlog_errno(err);
696 goto out_commit; 713 goto out_commit;
@@ -714,10 +731,11 @@ out_unlock_inode:
714out: 731out:
715 ocfs2_inode_unlock(dir, 1); 732 ocfs2_inode_unlock(dir, 1);
716 733
717 brelse(de_bh);
718 brelse(fe_bh); 734 brelse(fe_bh);
719 brelse(parent_fe_bh); 735 brelse(parent_fe_bh);
720 736
737 ocfs2_free_dir_lookup_result(&lookup);
738
721 mlog_exit(err); 739 mlog_exit(err);
722 740
723 return err; 741 return err;
@@ -766,10 +784,9 @@ static int ocfs2_unlink(struct inode *dir,
766 struct buffer_head *fe_bh = NULL; 784 struct buffer_head *fe_bh = NULL;
767 struct buffer_head *parent_node_bh = NULL; 785 struct buffer_head *parent_node_bh = NULL;
768 handle_t *handle = NULL; 786 handle_t *handle = NULL;
769 struct ocfs2_dir_entry *dirent = NULL;
770 struct buffer_head *dirent_bh = NULL;
771 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; 787 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
772 struct buffer_head *orphan_entry_bh = NULL; 788 struct ocfs2_dir_lookup_result lookup = { NULL, };
789 struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
773 790
774 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, 791 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
775 dentry->d_name.len, dentry->d_name.name); 792 dentry->d_name.len, dentry->d_name.name);
@@ -791,8 +808,8 @@ static int ocfs2_unlink(struct inode *dir,
791 } 808 }
792 809
793 status = ocfs2_find_files_on_disk(dentry->d_name.name, 810 status = ocfs2_find_files_on_disk(dentry->d_name.name,
794 dentry->d_name.len, &blkno, 811 dentry->d_name.len, &blkno, dir,
795 dir, &dirent_bh, &dirent); 812 &lookup);
796 if (status < 0) { 813 if (status < 0) {
797 if (status != -ENOENT) 814 if (status != -ENOENT)
798 mlog_errno(status); 815 mlog_errno(status);
@@ -817,10 +834,7 @@ static int ocfs2_unlink(struct inode *dir,
817 child_locked = 1; 834 child_locked = 1;
818 835
819 if (S_ISDIR(inode->i_mode)) { 836 if (S_ISDIR(inode->i_mode)) {
820 if (!ocfs2_empty_dir(inode)) { 837 if (inode->i_nlink != 2 || !ocfs2_empty_dir(inode)) {
821 status = -ENOTEMPTY;
822 goto leave;
823 } else if (inode->i_nlink != 2) {
824 status = -ENOTEMPTY; 838 status = -ENOTEMPTY;
825 goto leave; 839 goto leave;
826 } 840 }
@@ -836,8 +850,7 @@ static int ocfs2_unlink(struct inode *dir,
836 850
837 if (inode_is_unlinkable(inode)) { 851 if (inode_is_unlinkable(inode)) {
838 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, inode, 852 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, inode,
839 orphan_name, 853 orphan_name, &orphan_insert);
840 &orphan_entry_bh);
841 if (status < 0) { 854 if (status < 0) {
842 mlog_errno(status); 855 mlog_errno(status);
843 goto leave; 856 goto leave;
@@ -863,7 +876,7 @@ static int ocfs2_unlink(struct inode *dir,
863 876
864 if (inode_is_unlinkable(inode)) { 877 if (inode_is_unlinkable(inode)) {
865 status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name, 878 status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
866 orphan_entry_bh, orphan_dir); 879 &orphan_insert, orphan_dir);
867 if (status < 0) { 880 if (status < 0) {
868 mlog_errno(status); 881 mlog_errno(status);
869 goto leave; 882 goto leave;
@@ -871,7 +884,7 @@ static int ocfs2_unlink(struct inode *dir,
871 } 884 }
872 885
873 /* delete the name from the parent dir */ 886 /* delete the name from the parent dir */
874 status = ocfs2_delete_entry(handle, dir, dirent, dirent_bh); 887 status = ocfs2_delete_entry(handle, dir, &lookup);
875 if (status < 0) { 888 if (status < 0) {
876 mlog_errno(status); 889 mlog_errno(status);
877 goto leave; 890 goto leave;
@@ -880,7 +893,7 @@ static int ocfs2_unlink(struct inode *dir,
880 if (S_ISDIR(inode->i_mode)) 893 if (S_ISDIR(inode->i_mode))
881 drop_nlink(inode); 894 drop_nlink(inode);
882 drop_nlink(inode); 895 drop_nlink(inode);
883 fe->i_links_count = cpu_to_le16(inode->i_nlink); 896 ocfs2_set_links_count(fe, inode->i_nlink);
884 897
885 status = ocfs2_journal_dirty(handle, fe_bh); 898 status = ocfs2_journal_dirty(handle, fe_bh);
886 if (status < 0) { 899 if (status < 0) {
@@ -916,9 +929,10 @@ leave:
916 } 929 }
917 930
918 brelse(fe_bh); 931 brelse(fe_bh);
919 brelse(dirent_bh);
920 brelse(parent_node_bh); 932 brelse(parent_node_bh);
921 brelse(orphan_entry_bh); 933
934 ocfs2_free_dir_lookup_result(&orphan_insert);
935 ocfs2_free_dir_lookup_result(&lookup);
922 936
923 mlog_exit(status); 937 mlog_exit(status);
924 938
@@ -1004,29 +1018,27 @@ static int ocfs2_rename(struct inode *old_dir,
1004 struct inode *new_dir, 1018 struct inode *new_dir,
1005 struct dentry *new_dentry) 1019 struct dentry *new_dentry)
1006{ 1020{
1007 int status = 0, rename_lock = 0, parents_locked = 0; 1021 int status = 0, rename_lock = 0, parents_locked = 0, target_exists = 0;
1008 int old_child_locked = 0, new_child_locked = 0; 1022 int old_child_locked = 0, new_child_locked = 0, update_dot_dot = 0;
1009 struct inode *old_inode = old_dentry->d_inode; 1023 struct inode *old_inode = old_dentry->d_inode;
1010 struct inode *new_inode = new_dentry->d_inode; 1024 struct inode *new_inode = new_dentry->d_inode;
1011 struct inode *orphan_dir = NULL; 1025 struct inode *orphan_dir = NULL;
1012 struct ocfs2_dinode *newfe = NULL; 1026 struct ocfs2_dinode *newfe = NULL;
1013 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; 1027 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
1014 struct buffer_head *orphan_entry_bh = NULL;
1015 struct buffer_head *newfe_bh = NULL; 1028 struct buffer_head *newfe_bh = NULL;
1016 struct buffer_head *old_inode_bh = NULL; 1029 struct buffer_head *old_inode_bh = NULL;
1017 struct buffer_head *insert_entry_bh = NULL;
1018 struct ocfs2_super *osb = NULL; 1030 struct ocfs2_super *osb = NULL;
1019 u64 newfe_blkno, old_de_ino; 1031 u64 newfe_blkno, old_de_ino;
1020 handle_t *handle = NULL; 1032 handle_t *handle = NULL;
1021 struct buffer_head *old_dir_bh = NULL; 1033 struct buffer_head *old_dir_bh = NULL;
1022 struct buffer_head *new_dir_bh = NULL; 1034 struct buffer_head *new_dir_bh = NULL;
1023 struct ocfs2_dir_entry *old_inode_dot_dot_de = NULL, *old_de = NULL,
1024 *new_de = NULL;
1025 struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above
1026 struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir,
1027 // this is the 1st dirent bh
1028 nlink_t old_dir_nlink = old_dir->i_nlink; 1035 nlink_t old_dir_nlink = old_dir->i_nlink;
1029 struct ocfs2_dinode *old_di; 1036 struct ocfs2_dinode *old_di;
1037 struct ocfs2_dir_lookup_result old_inode_dot_dot_res = { NULL, };
1038 struct ocfs2_dir_lookup_result target_lookup_res = { NULL, };
1039 struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, };
1040 struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
1041 struct ocfs2_dir_lookup_result target_insert = { NULL, };
1030 1042
1031 /* At some point it might be nice to break this function up a 1043 /* At some point it might be nice to break this function up a
1032 * bit. */ 1044 * bit. */
@@ -1108,9 +1120,10 @@ static int ocfs2_rename(struct inode *old_dir,
1108 if (S_ISDIR(old_inode->i_mode)) { 1120 if (S_ISDIR(old_inode->i_mode)) {
1109 u64 old_inode_parent; 1121 u64 old_inode_parent;
1110 1122
1123 update_dot_dot = 1;
1111 status = ocfs2_find_files_on_disk("..", 2, &old_inode_parent, 1124 status = ocfs2_find_files_on_disk("..", 2, &old_inode_parent,
1112 old_inode, &old_inode_de_bh, 1125 old_inode,
1113 &old_inode_dot_dot_de); 1126 &old_inode_dot_dot_res);
1114 if (status) { 1127 if (status) {
1115 status = -EIO; 1128 status = -EIO;
1116 goto bail; 1129 goto bail;
@@ -1122,7 +1135,7 @@ static int ocfs2_rename(struct inode *old_dir,
1122 } 1135 }
1123 1136
1124 if (!new_inode && new_dir != old_dir && 1137 if (!new_inode && new_dir != old_dir &&
1125 new_dir->i_nlink >= OCFS2_LINK_MAX) { 1138 new_dir->i_nlink >= ocfs2_link_max(osb)) {
1126 status = -EMLINK; 1139 status = -EMLINK;
1127 goto bail; 1140 goto bail;
1128 } 1141 }
@@ -1151,8 +1164,8 @@ static int ocfs2_rename(struct inode *old_dir,
1151 * to delete it */ 1164 * to delete it */
1152 status = ocfs2_find_files_on_disk(new_dentry->d_name.name, 1165 status = ocfs2_find_files_on_disk(new_dentry->d_name.name,
1153 new_dentry->d_name.len, 1166 new_dentry->d_name.len,
1154 &newfe_blkno, new_dir, &new_de_bh, 1167 &newfe_blkno, new_dir,
1155 &new_de); 1168 &target_lookup_res);
1156 /* The only error we allow here is -ENOENT because the new 1169 /* The only error we allow here is -ENOENT because the new
1157 * file not existing is perfectly valid. */ 1170 * file not existing is perfectly valid. */
1158 if ((status < 0) && (status != -ENOENT)) { 1171 if ((status < 0) && (status != -ENOENT)) {
@@ -1161,8 +1174,10 @@ static int ocfs2_rename(struct inode *old_dir,
1161 mlog_errno(status); 1174 mlog_errno(status);
1162 goto bail; 1175 goto bail;
1163 } 1176 }
1177 if (status == 0)
1178 target_exists = 1;
1164 1179
1165 if (!new_de && new_inode) { 1180 if (!target_exists && new_inode) {
1166 /* 1181 /*
1167 * Target was unlinked by another node while we were 1182 * Target was unlinked by another node while we were
1168 * waiting to get to ocfs2_rename(). There isn't 1183 * waiting to get to ocfs2_rename(). There isn't
@@ -1175,7 +1190,7 @@ static int ocfs2_rename(struct inode *old_dir,
1175 1190
1176 /* In case we need to overwrite an existing file, we blow it 1191 /* In case we need to overwrite an existing file, we blow it
1177 * away first */ 1192 * away first */
1178 if (new_de) { 1193 if (target_exists) {
1179 /* VFS didn't think there existed an inode here, but 1194 /* VFS didn't think there existed an inode here, but
1180 * someone else in the cluster must have raced our 1195 * someone else in the cluster must have raced our
1181 * rename to create one. Today we error cleanly, in 1196 * rename to create one. Today we error cleanly, in
@@ -1216,8 +1231,8 @@ static int ocfs2_rename(struct inode *old_dir,
1216 1231
1217 newfe = (struct ocfs2_dinode *) newfe_bh->b_data; 1232 newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
1218 1233
1219 mlog(0, "aha rename over existing... new_de=%p new_blkno=%llu " 1234 mlog(0, "aha rename over existing... new_blkno=%llu "
1220 "newfebh=%p bhblocknr=%llu\n", new_de, 1235 "newfebh=%p bhblocknr=%llu\n",
1221 (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ? 1236 (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ?
1222 (unsigned long long)newfe_bh->b_blocknr : 0ULL); 1237 (unsigned long long)newfe_bh->b_blocknr : 0ULL);
1223 1238
@@ -1225,7 +1240,7 @@ static int ocfs2_rename(struct inode *old_dir,
1225 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, 1240 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
1226 new_inode, 1241 new_inode,
1227 orphan_name, 1242 orphan_name,
1228 &orphan_entry_bh); 1243 &orphan_insert);
1229 if (status < 0) { 1244 if (status < 0) {
1230 mlog_errno(status); 1245 mlog_errno(status);
1231 goto bail; 1246 goto bail;
@@ -1243,7 +1258,7 @@ static int ocfs2_rename(struct inode *old_dir,
1243 status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh, 1258 status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh,
1244 new_dentry->d_name.name, 1259 new_dentry->d_name.name,
1245 new_dentry->d_name.len, 1260 new_dentry->d_name.len,
1246 &insert_entry_bh); 1261 &target_insert);
1247 if (status < 0) { 1262 if (status < 0) {
1248 mlog_errno(status); 1263 mlog_errno(status);
1249 goto bail; 1264 goto bail;
@@ -1258,10 +1273,10 @@ static int ocfs2_rename(struct inode *old_dir,
1258 goto bail; 1273 goto bail;
1259 } 1274 }
1260 1275
1261 if (new_de) { 1276 if (target_exists) {
1262 if (S_ISDIR(new_inode->i_mode)) { 1277 if (S_ISDIR(new_inode->i_mode)) {
1263 if (!ocfs2_empty_dir(new_inode) || 1278 if (new_inode->i_nlink != 2 ||
1264 new_inode->i_nlink != 2) { 1279 !ocfs2_empty_dir(new_inode)) {
1265 status = -ENOTEMPTY; 1280 status = -ENOTEMPTY;
1266 goto bail; 1281 goto bail;
1267 } 1282 }
@@ -1274,10 +1289,10 @@ static int ocfs2_rename(struct inode *old_dir,
1274 } 1289 }
1275 1290
1276 if (S_ISDIR(new_inode->i_mode) || 1291 if (S_ISDIR(new_inode->i_mode) ||
1277 (newfe->i_links_count == cpu_to_le16(1))){ 1292 (ocfs2_read_links_count(newfe) == 1)) {
1278 status = ocfs2_orphan_add(osb, handle, new_inode, 1293 status = ocfs2_orphan_add(osb, handle, new_inode,
1279 newfe, orphan_name, 1294 newfe, orphan_name,
1280 orphan_entry_bh, orphan_dir); 1295 &orphan_insert, orphan_dir);
1281 if (status < 0) { 1296 if (status < 0) {
1282 mlog_errno(status); 1297 mlog_errno(status);
1283 goto bail; 1298 goto bail;
@@ -1285,8 +1300,8 @@ static int ocfs2_rename(struct inode *old_dir,
1285 } 1300 }
1286 1301
1287 /* change the dirent to point to the correct inode */ 1302 /* change the dirent to point to the correct inode */
1288 status = ocfs2_update_entry(new_dir, handle, new_de_bh, 1303 status = ocfs2_update_entry(new_dir, handle, &target_lookup_res,
1289 new_de, old_inode); 1304 old_inode);
1290 if (status < 0) { 1305 if (status < 0) {
1291 mlog_errno(status); 1306 mlog_errno(status);
1292 goto bail; 1307 goto bail;
@@ -1294,9 +1309,9 @@ static int ocfs2_rename(struct inode *old_dir,
1294 new_dir->i_version++; 1309 new_dir->i_version++;
1295 1310
1296 if (S_ISDIR(new_inode->i_mode)) 1311 if (S_ISDIR(new_inode->i_mode))
1297 newfe->i_links_count = 0; 1312 ocfs2_set_links_count(newfe, 0);
1298 else 1313 else
1299 le16_add_cpu(&newfe->i_links_count, -1); 1314 ocfs2_add_links_count(newfe, -1);
1300 1315
1301 status = ocfs2_journal_dirty(handle, newfe_bh); 1316 status = ocfs2_journal_dirty(handle, newfe_bh);
1302 if (status < 0) { 1317 if (status < 0) {
@@ -1307,7 +1322,7 @@ static int ocfs2_rename(struct inode *old_dir,
1307 /* if the name was not found in new_dir, add it now */ 1322 /* if the name was not found in new_dir, add it now */
1308 status = ocfs2_add_entry(handle, new_dentry, old_inode, 1323 status = ocfs2_add_entry(handle, new_dentry, old_inode,
1309 OCFS2_I(old_inode)->ip_blkno, 1324 OCFS2_I(old_inode)->ip_blkno,
1310 new_dir_bh, insert_entry_bh); 1325 new_dir_bh, &target_insert);
1311 } 1326 }
1312 1327
1313 old_inode->i_ctime = CURRENT_TIME; 1328 old_inode->i_ctime = CURRENT_TIME;
@@ -1334,15 +1349,13 @@ static int ocfs2_rename(struct inode *old_dir,
1334 * because the insert might have changed the type of directory 1349 * because the insert might have changed the type of directory
1335 * we're dealing with. 1350 * we're dealing with.
1336 */ 1351 */
1337 old_de_bh = ocfs2_find_entry(old_dentry->d_name.name, 1352 status = ocfs2_find_entry(old_dentry->d_name.name,
1338 old_dentry->d_name.len, 1353 old_dentry->d_name.len, old_dir,
1339 old_dir, &old_de); 1354 &old_entry_lookup);
1340 if (!old_de_bh) { 1355 if (status)
1341 status = -EIO;
1342 goto bail; 1356 goto bail;
1343 }
1344 1357
1345 status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh); 1358 status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup);
1346 if (status < 0) { 1359 if (status < 0) {
1347 mlog_errno(status); 1360 mlog_errno(status);
1348 goto bail; 1361 goto bail;
@@ -1353,9 +1366,10 @@ static int ocfs2_rename(struct inode *old_dir,
1353 new_inode->i_ctime = CURRENT_TIME; 1366 new_inode->i_ctime = CURRENT_TIME;
1354 } 1367 }
1355 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; 1368 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
1356 if (old_inode_de_bh) { 1369
1357 status = ocfs2_update_entry(old_inode, handle, old_inode_de_bh, 1370 if (update_dot_dot) {
1358 old_inode_dot_dot_de, new_dir); 1371 status = ocfs2_update_entry(old_inode, handle,
1372 &old_inode_dot_dot_res, new_dir);
1359 old_dir->i_nlink--; 1373 old_dir->i_nlink--;
1360 if (new_inode) { 1374 if (new_inode) {
1361 new_inode->i_nlink--; 1375 new_inode->i_nlink--;
@@ -1391,14 +1405,13 @@ static int ocfs2_rename(struct inode *old_dir,
1391 } else { 1405 } else {
1392 struct ocfs2_dinode *fe; 1406 struct ocfs2_dinode *fe;
1393 status = ocfs2_journal_access_di(handle, old_dir, 1407 status = ocfs2_journal_access_di(handle, old_dir,
1394 old_dir_bh, 1408 old_dir_bh,
1395 OCFS2_JOURNAL_ACCESS_WRITE); 1409 OCFS2_JOURNAL_ACCESS_WRITE);
1396 fe = (struct ocfs2_dinode *) old_dir_bh->b_data; 1410 fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
1397 fe->i_links_count = cpu_to_le16(old_dir->i_nlink); 1411 ocfs2_set_links_count(fe, old_dir->i_nlink);
1398 status = ocfs2_journal_dirty(handle, old_dir_bh); 1412 status = ocfs2_journal_dirty(handle, old_dir_bh);
1399 } 1413 }
1400 } 1414 }
1401
1402 ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir); 1415 ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
1403 status = 0; 1416 status = 0;
1404bail: 1417bail:
@@ -1429,15 +1442,17 @@ bail:
1429 1442
1430 if (new_inode) 1443 if (new_inode)
1431 iput(new_inode); 1444 iput(new_inode);
1445
1446 ocfs2_free_dir_lookup_result(&target_lookup_res);
1447 ocfs2_free_dir_lookup_result(&old_entry_lookup);
1448 ocfs2_free_dir_lookup_result(&old_inode_dot_dot_res);
1449 ocfs2_free_dir_lookup_result(&orphan_insert);
1450 ocfs2_free_dir_lookup_result(&target_insert);
1451
1432 brelse(newfe_bh); 1452 brelse(newfe_bh);
1433 brelse(old_inode_bh); 1453 brelse(old_inode_bh);
1434 brelse(old_dir_bh); 1454 brelse(old_dir_bh);
1435 brelse(new_dir_bh); 1455 brelse(new_dir_bh);
1436 brelse(new_de_bh);
1437 brelse(old_de_bh);
1438 brelse(old_inode_de_bh);
1439 brelse(orphan_entry_bh);
1440 brelse(insert_entry_bh);
1441 1456
1442 mlog_exit(status); 1457 mlog_exit(status);
1443 1458
@@ -1558,7 +1573,6 @@ static int ocfs2_symlink(struct inode *dir,
1558 struct inode *inode = NULL; 1573 struct inode *inode = NULL;
1559 struct super_block *sb; 1574 struct super_block *sb;
1560 struct buffer_head *new_fe_bh = NULL; 1575 struct buffer_head *new_fe_bh = NULL;
1561 struct buffer_head *de_bh = NULL;
1562 struct buffer_head *parent_fe_bh = NULL; 1576 struct buffer_head *parent_fe_bh = NULL;
1563 struct ocfs2_dinode *fe = NULL; 1577 struct ocfs2_dinode *fe = NULL;
1564 struct ocfs2_dinode *dirfe; 1578 struct ocfs2_dinode *dirfe;
@@ -1572,6 +1586,7 @@ static int ocfs2_symlink(struct inode *dir,
1572 .enable = 1, 1586 .enable = 1,
1573 }; 1587 };
1574 int did_quota = 0, did_quota_inode = 0; 1588 int did_quota = 0, did_quota_inode = 0;
1589 struct ocfs2_dir_lookup_result lookup = { NULL, };
1575 1590
1576 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, 1591 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
1577 dentry, symname, dentry->d_name.len, dentry->d_name.name); 1592 dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1592,7 +1607,7 @@ static int ocfs2_symlink(struct inode *dir,
1592 } 1607 }
1593 1608
1594 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data; 1609 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
1595 if (!dirfe->i_links_count) { 1610 if (!ocfs2_read_links_count(dirfe)) {
1596 /* can't make a file in a deleted directory. */ 1611 /* can't make a file in a deleted directory. */
1597 status = -ENOENT; 1612 status = -ENOENT;
1598 goto bail; 1613 goto bail;
@@ -1605,7 +1620,7 @@ static int ocfs2_symlink(struct inode *dir,
1605 1620
1606 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, 1621 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
1607 dentry->d_name.name, 1622 dentry->d_name.name,
1608 dentry->d_name.len, &de_bh); 1623 dentry->d_name.len, &lookup);
1609 if (status < 0) { 1624 if (status < 0) {
1610 mlog_errno(status); 1625 mlog_errno(status);
1611 goto bail; 1626 goto bail;
@@ -1744,7 +1759,7 @@ static int ocfs2_symlink(struct inode *dir,
1744 1759
1745 status = ocfs2_add_entry(handle, dentry, inode, 1760 status = ocfs2_add_entry(handle, dentry, inode,
1746 le64_to_cpu(fe->i_blkno), parent_fe_bh, 1761 le64_to_cpu(fe->i_blkno), parent_fe_bh,
1747 de_bh); 1762 &lookup);
1748 if (status < 0) { 1763 if (status < 0) {
1749 mlog_errno(status); 1764 mlog_errno(status);
1750 goto bail; 1765 goto bail;
@@ -1772,9 +1787,9 @@ bail:
1772 1787
1773 brelse(new_fe_bh); 1788 brelse(new_fe_bh);
1774 brelse(parent_fe_bh); 1789 brelse(parent_fe_bh);
1775 brelse(de_bh);
1776 kfree(si.name); 1790 kfree(si.name);
1777 kfree(si.value); 1791 kfree(si.value);
1792 ocfs2_free_dir_lookup_result(&lookup);
1778 if (inode_ac) 1793 if (inode_ac)
1779 ocfs2_free_alloc_context(inode_ac); 1794 ocfs2_free_alloc_context(inode_ac);
1780 if (data_ac) 1795 if (data_ac)
@@ -1826,7 +1841,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
1826 struct inode **ret_orphan_dir, 1841 struct inode **ret_orphan_dir,
1827 struct inode *inode, 1842 struct inode *inode,
1828 char *name, 1843 char *name,
1829 struct buffer_head **de_bh) 1844 struct ocfs2_dir_lookup_result *lookup)
1830{ 1845{
1831 struct inode *orphan_dir_inode; 1846 struct inode *orphan_dir_inode;
1832 struct buffer_head *orphan_dir_bh = NULL; 1847 struct buffer_head *orphan_dir_bh = NULL;
@@ -1857,7 +1872,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
1857 1872
1858 status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, 1873 status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
1859 orphan_dir_bh, name, 1874 orphan_dir_bh, name,
1860 OCFS2_ORPHAN_NAMELEN, de_bh); 1875 OCFS2_ORPHAN_NAMELEN, lookup);
1861 if (status < 0) { 1876 if (status < 0) {
1862 ocfs2_inode_unlock(orphan_dir_inode, 1); 1877 ocfs2_inode_unlock(orphan_dir_inode, 1);
1863 1878
@@ -1884,7 +1899,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1884 struct inode *inode, 1899 struct inode *inode,
1885 struct ocfs2_dinode *fe, 1900 struct ocfs2_dinode *fe,
1886 char *name, 1901 char *name,
1887 struct buffer_head *de_bh, 1902 struct ocfs2_dir_lookup_result *lookup,
1888 struct inode *orphan_dir_inode) 1903 struct inode *orphan_dir_inode)
1889{ 1904{
1890 struct buffer_head *orphan_dir_bh = NULL; 1905 struct buffer_head *orphan_dir_bh = NULL;
@@ -1910,8 +1925,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1910 * underneath us... */ 1925 * underneath us... */
1911 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; 1926 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
1912 if (S_ISDIR(inode->i_mode)) 1927 if (S_ISDIR(inode->i_mode))
1913 le16_add_cpu(&orphan_fe->i_links_count, 1); 1928 ocfs2_add_links_count(orphan_fe, 1);
1914 orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count); 1929 orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
1915 1930
1916 status = ocfs2_journal_dirty(handle, orphan_dir_bh); 1931 status = ocfs2_journal_dirty(handle, orphan_dir_bh);
1917 if (status < 0) { 1932 if (status < 0) {
@@ -1922,7 +1937,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1922 status = __ocfs2_add_entry(handle, orphan_dir_inode, name, 1937 status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
1923 OCFS2_ORPHAN_NAMELEN, inode, 1938 OCFS2_ORPHAN_NAMELEN, inode,
1924 OCFS2_I(inode)->ip_blkno, 1939 OCFS2_I(inode)->ip_blkno,
1925 orphan_dir_bh, de_bh); 1940 orphan_dir_bh, lookup);
1926 if (status < 0) { 1941 if (status < 0) {
1927 mlog_errno(status); 1942 mlog_errno(status);
1928 goto leave; 1943 goto leave;
@@ -1955,8 +1970,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
1955 char name[OCFS2_ORPHAN_NAMELEN + 1]; 1970 char name[OCFS2_ORPHAN_NAMELEN + 1];
1956 struct ocfs2_dinode *orphan_fe; 1971 struct ocfs2_dinode *orphan_fe;
1957 int status = 0; 1972 int status = 0;
1958 struct buffer_head *target_de_bh = NULL; 1973 struct ocfs2_dir_lookup_result lookup = { NULL, };
1959 struct ocfs2_dir_entry *target_de = NULL;
1960 1974
1961 mlog_entry_void(); 1975 mlog_entry_void();
1962 1976
@@ -1971,17 +1985,15 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
1971 OCFS2_ORPHAN_NAMELEN); 1985 OCFS2_ORPHAN_NAMELEN);
1972 1986
1973 /* find it's spot in the orphan directory */ 1987 /* find it's spot in the orphan directory */
1974 target_de_bh = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, 1988 status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode,
1975 orphan_dir_inode, &target_de); 1989 &lookup);
1976 if (!target_de_bh) { 1990 if (status) {
1977 status = -ENOENT;
1978 mlog_errno(status); 1991 mlog_errno(status);
1979 goto leave; 1992 goto leave;
1980 } 1993 }
1981 1994
1982 /* remove it from the orphan directory */ 1995 /* remove it from the orphan directory */
1983 status = ocfs2_delete_entry(handle, orphan_dir_inode, target_de, 1996 status = ocfs2_delete_entry(handle, orphan_dir_inode, &lookup);
1984 target_de_bh);
1985 if (status < 0) { 1997 if (status < 0) {
1986 mlog_errno(status); 1998 mlog_errno(status);
1987 goto leave; 1999 goto leave;
@@ -1997,8 +2009,8 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
1997 /* do the i_nlink dance! :) */ 2009 /* do the i_nlink dance! :) */
1998 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; 2010 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
1999 if (S_ISDIR(inode->i_mode)) 2011 if (S_ISDIR(inode->i_mode))
2000 le16_add_cpu(&orphan_fe->i_links_count, -1); 2012 ocfs2_add_links_count(orphan_fe, -1);
2001 orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count); 2013 orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
2002 2014
2003 status = ocfs2_journal_dirty(handle, orphan_dir_bh); 2015 status = ocfs2_journal_dirty(handle, orphan_dir_bh);
2004 if (status < 0) { 2016 if (status < 0) {
@@ -2007,7 +2019,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
2007 } 2019 }
2008 2020
2009leave: 2021leave:
2010 brelse(target_de_bh); 2022 ocfs2_free_dir_lookup_result(&lookup);
2011 2023
2012 mlog_exit(status); 2024 mlog_exit(status);
2013 return status; 2025 return status;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 946d3c34b90b..1386281950db 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -209,6 +209,7 @@ enum ocfs2_mount_options
209struct ocfs2_journal; 209struct ocfs2_journal;
210struct ocfs2_slot_info; 210struct ocfs2_slot_info;
211struct ocfs2_recovery_map; 211struct ocfs2_recovery_map;
212struct ocfs2_replay_map;
212struct ocfs2_quota_recovery; 213struct ocfs2_quota_recovery;
213struct ocfs2_dentry_lock; 214struct ocfs2_dentry_lock;
214struct ocfs2_super 215struct ocfs2_super
@@ -264,6 +265,7 @@ struct ocfs2_super
264 atomic_t vol_state; 265 atomic_t vol_state;
265 struct mutex recovery_lock; 266 struct mutex recovery_lock;
266 struct ocfs2_recovery_map *recovery_map; 267 struct ocfs2_recovery_map *recovery_map;
268 struct ocfs2_replay_map *replay_map;
267 struct task_struct *recovery_thread_task; 269 struct task_struct *recovery_thread_task;
268 int disable_recovery; 270 int disable_recovery;
269 wait_queue_head_t checkpoint_event; 271 wait_queue_head_t checkpoint_event;
@@ -287,11 +289,6 @@ struct ocfs2_super
287 289
288 u64 la_last_gd; 290 u64 la_last_gd;
289 291
290#ifdef CONFIG_OCFS2_FS_STATS
291 struct dentry *local_alloc_debug;
292 char *local_alloc_debug_buf;
293#endif
294
295 /* Next three fields are for local node slot recovery during 292 /* Next three fields are for local node slot recovery during
296 * mount. */ 293 * mount. */
297 int dirty; 294 int dirty;
@@ -305,9 +302,11 @@ struct ocfs2_super
305 struct ocfs2_cluster_connection *cconn; 302 struct ocfs2_cluster_connection *cconn;
306 struct ocfs2_lock_res osb_super_lockres; 303 struct ocfs2_lock_res osb_super_lockres;
307 struct ocfs2_lock_res osb_rename_lockres; 304 struct ocfs2_lock_res osb_rename_lockres;
305 struct ocfs2_lock_res osb_nfs_sync_lockres;
308 struct ocfs2_dlm_debug *osb_dlm_debug; 306 struct ocfs2_dlm_debug *osb_dlm_debug;
309 307
310 struct dentry *osb_debug_root; 308 struct dentry *osb_debug_root;
309 struct dentry *osb_ctxt;
311 310
312 wait_queue_head_t recovery_event; 311 wait_queue_head_t recovery_event;
313 312
@@ -344,6 +343,12 @@ struct ocfs2_super
344 343
345 /* used to protect metaecc calculation check of xattr. */ 344 /* used to protect metaecc calculation check of xattr. */
346 spinlock_t osb_xattr_lock; 345 spinlock_t osb_xattr_lock;
346
347 unsigned int osb_dx_mask;
348 u32 osb_dx_seed[4];
349
350 /* the group we used to allocate inodes. */
351 u64 osb_inode_alloc_group;
347}; 352};
348 353
349#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) 354#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
@@ -402,6 +407,51 @@ static inline int ocfs2_meta_ecc(struct ocfs2_super *osb)
402 return 0; 407 return 0;
403} 408}
404 409
410static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb)
411{
412 if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS)
413 return 1;
414 return 0;
415}
416
417static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb)
418{
419 if (ocfs2_supports_indexed_dirs(osb))
420 return OCFS2_DX_LINK_MAX;
421 return OCFS2_LINK_MAX;
422}
423
424static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di)
425{
426 u32 nlink = le16_to_cpu(di->i_links_count);
427 u32 hi = le16_to_cpu(di->i_links_count_hi);
428
429 if (di->i_dyn_features & cpu_to_le16(OCFS2_INDEXED_DIR_FL))
430 nlink |= (hi << OCFS2_LINKS_HI_SHIFT);
431
432 return nlink;
433}
434
435static inline void ocfs2_set_links_count(struct ocfs2_dinode *di, u32 nlink)
436{
437 u16 lo, hi;
438
439 lo = nlink;
440 hi = nlink >> OCFS2_LINKS_HI_SHIFT;
441
442 di->i_links_count = cpu_to_le16(lo);
443 di->i_links_count_hi = cpu_to_le16(hi);
444}
445
446static inline void ocfs2_add_links_count(struct ocfs2_dinode *di, int n)
447{
448 u32 links = ocfs2_read_links_count(di);
449
450 links += n;
451
452 ocfs2_set_links_count(di, links);
453}
454
405/* set / clear functions because cluster events can make these happen 455/* set / clear functions because cluster events can make these happen
406 * in parallel so we want the transitions to be atomic. this also 456 * in parallel so we want the transitions to be atomic. this also
407 * means that any future flags osb_flags must be protected by spinlock 457 * means that any future flags osb_flags must be protected by spinlock
@@ -482,6 +532,12 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
482#define OCFS2_IS_VALID_DIR_TRAILER(ptr) \ 532#define OCFS2_IS_VALID_DIR_TRAILER(ptr) \
483 (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE)) 533 (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE))
484 534
535#define OCFS2_IS_VALID_DX_ROOT(ptr) \
536 (!strcmp((ptr)->dr_signature, OCFS2_DX_ROOT_SIGNATURE))
537
538#define OCFS2_IS_VALID_DX_LEAF(ptr) \
539 (!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE))
540
485static inline unsigned long ino_from_blkno(struct super_block *sb, 541static inline unsigned long ino_from_blkno(struct super_block *sb,
486 u64 blkno) 542 u64 blkno)
487{ 543{
@@ -532,6 +588,16 @@ static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb,
532 return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits; 588 return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits;
533} 589}
534 590
591static inline u64 ocfs2_block_to_cluster_start(struct super_block *sb,
592 u64 blocks)
593{
594 int bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits;
595 unsigned int clusters;
596
597 clusters = ocfs2_blocks_to_clusters(sb, blocks);
598 return (u64)clusters << bits;
599}
600
535static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb, 601static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb,
536 u64 bytes) 602 u64 bytes)
537{ 603{
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 2332ef740f4f..7ab6e9e5e77c 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -66,6 +66,8 @@
66#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01" 66#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01"
67#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01" 67#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01"
68#define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1" 68#define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1"
69#define OCFS2_DX_ROOT_SIGNATURE "DXDIR01"
70#define OCFS2_DX_LEAF_SIGNATURE "DXLEAF1"
69 71
70/* Compatibility flags */ 72/* Compatibility flags */
71#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ 73#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \
@@ -95,7 +97,8 @@
95 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \ 97 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
96 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \ 98 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
97 | OCFS2_FEATURE_INCOMPAT_XATTR \ 99 | OCFS2_FEATURE_INCOMPAT_XATTR \
98 | OCFS2_FEATURE_INCOMPAT_META_ECC) 100 | OCFS2_FEATURE_INCOMPAT_META_ECC \
101 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS)
99#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ 102#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
100 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ 103 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
101 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) 104 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -151,6 +154,9 @@
151/* Support for extended attributes */ 154/* Support for extended attributes */
152#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200 155#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200
153 156
157/* Support for indexed directores */
158#define OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS 0x0400
159
154/* Metadata checksum and error correction */ 160/* Metadata checksum and error correction */
155#define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800 161#define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800
156 162
@@ -411,8 +417,12 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
411#define OCFS2_DIR_REC_LEN(name_len) (((name_len) + OCFS2_DIR_MEMBER_LEN + \ 417#define OCFS2_DIR_REC_LEN(name_len) (((name_len) + OCFS2_DIR_MEMBER_LEN + \
412 OCFS2_DIR_ROUND) & \ 418 OCFS2_DIR_ROUND) & \
413 ~OCFS2_DIR_ROUND) 419 ~OCFS2_DIR_ROUND)
420#define OCFS2_DIR_MIN_REC_LEN OCFS2_DIR_REC_LEN(1)
414 421
415#define OCFS2_LINK_MAX 32000 422#define OCFS2_LINK_MAX 32000
423#define OCFS2_DX_LINK_MAX ((1U << 31) - 1U)
424#define OCFS2_LINKS_HI_SHIFT 16
425#define OCFS2_DX_ENTRIES_MAX (0xffffffffU)
416 426
417#define S_SHIFT 12 427#define S_SHIFT 12
418static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = { 428static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
@@ -628,8 +638,9 @@ struct ocfs2_super_block {
628/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size 638/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size
629 for this fs*/ 639 for this fs*/
630 __le16 s_reserved0; 640 __le16 s_reserved0;
631 __le32 s_reserved1; 641 __le32 s_dx_seed[3]; /* seed[0-2] for dx dir hash.
632/*C0*/ __le64 s_reserved2[16]; /* Fill out superblock */ 642 * s_uuid_hash serves as seed[3]. */
643/*C0*/ __le64 s_reserved2[15]; /* Fill out superblock */
633/*140*/ 644/*140*/
634 645
635 /* 646 /*
@@ -679,7 +690,7 @@ struct ocfs2_dinode {
679 belongs to */ 690 belongs to */
680 __le16 i_suballoc_bit; /* Bit offset in suballocator 691 __le16 i_suballoc_bit; /* Bit offset in suballocator
681 block group */ 692 block group */
682/*10*/ __le16 i_reserved0; 693/*10*/ __le16 i_links_count_hi; /* High 16 bits of links count */
683 __le16 i_xattr_inline_size; 694 __le16 i_xattr_inline_size;
684 __le32 i_clusters; /* Cluster count */ 695 __le32 i_clusters; /* Cluster count */
685 __le32 i_uid; /* Owner UID */ 696 __le32 i_uid; /* Owner UID */
@@ -705,7 +716,8 @@ struct ocfs2_dinode {
705 __le16 i_dyn_features; 716 __le16 i_dyn_features;
706 __le64 i_xattr_loc; 717 __le64 i_xattr_loc;
707/*80*/ struct ocfs2_block_check i_check; /* Error checking */ 718/*80*/ struct ocfs2_block_check i_check; /* Error checking */
708/*88*/ __le64 i_reserved2[6]; 719/*88*/ __le64 i_dx_root; /* Pointer to dir index root block */
720 __le64 i_reserved2[5];
709/*B8*/ union { 721/*B8*/ union {
710 __le64 i_pad1; /* Generic way to refer to this 722 __le64 i_pad1; /* Generic way to refer to this
711 64bit union */ 723 64bit union */
@@ -781,6 +793,90 @@ struct ocfs2_dir_block_trailer {
781/*40*/ 793/*40*/
782}; 794};
783 795
796 /*
797 * A directory entry in the indexed tree. We don't store the full name here,
798 * but instead provide a pointer to the full dirent in the unindexed tree.
799 *
800 * We also store name_len here so as to reduce the number of leaf blocks we
801 * need to search in case of collisions.
802 */
803struct ocfs2_dx_entry {
804 __le32 dx_major_hash; /* Used to find logical
805 * cluster in index */
806 __le32 dx_minor_hash; /* Lower bits used to find
807 * block in cluster */
808 __le64 dx_dirent_blk; /* Physical block in unindexed
809 * tree holding this dirent. */
810};
811
812struct ocfs2_dx_entry_list {
813 __le32 de_reserved;
814 __le16 de_count; /* Maximum number of entries
815 * possible in de_entries */
816 __le16 de_num_used; /* Current number of
817 * de_entries entries */
818 struct ocfs2_dx_entry de_entries[0]; /* Indexed dir entries
819 * in a packed array of
820 * length de_num_used */
821};
822
823#define OCFS2_DX_FLAG_INLINE 0x01
824
825/*
826 * A directory indexing block. Each indexed directory has one of these,
827 * pointed to by ocfs2_dinode.
828 *
829 * This block stores an indexed btree root, and a set of free space
830 * start-of-list pointers.
831 */
832struct ocfs2_dx_root_block {
833 __u8 dr_signature[8]; /* Signature for verification */
834 struct ocfs2_block_check dr_check; /* Error checking */
835 __le16 dr_suballoc_slot; /* Slot suballocator this
836 * block belongs to. */
837 __le16 dr_suballoc_bit; /* Bit offset in suballocator
838 * block group */
839 __le32 dr_fs_generation; /* Must match super block */
840 __le64 dr_blkno; /* Offset on disk, in blocks */
841 __le64 dr_last_eb_blk; /* Pointer to last
842 * extent block */
843 __le32 dr_clusters; /* Clusters allocated
844 * to the indexed tree. */
845 __u8 dr_flags; /* OCFS2_DX_FLAG_* flags */
846 __u8 dr_reserved0;
847 __le16 dr_reserved1;
848 __le64 dr_dir_blkno; /* Pointer to parent inode */
849 __le32 dr_num_entries; /* Total number of
850 * names stored in
851 * this directory.*/
852 __le32 dr_reserved2;
853 __le64 dr_free_blk; /* Pointer to head of free
854 * unindexed block list. */
855 __le64 dr_reserved3[15];
856 union {
857 struct ocfs2_extent_list dr_list; /* Keep this aligned to 128
858 * bits for maximum space
859 * efficiency. */
860 struct ocfs2_dx_entry_list dr_entries; /* In-root-block list of
861 * entries. We grow out
862 * to extents if this
863 * gets too big. */
864 };
865};
866
867/*
868 * The header of a leaf block in the indexed tree.
869 */
870struct ocfs2_dx_leaf {
871 __u8 dl_signature[8];/* Signature for verification */
872 struct ocfs2_block_check dl_check; /* Error checking */
873 __le64 dl_blkno; /* Offset on disk, in blocks */
874 __le32 dl_fs_generation;/* Must match super block */
875 __le32 dl_reserved0;
876 __le64 dl_reserved1;
877 struct ocfs2_dx_entry_list dl_list;
878};
879
784/* 880/*
785 * On disk allocator group structure for OCFS2 881 * On disk allocator group structure for OCFS2
786 */ 882 */
@@ -1112,6 +1208,16 @@ static inline int ocfs2_extent_recs_per_inode_with_xattr(
1112 return size / sizeof(struct ocfs2_extent_rec); 1208 return size / sizeof(struct ocfs2_extent_rec);
1113} 1209}
1114 1210
1211static inline int ocfs2_extent_recs_per_dx_root(struct super_block *sb)
1212{
1213 int size;
1214
1215 size = sb->s_blocksize -
1216 offsetof(struct ocfs2_dx_root_block, dr_list.l_recs);
1217
1218 return size / sizeof(struct ocfs2_extent_rec);
1219}
1220
1115static inline int ocfs2_chain_recs_per_inode(struct super_block *sb) 1221static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
1116{ 1222{
1117 int size; 1223 int size;
@@ -1132,6 +1238,26 @@ static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
1132 return size / sizeof(struct ocfs2_extent_rec); 1238 return size / sizeof(struct ocfs2_extent_rec);
1133} 1239}
1134 1240
1241static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb)
1242{
1243 int size;
1244
1245 size = sb->s_blocksize -
1246 offsetof(struct ocfs2_dx_leaf, dl_list.de_entries);
1247
1248 return size / sizeof(struct ocfs2_dx_entry);
1249}
1250
1251static inline int ocfs2_dx_entries_per_root(struct super_block *sb)
1252{
1253 int size;
1254
1255 size = sb->s_blocksize -
1256 offsetof(struct ocfs2_dx_root_block, dr_entries.de_entries);
1257
1258 return size / sizeof(struct ocfs2_dx_entry);
1259}
1260
1135static inline u16 ocfs2_local_alloc_size(struct super_block *sb) 1261static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
1136{ 1262{
1137 u16 size; 1263 u16 size;
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index eb6f50c9ceca..a53ce87481bf 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -47,6 +47,7 @@ enum ocfs2_lock_type {
47 OCFS2_LOCK_TYPE_OPEN, 47 OCFS2_LOCK_TYPE_OPEN,
48 OCFS2_LOCK_TYPE_FLOCK, 48 OCFS2_LOCK_TYPE_FLOCK,
49 OCFS2_LOCK_TYPE_QINFO, 49 OCFS2_LOCK_TYPE_QINFO,
50 OCFS2_LOCK_TYPE_NFS_SYNC,
50 OCFS2_NUM_LOCK_TYPES 51 OCFS2_NUM_LOCK_TYPES
51}; 52};
52 53
@@ -81,6 +82,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
81 case OCFS2_LOCK_TYPE_QINFO: 82 case OCFS2_LOCK_TYPE_QINFO:
82 c = 'Q'; 83 c = 'Q';
83 break; 84 break;
85 case OCFS2_LOCK_TYPE_NFS_SYNC:
86 c = 'Y';
87 break;
84 default: 88 default:
85 c = '\0'; 89 c = '\0';
86 } 90 }
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index a69628603e18..8439f6b324b9 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -48,7 +48,8 @@
48#include "buffer_head_io.h" 48#include "buffer_head_io.h"
49 49
50#define NOT_ALLOC_NEW_GROUP 0 50#define NOT_ALLOC_NEW_GROUP 0
51#define ALLOC_NEW_GROUP 1 51#define ALLOC_NEW_GROUP 0x1
52#define ALLOC_GROUPS_FROM_GLOBAL 0x2
52 53
53#define OCFS2_MAX_INODES_TO_STEAL 1024 54#define OCFS2_MAX_INODES_TO_STEAL 1024
54 55
@@ -64,7 +65,9 @@ static int ocfs2_block_group_fill(handle_t *handle,
64static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 65static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
65 struct inode *alloc_inode, 66 struct inode *alloc_inode,
66 struct buffer_head *bh, 67 struct buffer_head *bh,
67 u64 max_block); 68 u64 max_block,
69 u64 *last_alloc_group,
70 int flags);
68 71
69static int ocfs2_cluster_group_search(struct inode *inode, 72static int ocfs2_cluster_group_search(struct inode *inode,
70 struct buffer_head *group_bh, 73 struct buffer_head *group_bh,
@@ -116,6 +119,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
116 u16 *bg_bit_off); 119 u16 *bg_bit_off);
117static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, 120static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
118 u32 bits_wanted, u64 max_block, 121 u32 bits_wanted, u64 max_block,
122 int flags,
119 struct ocfs2_alloc_context **ac); 123 struct ocfs2_alloc_context **ac);
120 124
121void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) 125void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
@@ -403,7 +407,9 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
403static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 407static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
404 struct inode *alloc_inode, 408 struct inode *alloc_inode,
405 struct buffer_head *bh, 409 struct buffer_head *bh,
406 u64 max_block) 410 u64 max_block,
411 u64 *last_alloc_group,
412 int flags)
407{ 413{
408 int status, credits; 414 int status, credits;
409 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; 415 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
@@ -423,7 +429,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
423 cl = &fe->id2.i_chain; 429 cl = &fe->id2.i_chain;
424 status = ocfs2_reserve_clusters_with_limit(osb, 430 status = ocfs2_reserve_clusters_with_limit(osb,
425 le16_to_cpu(cl->cl_cpg), 431 le16_to_cpu(cl->cl_cpg),
426 max_block, &ac); 432 max_block, flags, &ac);
427 if (status < 0) { 433 if (status < 0) {
428 if (status != -ENOSPC) 434 if (status != -ENOSPC)
429 mlog_errno(status); 435 mlog_errno(status);
@@ -440,6 +446,11 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
440 goto bail; 446 goto bail;
441 } 447 }
442 448
449 if (last_alloc_group && *last_alloc_group != 0) {
450 mlog(0, "use old allocation group %llu for block group alloc\n",
451 (unsigned long long)*last_alloc_group);
452 ac->ac_last_group = *last_alloc_group;
453 }
443 status = ocfs2_claim_clusters(osb, 454 status = ocfs2_claim_clusters(osb,
444 handle, 455 handle,
445 ac, 456 ac,
@@ -514,6 +525,11 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
514 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); 525 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
515 526
516 status = 0; 527 status = 0;
528
529 /* save the new last alloc group so that the caller can cache it. */
530 if (last_alloc_group)
531 *last_alloc_group = ac->ac_last_group;
532
517bail: 533bail:
518 if (handle) 534 if (handle)
519 ocfs2_commit_trans(osb, handle); 535 ocfs2_commit_trans(osb, handle);
@@ -531,7 +547,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
531 struct ocfs2_alloc_context *ac, 547 struct ocfs2_alloc_context *ac,
532 int type, 548 int type,
533 u32 slot, 549 u32 slot,
534 int alloc_new_group) 550 u64 *last_alloc_group,
551 int flags)
535{ 552{
536 int status; 553 int status;
537 u32 bits_wanted = ac->ac_bits_wanted; 554 u32 bits_wanted = ac->ac_bits_wanted;
@@ -587,7 +604,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
587 goto bail; 604 goto bail;
588 } 605 }
589 606
590 if (alloc_new_group != ALLOC_NEW_GROUP) { 607 if (!(flags & ALLOC_NEW_GROUP)) {
591 mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, " 608 mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
592 "and we don't alloc a new group for it.\n", 609 "and we don't alloc a new group for it.\n",
593 slot, bits_wanted, free_bits); 610 slot, bits_wanted, free_bits);
@@ -596,7 +613,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
596 } 613 }
597 614
598 status = ocfs2_block_group_alloc(osb, alloc_inode, bh, 615 status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
599 ac->ac_max_block); 616 ac->ac_max_block,
617 last_alloc_group, flags);
600 if (status < 0) { 618 if (status < 0) {
601 if (status != -ENOSPC) 619 if (status != -ENOSPC)
602 mlog_errno(status); 620 mlog_errno(status);
@@ -640,7 +658,7 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
640 658
641 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 659 status = ocfs2_reserve_suballoc_bits(osb, (*ac),
642 EXTENT_ALLOC_SYSTEM_INODE, 660 EXTENT_ALLOC_SYSTEM_INODE,
643 slot, ALLOC_NEW_GROUP); 661 slot, NULL, ALLOC_NEW_GROUP);
644 if (status < 0) { 662 if (status < 0) {
645 if (status != -ENOSPC) 663 if (status != -ENOSPC)
646 mlog_errno(status); 664 mlog_errno(status);
@@ -686,7 +704,8 @@ static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
686 704
687 status = ocfs2_reserve_suballoc_bits(osb, ac, 705 status = ocfs2_reserve_suballoc_bits(osb, ac,
688 INODE_ALLOC_SYSTEM_INODE, 706 INODE_ALLOC_SYSTEM_INODE,
689 slot, NOT_ALLOC_NEW_GROUP); 707 slot, NULL,
708 NOT_ALLOC_NEW_GROUP);
690 if (status >= 0) { 709 if (status >= 0) {
691 ocfs2_set_inode_steal_slot(osb, slot); 710 ocfs2_set_inode_steal_slot(osb, slot);
692 break; 711 break;
@@ -703,6 +722,7 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
703{ 722{
704 int status; 723 int status;
705 s16 slot = ocfs2_get_inode_steal_slot(osb); 724 s16 slot = ocfs2_get_inode_steal_slot(osb);
725 u64 alloc_group;
706 726
707 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 727 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
708 if (!(*ac)) { 728 if (!(*ac)) {
@@ -738,12 +758,22 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
738 goto inode_steal; 758 goto inode_steal;
739 759
740 atomic_set(&osb->s_num_inodes_stolen, 0); 760 atomic_set(&osb->s_num_inodes_stolen, 0);
761 alloc_group = osb->osb_inode_alloc_group;
741 status = ocfs2_reserve_suballoc_bits(osb, *ac, 762 status = ocfs2_reserve_suballoc_bits(osb, *ac,
742 INODE_ALLOC_SYSTEM_INODE, 763 INODE_ALLOC_SYSTEM_INODE,
743 osb->slot_num, ALLOC_NEW_GROUP); 764 osb->slot_num,
765 &alloc_group,
766 ALLOC_NEW_GROUP |
767 ALLOC_GROUPS_FROM_GLOBAL);
744 if (status >= 0) { 768 if (status >= 0) {
745 status = 0; 769 status = 0;
746 770
771 spin_lock(&osb->osb_lock);
772 osb->osb_inode_alloc_group = alloc_group;
773 spin_unlock(&osb->osb_lock);
774 mlog(0, "after reservation, new allocation group is "
775 "%llu\n", (unsigned long long)alloc_group);
776
747 /* 777 /*
748 * Some inodes must be freed by us, so try to allocate 778 * Some inodes must be freed by us, so try to allocate
749 * from our own next time. 779 * from our own next time.
@@ -790,7 +820,7 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
790 820
791 status = ocfs2_reserve_suballoc_bits(osb, ac, 821 status = ocfs2_reserve_suballoc_bits(osb, ac,
792 GLOBAL_BITMAP_SYSTEM_INODE, 822 GLOBAL_BITMAP_SYSTEM_INODE,
793 OCFS2_INVALID_SLOT, 823 OCFS2_INVALID_SLOT, NULL,
794 ALLOC_NEW_GROUP); 824 ALLOC_NEW_GROUP);
795 if (status < 0 && status != -ENOSPC) { 825 if (status < 0 && status != -ENOSPC) {
796 mlog_errno(status); 826 mlog_errno(status);
@@ -806,6 +836,7 @@ bail:
806 * things a bit. */ 836 * things a bit. */
807static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, 837static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
808 u32 bits_wanted, u64 max_block, 838 u32 bits_wanted, u64 max_block,
839 int flags,
809 struct ocfs2_alloc_context **ac) 840 struct ocfs2_alloc_context **ac)
810{ 841{
811 int status; 842 int status;
@@ -823,7 +854,8 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
823 (*ac)->ac_max_block = max_block; 854 (*ac)->ac_max_block = max_block;
824 855
825 status = -ENOSPC; 856 status = -ENOSPC;
826 if (ocfs2_alloc_should_use_local(osb, bits_wanted)) { 857 if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
858 ocfs2_alloc_should_use_local(osb, bits_wanted)) {
827 status = ocfs2_reserve_local_alloc_bits(osb, 859 status = ocfs2_reserve_local_alloc_bits(osb,
828 bits_wanted, 860 bits_wanted,
829 *ac); 861 *ac);
@@ -861,7 +893,8 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
861 u32 bits_wanted, 893 u32 bits_wanted,
862 struct ocfs2_alloc_context **ac) 894 struct ocfs2_alloc_context **ac)
863{ 895{
864 return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, ac); 896 return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
897 ALLOC_NEW_GROUP, ac);
865} 898}
866 899
867/* 900/*
@@ -1618,8 +1651,41 @@ bail:
1618 return status; 1651 return status;
1619} 1652}
1620 1653
1654static void ocfs2_init_inode_ac_group(struct inode *dir,
1655 struct buffer_head *parent_fe_bh,
1656 struct ocfs2_alloc_context *ac)
1657{
1658 struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data;
1659 /*
1660 * Try to allocate inodes from some specific group.
1661 *
1662 * If the parent dir has recorded the last group used in allocation,
1663 * cool, use it. Otherwise if we try to allocate new inode from the
1664 * same slot the parent dir belongs to, use the same chunk.
1665 *
1666 * We are very careful here to avoid the mistake of setting
1667 * ac_last_group to a group descriptor from a different (unlocked) slot.
1668 */
1669 if (OCFS2_I(dir)->ip_last_used_group &&
1670 OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
1671 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
1672 else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot)
1673 ac->ac_last_group = ocfs2_which_suballoc_group(
1674 le64_to_cpu(fe->i_blkno),
1675 le16_to_cpu(fe->i_suballoc_bit));
1676}
1677
1678static inline void ocfs2_save_inode_ac_group(struct inode *dir,
1679 struct ocfs2_alloc_context *ac)
1680{
1681 OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
1682 OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
1683}
1684
1621int ocfs2_claim_new_inode(struct ocfs2_super *osb, 1685int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1622 handle_t *handle, 1686 handle_t *handle,
1687 struct inode *dir,
1688 struct buffer_head *parent_fe_bh,
1623 struct ocfs2_alloc_context *ac, 1689 struct ocfs2_alloc_context *ac,
1624 u16 *suballoc_bit, 1690 u16 *suballoc_bit,
1625 u64 *fe_blkno) 1691 u64 *fe_blkno)
@@ -1635,6 +1701,8 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1635 BUG_ON(ac->ac_bits_wanted != 1); 1701 BUG_ON(ac->ac_bits_wanted != 1);
1636 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); 1702 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
1637 1703
1704 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
1705
1638 status = ocfs2_claim_suballoc_bits(osb, 1706 status = ocfs2_claim_suballoc_bits(osb,
1639 ac, 1707 ac,
1640 handle, 1708 handle,
@@ -1653,6 +1721,7 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1653 1721
1654 *fe_blkno = bg_blkno + (u64) (*suballoc_bit); 1722 *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
1655 ac->ac_bits_given++; 1723 ac->ac_bits_given++;
1724 ocfs2_save_inode_ac_group(dir, ac);
1656 status = 0; 1725 status = 0;
1657bail: 1726bail:
1658 mlog_exit(status); 1727 mlog_exit(status);
@@ -2116,3 +2185,167 @@ out:
2116 2185
2117 return ret; 2186 return ret;
2118} 2187}
2188
2189/*
2190 * Read the inode specified by blkno to get suballoc_slot and
2191 * suballoc_bit.
2192 */
2193static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2194 u16 *suballoc_slot, u16 *suballoc_bit)
2195{
2196 int status;
2197 struct buffer_head *inode_bh = NULL;
2198 struct ocfs2_dinode *inode_fe;
2199
2200 mlog_entry("blkno: %llu\n", (unsigned long long)blkno);
2201
2202 /* dirty read disk */
2203 status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
2204 if (status < 0) {
2205 mlog(ML_ERROR, "read block %llu failed %d\n",
2206 (unsigned long long)blkno, status);
2207 goto bail;
2208 }
2209
2210 inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
2211 if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
2212 mlog(ML_ERROR, "invalid inode %llu requested\n",
2213 (unsigned long long)blkno);
2214 status = -EINVAL;
2215 goto bail;
2216 }
2217
2218 if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
2219 (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
2220 mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
2221 (unsigned long long)blkno,
2222 (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
2223 status = -EINVAL;
2224 goto bail;
2225 }
2226
2227 if (suballoc_slot)
2228 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
2229 if (suballoc_bit)
2230 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
2231
2232bail:
2233 brelse(inode_bh);
2234
2235 mlog_exit(status);
2236 return status;
2237}
2238
2239/*
2240 * test whether bit is SET in allocator bitmap or not. on success, 0
2241 * is returned and *res is 1 for SET; 0 otherwise. when fails, errno
2242 * is returned and *res is meaningless. Call this after you have
2243 * cluster locked against suballoc, or you may get a result based on
2244 * non-up2date contents
2245 */
2246static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2247 struct inode *suballoc,
2248 struct buffer_head *alloc_bh, u64 blkno,
2249 u16 bit, int *res)
2250{
2251 struct ocfs2_dinode *alloc_fe;
2252 struct ocfs2_group_desc *group;
2253 struct buffer_head *group_bh = NULL;
2254 u64 bg_blkno;
2255 int status;
2256
2257 mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
2258 (unsigned int)bit);
2259
2260 alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
2261 if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) {
2262 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
2263 (unsigned int)bit,
2264 ocfs2_bits_per_group(&alloc_fe->id2.i_chain));
2265 status = -EINVAL;
2266 goto bail;
2267 }
2268
2269 bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
2270 status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno,
2271 &group_bh);
2272 if (status < 0) {
2273 mlog(ML_ERROR, "read group %llu failed %d\n",
2274 (unsigned long long)bg_blkno, status);
2275 goto bail;
2276 }
2277
2278 group = (struct ocfs2_group_desc *) group_bh->b_data;
2279 *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
2280
2281bail:
2282 brelse(group_bh);
2283
2284 mlog_exit(status);
2285 return status;
2286}
2287
2288/*
2289 * Test if the bit representing this inode (blkno) is set in the
2290 * suballocator.
2291 *
2292 * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
2293 *
2294 * In the event of failure, a negative value is returned and *res is
2295 * meaningless.
2296 *
2297 * Callers must make sure to hold nfs_sync_lock to prevent
2298 * ocfs2_delete_inode() on another node from accessing the same
2299 * suballocator concurrently.
2300 */
2301int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2302{
2303 int status;
2304 u16 suballoc_bit = 0, suballoc_slot = 0;
2305 struct inode *inode_alloc_inode;
2306 struct buffer_head *alloc_bh = NULL;
2307
2308 mlog_entry("blkno: %llu", (unsigned long long)blkno);
2309
2310 status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
2311 &suballoc_bit);
2312 if (status < 0) {
2313 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
2314 goto bail;
2315 }
2316
2317 inode_alloc_inode =
2318 ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
2319 suballoc_slot);
2320 if (!inode_alloc_inode) {
2321 /* the error code could be inaccurate, but we are not able to
2322 * get the correct one. */
2323 status = -EINVAL;
2324 mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
2325 (u32)suballoc_slot);
2326 goto bail;
2327 }
2328
2329 mutex_lock(&inode_alloc_inode->i_mutex);
2330 status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
2331 if (status < 0) {
2332 mutex_unlock(&inode_alloc_inode->i_mutex);
2333 mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
2334 (u32)suballoc_slot, status);
2335 goto bail;
2336 }
2337
2338 status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
2339 blkno, suballoc_bit, res);
2340 if (status < 0)
2341 mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
2342
2343 ocfs2_inode_unlock(inode_alloc_inode, 0);
2344 mutex_unlock(&inode_alloc_inode->i_mutex);
2345
2346 iput(inode_alloc_inode);
2347 brelse(alloc_bh);
2348bail:
2349 mlog_exit(status);
2350 return status;
2351}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index e3c13c77f9e8..8c9a78a43164 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -88,6 +88,8 @@ int ocfs2_claim_metadata(struct ocfs2_super *osb,
88 u64 *blkno_start); 88 u64 *blkno_start);
89int ocfs2_claim_new_inode(struct ocfs2_super *osb, 89int ocfs2_claim_new_inode(struct ocfs2_super *osb,
90 handle_t *handle, 90 handle_t *handle,
91 struct inode *dir,
92 struct buffer_head *parent_fe_bh,
91 struct ocfs2_alloc_context *ac, 93 struct ocfs2_alloc_context *ac,
92 u16 *suballoc_bit, 94 u16 *suballoc_bit,
93 u64 *fe_blkno); 95 u64 *fe_blkno);
@@ -186,4 +188,6 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
186 u32 clusters_to_add, u32 extents_to_split, 188 u32 clusters_to_add, u32 extents_to_split,
187 struct ocfs2_alloc_context **data_ac, 189 struct ocfs2_alloc_context **data_ac,
188 struct ocfs2_alloc_context **meta_ac); 190 struct ocfs2_alloc_context **meta_ac);
191
192int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res);
189#endif /* _CHAINALLOC_H_ */ 193#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 7ac83a81ee55..79ff8d9d37e0 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -201,6 +201,170 @@ static const match_table_t tokens = {
201 {Opt_err, NULL} 201 {Opt_err, NULL}
202}; 202};
203 203
204#ifdef CONFIG_DEBUG_FS
205static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
206{
207 int out = 0;
208 int i;
209 struct ocfs2_cluster_connection *cconn = osb->cconn;
210 struct ocfs2_recovery_map *rm = osb->recovery_map;
211
212 out += snprintf(buf + out, len - out,
213 "%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n",
214 "Device", osb->dev_str, osb->uuid_str,
215 osb->fs_generation, osb->vol_label);
216
217 out += snprintf(buf + out, len - out,
218 "%10s => State: %d Flags: 0x%lX\n", "Volume",
219 atomic_read(&osb->vol_state), osb->osb_flags);
220
221 out += snprintf(buf + out, len - out,
222 "%10s => Block: %lu Cluster: %d\n", "Sizes",
223 osb->sb->s_blocksize, osb->s_clustersize);
224
225 out += snprintf(buf + out, len - out,
226 "%10s => Compat: 0x%X Incompat: 0x%X "
227 "ROcompat: 0x%X\n",
228 "Features", osb->s_feature_compat,
229 osb->s_feature_incompat, osb->s_feature_ro_compat);
230
231 out += snprintf(buf + out, len - out,
232 "%10s => Opts: 0x%lX AtimeQuanta: %u\n", "Mount",
233 osb->s_mount_opt, osb->s_atime_quantum);
234
235 out += snprintf(buf + out, len - out,
236 "%10s => Stack: %s Name: %*s Version: %d.%d\n",
237 "Cluster",
238 (*osb->osb_cluster_stack == '\0' ?
239 "o2cb" : osb->osb_cluster_stack),
240 cconn->cc_namelen, cconn->cc_name,
241 cconn->cc_version.pv_major, cconn->cc_version.pv_minor);
242
243 spin_lock(&osb->dc_task_lock);
244 out += snprintf(buf + out, len - out,
245 "%10s => Pid: %d Count: %lu WakeSeq: %lu "
246 "WorkSeq: %lu\n", "DownCnvt",
247 task_pid_nr(osb->dc_task), osb->blocked_lock_count,
248 osb->dc_wake_sequence, osb->dc_work_sequence);
249 spin_unlock(&osb->dc_task_lock);
250
251 spin_lock(&osb->osb_lock);
252 out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:",
253 "Recovery",
254 (osb->recovery_thread_task ?
255 task_pid_nr(osb->recovery_thread_task) : -1));
256 if (rm->rm_used == 0)
257 out += snprintf(buf + out, len - out, " None\n");
258 else {
259 for (i = 0; i < rm->rm_used; i++)
260 out += snprintf(buf + out, len - out, " %d",
261 rm->rm_entries[i]);
262 out += snprintf(buf + out, len - out, "\n");
263 }
264 spin_unlock(&osb->osb_lock);
265
266 out += snprintf(buf + out, len - out,
267 "%10s => Pid: %d Interval: %lu Needs: %d\n", "Commit",
268 task_pid_nr(osb->commit_task), osb->osb_commit_interval,
269 atomic_read(&osb->needs_checkpoint));
270
271 out += snprintf(buf + out, len - out,
272 "%10s => State: %d NumTxns: %d TxnId: %lu\n",
273 "Journal", osb->journal->j_state,
274 atomic_read(&osb->journal->j_num_trans),
275 osb->journal->j_trans_id);
276
277 out += snprintf(buf + out, len - out,
278 "%10s => GlobalAllocs: %d LocalAllocs: %d "
279 "SubAllocs: %d LAWinMoves: %d SAExtends: %d\n",
280 "Stats",
281 atomic_read(&osb->alloc_stats.bitmap_data),
282 atomic_read(&osb->alloc_stats.local_data),
283 atomic_read(&osb->alloc_stats.bg_allocs),
284 atomic_read(&osb->alloc_stats.moves),
285 atomic_read(&osb->alloc_stats.bg_extends));
286
287 out += snprintf(buf + out, len - out,
288 "%10s => State: %u Descriptor: %llu Size: %u bits "
289 "Default: %u bits\n",
290 "LocalAlloc", osb->local_alloc_state,
291 (unsigned long long)osb->la_last_gd,
292 osb->local_alloc_bits, osb->local_alloc_default_bits);
293
294 spin_lock(&osb->osb_lock);
295 out += snprintf(buf + out, len - out,
296 "%10s => Slot: %d NumStolen: %d\n", "Steal",
297 osb->s_inode_steal_slot,
298 atomic_read(&osb->s_num_inodes_stolen));
299 spin_unlock(&osb->osb_lock);
300
301 out += snprintf(buf + out, len - out, "%10s => %3s %10s\n",
302 "Slots", "Num", "RecoGen");
303
304 for (i = 0; i < osb->max_slots; ++i) {
305 out += snprintf(buf + out, len - out,
306 "%10s %c %3d %10d\n",
307 " ",
308 (i == osb->slot_num ? '*' : ' '),
309 i, osb->slot_recovery_generations[i]);
310 }
311
312 return out;
313}
314
315static int ocfs2_osb_debug_open(struct inode *inode, struct file *file)
316{
317 struct ocfs2_super *osb = inode->i_private;
318 char *buf = NULL;
319
320 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
321 if (!buf)
322 goto bail;
323
324 i_size_write(inode, ocfs2_osb_dump(osb, buf, PAGE_SIZE));
325
326 file->private_data = buf;
327
328 return 0;
329bail:
330 return -ENOMEM;
331}
332
333static int ocfs2_debug_release(struct inode *inode, struct file *file)
334{
335 kfree(file->private_data);
336 return 0;
337}
338
339static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
340 size_t nbytes, loff_t *ppos)
341{
342 return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
343 i_size_read(file->f_mapping->host));
344}
345#else
346static int ocfs2_osb_debug_open(struct inode *inode, struct file *file)
347{
348 return 0;
349}
350static int ocfs2_debug_release(struct inode *inode, struct file *file)
351{
352 return 0;
353}
354static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
355 size_t nbytes, loff_t *ppos)
356{
357 return 0;
358}
359#endif /* CONFIG_DEBUG_FS */
360
361static struct file_operations ocfs2_osb_debug_fops = {
362 .open = ocfs2_osb_debug_open,
363 .release = ocfs2_debug_release,
364 .read = ocfs2_debug_read,
365 .llseek = generic_file_llseek,
366};
367
204/* 368/*
205 * write_super and sync_fs ripped right out of ext3. 369 * write_super and sync_fs ripped right out of ext3.
206 */ 370 */
@@ -926,6 +1090,16 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
926 goto read_super_error; 1090 goto read_super_error;
927 } 1091 }
928 1092
1093 osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR,
1094 osb->osb_debug_root,
1095 osb,
1096 &ocfs2_osb_debug_fops);
1097 if (!osb->osb_ctxt) {
1098 status = -EINVAL;
1099 mlog_errno(status);
1100 goto read_super_error;
1101 }
1102
929 status = ocfs2_mount_volume(sb); 1103 status = ocfs2_mount_volume(sb);
930 if (osb->root_inode) 1104 if (osb->root_inode)
931 inode = igrab(osb->root_inode); 1105 inode = igrab(osb->root_inode);
@@ -1620,6 +1794,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1620 osb = OCFS2_SB(sb); 1794 osb = OCFS2_SB(sb);
1621 BUG_ON(!osb); 1795 BUG_ON(!osb);
1622 1796
1797 debugfs_remove(osb->osb_ctxt);
1798
1623 ocfs2_disable_quotas(osb); 1799 ocfs2_disable_quotas(osb);
1624 1800
1625 ocfs2_shutdown_local_alloc(osb); 1801 ocfs2_shutdown_local_alloc(osb);
@@ -1742,6 +1918,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
1742 bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); 1918 bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
1743 sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits); 1919 sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits);
1744 1920
1921 osb->osb_dx_mask = (1 << (cbits - bbits)) - 1;
1922
1923 for (i = 0; i < 3; i++)
1924 osb->osb_dx_seed[i] = le32_to_cpu(di->id2.i_super.s_dx_seed[i]);
1925 osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash);
1926
1745 osb->sb = sb; 1927 osb->sb = sb;
1746 /* Save off for ocfs2_rw_direct */ 1928 /* Save off for ocfs2_rw_direct */
1747 osb->s_sectsize_bits = blksize_bits(sector_size); 1929 osb->s_sectsize_bits = blksize_bits(sector_size);
@@ -2130,6 +2312,12 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
2130 * lock, and it's marked as dirty, set the bit in the recover 2312 * lock, and it's marked as dirty, set the bit in the recover
2131 * map and launch a recovery thread for it. */ 2313 * map and launch a recovery thread for it. */
2132 status = ocfs2_mark_dead_nodes(osb); 2314 status = ocfs2_mark_dead_nodes(osb);
2315 if (status < 0) {
2316 mlog_errno(status);
2317 goto finally;
2318 }
2319
2320 status = ocfs2_compute_replay_slots(osb);
2133 if (status < 0) 2321 if (status < 0)
2134 mlog_errno(status); 2322 mlog_errno(status);
2135 2323
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index ed0a0cfd68d2..579dd1b1110f 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -39,6 +39,7 @@
39#include <linux/slab.h> 39#include <linux/slab.h>
40#include <linux/pagemap.h> 40#include <linux/pagemap.h>
41#include <linux/utsname.h> 41#include <linux/utsname.h>
42#include <linux/namei.h>
42 43
43#define MLOG_MASK_PREFIX ML_NAMEI 44#define MLOG_MASK_PREFIX ML_NAMEI
44#include <cluster/masklog.h> 45#include <cluster/masklog.h>
@@ -54,26 +55,6 @@
54 55
55#include "buffer_head_io.h" 56#include "buffer_head_io.h"
56 57
57static char *ocfs2_page_getlink(struct dentry * dentry,
58 struct page **ppage);
59static char *ocfs2_fast_symlink_getlink(struct inode *inode,
60 struct buffer_head **bh);
61
62/* get the link contents into pagecache */
63static char *ocfs2_page_getlink(struct dentry * dentry,
64 struct page **ppage)
65{
66 struct page * page;
67 struct address_space *mapping = dentry->d_inode->i_mapping;
68 page = read_mapping_page(mapping, 0, NULL);
69 if (IS_ERR(page))
70 goto sync_fail;
71 *ppage = page;
72 return kmap(page);
73
74sync_fail:
75 return (char*)page;
76}
77 58
78static char *ocfs2_fast_symlink_getlink(struct inode *inode, 59static char *ocfs2_fast_symlink_getlink(struct inode *inode,
79 struct buffer_head **bh) 60 struct buffer_head **bh)
@@ -128,40 +109,55 @@ out:
128 return ret; 109 return ret;
129} 110}
130 111
131static void *ocfs2_follow_link(struct dentry *dentry, 112static void *ocfs2_fast_follow_link(struct dentry *dentry,
132 struct nameidata *nd) 113 struct nameidata *nd)
133{ 114{
134 int status; 115 int status = 0;
135 char *link; 116 int len;
117 char *target, *link = ERR_PTR(-ENOMEM);
136 struct inode *inode = dentry->d_inode; 118 struct inode *inode = dentry->d_inode;
137 struct page *page = NULL;
138 struct buffer_head *bh = NULL; 119 struct buffer_head *bh = NULL;
139 120
140 if (ocfs2_inode_is_fast_symlink(inode)) 121 mlog_entry_void();
141 link = ocfs2_fast_symlink_getlink(inode, &bh); 122
142 else 123 BUG_ON(!ocfs2_inode_is_fast_symlink(inode));
143 link = ocfs2_page_getlink(dentry, &page); 124 target = ocfs2_fast_symlink_getlink(inode, &bh);
144 if (IS_ERR(link)) { 125 if (IS_ERR(target)) {
145 status = PTR_ERR(link); 126 status = PTR_ERR(target);
146 mlog_errno(status); 127 mlog_errno(status);
147 goto bail; 128 goto bail;
148 } 129 }
149 130
150 status = vfs_follow_link(nd, link); 131 /* Fast symlinks can't be large */
132 len = strlen(target);
133 link = kzalloc(len + 1, GFP_NOFS);
134 if (!link) {
135 status = -ENOMEM;
136 mlog_errno(status);
137 goto bail;
138 }
139
140 memcpy(link, target, len);
141 nd_set_link(nd, link);
151 142
152bail: 143bail:
153 if (page) {
154 kunmap(page);
155 page_cache_release(page);
156 }
157 brelse(bh); 144 brelse(bh);
158 145
159 return ERR_PTR(status); 146 mlog_exit(status);
147 return status ? ERR_PTR(status) : link;
148}
149
150static void ocfs2_fast_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
151{
152 char *link = cookie;
153
154 kfree(link);
160} 155}
161 156
162const struct inode_operations ocfs2_symlink_inode_operations = { 157const struct inode_operations ocfs2_symlink_inode_operations = {
163 .readlink = page_readlink, 158 .readlink = page_readlink,
164 .follow_link = ocfs2_follow_link, 159 .follow_link = page_follow_link_light,
160 .put_link = page_put_link,
165 .getattr = ocfs2_getattr, 161 .getattr = ocfs2_getattr,
166 .setattr = ocfs2_setattr, 162 .setattr = ocfs2_setattr,
167 .setxattr = generic_setxattr, 163 .setxattr = generic_setxattr,
@@ -171,7 +167,8 @@ const struct inode_operations ocfs2_symlink_inode_operations = {
171}; 167};
172const struct inode_operations ocfs2_fast_symlink_inode_operations = { 168const struct inode_operations ocfs2_fast_symlink_inode_operations = {
173 .readlink = ocfs2_readlink, 169 .readlink = ocfs2_readlink,
174 .follow_link = ocfs2_follow_link, 170 .follow_link = ocfs2_fast_follow_link,
171 .put_link = ocfs2_fast_put_link,
175 .getattr = ocfs2_getattr, 172 .getattr = ocfs2_getattr,
176 .setattr = ocfs2_setattr, 173 .setattr = ocfs2_setattr,
177 .setxattr = generic_setxattr, 174 .setxattr = generic_setxattr,
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 2563df89fc2a..15631019dc63 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -512,7 +512,7 @@ int ocfs2_calc_xattr_init(struct inode *dir,
512 struct ocfs2_security_xattr_info *si, 512 struct ocfs2_security_xattr_info *si,
513 int *want_clusters, 513 int *want_clusters,
514 int *xattr_credits, 514 int *xattr_credits,
515 struct ocfs2_alloc_context **xattr_ac) 515 int *want_meta)
516{ 516{
517 int ret = 0; 517 int ret = 0;
518 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 518 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
@@ -554,11 +554,7 @@ int ocfs2_calc_xattr_init(struct inode *dir,
554 if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE || 554 if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
555 (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) || 555 (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) ||
556 (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) { 556 (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) {
557 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac); 557 *want_meta = *want_meta + 1;
558 if (ret) {
559 mlog_errno(ret);
560 return ret;
561 }
562 *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS; 558 *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
563 } 559 }
564 560
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 5a1ebc789f7e..1ca7e9a1b7bc 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -68,7 +68,7 @@ int ocfs2_calc_security_init(struct inode *,
68 int *, int *, struct ocfs2_alloc_context **); 68 int *, int *, struct ocfs2_alloc_context **);
69int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *, 69int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *,
70 int, struct ocfs2_security_xattr_info *, 70 int, struct ocfs2_security_xattr_info *,
71 int *, int *, struct ocfs2_alloc_context **); 71 int *, int *, int *);
72 72
73/* 73/*
74 * xattrs can live inside an inode, as part of an external xattr block, 74 * xattrs can live inside an inode, as part of an external xattr block,
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 633e9dc972bb..379ae5fb4411 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -262,14 +262,19 @@ static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf)
262{ 262{
263 struct super_block *s = dentry->d_sb; 263 struct super_block *s = dentry->d_sb;
264 struct omfs_sb_info *sbi = OMFS_SB(s); 264 struct omfs_sb_info *sbi = OMFS_SB(s);
265 u64 id = huge_encode_dev(s->s_bdev->bd_dev);
266
265 buf->f_type = OMFS_MAGIC; 267 buf->f_type = OMFS_MAGIC;
266 buf->f_bsize = sbi->s_blocksize; 268 buf->f_bsize = sbi->s_blocksize;
267 buf->f_blocks = sbi->s_num_blocks; 269 buf->f_blocks = sbi->s_num_blocks;
268 buf->f_files = sbi->s_num_blocks; 270 buf->f_files = sbi->s_num_blocks;
269 buf->f_namelen = OMFS_NAMELEN; 271 buf->f_namelen = OMFS_NAMELEN;
272 buf->f_fsid.val[0] = (u32)id;
273 buf->f_fsid.val[1] = (u32)(id >> 32);
270 274
271 buf->f_bfree = buf->f_bavail = buf->f_ffree = 275 buf->f_bfree = buf->f_bavail = buf->f_ffree =
272 omfs_count_free(s); 276 omfs_count_free(s);
277
273 return 0; 278 return 0;
274} 279}
275 280
@@ -421,7 +426,7 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
421 426
422 sbi->s_uid = current_uid(); 427 sbi->s_uid = current_uid();
423 sbi->s_gid = current_gid(); 428 sbi->s_gid = current_gid();
424 sbi->s_dmask = sbi->s_fmask = current->fs->umask; 429 sbi->s_dmask = sbi->s_fmask = current_umask();
425 430
426 if (!parse_options((char *) data, sbi)) 431 if (!parse_options((char *) data, sbi))
427 goto end; 432 goto end;
diff --git a/fs/open.c b/fs/open.c
index 75b61677daaf..bdfbf03615a4 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -29,6 +29,7 @@
29#include <linux/rcupdate.h> 29#include <linux/rcupdate.h>
30#include <linux/audit.h> 30#include <linux/audit.h>
31#include <linux/falloc.h> 31#include <linux/falloc.h>
32#include <linux/fs_struct.h>
32 33
33int vfs_statfs(struct dentry *dentry, struct kstatfs *buf) 34int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
34{ 35{
@@ -1032,7 +1033,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
1032 if (!IS_ERR(tmp)) { 1033 if (!IS_ERR(tmp)) {
1033 fd = get_unused_fd_flags(flags); 1034 fd = get_unused_fd_flags(flags);
1034 if (fd >= 0) { 1035 if (fd >= 0) {
1035 struct file *f = do_filp_open(dfd, tmp, flags, mode); 1036 struct file *f = do_filp_open(dfd, tmp, flags, mode, 0);
1036 if (IS_ERR(f)) { 1037 if (IS_ERR(f)) {
1037 put_unused_fd(fd); 1038 put_unused_fd(fd);
1038 fd = PTR_ERR(f); 1039 fd = PTR_ERR(f);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 38e337d51ced..99e33ef40be4 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -19,6 +19,7 @@
19#include <linux/kmod.h> 19#include <linux/kmod.h>
20#include <linux/ctype.h> 20#include <linux/ctype.h>
21#include <linux/genhd.h> 21#include <linux/genhd.h>
22#include <linux/blktrace_api.h>
22 23
23#include "check.h" 24#include "check.h"
24 25
@@ -294,6 +295,9 @@ static struct attribute_group part_attr_group = {
294 295
295static struct attribute_group *part_attr_groups[] = { 296static struct attribute_group *part_attr_groups[] = {
296 &part_attr_group, 297 &part_attr_group,
298#ifdef CONFIG_BLK_DEV_IO_TRACE
299 &blk_trace_attr_group,
300#endif
297 NULL 301 NULL
298}; 302};
299 303
diff --git a/fs/pipe.c b/fs/pipe.c
index 4af7aa521813..13414ec45b8d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -37,6 +37,42 @@
37 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 37 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
38 */ 38 */
39 39
40static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
41{
42 if (pipe->inode)
43 mutex_lock_nested(&pipe->inode->i_mutex, subclass);
44}
45
46void pipe_lock(struct pipe_inode_info *pipe)
47{
48 /*
49 * pipe_lock() nests non-pipe inode locks (for writing to a file)
50 */
51 pipe_lock_nested(pipe, I_MUTEX_PARENT);
52}
53EXPORT_SYMBOL(pipe_lock);
54
55void pipe_unlock(struct pipe_inode_info *pipe)
56{
57 if (pipe->inode)
58 mutex_unlock(&pipe->inode->i_mutex);
59}
60EXPORT_SYMBOL(pipe_unlock);
61
62void pipe_double_lock(struct pipe_inode_info *pipe1,
63 struct pipe_inode_info *pipe2)
64{
65 BUG_ON(pipe1 == pipe2);
66
67 if (pipe1 < pipe2) {
68 pipe_lock_nested(pipe1, I_MUTEX_PARENT);
69 pipe_lock_nested(pipe2, I_MUTEX_CHILD);
70 } else {
71 pipe_lock_nested(pipe2, I_MUTEX_CHILD);
72 pipe_lock_nested(pipe1, I_MUTEX_PARENT);
73 }
74}
75
40/* Drop the inode semaphore and wait for a pipe event, atomically */ 76/* Drop the inode semaphore and wait for a pipe event, atomically */
41void pipe_wait(struct pipe_inode_info *pipe) 77void pipe_wait(struct pipe_inode_info *pipe)
42{ 78{
@@ -47,12 +83,10 @@ void pipe_wait(struct pipe_inode_info *pipe)
47 * is considered a noninteractive wait: 83 * is considered a noninteractive wait:
48 */ 84 */
49 prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); 85 prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
50 if (pipe->inode) 86 pipe_unlock(pipe);
51 mutex_unlock(&pipe->inode->i_mutex);
52 schedule(); 87 schedule();
53 finish_wait(&pipe->wait, &wait); 88 finish_wait(&pipe->wait, &wait);
54 if (pipe->inode) 89 pipe_lock(pipe);
55 mutex_lock(&pipe->inode->i_mutex);
56} 90}
57 91
58static int 92static int
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 7e4877d9dcb5..725a650bbbb8 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -80,6 +80,7 @@
80#include <linux/delayacct.h> 80#include <linux/delayacct.h>
81#include <linux/seq_file.h> 81#include <linux/seq_file.h>
82#include <linux/pid_namespace.h> 82#include <linux/pid_namespace.h>
83#include <linux/ptrace.h>
83#include <linux/tracehook.h> 84#include <linux/tracehook.h>
84 85
85#include <asm/pgtable.h> 86#include <asm/pgtable.h>
@@ -352,6 +353,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
352 char state; 353 char state;
353 pid_t ppid = 0, pgid = -1, sid = -1; 354 pid_t ppid = 0, pgid = -1, sid = -1;
354 int num_threads = 0; 355 int num_threads = 0;
356 int permitted;
355 struct mm_struct *mm; 357 struct mm_struct *mm;
356 unsigned long long start_time; 358 unsigned long long start_time;
357 unsigned long cmin_flt = 0, cmaj_flt = 0; 359 unsigned long cmin_flt = 0, cmaj_flt = 0;
@@ -364,11 +366,14 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
364 366
365 state = *get_task_state(task); 367 state = *get_task_state(task);
366 vsize = eip = esp = 0; 368 vsize = eip = esp = 0;
369 permitted = ptrace_may_access(task, PTRACE_MODE_READ);
367 mm = get_task_mm(task); 370 mm = get_task_mm(task);
368 if (mm) { 371 if (mm) {
369 vsize = task_vsize(mm); 372 vsize = task_vsize(mm);
370 eip = KSTK_EIP(task); 373 if (permitted) {
371 esp = KSTK_ESP(task); 374 eip = KSTK_EIP(task);
375 esp = KSTK_ESP(task);
376 }
372 } 377 }
373 378
374 get_task_comm(tcomm, task); 379 get_task_comm(tcomm, task);
@@ -424,7 +429,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
424 unlock_task_sighand(task, &flags); 429 unlock_task_sighand(task, &flags);
425 } 430 }
426 431
427 if (!whole || num_threads < 2) 432 if (permitted && (!whole || num_threads < 2))
428 wchan = get_wchan(task); 433 wchan = get_wchan(task);
429 if (!whole) { 434 if (!whole) {
430 min_flt = task->min_flt; 435 min_flt = task->min_flt;
@@ -476,7 +481,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
476 rsslim, 481 rsslim,
477 mm ? mm->start_code : 0, 482 mm ? mm->start_code : 0,
478 mm ? mm->end_code : 0, 483 mm ? mm->end_code : 0,
479 mm ? mm->start_stack : 0, 484 (permitted && mm) ? mm->start_stack : 0,
480 esp, 485 esp,
481 eip, 486 eip,
482 /* The signal information here is obsolete. 487 /* The signal information here is obsolete.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index e0afd326b688..3326bbf9ab95 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -80,6 +80,7 @@
80#include <linux/oom.h> 80#include <linux/oom.h>
81#include <linux/elf.h> 81#include <linux/elf.h>
82#include <linux/pid_namespace.h> 82#include <linux/pid_namespace.h>
83#include <linux/fs_struct.h>
83#include "internal.h" 84#include "internal.h"
84 85
85/* NOTE: 86/* NOTE:
@@ -321,7 +322,10 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
321 wchan = get_wchan(task); 322 wchan = get_wchan(task);
322 323
323 if (lookup_symbol_name(wchan, symname) < 0) 324 if (lookup_symbol_name(wchan, symname) < 0)
324 return sprintf(buffer, "%lu", wchan); 325 if (!ptrace_may_access(task, PTRACE_MODE_READ))
326 return 0;
327 else
328 return sprintf(buffer, "%lu", wchan);
325 else 329 else
326 return sprintf(buffer, "%s", symname); 330 return sprintf(buffer, "%s", symname);
327} 331}
@@ -647,14 +651,14 @@ static unsigned mounts_poll(struct file *file, poll_table *wait)
647{ 651{
648 struct proc_mounts *p = file->private_data; 652 struct proc_mounts *p = file->private_data;
649 struct mnt_namespace *ns = p->ns; 653 struct mnt_namespace *ns = p->ns;
650 unsigned res = 0; 654 unsigned res = POLLIN | POLLRDNORM;
651 655
652 poll_wait(file, &ns->poll, wait); 656 poll_wait(file, &ns->poll, wait);
653 657
654 spin_lock(&vfsmount_lock); 658 spin_lock(&vfsmount_lock);
655 if (p->event != ns->event) { 659 if (p->event != ns->event) {
656 p->event = ns->event; 660 p->event = ns->event;
657 res = POLLERR; 661 res |= POLLERR | POLLPRI;
658 } 662 }
659 spin_unlock(&vfsmount_lock); 663 spin_unlock(&vfsmount_lock);
660 664
@@ -1952,7 +1956,7 @@ static struct dentry *proc_pident_instantiate(struct inode *dir,
1952 const struct pid_entry *p = ptr; 1956 const struct pid_entry *p = ptr;
1953 struct inode *inode; 1957 struct inode *inode;
1954 struct proc_inode *ei; 1958 struct proc_inode *ei;
1955 struct dentry *error = ERR_PTR(-EINVAL); 1959 struct dentry *error = ERR_PTR(-ENOENT);
1956 1960
1957 inode = proc_pid_make_inode(dir->i_sb, task); 1961 inode = proc_pid_make_inode(dir->i_sb, task);
1958 if (!inode) 1962 if (!inode)
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 43d23948384a..c6b0302af4c4 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -35,7 +35,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
35#define K(x) ((x) << (PAGE_SHIFT - 10)) 35#define K(x) ((x) << (PAGE_SHIFT - 10))
36 si_meminfo(&i); 36 si_meminfo(&i);
37 si_swapinfo(&i); 37 si_swapinfo(&i);
38 committed = atomic_long_read(&vm_committed_space); 38 committed = percpu_counter_read_positive(&vm_committed_as);
39 allowed = ((totalram_pages - hugetlb_total_pages()) 39 allowed = ((totalram_pages - hugetlb_total_pages())
40 * sysctl_overcommit_ratio / 100) + total_swap_pages; 40 * sysctl_overcommit_ratio / 100) + total_swap_pages;
41 41
@@ -120,7 +120,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
120 K(i.freeram-i.freehigh), 120 K(i.freeram-i.freehigh),
121#endif 121#endif
122#ifndef CONFIG_MMU 122#ifndef CONFIG_MMU
123 K((unsigned long) atomic_read(&mmap_pages_allocated)), 123 K((unsigned long) atomic_long_read(&mmap_pages_allocated)),
124#endif 124#endif
125 K(i.totalswap), 125 K(i.totalswap),
126 K(i.freeswap), 126 K(i.freeswap),
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index b446d7ad0b0d..7e14d1a04001 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -76,7 +76,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
76 76
77/* 77/*
78 * display a list of all the REGIONs the kernel knows about 78 * display a list of all the REGIONs the kernel knows about
79 * - nommu kernals have a single flat list 79 * - nommu kernels have a single flat list
80 */ 80 */
81static int nommu_region_list_show(struct seq_file *m, void *_p) 81static int nommu_region_list_show(struct seq_file *m, void *_p)
82{ 82{
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 4a9e0f65ae60..83adcc869437 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -144,16 +144,12 @@ void proc_tty_register_driver(struct tty_driver *driver)
144{ 144{
145 struct proc_dir_entry *ent; 145 struct proc_dir_entry *ent;
146 146
147 if (!driver->ops->read_proc || !driver->driver_name || 147 if (!driver->driver_name || driver->proc_entry ||
148 driver->proc_entry) 148 !driver->ops->proc_fops)
149 return; 149 return;
150 150
151 ent = create_proc_entry(driver->driver_name, 0, proc_tty_driver); 151 ent = proc_create_data(driver->driver_name, 0, proc_tty_driver,
152 if (!ent) 152 driver->ops->proc_fops, driver);
153 return;
154 ent->read_proc = driver->ops->read_proc;
155 ent->data = driver;
156
157 driver->proc_entry = ent; 153 driver->proc_entry = ent;
158} 154}
159 155
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 1e15a2b176e8..b080b791d9e3 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -67,8 +67,7 @@ static int proc_get_sb(struct file_system_type *fs_type,
67 sb->s_flags = flags; 67 sb->s_flags = flags;
68 err = proc_fill_super(sb); 68 err = proc_fill_super(sb);
69 if (err) { 69 if (err) {
70 up_write(&sb->s_umount); 70 deactivate_locked_super(sb);
71 deactivate_super(sb);
72 return err; 71 return err;
73 } 72 }
74 73
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index f75efa22df5e..81e4eb60972e 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -18,6 +18,9 @@
18#ifndef arch_irq_stat 18#ifndef arch_irq_stat
19#define arch_irq_stat() 0 19#define arch_irq_stat() 0
20#endif 20#endif
21#ifndef arch_idle_time
22#define arch_idle_time(cpu) 0
23#endif
21 24
22static int show_stat(struct seq_file *p, void *v) 25static int show_stat(struct seq_file *p, void *v)
23{ 26{
@@ -40,6 +43,7 @@ static int show_stat(struct seq_file *p, void *v)
40 nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice); 43 nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice);
41 system = cputime64_add(system, kstat_cpu(i).cpustat.system); 44 system = cputime64_add(system, kstat_cpu(i).cpustat.system);
42 idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle); 45 idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle);
46 idle = cputime64_add(idle, arch_idle_time(i));
43 iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait); 47 iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait);
44 irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); 48 irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
45 softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); 49 softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
@@ -69,6 +73,7 @@ static int show_stat(struct seq_file *p, void *v)
69 nice = kstat_cpu(i).cpustat.nice; 73 nice = kstat_cpu(i).cpustat.nice;
70 system = kstat_cpu(i).cpustat.system; 74 system = kstat_cpu(i).cpustat.system;
71 idle = kstat_cpu(i).cpustat.idle; 75 idle = kstat_cpu(i).cpustat.idle;
76 idle = cputime64_add(idle, arch_idle_time(i));
72 iowait = kstat_cpu(i).cpustat.iowait; 77 iowait = kstat_cpu(i).cpustat.iowait;
73 irq = kstat_cpu(i).cpustat.irq; 78 irq = kstat_cpu(i).cpustat.irq;
74 softirq = kstat_cpu(i).cpustat.softirq; 79 softirq = kstat_cpu(i).cpustat.softirq;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index b0ae0be4801f..6f61b7cc32e0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -204,6 +204,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
204 struct file *file = vma->vm_file; 204 struct file *file = vma->vm_file;
205 int flags = vma->vm_flags; 205 int flags = vma->vm_flags;
206 unsigned long ino = 0; 206 unsigned long ino = 0;
207 unsigned long long pgoff = 0;
207 dev_t dev = 0; 208 dev_t dev = 0;
208 int len; 209 int len;
209 210
@@ -211,6 +212,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
211 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 212 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
212 dev = inode->i_sb->s_dev; 213 dev = inode->i_sb->s_dev;
213 ino = inode->i_ino; 214 ino = inode->i_ino;
215 pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
214 } 216 }
215 217
216 seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", 218 seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
@@ -220,7 +222,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
220 flags & VM_WRITE ? 'w' : '-', 222 flags & VM_WRITE ? 'w' : '-',
221 flags & VM_EXEC ? 'x' : '-', 223 flags & VM_EXEC ? 'x' : '-',
222 flags & VM_MAYSHARE ? 's' : 'p', 224 flags & VM_MAYSHARE ? 's' : 'p',
223 ((loff_t)vma->vm_pgoff) << PAGE_SHIFT, 225 pgoff,
224 MAJOR(dev), MINOR(dev), ino, &len); 226 MAJOR(dev), MINOR(dev), ino, &len);
225 227
226 /* 228 /*
@@ -663,6 +665,10 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
663 goto out_task; 665 goto out_task;
664 666
665 ret = 0; 667 ret = 0;
668
669 if (!count)
670 goto out_task;
671
666 mm = get_task_mm(task); 672 mm = get_task_mm(task);
667 if (!mm) 673 if (!mm)
668 goto out_task; 674 goto out_task;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 343ea1216bc8..64a72e2e7650 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -2,6 +2,7 @@
2#include <linux/mm.h> 2#include <linux/mm.h>
3#include <linux/file.h> 3#include <linux/file.h>
4#include <linux/fdtable.h> 4#include <linux/fdtable.h>
5#include <linux/fs_struct.h>
5#include <linux/mount.h> 6#include <linux/mount.h>
6#include <linux/ptrace.h> 7#include <linux/ptrace.h>
7#include <linux/seq_file.h> 8#include <linux/seq_file.h>
@@ -49,7 +50,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
49 else 50 else
50 bytes += kobjsize(mm); 51 bytes += kobjsize(mm);
51 52
52 if (current->fs && atomic_read(&current->fs->count) > 1) 53 if (current->fs && current->fs->users > 1)
53 sbytes += kobjsize(current->fs); 54 sbytes += kobjsize(current->fs);
54 else 55 else
55 bytes += kobjsize(current->fs); 56 bytes += kobjsize(current->fs);
@@ -125,6 +126,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
125 struct file *file; 126 struct file *file;
126 dev_t dev = 0; 127 dev_t dev = 0;
127 int flags, len; 128 int flags, len;
129 unsigned long long pgoff = 0;
128 130
129 flags = vma->vm_flags; 131 flags = vma->vm_flags;
130 file = vma->vm_file; 132 file = vma->vm_file;
@@ -133,17 +135,18 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
133 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 135 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
134 dev = inode->i_sb->s_dev; 136 dev = inode->i_sb->s_dev;
135 ino = inode->i_ino; 137 ino = inode->i_ino;
138 pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
136 } 139 }
137 140
138 seq_printf(m, 141 seq_printf(m,
139 "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n", 142 "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
140 vma->vm_start, 143 vma->vm_start,
141 vma->vm_end, 144 vma->vm_end,
142 flags & VM_READ ? 'r' : '-', 145 flags & VM_READ ? 'r' : '-',
143 flags & VM_WRITE ? 'w' : '-', 146 flags & VM_WRITE ? 'w' : '-',
144 flags & VM_EXEC ? 'x' : '-', 147 flags & VM_EXEC ? 'x' : '-',
145 flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', 148 flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
146 vma->vm_pgoff << PAGE_SHIFT, 149 pgoff,
147 MAJOR(dev), MINOR(dev), ino, &len); 150 MAJOR(dev), MINOR(dev), ino, &len);
148 151
149 if (file) { 152 if (file) {
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2aad1044b84c..fe1f0f31d11c 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -282,6 +282,7 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock )
282static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf) 282static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
283{ 283{
284 struct super_block *sb = dentry->d_sb; 284 struct super_block *sb = dentry->d_sb;
285 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
285 286
286 lock_kernel(); 287 lock_kernel();
287 288
@@ -291,6 +292,8 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
291 buf->f_bfree = qnx4_count_free_blocks(sb); 292 buf->f_bfree = qnx4_count_free_blocks(sb);
292 buf->f_bavail = buf->f_bfree; 293 buf->f_bavail = buf->f_bfree;
293 buf->f_namelen = QNX4_NAME_MAX; 294 buf->f_namelen = QNX4_NAME_MAX;
295 buf->f_fsid.val[0] = (u32)id;
296 buf->f_fsid.val[1] = (u32)(id >> 32);
294 297
295 unlock_kernel(); 298 unlock_kernel();
296 299
diff --git a/fs/quota/Makefile b/fs/quota/Makefile
index 385a0831cc99..68d4f6dc0578 100644
--- a/fs/quota/Makefile
+++ b/fs/quota/Makefile
@@ -1,12 +1,3 @@
1#
2# Makefile for the Linux filesystems.
3#
4# 14 Sep 2000, Christoph Hellwig <hch@infradead.org>
5# Rewritten to use lists instead of if-statements.
6#
7
8obj-y :=
9
10obj-$(CONFIG_QUOTA) += dquot.o 1obj-$(CONFIG_QUOTA) += dquot.o
11obj-$(CONFIG_QFMT_V1) += quota_v1.o 2obj-$(CONFIG_QFMT_V1) += quota_v1.o
12obj-$(CONFIG_QFMT_V2) += quota_v2.o 3obj-$(CONFIG_QFMT_V2) += quota_v2.o
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 2ca967a5ef77..607c579e5eca 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -823,7 +823,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
823 823
824 spin_lock(&inode_lock); 824 spin_lock(&inode_lock);
825 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 825 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
826 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 826 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
827 continue; 827 continue;
828 if (!atomic_read(&inode->i_writecount)) 828 if (!atomic_read(&inode->i_writecount))
829 continue; 829 continue;
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 995ef1d6686c..ebb2c417912c 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -59,7 +59,6 @@ const struct inode_operations ramfs_file_inode_operations = {
59 */ 59 */
60int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) 60int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
61{ 61{
62 struct pagevec lru_pvec;
63 unsigned long npages, xpages, loop, limit; 62 unsigned long npages, xpages, loop, limit;
64 struct page *pages; 63 struct page *pages;
65 unsigned order; 64 unsigned order;
@@ -102,24 +101,20 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
102 memset(data, 0, newsize); 101 memset(data, 0, newsize);
103 102
104 /* attach all the pages to the inode's address space */ 103 /* attach all the pages to the inode's address space */
105 pagevec_init(&lru_pvec, 0);
106 for (loop = 0; loop < npages; loop++) { 104 for (loop = 0; loop < npages; loop++) {
107 struct page *page = pages + loop; 105 struct page *page = pages + loop;
108 106
109 ret = add_to_page_cache(page, inode->i_mapping, loop, GFP_KERNEL); 107 ret = add_to_page_cache_lru(page, inode->i_mapping, loop,
108 GFP_KERNEL);
110 if (ret < 0) 109 if (ret < 0)
111 goto add_error; 110 goto add_error;
112 111
113 if (!pagevec_add(&lru_pvec, page))
114 __pagevec_lru_add_file(&lru_pvec);
115
116 /* prevent the page from being discarded on memory pressure */ 112 /* prevent the page from being discarded on memory pressure */
117 SetPageDirty(page); 113 SetPageDirty(page);
118 114
119 unlock_page(page); 115 unlock_page(page);
120 } 116 }
121 117
122 pagevec_lru_add_file(&lru_pvec);
123 return 0; 118 return 0;
124 119
125 fsize_exceeded: 120 fsize_exceeded:
@@ -128,10 +123,8 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
128 return -EFBIG; 123 return -EFBIG;
129 124
130 add_error: 125 add_error:
131 pagevec_lru_add_file(&lru_pvec); 126 while (loop < npages)
132 page_cache_release(pages + loop); 127 __free_page(pages + loop++);
133 for (loop++; loop < npages; loop++)
134 __free_page(pages + loop);
135 return ret; 128 return ret;
136} 129}
137 130
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index b7e6ac706b87..3a6b193d8444 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -33,12 +33,15 @@
33#include <linux/backing-dev.h> 33#include <linux/backing-dev.h>
34#include <linux/ramfs.h> 34#include <linux/ramfs.h>
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <linux/parser.h>
36#include <asm/uaccess.h> 37#include <asm/uaccess.h>
37#include "internal.h" 38#include "internal.h"
38 39
39/* some random number */ 40/* some random number */
40#define RAMFS_MAGIC 0x858458f6 41#define RAMFS_MAGIC 0x858458f6
41 42
43#define RAMFS_DEFAULT_MODE 0755
44
42static const struct super_operations ramfs_ops; 45static const struct super_operations ramfs_ops;
43static const struct inode_operations ramfs_dir_inode_operations; 46static const struct inode_operations ramfs_dir_inode_operations;
44 47
@@ -158,30 +161,102 @@ static const struct inode_operations ramfs_dir_inode_operations = {
158static const struct super_operations ramfs_ops = { 161static const struct super_operations ramfs_ops = {
159 .statfs = simple_statfs, 162 .statfs = simple_statfs,
160 .drop_inode = generic_delete_inode, 163 .drop_inode = generic_delete_inode,
164 .show_options = generic_show_options,
165};
166
167struct ramfs_mount_opts {
168 umode_t mode;
169};
170
171enum {
172 Opt_mode,
173 Opt_err
161}; 174};
162 175
176static const match_table_t tokens = {
177 {Opt_mode, "mode=%o"},
178 {Opt_err, NULL}
179};
180
181struct ramfs_fs_info {
182 struct ramfs_mount_opts mount_opts;
183};
184
185static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts)
186{
187 substring_t args[MAX_OPT_ARGS];
188 int option;
189 int token;
190 char *p;
191
192 opts->mode = RAMFS_DEFAULT_MODE;
193
194 while ((p = strsep(&data, ",")) != NULL) {
195 if (!*p)
196 continue;
197
198 token = match_token(p, tokens, args);
199 switch (token) {
200 case Opt_mode:
201 if (match_octal(&args[0], &option))
202 return -EINVAL;
203 opts->mode = option & S_IALLUGO;
204 break;
205 default:
206 printk(KERN_ERR "ramfs: bad mount option: %s\n", p);
207 return -EINVAL;
208 }
209 }
210
211 return 0;
212}
213
163static int ramfs_fill_super(struct super_block * sb, void * data, int silent) 214static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
164{ 215{
165 struct inode * inode; 216 struct ramfs_fs_info *fsi;
166 struct dentry * root; 217 struct inode *inode = NULL;
167 218 struct dentry *root;
168 sb->s_maxbytes = MAX_LFS_FILESIZE; 219 int err;
169 sb->s_blocksize = PAGE_CACHE_SIZE; 220
170 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 221 save_mount_options(sb, data);
171 sb->s_magic = RAMFS_MAGIC; 222
172 sb->s_op = &ramfs_ops; 223 fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL);
173 sb->s_time_gran = 1; 224 sb->s_fs_info = fsi;
174 inode = ramfs_get_inode(sb, S_IFDIR | 0755, 0); 225 if (!fsi) {
175 if (!inode) 226 err = -ENOMEM;
176 return -ENOMEM; 227 goto fail;
228 }
229
230 err = ramfs_parse_options(data, &fsi->mount_opts);
231 if (err)
232 goto fail;
233
234 sb->s_maxbytes = MAX_LFS_FILESIZE;
235 sb->s_blocksize = PAGE_CACHE_SIZE;
236 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
237 sb->s_magic = RAMFS_MAGIC;
238 sb->s_op = &ramfs_ops;
239 sb->s_time_gran = 1;
240
241 inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0);
242 if (!inode) {
243 err = -ENOMEM;
244 goto fail;
245 }
177 246
178 root = d_alloc_root(inode); 247 root = d_alloc_root(inode);
248 sb->s_root = root;
179 if (!root) { 249 if (!root) {
180 iput(inode); 250 err = -ENOMEM;
181 return -ENOMEM; 251 goto fail;
182 } 252 }
183 sb->s_root = root; 253
184 return 0; 254 return 0;
255fail:
256 kfree(fsi);
257 sb->s_fs_info = NULL;
258 iput(inode);
259 return err;
185} 260}
186 261
187int ramfs_get_sb(struct file_system_type *fs_type, 262int ramfs_get_sb(struct file_system_type *fs_type,
@@ -197,10 +272,16 @@ static int rootfs_get_sb(struct file_system_type *fs_type,
197 mnt); 272 mnt);
198} 273}
199 274
275static void ramfs_kill_sb(struct super_block *sb)
276{
277 kfree(sb->s_fs_info);
278 kill_litter_super(sb);
279}
280
200static struct file_system_type ramfs_fs_type = { 281static struct file_system_type ramfs_fs_type = {
201 .name = "ramfs", 282 .name = "ramfs",
202 .get_sb = ramfs_get_sb, 283 .get_sb = ramfs_get_sb,
203 .kill_sb = kill_litter_super, 284 .kill_sb = ramfs_kill_sb,
204}; 285};
205static struct file_system_type rootfs_fs_type = { 286static struct file_system_type rootfs_fs_type = {
206 .name = "rootfs", 287 .name = "rootfs",
diff --git a/fs/read_write.c b/fs/read_write.c
index 400fe81c973e..9d1e76bb9ee1 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -731,6 +731,62 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
731 return ret; 731 return ret;
732} 732}
733 733
734static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
735{
736#define HALF_LONG_BITS (BITS_PER_LONG / 2)
737 return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
738}
739
740SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
741 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
742{
743 loff_t pos = pos_from_hilo(pos_h, pos_l);
744 struct file *file;
745 ssize_t ret = -EBADF;
746 int fput_needed;
747
748 if (pos < 0)
749 return -EINVAL;
750
751 file = fget_light(fd, &fput_needed);
752 if (file) {
753 ret = -ESPIPE;
754 if (file->f_mode & FMODE_PREAD)
755 ret = vfs_readv(file, vec, vlen, &pos);
756 fput_light(file, fput_needed);
757 }
758
759 if (ret > 0)
760 add_rchar(current, ret);
761 inc_syscr(current);
762 return ret;
763}
764
765SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
766 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
767{
768 loff_t pos = pos_from_hilo(pos_h, pos_l);
769 struct file *file;
770 ssize_t ret = -EBADF;
771 int fput_needed;
772
773 if (pos < 0)
774 return -EINVAL;
775
776 file = fget_light(fd, &fput_needed);
777 if (file) {
778 ret = -ESPIPE;
779 if (file->f_mode & FMODE_PWRITE)
780 ret = vfs_writev(file, vec, vlen, &pos);
781 fput_light(file, fput_needed);
782 }
783
784 if (ret > 0)
785 add_wchar(current, ret);
786 inc_syscw(current);
787 return ret;
788}
789
734static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, 790static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
735 size_t count, loff_t max) 791 size_t count, loff_t max)
736{ 792{
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
index 949b8c6addc8..513f431038f9 100644
--- a/fs/reiserfs/Kconfig
+++ b/fs/reiserfs/Kconfig
@@ -1,5 +1,6 @@
1config REISERFS_FS 1config REISERFS_FS
2 tristate "Reiserfs support" 2 tristate "Reiserfs support"
3 select CRC32
3 help 4 help
4 Stores not just filenames but the files themselves in a balanced 5 Stores not just filenames but the files themselves in a balanced
5 tree. Uses journalling. 6 tree. Uses journalling.
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 67a80d7e59e2..45ee3d357c70 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -41,6 +41,18 @@ static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
41 41
42#define store_ih(where,what) copy_item_head (where, what) 42#define store_ih(where,what) copy_item_head (where, what)
43 43
44static inline bool is_privroot_deh(struct dentry *dir,
45 struct reiserfs_de_head *deh)
46{
47 int ret = 0;
48#ifdef CONFIG_REISERFS_FS_XATTR
49 struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root;
50 ret = (dir == dir->d_parent && privroot->d_inode &&
51 deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
52#endif
53 return ret;
54}
55
44int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent, 56int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
45 filldir_t filldir, loff_t *pos) 57 filldir_t filldir, loff_t *pos)
46{ 58{
@@ -138,18 +150,8 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
138 } 150 }
139 151
140 /* Ignore the .reiserfs_priv entry */ 152 /* Ignore the .reiserfs_priv entry */
141 if (reiserfs_xattrs(inode->i_sb) && 153 if (is_privroot_deh(dentry, deh))
142 !old_format_only(inode->i_sb) &&
143 dentry == inode->i_sb->s_root &&
144 REISERFS_SB(inode->i_sb)->priv_root &&
145 REISERFS_SB(inode->i_sb)->priv_root->d_inode
146 && deh_objectid(deh) ==
147 le32_to_cpu(INODE_PKEY
148 (REISERFS_SB(inode->i_sb)->
149 priv_root->d_inode)->
150 k_objectid)) {
151 continue; 154 continue;
152 }
153 155
154 d_off = deh_offset(deh); 156 d_off = deh_offset(deh);
155 *pos = d_off; 157 *pos = d_off;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index efd4d720718e..271579128634 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -338,21 +338,8 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
338 &path_to_entry, &de); 338 &path_to_entry, &de);
339 pathrelse(&path_to_entry); 339 pathrelse(&path_to_entry);
340 if (retval == NAME_FOUND) { 340 if (retval == NAME_FOUND) {
341 /* Hide the .reiserfs_priv directory */ 341 inode = reiserfs_iget(dir->i_sb,
342 if (reiserfs_xattrs(dir->i_sb) && 342 (struct cpu_key *)&(de.de_dir_id));
343 !old_format_only(dir->i_sb) &&
344 REISERFS_SB(dir->i_sb)->priv_root &&
345 REISERFS_SB(dir->i_sb)->priv_root->d_inode &&
346 de.de_objectid ==
347 le32_to_cpu(INODE_PKEY
348 (REISERFS_SB(dir->i_sb)->priv_root->d_inode)->
349 k_objectid)) {
350 reiserfs_write_unlock(dir->i_sb);
351 return ERR_PTR(-EACCES);
352 }
353
354 inode =
355 reiserfs_iget(dir->i_sb, (struct cpu_key *)&(de.de_dir_id));
356 if (!inode || IS_ERR(inode)) { 343 if (!inode || IS_ERR(inode)) {
357 reiserfs_write_unlock(dir->i_sb); 344 reiserfs_write_unlock(dir->i_sb);
358 return ERR_PTR(-EACCES); 345 return ERR_PTR(-EACCES);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 972250c62896..3567fb9e3fb1 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -27,6 +27,7 @@
27#include <linux/mnt_namespace.h> 27#include <linux/mnt_namespace.h>
28#include <linux/mount.h> 28#include <linux/mount.h>
29#include <linux/namei.h> 29#include <linux/namei.h>
30#include <linux/crc32.h>
30 31
31struct file_system_type reiserfs_fs_type; 32struct file_system_type reiserfs_fs_type;
32 33
@@ -447,13 +448,11 @@ int remove_save_link(struct inode *inode, int truncate)
447static void reiserfs_kill_sb(struct super_block *s) 448static void reiserfs_kill_sb(struct super_block *s)
448{ 449{
449 if (REISERFS_SB(s)) { 450 if (REISERFS_SB(s)) {
450#ifdef CONFIG_REISERFS_FS_XATTR
451 if (REISERFS_SB(s)->xattr_root) { 451 if (REISERFS_SB(s)->xattr_root) {
452 d_invalidate(REISERFS_SB(s)->xattr_root); 452 d_invalidate(REISERFS_SB(s)->xattr_root);
453 dput(REISERFS_SB(s)->xattr_root); 453 dput(REISERFS_SB(s)->xattr_root);
454 REISERFS_SB(s)->xattr_root = NULL; 454 REISERFS_SB(s)->xattr_root = NULL;
455 } 455 }
456#endif
457 if (REISERFS_SB(s)->priv_root) { 456 if (REISERFS_SB(s)->priv_root) {
458 d_invalidate(REISERFS_SB(s)->priv_root); 457 d_invalidate(REISERFS_SB(s)->priv_root);
459 dput(REISERFS_SB(s)->priv_root); 458 dput(REISERFS_SB(s)->priv_root);
@@ -1315,8 +1314,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1315 } 1314 }
1316 1315
1317out_ok: 1316out_ok:
1318 kfree(s->s_options); 1317 replace_mount_options(s, new_opts);
1319 s->s_options = new_opts;
1320 return 0; 1318 return 0;
1321 1319
1322out_err: 1320out_err:
@@ -1841,7 +1839,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1841 goto error; 1839 goto error;
1842 } 1840 }
1843 1841
1844 if ((errval = reiserfs_xattr_init(s, s->s_flags))) { 1842 if ((errval = reiserfs_lookup_privroot(s)) ||
1843 (errval = reiserfs_xattr_init(s, s->s_flags))) {
1845 dput(s->s_root); 1844 dput(s->s_root);
1846 s->s_root = NULL; 1845 s->s_root = NULL;
1847 goto error; 1846 goto error;
@@ -1854,7 +1853,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1854 reiserfs_info(s, "using 3.5.x disk format\n"); 1853 reiserfs_info(s, "using 3.5.x disk format\n");
1855 } 1854 }
1856 1855
1857 if ((errval = reiserfs_xattr_init(s, s->s_flags))) { 1856 if ((errval = reiserfs_lookup_privroot(s)) ||
1857 (errval = reiserfs_xattr_init(s, s->s_flags))) {
1858 dput(s->s_root); 1858 dput(s->s_root);
1859 s->s_root = NULL; 1859 s->s_root = NULL;
1860 goto error; 1860 goto error;
@@ -1904,6 +1904,10 @@ static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1904 buf->f_bsize = dentry->d_sb->s_blocksize; 1904 buf->f_bsize = dentry->d_sb->s_blocksize;
1905 /* changed to accommodate gcc folks. */ 1905 /* changed to accommodate gcc folks. */
1906 buf->f_type = REISERFS_SUPER_MAGIC; 1906 buf->f_type = REISERFS_SUPER_MAGIC;
1907 buf->f_fsid.val[0] = (u32)crc32_le(0, rs->s_uuid, sizeof(rs->s_uuid)/2);
1908 buf->f_fsid.val[1] = (u32)crc32_le(0, rs->s_uuid + sizeof(rs->s_uuid)/2,
1909 sizeof(rs->s_uuid)/2);
1910
1907 return 0; 1911 return 0;
1908} 1912}
1909 1913
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index f83f52bae390..8e7deb0e6964 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -113,41 +113,30 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
113 113
114#define xattr_may_create(flags) (!flags || flags & XATTR_CREATE) 114#define xattr_may_create(flags) (!flags || flags & XATTR_CREATE)
115 115
116/* Returns and possibly creates the xattr dir. */ 116static struct dentry *open_xa_root(struct super_block *sb, int flags)
117static struct dentry *lookup_or_create_dir(struct dentry *parent,
118 const char *name, int flags)
119{ 117{
120 struct dentry *dentry; 118 struct dentry *privroot = REISERFS_SB(sb)->priv_root;
121 BUG_ON(!parent); 119 struct dentry *xaroot;
122 120 if (!privroot->d_inode)
123 dentry = lookup_one_len(name, parent, strlen(name)); 121 return ERR_PTR(-ENODATA);
124 if (IS_ERR(dentry))
125 return dentry;
126 else if (!dentry->d_inode) {
127 int err = -ENODATA;
128 122
129 if (xattr_may_create(flags)) { 123 mutex_lock_nested(&privroot->d_inode->i_mutex, I_MUTEX_XATTR);
130 mutex_lock_nested(&parent->d_inode->i_mutex,
131 I_MUTEX_XATTR);
132 err = xattr_mkdir(parent->d_inode, dentry, 0700);
133 mutex_unlock(&parent->d_inode->i_mutex);
134 }
135 124
125 xaroot = dget(REISERFS_SB(sb)->xattr_root);
126 if (!xaroot)
127 xaroot = ERR_PTR(-ENODATA);
128 else if (!xaroot->d_inode) {
129 int err = -ENODATA;
130 if (xattr_may_create(flags))
131 err = xattr_mkdir(privroot->d_inode, xaroot, 0700);
136 if (err) { 132 if (err) {
137 dput(dentry); 133 dput(xaroot);
138 dentry = ERR_PTR(err); 134 xaroot = ERR_PTR(err);
139 } 135 }
140 } 136 }
141 137
142 return dentry; 138 mutex_unlock(&privroot->d_inode->i_mutex);
143} 139 return xaroot;
144
145static struct dentry *open_xa_root(struct super_block *sb, int flags)
146{
147 struct dentry *privroot = REISERFS_SB(sb)->priv_root;
148 if (!privroot)
149 return ERR_PTR(-ENODATA);
150 return lookup_or_create_dir(privroot, XAROOT_NAME, flags);
151} 140}
152 141
153static struct dentry *open_xa_dir(const struct inode *inode, int flags) 142static struct dentry *open_xa_dir(const struct inode *inode, int flags)
@@ -163,10 +152,22 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
163 le32_to_cpu(INODE_PKEY(inode)->k_objectid), 152 le32_to_cpu(INODE_PKEY(inode)->k_objectid),
164 inode->i_generation); 153 inode->i_generation);
165 154
166 xadir = lookup_or_create_dir(xaroot, namebuf, flags); 155 mutex_lock_nested(&xaroot->d_inode->i_mutex, I_MUTEX_XATTR);
156
157 xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf));
158 if (!IS_ERR(xadir) && !xadir->d_inode) {
159 int err = -ENODATA;
160 if (xattr_may_create(flags))
161 err = xattr_mkdir(xaroot->d_inode, xadir, 0700);
162 if (err) {
163 dput(xadir);
164 xadir = ERR_PTR(err);
165 }
166 }
167
168 mutex_unlock(&xaroot->d_inode->i_mutex);
167 dput(xaroot); 169 dput(xaroot);
168 return xadir; 170 return xadir;
169
170} 171}
171 172
172/* The following are side effects of other operations that aren't explicitly 173/* The following are side effects of other operations that aren't explicitly
@@ -184,6 +185,7 @@ fill_with_dentries(void *buf, const char *name, int namelen, loff_t offset,
184{ 185{
185 struct reiserfs_dentry_buf *dbuf = buf; 186 struct reiserfs_dentry_buf *dbuf = buf;
186 struct dentry *dentry; 187 struct dentry *dentry;
188 WARN_ON_ONCE(!mutex_is_locked(&dbuf->xadir->d_inode->i_mutex));
187 189
188 if (dbuf->count == ARRAY_SIZE(dbuf->dentries)) 190 if (dbuf->count == ARRAY_SIZE(dbuf->dentries))
189 return -ENOSPC; 191 return -ENOSPC;
@@ -349,6 +351,7 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name,
349 if (IS_ERR(xadir)) 351 if (IS_ERR(xadir))
350 return ERR_CAST(xadir); 352 return ERR_CAST(xadir);
351 353
354 mutex_lock_nested(&xadir->d_inode->i_mutex, I_MUTEX_XATTR);
352 xafile = lookup_one_len(name, xadir, strlen(name)); 355 xafile = lookup_one_len(name, xadir, strlen(name));
353 if (IS_ERR(xafile)) { 356 if (IS_ERR(xafile)) {
354 err = PTR_ERR(xafile); 357 err = PTR_ERR(xafile);
@@ -360,18 +363,15 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name,
360 363
361 if (!xafile->d_inode) { 364 if (!xafile->d_inode) {
362 err = -ENODATA; 365 err = -ENODATA;
363 if (xattr_may_create(flags)) { 366 if (xattr_may_create(flags))
364 mutex_lock_nested(&xadir->d_inode->i_mutex,
365 I_MUTEX_XATTR);
366 err = xattr_create(xadir->d_inode, xafile, 367 err = xattr_create(xadir->d_inode, xafile,
367 0700|S_IFREG); 368 0700|S_IFREG);
368 mutex_unlock(&xadir->d_inode->i_mutex);
369 }
370 } 369 }
371 370
372 if (err) 371 if (err)
373 dput(xafile); 372 dput(xafile);
374out: 373out:
374 mutex_unlock(&xadir->d_inode->i_mutex);
375 dput(xadir); 375 dput(xadir);
376 if (err) 376 if (err)
377 return ERR_PTR(err); 377 return ERR_PTR(err);
@@ -435,6 +435,7 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name)
435 if (IS_ERR(xadir)) 435 if (IS_ERR(xadir))
436 return PTR_ERR(xadir); 436 return PTR_ERR(xadir);
437 437
438 mutex_lock_nested(&xadir->d_inode->i_mutex, I_MUTEX_XATTR);
438 dentry = lookup_one_len(name, xadir, strlen(name)); 439 dentry = lookup_one_len(name, xadir, strlen(name));
439 if (IS_ERR(dentry)) { 440 if (IS_ERR(dentry)) {
440 err = PTR_ERR(dentry); 441 err = PTR_ERR(dentry);
@@ -442,14 +443,13 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name)
442 } 443 }
443 444
444 if (dentry->d_inode) { 445 if (dentry->d_inode) {
445 mutex_lock_nested(&xadir->d_inode->i_mutex, I_MUTEX_XATTR);
446 err = xattr_unlink(xadir->d_inode, dentry); 446 err = xattr_unlink(xadir->d_inode, dentry);
447 mutex_unlock(&xadir->d_inode->i_mutex);
448 update_ctime(inode); 447 update_ctime(inode);
449 } 448 }
450 449
451 dput(dentry); 450 dput(dentry);
452out_dput: 451out_dput:
452 mutex_unlock(&xadir->d_inode->i_mutex);
453 dput(xadir); 453 dput(xadir);
454 return err; 454 return err;
455} 455}
@@ -687,20 +687,6 @@ out:
687 return err; 687 return err;
688} 688}
689 689
690/* Actual operations that are exported to VFS-land */
691struct xattr_handler *reiserfs_xattr_handlers[] = {
692 &reiserfs_xattr_user_handler,
693 &reiserfs_xattr_trusted_handler,
694#ifdef CONFIG_REISERFS_FS_SECURITY
695 &reiserfs_xattr_security_handler,
696#endif
697#ifdef CONFIG_REISERFS_FS_POSIX_ACL
698 &reiserfs_posix_acl_access_handler,
699 &reiserfs_posix_acl_default_handler,
700#endif
701 NULL
702};
703
704/* 690/*
705 * In order to implement different sets of xattr operations for each xattr 691 * In order to implement different sets of xattr operations for each xattr
706 * prefix with the generic xattr API, a filesystem should create a 692 * prefix with the generic xattr API, a filesystem should create a
@@ -843,7 +829,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
843 if (!dentry->d_inode) 829 if (!dentry->d_inode)
844 return -EINVAL; 830 return -EINVAL;
845 831
846 if (!reiserfs_xattrs(dentry->d_sb) || 832 if (!dentry->d_sb->s_xattr ||
847 get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) 833 get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
848 return -EOPNOTSUPP; 834 return -EOPNOTSUPP;
849 835
@@ -885,42 +871,50 @@ static int reiserfs_check_acl(struct inode *inode, int mask)
885 return error; 871 return error;
886} 872}
887 873
888int reiserfs_permission(struct inode *inode, int mask)
889{
890 /*
891 * We don't do permission checks on the internal objects.
892 * Permissions are determined by the "owning" object.
893 */
894 if (IS_PRIVATE(inode))
895 return 0;
896 /*
897 * Stat data v1 doesn't support ACLs.
898 */
899 if (get_inode_sd_version(inode) == STAT_DATA_V1)
900 return generic_permission(inode, mask, NULL);
901 else
902 return generic_permission(inode, mask, reiserfs_check_acl);
903}
904
905static int create_privroot(struct dentry *dentry) 874static int create_privroot(struct dentry *dentry)
906{ 875{
907 int err; 876 int err;
908 struct inode *inode = dentry->d_parent->d_inode; 877 struct inode *inode = dentry->d_parent->d_inode;
909 mutex_lock_nested(&inode->i_mutex, I_MUTEX_XATTR); 878 WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
879
910 err = xattr_mkdir(inode, dentry, 0700); 880 err = xattr_mkdir(inode, dentry, 0700);
911 mutex_unlock(&inode->i_mutex); 881 if (err || !dentry->d_inode) {
912 if (err) { 882 reiserfs_warning(dentry->d_sb, "jdm-20006",
913 dput(dentry); 883 "xattrs/ACLs enabled and couldn't "
914 dentry = NULL; 884 "find/create .reiserfs_priv. "
885 "Failing mount.");
886 return -EOPNOTSUPP;
915 } 887 }
916 888
917 if (dentry && dentry->d_inode) 889 dentry->d_inode->i_flags |= S_PRIVATE;
918 reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr " 890 reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr "
919 "storage.\n", PRIVROOT_NAME); 891 "storage.\n", PRIVROOT_NAME);
920 892
921 return err; 893 return 0;
922} 894}
923 895
896#else
897int __init reiserfs_xattr_register_handlers(void) { return 0; }
898void reiserfs_xattr_unregister_handlers(void) {}
899static int create_privroot(struct dentry *dentry) { return 0; }
900#endif
901
902/* Actual operations that are exported to VFS-land */
903struct xattr_handler *reiserfs_xattr_handlers[] = {
904#ifdef CONFIG_REISERFS_FS_XATTR
905 &reiserfs_xattr_user_handler,
906 &reiserfs_xattr_trusted_handler,
907#endif
908#ifdef CONFIG_REISERFS_FS_SECURITY
909 &reiserfs_xattr_security_handler,
910#endif
911#ifdef CONFIG_REISERFS_FS_POSIX_ACL
912 &reiserfs_posix_acl_access_handler,
913 &reiserfs_posix_acl_default_handler,
914#endif
915 NULL
916};
917
924static int xattr_mount_check(struct super_block *s) 918static int xattr_mount_check(struct super_block *s)
925{ 919{
926 /* We need generation numbers to ensure that the oid mapping is correct 920 /* We need generation numbers to ensure that the oid mapping is correct
@@ -940,21 +934,33 @@ static int xattr_mount_check(struct super_block *s)
940 return 0; 934 return 0;
941} 935}
942 936
943#else 937int reiserfs_permission(struct inode *inode, int mask)
944int __init reiserfs_xattr_register_handlers(void) { return 0; } 938{
945void reiserfs_xattr_unregister_handlers(void) {} 939 /*
940 * We don't do permission checks on the internal objects.
941 * Permissions are determined by the "owning" object.
942 */
943 if (IS_PRIVATE(inode))
944 return 0;
945
946#ifdef CONFIG_REISERFS_FS_XATTR
947 /*
948 * Stat data v1 doesn't support ACLs.
949 */
950 if (get_inode_sd_version(inode) != STAT_DATA_V1)
951 return generic_permission(inode, mask, reiserfs_check_acl);
946#endif 952#endif
953 return generic_permission(inode, mask, NULL);
954}
947 955
948/* This will catch lookups from the fs root to .reiserfs_priv */ 956/* This will catch lookups from the fs root to .reiserfs_priv */
949static int 957static int
950xattr_lookup_poison(struct dentry *dentry, struct qstr *q1, struct qstr *name) 958xattr_lookup_poison(struct dentry *dentry, struct qstr *q1, struct qstr *name)
951{ 959{
952 struct dentry *priv_root = REISERFS_SB(dentry->d_sb)->priv_root; 960 struct dentry *priv_root = REISERFS_SB(dentry->d_sb)->priv_root;
953 if (name->len == priv_root->d_name.len && 961 if (container_of(q1, struct dentry, d_name) == priv_root)
954 name->hash == priv_root->d_name.hash &&
955 !memcmp(name->name, priv_root->d_name.name, name->len)) {
956 return -ENOENT; 962 return -ENOENT;
957 } else if (q1->len == name->len && 963 if (q1->len == name->len &&
958 !memcmp(q1->name, name->name, name->len)) 964 !memcmp(q1->name, name->name, name->len))
959 return 0; 965 return 0;
960 return 1; 966 return 1;
@@ -964,73 +970,71 @@ static const struct dentry_operations xattr_lookup_poison_ops = {
964 .d_compare = xattr_lookup_poison, 970 .d_compare = xattr_lookup_poison,
965}; 971};
966 972
973int reiserfs_lookup_privroot(struct super_block *s)
974{
975 struct dentry *dentry;
976 int err = 0;
977
978 /* If we don't have the privroot located yet - go find it */
979 mutex_lock(&s->s_root->d_inode->i_mutex);
980 dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
981 strlen(PRIVROOT_NAME));
982 if (!IS_ERR(dentry)) {
983 REISERFS_SB(s)->priv_root = dentry;
984 s->s_root->d_op = &xattr_lookup_poison_ops;
985 if (dentry->d_inode)
986 dentry->d_inode->i_flags |= S_PRIVATE;
987 } else
988 err = PTR_ERR(dentry);
989 mutex_unlock(&s->s_root->d_inode->i_mutex);
990
991 return err;
992}
993
967/* We need to take a copy of the mount flags since things like 994/* We need to take a copy of the mount flags since things like
968 * MS_RDONLY don't get set until *after* we're called. 995 * MS_RDONLY don't get set until *after* we're called.
969 * mount_flags != mount_options */ 996 * mount_flags != mount_options */
970int reiserfs_xattr_init(struct super_block *s, int mount_flags) 997int reiserfs_xattr_init(struct super_block *s, int mount_flags)
971{ 998{
972 int err = 0; 999 int err = 0;
1000 struct dentry *privroot = REISERFS_SB(s)->priv_root;
973 1001
974#ifdef CONFIG_REISERFS_FS_XATTR
975 err = xattr_mount_check(s); 1002 err = xattr_mount_check(s);
976 if (err) 1003 if (err)
977 goto error; 1004 goto error;
978#endif
979 1005
980 /* If we don't have the privroot located yet - go find it */ 1006 if (!privroot->d_inode && !(mount_flags & MS_RDONLY)) {
981 if (!REISERFS_SB(s)->priv_root) { 1007 mutex_lock(&s->s_root->d_inode->i_mutex);
982 struct dentry *dentry; 1008 err = create_privroot(REISERFS_SB(s)->priv_root);
983 dentry = lookup_one_len(PRIVROOT_NAME, s->s_root, 1009 mutex_unlock(&s->s_root->d_inode->i_mutex);
984 strlen(PRIVROOT_NAME));
985 if (!IS_ERR(dentry)) {
986#ifdef CONFIG_REISERFS_FS_XATTR
987 if (!(mount_flags & MS_RDONLY) && !dentry->d_inode)
988 err = create_privroot(dentry);
989#endif
990 if (!dentry->d_inode) {
991 dput(dentry);
992 dentry = NULL;
993 }
994 } else
995 err = PTR_ERR(dentry);
996
997 if (!err && dentry) {
998 s->s_root->d_op = &xattr_lookup_poison_ops;
999 dentry->d_inode->i_flags |= S_PRIVATE;
1000 REISERFS_SB(s)->priv_root = dentry;
1001#ifdef CONFIG_REISERFS_FS_XATTR
1002 /* xattrs are unavailable */
1003 } else if (!(mount_flags & MS_RDONLY)) {
1004 /* If we're read-only it just means that the dir
1005 * hasn't been created. Not an error -- just no
1006 * xattrs on the fs. We'll check again if we
1007 * go read-write */
1008 reiserfs_warning(s, "jdm-20006",
1009 "xattrs/ACLs enabled and couldn't "
1010 "find/create .reiserfs_priv. "
1011 "Failing mount.");
1012 err = -EOPNOTSUPP;
1013#endif
1014 }
1015 } 1010 }
1016 1011
1017#ifdef CONFIG_REISERFS_FS_XATTR 1012 if (privroot->d_inode) {
1018 if (!err)
1019 s->s_xattr = reiserfs_xattr_handlers; 1013 s->s_xattr = reiserfs_xattr_handlers;
1014 mutex_lock(&privroot->d_inode->i_mutex);
1015 if (!REISERFS_SB(s)->xattr_root) {
1016 struct dentry *dentry;
1017 dentry = lookup_one_len(XAROOT_NAME, privroot,
1018 strlen(XAROOT_NAME));
1019 if (!IS_ERR(dentry))
1020 REISERFS_SB(s)->xattr_root = dentry;
1021 else
1022 err = PTR_ERR(dentry);
1023 }
1024 mutex_unlock(&privroot->d_inode->i_mutex);
1025 }
1020 1026
1021error: 1027error:
1022 if (err) { 1028 if (err) {
1023 clear_bit(REISERFS_XATTRS_USER, &(REISERFS_SB(s)->s_mount_opt)); 1029 clear_bit(REISERFS_XATTRS_USER, &(REISERFS_SB(s)->s_mount_opt));
1024 clear_bit(REISERFS_POSIXACL, &(REISERFS_SB(s)->s_mount_opt)); 1030 clear_bit(REISERFS_POSIXACL, &(REISERFS_SB(s)->s_mount_opt));
1025 } 1031 }
1026#endif
1027 1032
1028 /* The super_block MS_POSIXACL must mirror the (no)acl mount option. */ 1033 /* The super_block MS_POSIXACL must mirror the (no)acl mount option. */
1029 s->s_flags = s->s_flags & ~MS_POSIXACL;
1030#ifdef CONFIG_REISERFS_FS_POSIX_ACL
1031 if (reiserfs_posixacl(s)) 1034 if (reiserfs_posixacl(s))
1032 s->s_flags |= MS_POSIXACL; 1035 s->s_flags |= MS_POSIXACL;
1033#endif 1036 else
1037 s->s_flags &= ~MS_POSIXACL;
1034 1038
1035 return err; 1039 return err;
1036} 1040}
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index d423416d93d1..c303c426fe2b 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -428,7 +428,7 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
428 } else { 428 } else {
429 apply_umask: 429 apply_umask:
430 /* no ACL, apply umask */ 430 /* no ACL, apply umask */
431 inode->i_mode &= ~current->fs->umask; 431 inode->i_mode &= ~current_umask();
432 } 432 }
433 433
434 return err; 434 return err;
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 4d3c20e787c3..a92c8792c0f6 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -55,8 +55,16 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
55 struct reiserfs_security_handle *sec) 55 struct reiserfs_security_handle *sec)
56{ 56{
57 int blocks = 0; 57 int blocks = 0;
58 int error = security_inode_init_security(inode, dir, &sec->name, 58 int error;
59 &sec->value, &sec->length); 59
60 sec->name = NULL;
61
62 /* Don't add selinux attributes on xattrs - they'll never get used */
63 if (IS_PRIVATE(dir))
64 return 0;
65
66 error = security_inode_init_security(inode, dir, &sec->name,
67 &sec->value, &sec->length);
60 if (error) { 68 if (error) {
61 if (error == -EOPNOTSUPP) 69 if (error == -EOPNOTSUPP)
62 error = 0; 70 error = 0;
diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig
index 1a17020f9faf..ce2d6bcc6266 100644
--- a/fs/romfs/Kconfig
+++ b/fs/romfs/Kconfig
@@ -1,6 +1,6 @@
1config ROMFS_FS 1config ROMFS_FS
2 tristate "ROM file system support" 2 tristate "ROM file system support"
3 depends on BLOCK 3 depends on BLOCK || MTD
4 ---help--- 4 ---help---
5 This is a very small read-only file system mainly intended for 5 This is a very small read-only file system mainly intended for
6 initial ram disks of installation disks, but it could be used for 6 initial ram disks of installation disks, but it could be used for
@@ -14,3 +14,49 @@ config ROMFS_FS
14 14
15 If you don't know whether you need it, then you don't need it: 15 If you don't know whether you need it, then you don't need it:
16 answer N. 16 answer N.
17
18#
19# Select the backing stores to be supported
20#
21choice
22 prompt "RomFS backing stores"
23 depends on ROMFS_FS
24 default ROMFS_BACKED_BY_BLOCK
25 help
26 Select the backing stores to be supported.
27
28config ROMFS_BACKED_BY_BLOCK
29 bool "Block device-backed ROM file system support"
30 depends on BLOCK
31 help
32 This permits ROMFS to use block devices buffered through the page
33 cache as the medium from which to retrieve data. It does not allow
34 direct mapping of the medium.
35
36 If unsure, answer Y.
37
38config ROMFS_BACKED_BY_MTD
39 bool "MTD-backed ROM file system support"
40 depends on MTD=y || (ROMFS_FS=m && MTD)
41 help
42 This permits ROMFS to use MTD based devices directly, without the
43 intercession of the block layer (which may have been disabled). It
44 also allows direct mapping of MTD devices through romfs files under
45 NOMMU conditions if the underlying device is directly addressable by
46 the CPU.
47
48 If unsure, answer Y.
49
50config ROMFS_BACKED_BY_BOTH
51 bool "Both the above"
52 depends on BLOCK && (MTD=y || (ROMFS_FS=m && MTD))
53endchoice
54
55
56config ROMFS_ON_BLOCK
57 bool
58 default y if ROMFS_BACKED_BY_BLOCK || ROMFS_BACKED_BY_BOTH
59
60config ROMFS_ON_MTD
61 bool
62 default y if ROMFS_BACKED_BY_MTD || ROMFS_BACKED_BY_BOTH
diff --git a/fs/romfs/Makefile b/fs/romfs/Makefile
index c95b21cf49a3..420beb7d495c 100644
--- a/fs/romfs/Makefile
+++ b/fs/romfs/Makefile
@@ -1,7 +1,12 @@
1# 1#
2# Makefile for the linux romfs filesystem routines. 2# Makefile for the linux RomFS filesystem routines.
3# 3#
4 4
5obj-$(CONFIG_ROMFS_FS) += romfs.o 5obj-$(CONFIG_ROMFS_FS) += romfs.o
6 6
7romfs-objs := inode.o 7romfs-y := storage.o super.o
8
9ifneq ($(CONFIG_MMU),y)
10romfs-$(CONFIG_ROMFS_ON_MTD) += mmap-nommu.o
11endif
12
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
deleted file mode 100644
index 98a232f7196b..000000000000
--- a/fs/romfs/inode.c
+++ /dev/null
@@ -1,665 +0,0 @@
1/*
2 * ROMFS file system, Linux implementation
3 *
4 * Copyright (C) 1997-1999 Janos Farkas <chexum@shadow.banki.hu>
5 *
6 * Using parts of the minix filesystem
7 * Copyright (C) 1991, 1992 Linus Torvalds
8 *
9 * and parts of the affs filesystem additionally
10 * Copyright (C) 1993 Ray Burr
11 * Copyright (C) 1996 Hans-Joachim Widmaier
12 *
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public License
15 * as published by the Free Software Foundation; either version
16 * 2 of the License, or (at your option) any later version.
17 *
18 * Changes
19 * Changed for 2.1.19 modules
20 * Jan 1997 Initial release
21 * Jun 1997 2.1.43+ changes
22 * Proper page locking in readpage
23 * Changed to work with 2.1.45+ fs
24 * Jul 1997 Fixed follow_link
25 * 2.1.47
26 * lookup shouldn't return -ENOENT
27 * from Horst von Brand:
28 * fail on wrong checksum
29 * double unlock_super was possible
30 * correct namelen for statfs
31 * spotted by Bill Hawes:
32 * readlink shouldn't iput()
33 * Jun 1998 2.1.106 from Avery Pennarun: glibc scandir()
34 * exposed a problem in readdir
35 * 2.1.107 code-freeze spellchecker run
36 * Aug 1998 2.1.118+ VFS changes
37 * Sep 1998 2.1.122 another VFS change (follow_link)
38 * Apr 1999 2.2.7 no more EBADF checking in
39 * lookup/readdir, use ERR_PTR
40 * Jun 1999 2.3.6 d_alloc_root use changed
41 * 2.3.9 clean up usage of ENOENT/negative
42 * dentries in lookup
43 * clean up page flags setting
44 * (error, uptodate, locking) in
45 * in readpage
46 * use init_special_inode for
47 * fifos/sockets (and streamline) in
48 * read_inode, fix _ops table order
49 * Aug 1999 2.3.16 __initfunc() => __init change
50 * Oct 1999 2.3.24 page->owner hack obsoleted
51 * Nov 1999 2.3.27 2.3.25+ page->offset => index change
52 */
53
54/* todo:
55 * - see Documentation/filesystems/romfs.txt
56 * - use allocated, not stack memory for file names?
57 * - considering write access...
58 * - network (tftp) files?
59 * - merge back some _op tables
60 */
61
62/*
63 * Sorry about some optimizations and for some goto's. I just wanted
64 * to squeeze some more bytes out of this code.. :)
65 */
66
67#include <linux/module.h>
68#include <linux/types.h>
69#include <linux/errno.h>
70#include <linux/slab.h>
71#include <linux/romfs_fs.h>
72#include <linux/fs.h>
73#include <linux/init.h>
74#include <linux/pagemap.h>
75#include <linux/smp_lock.h>
76#include <linux/buffer_head.h>
77#include <linux/vfs.h>
78
79#include <asm/uaccess.h>
80
81struct romfs_inode_info {
82 unsigned long i_metasize; /* size of non-data area */
83 unsigned long i_dataoffset; /* from the start of fs */
84 struct inode vfs_inode;
85};
86
87static struct inode *romfs_iget(struct super_block *, unsigned long);
88
89/* instead of private superblock data */
90static inline unsigned long romfs_maxsize(struct super_block *sb)
91{
92 return (unsigned long)sb->s_fs_info;
93}
94
95static inline struct romfs_inode_info *ROMFS_I(struct inode *inode)
96{
97 return container_of(inode, struct romfs_inode_info, vfs_inode);
98}
99
100static __u32
101romfs_checksum(void *data, int size)
102{
103 __u32 sum;
104 __be32 *ptr;
105
106 sum = 0; ptr = data;
107 size>>=2;
108 while (size>0) {
109 sum += be32_to_cpu(*ptr++);
110 size--;
111 }
112 return sum;
113}
114
115static const struct super_operations romfs_ops;
116
117static int romfs_fill_super(struct super_block *s, void *data, int silent)
118{
119 struct buffer_head *bh;
120 struct romfs_super_block *rsb;
121 struct inode *root;
122 int sz, ret = -EINVAL;
123
124 /* I would parse the options here, but there are none.. :) */
125
126 sb_set_blocksize(s, ROMBSIZE);
127 s->s_maxbytes = 0xFFFFFFFF;
128
129 bh = sb_bread(s, 0);
130 if (!bh) {
131 /* XXX merge with other printk? */
132 printk ("romfs: unable to read superblock\n");
133 goto outnobh;
134 }
135
136 rsb = (struct romfs_super_block *)bh->b_data;
137 sz = be32_to_cpu(rsb->size);
138 if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1
139 || sz < ROMFH_SIZE) {
140 if (!silent)
141 printk ("VFS: Can't find a romfs filesystem on dev "
142 "%s.\n", s->s_id);
143 goto out;
144 }
145 if (romfs_checksum(rsb, min_t(int, sz, 512))) {
146 printk ("romfs: bad initial checksum on dev "
147 "%s.\n", s->s_id);
148 goto out;
149 }
150
151 s->s_magic = ROMFS_MAGIC;
152 s->s_fs_info = (void *)(long)sz;
153
154 s->s_flags |= MS_RDONLY;
155
156 /* Find the start of the fs */
157 sz = (ROMFH_SIZE +
158 strnlen(rsb->name, ROMFS_MAXFN) + 1 + ROMFH_PAD)
159 & ROMFH_MASK;
160
161 s->s_op = &romfs_ops;
162 root = romfs_iget(s, sz);
163 if (IS_ERR(root)) {
164 ret = PTR_ERR(root);
165 goto out;
166 }
167
168 ret = -ENOMEM;
169 s->s_root = d_alloc_root(root);
170 if (!s->s_root)
171 goto outiput;
172
173 brelse(bh);
174 return 0;
175
176outiput:
177 iput(root);
178out:
179 brelse(bh);
180outnobh:
181 return ret;
182}
183
184/* That's simple too. */
185
186static int
187romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
188{
189 buf->f_type = ROMFS_MAGIC;
190 buf->f_bsize = ROMBSIZE;
191 buf->f_bfree = buf->f_bavail = buf->f_ffree;
192 buf->f_blocks = (romfs_maxsize(dentry->d_sb)+ROMBSIZE-1)>>ROMBSBITS;
193 buf->f_namelen = ROMFS_MAXFN;
194 return 0;
195}
196
197/* some helper routines */
198
199static int
200romfs_strnlen(struct inode *i, unsigned long offset, unsigned long count)
201{
202 struct buffer_head *bh;
203 unsigned long avail, maxsize, res;
204
205 maxsize = romfs_maxsize(i->i_sb);
206 if (offset >= maxsize)
207 return -1;
208
209 /* strnlen is almost always valid */
210 if (count > maxsize || offset+count > maxsize)
211 count = maxsize-offset;
212
213 bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
214 if (!bh)
215 return -1; /* error */
216
217 avail = ROMBSIZE - (offset & ROMBMASK);
218 maxsize = min_t(unsigned long, count, avail);
219 res = strnlen(((char *)bh->b_data)+(offset&ROMBMASK), maxsize);
220 brelse(bh);
221
222 if (res < maxsize)
223 return res; /* found all of it */
224
225 while (res < count) {
226 offset += maxsize;
227
228 bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
229 if (!bh)
230 return -1;
231 maxsize = min_t(unsigned long, count - res, ROMBSIZE);
232 avail = strnlen(bh->b_data, maxsize);
233 res += avail;
234 brelse(bh);
235 if (avail < maxsize)
236 return res;
237 }
238 return res;
239}
240
241static int
242romfs_copyfrom(struct inode *i, void *dest, unsigned long offset, unsigned long count)
243{
244 struct buffer_head *bh;
245 unsigned long avail, maxsize, res;
246
247 maxsize = romfs_maxsize(i->i_sb);
248 if (offset >= maxsize || count > maxsize || offset+count>maxsize)
249 return -1;
250
251 bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
252 if (!bh)
253 return -1; /* error */
254
255 avail = ROMBSIZE - (offset & ROMBMASK);
256 maxsize = min_t(unsigned long, count, avail);
257 memcpy(dest, ((char *)bh->b_data) + (offset & ROMBMASK), maxsize);
258 brelse(bh);
259
260 res = maxsize; /* all of it */
261
262 while (res < count) {
263 offset += maxsize;
264 dest += maxsize;
265
266 bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
267 if (!bh)
268 return -1;
269 maxsize = min_t(unsigned long, count - res, ROMBSIZE);
270 memcpy(dest, bh->b_data, maxsize);
271 brelse(bh);
272 res += maxsize;
273 }
274 return res;
275}
276
277static unsigned char romfs_dtype_table[] = {
278 DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO
279};
280
281static int
282romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
283{
284 struct inode *i = filp->f_path.dentry->d_inode;
285 struct romfs_inode ri;
286 unsigned long offset, maxoff;
287 int j, ino, nextfh;
288 int stored = 0;
289 char fsname[ROMFS_MAXFN]; /* XXX dynamic? */
290
291 lock_kernel();
292
293 maxoff = romfs_maxsize(i->i_sb);
294
295 offset = filp->f_pos;
296 if (!offset) {
297 offset = i->i_ino & ROMFH_MASK;
298 if (romfs_copyfrom(i, &ri, offset, ROMFH_SIZE) <= 0)
299 goto out;
300 offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
301 }
302
303 /* Not really failsafe, but we are read-only... */
304 for(;;) {
305 if (!offset || offset >= maxoff) {
306 offset = maxoff;
307 filp->f_pos = offset;
308 goto out;
309 }
310 filp->f_pos = offset;
311
312 /* Fetch inode info */
313 if (romfs_copyfrom(i, &ri, offset, ROMFH_SIZE) <= 0)
314 goto out;
315
316 j = romfs_strnlen(i, offset+ROMFH_SIZE, sizeof(fsname)-1);
317 if (j < 0)
318 goto out;
319
320 fsname[j]=0;
321 romfs_copyfrom(i, fsname, offset+ROMFH_SIZE, j);
322
323 ino = offset;
324 nextfh = be32_to_cpu(ri.next);
325 if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
326 ino = be32_to_cpu(ri.spec);
327 if (filldir(dirent, fsname, j, offset, ino,
328 romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0) {
329 goto out;
330 }
331 stored++;
332 offset = nextfh & ROMFH_MASK;
333 }
334out:
335 unlock_kernel();
336 return stored;
337}
338
339static struct dentry *
340romfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
341{
342 unsigned long offset, maxoff;
343 long res;
344 int fslen;
345 struct inode *inode = NULL;
346 char fsname[ROMFS_MAXFN]; /* XXX dynamic? */
347 struct romfs_inode ri;
348 const char *name; /* got from dentry */
349 int len;
350
351 res = -EACCES; /* placeholder for "no data here" */
352 offset = dir->i_ino & ROMFH_MASK;
353 lock_kernel();
354 if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0)
355 goto error;
356
357 maxoff = romfs_maxsize(dir->i_sb);
358 offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
359
360 /* OK, now find the file whose name is in "dentry" in the
361 * directory specified by "dir". */
362
363 name = dentry->d_name.name;
364 len = dentry->d_name.len;
365
366 for(;;) {
367 if (!offset || offset >= maxoff)
368 goto success; /* negative success */
369 if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0)
370 goto error;
371
372 /* try to match the first 16 bytes of name */
373 fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, ROMFH_SIZE);
374 if (len < ROMFH_SIZE) {
375 if (len == fslen) {
376 /* both are shorter, and same size */
377 romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1);
378 if (strncmp (name, fsname, len) == 0)
379 break;
380 }
381 } else if (fslen >= ROMFH_SIZE) {
382 /* both are longer; XXX optimize max size */
383 fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, sizeof(fsname)-1);
384 if (len == fslen) {
385 romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1);
386 if (strncmp(name, fsname, len) == 0)
387 break;
388 }
389 }
390 /* next entry */
391 offset = be32_to_cpu(ri.next) & ROMFH_MASK;
392 }
393
394 /* Hard link handling */
395 if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
396 offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
397
398 inode = romfs_iget(dir->i_sb, offset);
399 if (IS_ERR(inode)) {
400 res = PTR_ERR(inode);
401 goto error;
402 }
403
404success:
405 d_add(dentry, inode);
406 res = 0;
407error:
408 unlock_kernel();
409 return ERR_PTR(res);
410}
411
412/*
413 * Ok, we do readpage, to be able to execute programs. Unfortunately,
414 * we can't use bmap, since we may have looser alignments.
415 */
416
417static int
418romfs_readpage(struct file *file, struct page * page)
419{
420 struct inode *inode = page->mapping->host;
421 loff_t offset, size;
422 unsigned long filled;
423 void *buf;
424 int result = -EIO;
425
426 page_cache_get(page);
427 lock_kernel();
428 buf = kmap(page);
429 if (!buf)
430 goto err_out;
431
432 /* 32 bit warning -- but not for us :) */
433 offset = page_offset(page);
434 size = i_size_read(inode);
435 filled = 0;
436 result = 0;
437 if (offset < size) {
438 unsigned long readlen;
439
440 size -= offset;
441 readlen = size > PAGE_SIZE ? PAGE_SIZE : size;
442
443 filled = romfs_copyfrom(inode, buf, ROMFS_I(inode)->i_dataoffset+offset, readlen);
444
445 if (filled != readlen) {
446 SetPageError(page);
447 filled = 0;
448 result = -EIO;
449 }
450 }
451
452 if (filled < PAGE_SIZE)
453 memset(buf + filled, 0, PAGE_SIZE-filled);
454
455 if (!result)
456 SetPageUptodate(page);
457 flush_dcache_page(page);
458
459 unlock_page(page);
460
461 kunmap(page);
462err_out:
463 page_cache_release(page);
464 unlock_kernel();
465
466 return result;
467}
468
469/* Mapping from our types to the kernel */
470
471static const struct address_space_operations romfs_aops = {
472 .readpage = romfs_readpage
473};
474
475static const struct file_operations romfs_dir_operations = {
476 .read = generic_read_dir,
477 .readdir = romfs_readdir,
478};
479
480static const struct inode_operations romfs_dir_inode_operations = {
481 .lookup = romfs_lookup,
482};
483
484static mode_t romfs_modemap[] =
485{
486 0, S_IFDIR+0644, S_IFREG+0644, S_IFLNK+0777,
487 S_IFBLK+0600, S_IFCHR+0600, S_IFSOCK+0644, S_IFIFO+0644
488};
489
490static struct inode *
491romfs_iget(struct super_block *sb, unsigned long ino)
492{
493 int nextfh, ret;
494 struct romfs_inode ri;
495 struct inode *i;
496
497 ino &= ROMFH_MASK;
498 i = iget_locked(sb, ino);
499 if (!i)
500 return ERR_PTR(-ENOMEM);
501 if (!(i->i_state & I_NEW))
502 return i;
503
504 i->i_mode = 0;
505
506 /* Loop for finding the real hard link */
507 for(;;) {
508 if (romfs_copyfrom(i, &ri, ino, ROMFH_SIZE) <= 0) {
509 printk(KERN_ERR "romfs: read error for inode 0x%lx\n",
510 ino);
511 iget_failed(i);
512 return ERR_PTR(-EIO);
513 }
514 /* XXX: do romfs_checksum here too (with name) */
515
516 nextfh = be32_to_cpu(ri.next);
517 if ((nextfh & ROMFH_TYPE) != ROMFH_HRD)
518 break;
519
520 ino = be32_to_cpu(ri.spec) & ROMFH_MASK;
521 }
522
523 i->i_nlink = 1; /* Hard to decide.. */
524 i->i_size = be32_to_cpu(ri.size);
525 i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
526 i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
527
528 /* Precalculate the data offset */
529 ret = romfs_strnlen(i, ino + ROMFH_SIZE, ROMFS_MAXFN);
530 if (ret >= 0)
531 ino = (ROMFH_SIZE + ret + 1 + ROMFH_PAD) & ROMFH_MASK;
532 else
533 ino = 0;
534
535 ROMFS_I(i)->i_metasize = ino;
536 ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK);
537
538 /* Compute permissions */
539 ino = romfs_modemap[nextfh & ROMFH_TYPE];
540 /* only "normal" files have ops */
541 switch (nextfh & ROMFH_TYPE) {
542 case 1:
543 i->i_size = ROMFS_I(i)->i_metasize;
544 i->i_op = &romfs_dir_inode_operations;
545 i->i_fop = &romfs_dir_operations;
546 if (nextfh & ROMFH_EXEC)
547 ino |= S_IXUGO;
548 i->i_mode = ino;
549 break;
550 case 2:
551 i->i_fop = &generic_ro_fops;
552 i->i_data.a_ops = &romfs_aops;
553 if (nextfh & ROMFH_EXEC)
554 ino |= S_IXUGO;
555 i->i_mode = ino;
556 break;
557 case 3:
558 i->i_op = &page_symlink_inode_operations;
559 i->i_data.a_ops = &romfs_aops;
560 i->i_mode = ino | S_IRWXUGO;
561 break;
562 default:
563 /* depending on MBZ for sock/fifos */
564 nextfh = be32_to_cpu(ri.spec);
565 init_special_inode(i, ino,
566 MKDEV(nextfh>>16,nextfh&0xffff));
567 }
568 unlock_new_inode(i);
569 return i;
570}
571
572static struct kmem_cache * romfs_inode_cachep;
573
574static struct inode *romfs_alloc_inode(struct super_block *sb)
575{
576 struct romfs_inode_info *ei;
577 ei = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
578 if (!ei)
579 return NULL;
580 return &ei->vfs_inode;
581}
582
583static void romfs_destroy_inode(struct inode *inode)
584{
585 kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
586}
587
588static void init_once(void *foo)
589{
590 struct romfs_inode_info *ei = foo;
591
592 inode_init_once(&ei->vfs_inode);
593}
594
595static int init_inodecache(void)
596{
597 romfs_inode_cachep = kmem_cache_create("romfs_inode_cache",
598 sizeof(struct romfs_inode_info),
599 0, (SLAB_RECLAIM_ACCOUNT|
600 SLAB_MEM_SPREAD),
601 init_once);
602 if (romfs_inode_cachep == NULL)
603 return -ENOMEM;
604 return 0;
605}
606
607static void destroy_inodecache(void)
608{
609 kmem_cache_destroy(romfs_inode_cachep);
610}
611
612static int romfs_remount(struct super_block *sb, int *flags, char *data)
613{
614 *flags |= MS_RDONLY;
615 return 0;
616}
617
618static const struct super_operations romfs_ops = {
619 .alloc_inode = romfs_alloc_inode,
620 .destroy_inode = romfs_destroy_inode,
621 .statfs = romfs_statfs,
622 .remount_fs = romfs_remount,
623};
624
625static int romfs_get_sb(struct file_system_type *fs_type,
626 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
627{
628 return get_sb_bdev(fs_type, flags, dev_name, data, romfs_fill_super,
629 mnt);
630}
631
632static struct file_system_type romfs_fs_type = {
633 .owner = THIS_MODULE,
634 .name = "romfs",
635 .get_sb = romfs_get_sb,
636 .kill_sb = kill_block_super,
637 .fs_flags = FS_REQUIRES_DEV,
638};
639
640static int __init init_romfs_fs(void)
641{
642 int err = init_inodecache();
643 if (err)
644 goto out1;
645 err = register_filesystem(&romfs_fs_type);
646 if (err)
647 goto out;
648 return 0;
649out:
650 destroy_inodecache();
651out1:
652 return err;
653}
654
655static void __exit exit_romfs_fs(void)
656{
657 unregister_filesystem(&romfs_fs_type);
658 destroy_inodecache();
659}
660
661/* Yes, works even as a module... :) */
662
663module_init(init_romfs_fs)
664module_exit(exit_romfs_fs)
665MODULE_LICENSE("GPL");
diff --git a/fs/romfs/internal.h b/fs/romfs/internal.h
new file mode 100644
index 000000000000..95217b830118
--- /dev/null
+++ b/fs/romfs/internal.h
@@ -0,0 +1,47 @@
1/* RomFS internal definitions
2 *
3 * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/romfs_fs.h>
13
14struct romfs_inode_info {
15 struct inode vfs_inode;
16 unsigned long i_metasize; /* size of non-data area */
17 unsigned long i_dataoffset; /* from the start of fs */
18};
19
20static inline size_t romfs_maxsize(struct super_block *sb)
21{
22 return (size_t) (unsigned long) sb->s_fs_info;
23}
24
25static inline struct romfs_inode_info *ROMFS_I(struct inode *inode)
26{
27 return container_of(inode, struct romfs_inode_info, vfs_inode);
28}
29
30/*
31 * mmap-nommu.c
32 */
33#if !defined(CONFIG_MMU) && defined(CONFIG_ROMFS_ON_MTD)
34extern const struct file_operations romfs_ro_fops;
35#else
36#define romfs_ro_fops generic_ro_fops
37#endif
38
39/*
40 * storage.c
41 */
42extern int romfs_dev_read(struct super_block *sb, unsigned long pos,
43 void *buf, size_t buflen);
44extern ssize_t romfs_dev_strnlen(struct super_block *sb,
45 unsigned long pos, size_t maxlen);
46extern int romfs_dev_strcmp(struct super_block *sb, unsigned long pos,
47 const char *str, size_t size);
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
new file mode 100644
index 000000000000..f0511e816967
--- /dev/null
+++ b/fs/romfs/mmap-nommu.c
@@ -0,0 +1,75 @@
1/* NOMMU mmap support for RomFS on MTD devices
2 *
3 * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/mm.h>
13#include <linux/mtd/super.h>
14#include "internal.h"
15
16/*
17 * try to determine where a shared mapping can be made
18 * - only supported for NOMMU at the moment (MMU can't doesn't copy private
19 * mappings)
20 * - attempts to map through to the underlying MTD device
21 */
22static unsigned long romfs_get_unmapped_area(struct file *file,
23 unsigned long addr,
24 unsigned long len,
25 unsigned long pgoff,
26 unsigned long flags)
27{
28 struct inode *inode = file->f_mapping->host;
29 struct mtd_info *mtd = inode->i_sb->s_mtd;
30 unsigned long isize, offset;
31
32 if (!mtd)
33 goto cant_map_directly;
34
35 isize = i_size_read(inode);
36 offset = pgoff << PAGE_SHIFT;
37 if (offset > isize || len > isize || offset > isize - len)
38 return (unsigned long) -EINVAL;
39
40 /* we need to call down to the MTD layer to do the actual mapping */
41 if (mtd->get_unmapped_area) {
42 if (addr != 0)
43 return (unsigned long) -EINVAL;
44
45 if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT))
46 return (unsigned long) -EINVAL;
47
48 offset += ROMFS_I(inode)->i_dataoffset;
49 if (offset > mtd->size - len)
50 return (unsigned long) -EINVAL;
51
52 return mtd->get_unmapped_area(mtd, len, offset, flags);
53 }
54
55cant_map_directly:
56 return (unsigned long) -ENOSYS;
57}
58
59/*
60 * permit a R/O mapping to be made directly through onto an MTD device if
61 * possible
62 */
63static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
64{
65 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
66}
67
68const struct file_operations romfs_ro_fops = {
69 .llseek = generic_file_llseek,
70 .read = do_sync_read,
71 .aio_read = generic_file_aio_read,
72 .splice_read = generic_file_splice_read,
73 .mmap = romfs_mmap,
74 .get_unmapped_area = romfs_get_unmapped_area,
75};
diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c
new file mode 100644
index 000000000000..b3208adf8e71
--- /dev/null
+++ b/fs/romfs/storage.c
@@ -0,0 +1,293 @@
1/* RomFS storage access routines
2 *
3 * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/fs.h>
13#include <linux/mtd/super.h>
14#include <linux/buffer_head.h>
15#include "internal.h"
16
17#if !defined(CONFIG_ROMFS_ON_MTD) && !defined(CONFIG_ROMFS_ON_BLOCK)
18#error no ROMFS backing store interface configured
19#endif
20
21#ifdef CONFIG_ROMFS_ON_MTD
22#define ROMFS_MTD_READ(sb, ...) ((sb)->s_mtd->read((sb)->s_mtd, ##__VA_ARGS__))
23
24/*
25 * read data from an romfs image on an MTD device
26 */
27static int romfs_mtd_read(struct super_block *sb, unsigned long pos,
28 void *buf, size_t buflen)
29{
30 size_t rlen;
31 int ret;
32
33 ret = ROMFS_MTD_READ(sb, pos, buflen, &rlen, buf);
34 return (ret < 0 || rlen != buflen) ? -EIO : 0;
35}
36
37/*
38 * determine the length of a string in a romfs image on an MTD device
39 */
40static ssize_t romfs_mtd_strnlen(struct super_block *sb,
41 unsigned long pos, size_t maxlen)
42{
43 ssize_t n = 0;
44 size_t segment;
45 u_char buf[16], *p;
46 size_t len;
47 int ret;
48
49 /* scan the string up to 16 bytes at a time */
50 while (maxlen > 0) {
51 segment = min_t(size_t, maxlen, 16);
52 ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf);
53 if (ret < 0)
54 return ret;
55 p = memchr(buf, 0, len);
56 if (p)
57 return n + (p - buf);
58 maxlen -= len;
59 pos += len;
60 n += len;
61 }
62
63 return n;
64}
65
66/*
67 * compare a string to one in a romfs image on MTD
68 * - return 1 if matched, 0 if differ, -ve if error
69 */
70static int romfs_mtd_strcmp(struct super_block *sb, unsigned long pos,
71 const char *str, size_t size)
72{
73 u_char buf[17];
74 size_t len, segment;
75 int ret;
76
77 /* scan the string up to 16 bytes at a time, and attempt to grab the
78 * trailing NUL whilst we're at it */
79 buf[0] = 0xff;
80
81 while (size > 0) {
82 segment = min_t(size_t, size + 1, 17);
83 ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf);
84 if (ret < 0)
85 return ret;
86 len--;
87 if (memcmp(buf, str, len) != 0)
88 return 0;
89 buf[0] = buf[len];
90 size -= len;
91 pos += len;
92 str += len;
93 }
94
95 /* check the trailing NUL was */
96 if (buf[0])
97 return 0;
98
99 return 1;
100}
101#endif /* CONFIG_ROMFS_ON_MTD */
102
103#ifdef CONFIG_ROMFS_ON_BLOCK
104/*
105 * read data from an romfs image on a block device
106 */
107static int romfs_blk_read(struct super_block *sb, unsigned long pos,
108 void *buf, size_t buflen)
109{
110 struct buffer_head *bh;
111 unsigned long offset;
112 size_t segment;
113
114 /* copy the string up to blocksize bytes at a time */
115 while (buflen > 0) {
116 offset = pos & (ROMBSIZE - 1);
117 segment = min_t(size_t, buflen, ROMBSIZE - offset);
118 bh = sb_bread(sb, pos >> ROMBSBITS);
119 if (!bh)
120 return -EIO;
121 memcpy(buf, bh->b_data + offset, segment);
122 brelse(bh);
123 buf += segment;
124 buflen -= segment;
125 pos += segment;
126 }
127
128 return 0;
129}
130
131/*
132 * determine the length of a string in romfs on a block device
133 */
134static ssize_t romfs_blk_strnlen(struct super_block *sb,
135 unsigned long pos, size_t limit)
136{
137 struct buffer_head *bh;
138 unsigned long offset;
139 ssize_t n = 0;
140 size_t segment;
141 u_char *buf, *p;
142
143 /* scan the string up to blocksize bytes at a time */
144 while (limit > 0) {
145 offset = pos & (ROMBSIZE - 1);
146 segment = min_t(size_t, limit, ROMBSIZE - offset);
147 bh = sb_bread(sb, pos >> ROMBSBITS);
148 if (!bh)
149 return -EIO;
150 buf = bh->b_data + offset;
151 p = memchr(buf, 0, segment);
152 brelse(bh);
153 if (p)
154 return n + (p - buf);
155 limit -= segment;
156 pos += segment;
157 n += segment;
158 }
159
160 return n;
161}
162
163/*
164 * compare a string to one in a romfs image on a block device
165 * - return 1 if matched, 0 if differ, -ve if error
166 */
167static int romfs_blk_strcmp(struct super_block *sb, unsigned long pos,
168 const char *str, size_t size)
169{
170 struct buffer_head *bh;
171 unsigned long offset;
172 size_t segment;
173 bool matched, terminated = false;
174
175 /* compare string up to a block at a time */
176 while (size > 0) {
177 offset = pos & (ROMBSIZE - 1);
178 segment = min_t(size_t, size, ROMBSIZE - offset);
179 bh = sb_bread(sb, pos >> ROMBSBITS);
180 if (!bh)
181 return -EIO;
182 matched = (memcmp(bh->b_data + offset, str, segment) == 0);
183
184 size -= segment;
185 pos += segment;
186 str += segment;
187 if (matched && size == 0 && offset + segment < ROMBSIZE) {
188 if (!bh->b_data[offset + segment])
189 terminated = true;
190 else
191 matched = false;
192 }
193 brelse(bh);
194 if (!matched)
195 return 0;
196 }
197
198 if (!terminated) {
199 /* the terminating NUL must be on the first byte of the next
200 * block */
201 BUG_ON((pos & (ROMBSIZE - 1)) != 0);
202 bh = sb_bread(sb, pos >> ROMBSBITS);
203 if (!bh)
204 return -EIO;
205 matched = !bh->b_data[0];
206 brelse(bh);
207 if (!matched)
208 return 0;
209 }
210
211 return 1;
212}
213#endif /* CONFIG_ROMFS_ON_BLOCK */
214
215/*
216 * read data from the romfs image
217 */
218int romfs_dev_read(struct super_block *sb, unsigned long pos,
219 void *buf, size_t buflen)
220{
221 size_t limit;
222
223 limit = romfs_maxsize(sb);
224 if (pos >= limit)
225 return -EIO;
226 if (buflen > limit - pos)
227 buflen = limit - pos;
228
229#ifdef CONFIG_ROMFS_ON_MTD
230 if (sb->s_mtd)
231 return romfs_mtd_read(sb, pos, buf, buflen);
232#endif
233#ifdef CONFIG_ROMFS_ON_BLOCK
234 if (sb->s_bdev)
235 return romfs_blk_read(sb, pos, buf, buflen);
236#endif
237 return -EIO;
238}
239
240/*
241 * determine the length of a string in romfs
242 */
243ssize_t romfs_dev_strnlen(struct super_block *sb,
244 unsigned long pos, size_t maxlen)
245{
246 size_t limit;
247
248 limit = romfs_maxsize(sb);
249 if (pos >= limit)
250 return -EIO;
251 if (maxlen > limit - pos)
252 maxlen = limit - pos;
253
254#ifdef CONFIG_ROMFS_ON_MTD
255 if (sb->s_mtd)
256 return romfs_mtd_strnlen(sb, pos, limit);
257#endif
258#ifdef CONFIG_ROMFS_ON_BLOCK
259 if (sb->s_bdev)
260 return romfs_blk_strnlen(sb, pos, limit);
261#endif
262 return -EIO;
263}
264
265/*
266 * compare a string to one in romfs
267 * - the string to be compared to, str, may not be NUL-terminated; instead the
268 * string is of the specified size
269 * - return 1 if matched, 0 if differ, -ve if error
270 */
271int romfs_dev_strcmp(struct super_block *sb, unsigned long pos,
272 const char *str, size_t size)
273{
274 size_t limit;
275
276 limit = romfs_maxsize(sb);
277 if (pos >= limit)
278 return -EIO;
279 if (size > ROMFS_MAXFN)
280 return -ENAMETOOLONG;
281 if (size + 1 > limit - pos)
282 return -EIO;
283
284#ifdef CONFIG_ROMFS_ON_MTD
285 if (sb->s_mtd)
286 return romfs_mtd_strcmp(sb, pos, str, size);
287#endif
288#ifdef CONFIG_ROMFS_ON_BLOCK
289 if (sb->s_bdev)
290 return romfs_blk_strcmp(sb, pos, str, size);
291#endif
292 return -EIO;
293}
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
new file mode 100644
index 000000000000..4ab3c03d8f95
--- /dev/null
+++ b/fs/romfs/super.c
@@ -0,0 +1,654 @@
1/* Block- or MTD-based romfs
2 *
3 * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * Derived from: ROMFS file system, Linux implementation
7 *
8 * Copyright © 1997-1999 Janos Farkas <chexum@shadow.banki.hu>
9 *
10 * Using parts of the minix filesystem
11 * Copyright © 1991, 1992 Linus Torvalds
12 *
13 * and parts of the affs filesystem additionally
14 * Copyright © 1993 Ray Burr
15 * Copyright © 1996 Hans-Joachim Widmaier
16 *
17 * Changes
18 * Changed for 2.1.19 modules
19 * Jan 1997 Initial release
20 * Jun 1997 2.1.43+ changes
21 * Proper page locking in readpage
22 * Changed to work with 2.1.45+ fs
23 * Jul 1997 Fixed follow_link
24 * 2.1.47
25 * lookup shouldn't return -ENOENT
26 * from Horst von Brand:
27 * fail on wrong checksum
28 * double unlock_super was possible
29 * correct namelen for statfs
30 * spotted by Bill Hawes:
31 * readlink shouldn't iput()
32 * Jun 1998 2.1.106 from Avery Pennarun: glibc scandir()
33 * exposed a problem in readdir
34 * 2.1.107 code-freeze spellchecker run
35 * Aug 1998 2.1.118+ VFS changes
36 * Sep 1998 2.1.122 another VFS change (follow_link)
37 * Apr 1999 2.2.7 no more EBADF checking in
38 * lookup/readdir, use ERR_PTR
39 * Jun 1999 2.3.6 d_alloc_root use changed
40 * 2.3.9 clean up usage of ENOENT/negative
41 * dentries in lookup
42 * clean up page flags setting
43 * (error, uptodate, locking) in
44 * in readpage
45 * use init_special_inode for
46 * fifos/sockets (and streamline) in
47 * read_inode, fix _ops table order
48 * Aug 1999 2.3.16 __initfunc() => __init change
49 * Oct 1999 2.3.24 page->owner hack obsoleted
50 * Nov 1999 2.3.27 2.3.25+ page->offset => index change
51 *
52 *
53 * This program is free software; you can redistribute it and/or
54 * modify it under the terms of the GNU General Public Licence
55 * as published by the Free Software Foundation; either version
56 * 2 of the Licence, or (at your option) any later version.
57 */
58
59#include <linux/module.h>
60#include <linux/string.h>
61#include <linux/fs.h>
62#include <linux/time.h>
63#include <linux/slab.h>
64#include <linux/init.h>
65#include <linux/blkdev.h>
66#include <linux/parser.h>
67#include <linux/mount.h>
68#include <linux/namei.h>
69#include <linux/statfs.h>
70#include <linux/mtd/super.h>
71#include <linux/ctype.h>
72#include <linux/highmem.h>
73#include <linux/pagemap.h>
74#include <linux/uaccess.h>
75#include "internal.h"
76
77static struct kmem_cache *romfs_inode_cachep;
78
79static const umode_t romfs_modemap[8] = {
80 0, /* hard link */
81 S_IFDIR | 0644, /* directory */
82 S_IFREG | 0644, /* regular file */
83 S_IFLNK | 0777, /* symlink */
84 S_IFBLK | 0600, /* blockdev */
85 S_IFCHR | 0600, /* chardev */
86 S_IFSOCK | 0644, /* socket */
87 S_IFIFO | 0644 /* FIFO */
88};
89
90static const unsigned char romfs_dtype_table[] = {
91 DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO
92};
93
94static struct inode *romfs_iget(struct super_block *sb, unsigned long pos);
95
96/*
97 * read a page worth of data from the image
98 */
99static int romfs_readpage(struct file *file, struct page *page)
100{
101 struct inode *inode = page->mapping->host;
102 loff_t offset, size;
103 unsigned long fillsize, pos;
104 void *buf;
105 int ret;
106
107 buf = kmap(page);
108 if (!buf)
109 return -ENOMEM;
110
111 /* 32 bit warning -- but not for us :) */
112 offset = page_offset(page);
113 size = i_size_read(inode);
114 fillsize = 0;
115 ret = 0;
116 if (offset < size) {
117 size -= offset;
118 fillsize = size > PAGE_SIZE ? PAGE_SIZE : size;
119
120 pos = ROMFS_I(inode)->i_dataoffset + offset;
121
122 ret = romfs_dev_read(inode->i_sb, pos, buf, fillsize);
123 if (ret < 0) {
124 SetPageError(page);
125 fillsize = 0;
126 ret = -EIO;
127 }
128 }
129
130 if (fillsize < PAGE_SIZE)
131 memset(buf + fillsize, 0, PAGE_SIZE - fillsize);
132 if (ret == 0)
133 SetPageUptodate(page);
134
135 flush_dcache_page(page);
136 kunmap(page);
137 unlock_page(page);
138 return ret;
139}
140
141static const struct address_space_operations romfs_aops = {
142 .readpage = romfs_readpage
143};
144
145/*
146 * read the entries from a directory
147 */
148static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
149{
150 struct inode *i = filp->f_dentry->d_inode;
151 struct romfs_inode ri;
152 unsigned long offset, maxoff;
153 int j, ino, nextfh;
154 int stored = 0;
155 char fsname[ROMFS_MAXFN]; /* XXX dynamic? */
156 int ret;
157
158 maxoff = romfs_maxsize(i->i_sb);
159
160 offset = filp->f_pos;
161 if (!offset) {
162 offset = i->i_ino & ROMFH_MASK;
163 ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
164 if (ret < 0)
165 goto out;
166 offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
167 }
168
169 /* Not really failsafe, but we are read-only... */
170 for (;;) {
171 if (!offset || offset >= maxoff) {
172 offset = maxoff;
173 filp->f_pos = offset;
174 goto out;
175 }
176 filp->f_pos = offset;
177
178 /* Fetch inode info */
179 ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
180 if (ret < 0)
181 goto out;
182
183 j = romfs_dev_strnlen(i->i_sb, offset + ROMFH_SIZE,
184 sizeof(fsname) - 1);
185 if (j < 0)
186 goto out;
187
188 ret = romfs_dev_read(i->i_sb, offset + ROMFH_SIZE, fsname, j);
189 if (ret < 0)
190 goto out;
191 fsname[j] = '\0';
192
193 ino = offset;
194 nextfh = be32_to_cpu(ri.next);
195 if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
196 ino = be32_to_cpu(ri.spec);
197 if (filldir(dirent, fsname, j, offset, ino,
198 romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0)
199 goto out;
200
201 stored++;
202 offset = nextfh & ROMFH_MASK;
203 }
204
205out:
206 return stored;
207}
208
209/*
210 * look up an entry in a directory
211 */
212static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry,
213 struct nameidata *nd)
214{
215 unsigned long offset, maxoff;
216 struct inode *inode;
217 struct romfs_inode ri;
218 const char *name; /* got from dentry */
219 int len, ret;
220
221 offset = dir->i_ino & ROMFH_MASK;
222 ret = romfs_dev_read(dir->i_sb, offset, &ri, ROMFH_SIZE);
223 if (ret < 0)
224 goto error;
225
226 /* search all the file entries in the list starting from the one
227 * pointed to by the directory's special data */
228 maxoff = romfs_maxsize(dir->i_sb);
229 offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
230
231 name = dentry->d_name.name;
232 len = dentry->d_name.len;
233
234 for (;;) {
235 if (!offset || offset >= maxoff)
236 goto out0;
237
238 ret = romfs_dev_read(dir->i_sb, offset, &ri, sizeof(ri));
239 if (ret < 0)
240 goto error;
241
242 /* try to match the first 16 bytes of name */
243 ret = romfs_dev_strcmp(dir->i_sb, offset + ROMFH_SIZE, name,
244 len);
245 if (ret < 0)
246 goto error;
247 if (ret == 1)
248 break;
249
250 /* next entry */
251 offset = be32_to_cpu(ri.next) & ROMFH_MASK;
252 }
253
254 /* Hard link handling */
255 if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
256 offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
257
258 inode = romfs_iget(dir->i_sb, offset);
259 if (IS_ERR(inode)) {
260 ret = PTR_ERR(inode);
261 goto error;
262 }
263 goto outi;
264
265 /*
266 * it's a bit funky, _lookup needs to return an error code
267 * (negative) or a NULL, both as a dentry. ENOENT should not
268 * be returned, instead we need to create a negative dentry by
269 * d_add(dentry, NULL); and return 0 as no error.
270 * (Although as I see, it only matters on writable file
271 * systems).
272 */
273out0:
274 inode = NULL;
275outi:
276 d_add(dentry, inode);
277 ret = 0;
278error:
279 return ERR_PTR(ret);
280}
281
282static const struct file_operations romfs_dir_operations = {
283 .read = generic_read_dir,
284 .readdir = romfs_readdir,
285};
286
287static struct inode_operations romfs_dir_inode_operations = {
288 .lookup = romfs_lookup,
289};
290
291/*
292 * get a romfs inode based on its position in the image (which doubles as the
293 * inode number)
294 */
295static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
296{
297 struct romfs_inode_info *inode;
298 struct romfs_inode ri;
299 struct inode *i;
300 unsigned long nlen;
301 unsigned nextfh;
302 int ret;
303 umode_t mode;
304
305 /* we might have to traverse a chain of "hard link" file entries to get
306 * to the actual file */
307 for (;;) {
308 ret = romfs_dev_read(sb, pos, &ri, sizeof(ri));
309 if (ret < 0)
310 goto error;
311
312 /* XXX: do romfs_checksum here too (with name) */
313
314 nextfh = be32_to_cpu(ri.next);
315 if ((nextfh & ROMFH_TYPE) != ROMFH_HRD)
316 break;
317
318 pos = be32_to_cpu(ri.spec) & ROMFH_MASK;
319 }
320
321 /* determine the length of the filename */
322 nlen = romfs_dev_strnlen(sb, pos + ROMFH_SIZE, ROMFS_MAXFN);
323 if (IS_ERR_VALUE(nlen))
324 goto eio;
325
326 /* get an inode for this image position */
327 i = iget_locked(sb, pos);
328 if (!i)
329 return ERR_PTR(-ENOMEM);
330
331 if (!(i->i_state & I_NEW))
332 return i;
333
334 /* precalculate the data offset */
335 inode = ROMFS_I(i);
336 inode->i_metasize = (ROMFH_SIZE + nlen + 1 + ROMFH_PAD) & ROMFH_MASK;
337 inode->i_dataoffset = pos + inode->i_metasize;
338
339 i->i_nlink = 1; /* Hard to decide.. */
340 i->i_size = be32_to_cpu(ri.size);
341 i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
342 i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
343
344 /* set up mode and ops */
345 mode = romfs_modemap[nextfh & ROMFH_TYPE];
346
347 switch (nextfh & ROMFH_TYPE) {
348 case ROMFH_DIR:
349 i->i_size = ROMFS_I(i)->i_metasize;
350 i->i_op = &romfs_dir_inode_operations;
351 i->i_fop = &romfs_dir_operations;
352 if (nextfh & ROMFH_EXEC)
353 mode |= S_IXUGO;
354 break;
355 case ROMFH_REG:
356 i->i_fop = &romfs_ro_fops;
357 i->i_data.a_ops = &romfs_aops;
358 if (i->i_sb->s_mtd)
359 i->i_data.backing_dev_info =
360 i->i_sb->s_mtd->backing_dev_info;
361 if (nextfh & ROMFH_EXEC)
362 mode |= S_IXUGO;
363 break;
364 case ROMFH_SYM:
365 i->i_op = &page_symlink_inode_operations;
366 i->i_data.a_ops = &romfs_aops;
367 mode |= S_IRWXUGO;
368 break;
369 default:
370 /* depending on MBZ for sock/fifos */
371 nextfh = be32_to_cpu(ri.spec);
372 init_special_inode(i, mode, MKDEV(nextfh >> 16,
373 nextfh & 0xffff));
374 break;
375 }
376
377 i->i_mode = mode;
378
379 unlock_new_inode(i);
380 return i;
381
382eio:
383 ret = -EIO;
384error:
385 printk(KERN_ERR "ROMFS: read error for inode 0x%lx\n", pos);
386 return ERR_PTR(ret);
387}
388
389/*
390 * allocate a new inode
391 */
392static struct inode *romfs_alloc_inode(struct super_block *sb)
393{
394 struct romfs_inode_info *inode;
395 inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
396 return inode ? &inode->vfs_inode : NULL;
397}
398
399/*
400 * return a spent inode to the slab cache
401 */
402static void romfs_destroy_inode(struct inode *inode)
403{
404 kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
405}
406
407/*
408 * get filesystem statistics
409 */
410static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
411{
412 struct super_block *sb = dentry->d_sb;
413 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
414
415 buf->f_type = ROMFS_MAGIC;
416 buf->f_namelen = ROMFS_MAXFN;
417 buf->f_bsize = ROMBSIZE;
418 buf->f_bfree = buf->f_bavail = buf->f_ffree;
419 buf->f_blocks =
420 (romfs_maxsize(dentry->d_sb) + ROMBSIZE - 1) >> ROMBSBITS;
421 buf->f_fsid.val[0] = (u32)id;
422 buf->f_fsid.val[1] = (u32)(id >> 32);
423 return 0;
424}
425
426/*
427 * remounting must involve read-only
428 */
429static int romfs_remount(struct super_block *sb, int *flags, char *data)
430{
431 *flags |= MS_RDONLY;
432 return 0;
433}
434
435static const struct super_operations romfs_super_ops = {
436 .alloc_inode = romfs_alloc_inode,
437 .destroy_inode = romfs_destroy_inode,
438 .statfs = romfs_statfs,
439 .remount_fs = romfs_remount,
440};
441
442/*
443 * checksum check on part of a romfs filesystem
444 */
445static __u32 romfs_checksum(const void *data, int size)
446{
447 const __be32 *ptr = data;
448 __u32 sum;
449
450 sum = 0;
451 size >>= 2;
452 while (size > 0) {
453 sum += be32_to_cpu(*ptr++);
454 size--;
455 }
456 return sum;
457}
458
459/*
460 * fill in the superblock
461 */
462static int romfs_fill_super(struct super_block *sb, void *data, int silent)
463{
464 struct romfs_super_block *rsb;
465 struct inode *root;
466 unsigned long pos, img_size;
467 const char *storage;
468 size_t len;
469 int ret;
470
471#ifdef CONFIG_BLOCK
472 if (!sb->s_mtd) {
473 sb_set_blocksize(sb, ROMBSIZE);
474 } else {
475 sb->s_blocksize = ROMBSIZE;
476 sb->s_blocksize_bits = blksize_bits(ROMBSIZE);
477 }
478#endif
479
480 sb->s_maxbytes = 0xFFFFFFFF;
481 sb->s_magic = ROMFS_MAGIC;
482 sb->s_flags |= MS_RDONLY | MS_NOATIME;
483 sb->s_op = &romfs_super_ops;
484
485 /* read the image superblock and check it */
486 rsb = kmalloc(512, GFP_KERNEL);
487 if (!rsb)
488 return -ENOMEM;
489
490 sb->s_fs_info = (void *) 512;
491 ret = romfs_dev_read(sb, 0, rsb, 512);
492 if (ret < 0)
493 goto error_rsb;
494
495 img_size = be32_to_cpu(rsb->size);
496
497 if (sb->s_mtd && img_size > sb->s_mtd->size)
498 goto error_rsb_inval;
499
500 sb->s_fs_info = (void *) img_size;
501
502 if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 ||
503 img_size < ROMFH_SIZE) {
504 if (!silent)
505 printk(KERN_WARNING "VFS:"
506 " Can't find a romfs filesystem on dev %s.\n",
507 sb->s_id);
508 goto error_rsb_inval;
509 }
510
511 if (romfs_checksum(rsb, min_t(size_t, img_size, 512))) {
512 printk(KERN_ERR "ROMFS: bad initial checksum on dev %s.\n",
513 sb->s_id);
514 goto error_rsb_inval;
515 }
516
517 storage = sb->s_mtd ? "MTD" : "the block layer";
518
519 len = strnlen(rsb->name, ROMFS_MAXFN);
520 if (!silent)
521 printk(KERN_NOTICE "ROMFS: Mounting image '%*.*s' through %s\n",
522 (unsigned) len, (unsigned) len, rsb->name, storage);
523
524 kfree(rsb);
525 rsb = NULL;
526
527 /* find the root directory */
528 pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK;
529
530 root = romfs_iget(sb, pos);
531 if (!root)
532 goto error;
533
534 sb->s_root = d_alloc_root(root);
535 if (!sb->s_root)
536 goto error_i;
537
538 return 0;
539
540error_i:
541 iput(root);
542error:
543 return -EINVAL;
544error_rsb_inval:
545 ret = -EINVAL;
546error_rsb:
547 return ret;
548}
549
550/*
551 * get a superblock for mounting
552 */
553static int romfs_get_sb(struct file_system_type *fs_type,
554 int flags, const char *dev_name,
555 void *data, struct vfsmount *mnt)
556{
557 int ret = -EINVAL;
558
559#ifdef CONFIG_ROMFS_ON_MTD
560 ret = get_sb_mtd(fs_type, flags, dev_name, data, romfs_fill_super,
561 mnt);
562#endif
563#ifdef CONFIG_ROMFS_ON_BLOCK
564 if (ret == -EINVAL)
565 ret = get_sb_bdev(fs_type, flags, dev_name, data,
566 romfs_fill_super, mnt);
567#endif
568 return ret;
569}
570
571/*
572 * destroy a romfs superblock in the appropriate manner
573 */
574static void romfs_kill_sb(struct super_block *sb)
575{
576#ifdef CONFIG_ROMFS_ON_MTD
577 if (sb->s_mtd) {
578 kill_mtd_super(sb);
579 return;
580 }
581#endif
582#ifdef CONFIG_ROMFS_ON_BLOCK
583 if (sb->s_bdev) {
584 kill_block_super(sb);
585 return;
586 }
587#endif
588}
589
590static struct file_system_type romfs_fs_type = {
591 .owner = THIS_MODULE,
592 .name = "romfs",
593 .get_sb = romfs_get_sb,
594 .kill_sb = romfs_kill_sb,
595 .fs_flags = FS_REQUIRES_DEV,
596};
597
598/*
599 * inode storage initialiser
600 */
601static void romfs_i_init_once(void *_inode)
602{
603 struct romfs_inode_info *inode = _inode;
604
605 inode_init_once(&inode->vfs_inode);
606}
607
608/*
609 * romfs module initialisation
610 */
611static int __init init_romfs_fs(void)
612{
613 int ret;
614
615 printk(KERN_INFO "ROMFS MTD (C) 2007 Red Hat, Inc.\n");
616
617 romfs_inode_cachep =
618 kmem_cache_create("romfs_i",
619 sizeof(struct romfs_inode_info), 0,
620 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
621 romfs_i_init_once);
622
623 if (!romfs_inode_cachep) {
624 printk(KERN_ERR
625 "ROMFS error: Failed to initialise inode cache\n");
626 return -ENOMEM;
627 }
628 ret = register_filesystem(&romfs_fs_type);
629 if (ret) {
630 printk(KERN_ERR "ROMFS error: Failed to register filesystem\n");
631 goto error_register;
632 }
633 return 0;
634
635error_register:
636 kmem_cache_destroy(romfs_inode_cachep);
637 return ret;
638}
639
640/*
641 * romfs module removal
642 */
643static void __exit exit_romfs_fs(void)
644{
645 unregister_filesystem(&romfs_fs_type);
646 kmem_cache_destroy(romfs_inode_cachep);
647}
648
649module_init(init_romfs_fs);
650module_exit(exit_romfs_fs);
651
652MODULE_DESCRIPTION("Direct-MTD Capable RomFS");
653MODULE_AUTHOR("Red Hat, Inc.");
654MODULE_LICENSE("GPL"); /* Actually dual-licensed, but it doesn't matter for */
diff --git a/fs/splice.c b/fs/splice.c
index 4ed0ba44a966..666953d59a35 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -59,7 +59,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
59 */ 59 */
60 wait_on_page_writeback(page); 60 wait_on_page_writeback(page);
61 61
62 if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) 62 if (page_has_private(page) &&
63 !try_to_release_page(page, GFP_KERNEL))
63 goto out_unlock; 64 goto out_unlock;
64 65
65 /* 66 /*
@@ -181,8 +182,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
181 do_wakeup = 0; 182 do_wakeup = 0;
182 page_nr = 0; 183 page_nr = 0;
183 184
184 if (pipe->inode) 185 pipe_lock(pipe);
185 mutex_lock(&pipe->inode->i_mutex);
186 186
187 for (;;) { 187 for (;;) {
188 if (!pipe->readers) { 188 if (!pipe->readers) {
@@ -244,15 +244,13 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
244 pipe->waiting_writers--; 244 pipe->waiting_writers--;
245 } 245 }
246 246
247 if (pipe->inode) { 247 pipe_unlock(pipe);
248 mutex_unlock(&pipe->inode->i_mutex);
249 248
250 if (do_wakeup) { 249 if (do_wakeup) {
251 smp_mb(); 250 smp_mb();
252 if (waitqueue_active(&pipe->wait)) 251 if (waitqueue_active(&pipe->wait))
253 wake_up_interruptible(&pipe->wait); 252 wake_up_interruptible(&pipe->wait);
254 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 253 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
255 }
256 } 254 }
257 255
258 while (page_nr < spd_pages) 256 while (page_nr < spd_pages)
@@ -554,8 +552,8 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
554 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create 552 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
555 * a new page in the output file page cache and fill/dirty that. 553 * a new page in the output file page cache and fill/dirty that.
556 */ 554 */
557static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 555int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
558 struct splice_desc *sd) 556 struct splice_desc *sd)
559{ 557{
560 struct file *file = sd->u.file; 558 struct file *file = sd->u.file;
561 struct address_space *mapping = file->f_mapping; 559 struct address_space *mapping = file->f_mapping;
@@ -599,108 +597,177 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
599out: 597out:
600 return ret; 598 return ret;
601} 599}
600EXPORT_SYMBOL(pipe_to_file);
601
602static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
603{
604 smp_mb();
605 if (waitqueue_active(&pipe->wait))
606 wake_up_interruptible(&pipe->wait);
607 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
608}
602 609
603/** 610/**
604 * __splice_from_pipe - splice data from a pipe to given actor 611 * splice_from_pipe_feed - feed available data from a pipe to a file
605 * @pipe: pipe to splice from 612 * @pipe: pipe to splice from
606 * @sd: information to @actor 613 * @sd: information to @actor
607 * @actor: handler that splices the data 614 * @actor: handler that splices the data
608 * 615 *
609 * Description: 616 * Description:
610 * This function does little more than loop over the pipe and call 617 * This function loops over the pipe and calls @actor to do the
611 * @actor to do the actual moving of a single struct pipe_buffer to 618 * actual moving of a single struct pipe_buffer to the desired
612 * the desired destination. See pipe_to_file, pipe_to_sendpage, or 619 * destination. It returns when there's no more buffers left in
613 * pipe_to_user. 620 * the pipe or if the requested number of bytes (@sd->total_len)
621 * have been copied. It returns a positive number (one) if the
622 * pipe needs to be filled with more data, zero if the required
623 * number of bytes have been copied and -errno on error.
614 * 624 *
625 * This, together with splice_from_pipe_{begin,end,next}, may be
626 * used to implement the functionality of __splice_from_pipe() when
627 * locking is required around copying the pipe buffers to the
628 * destination.
615 */ 629 */
616ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, 630int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
617 splice_actor *actor) 631 splice_actor *actor)
618{ 632{
619 int ret, do_wakeup, err; 633 int ret;
620
621 ret = 0;
622 do_wakeup = 0;
623
624 for (;;) {
625 if (pipe->nrbufs) {
626 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
627 const struct pipe_buf_operations *ops = buf->ops;
628 634
629 sd->len = buf->len; 635 while (pipe->nrbufs) {
630 if (sd->len > sd->total_len) 636 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
631 sd->len = sd->total_len; 637 const struct pipe_buf_operations *ops = buf->ops;
632 638
633 err = actor(pipe, buf, sd); 639 sd->len = buf->len;
634 if (err <= 0) { 640 if (sd->len > sd->total_len)
635 if (!ret && err != -ENODATA) 641 sd->len = sd->total_len;
636 ret = err;
637 642
638 break; 643 ret = actor(pipe, buf, sd);
639 } 644 if (ret <= 0) {
645 if (ret == -ENODATA)
646 ret = 0;
647 return ret;
648 }
649 buf->offset += ret;
650 buf->len -= ret;
640 651
641 ret += err; 652 sd->num_spliced += ret;
642 buf->offset += err; 653 sd->len -= ret;
643 buf->len -= err; 654 sd->pos += ret;
655 sd->total_len -= ret;
644 656
645 sd->len -= err; 657 if (!buf->len) {
646 sd->pos += err; 658 buf->ops = NULL;
647 sd->total_len -= err; 659 ops->release(pipe, buf);
648 if (sd->len) 660 pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
649 continue; 661 pipe->nrbufs--;
662 if (pipe->inode)
663 sd->need_wakeup = true;
664 }
650 665
651 if (!buf->len) { 666 if (!sd->total_len)
652 buf->ops = NULL; 667 return 0;
653 ops->release(pipe, buf); 668 }
654 pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
655 pipe->nrbufs--;
656 if (pipe->inode)
657 do_wakeup = 1;
658 }
659 669
660 if (!sd->total_len) 670 return 1;
661 break; 671}
662 } 672EXPORT_SYMBOL(splice_from_pipe_feed);
663 673
664 if (pipe->nrbufs) 674/**
665 continue; 675 * splice_from_pipe_next - wait for some data to splice from
676 * @pipe: pipe to splice from
677 * @sd: information about the splice operation
678 *
679 * Description:
680 * This function will wait for some data and return a positive
681 * value (one) if pipe buffers are available. It will return zero
682 * or -errno if no more data needs to be spliced.
683 */
684int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
685{
686 while (!pipe->nrbufs) {
666 if (!pipe->writers) 687 if (!pipe->writers)
667 break; 688 return 0;
668 if (!pipe->waiting_writers) {
669 if (ret)
670 break;
671 }
672 689
673 if (sd->flags & SPLICE_F_NONBLOCK) { 690 if (!pipe->waiting_writers && sd->num_spliced)
674 if (!ret) 691 return 0;
675 ret = -EAGAIN;
676 break;
677 }
678 692
679 if (signal_pending(current)) { 693 if (sd->flags & SPLICE_F_NONBLOCK)
680 if (!ret) 694 return -EAGAIN;
681 ret = -ERESTARTSYS;
682 break;
683 }
684 695
685 if (do_wakeup) { 696 if (signal_pending(current))
686 smp_mb(); 697 return -ERESTARTSYS;
687 if (waitqueue_active(&pipe->wait)) 698
688 wake_up_interruptible_sync(&pipe->wait); 699 if (sd->need_wakeup) {
689 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 700 wakeup_pipe_writers(pipe);
690 do_wakeup = 0; 701 sd->need_wakeup = false;
691 } 702 }
692 703
693 pipe_wait(pipe); 704 pipe_wait(pipe);
694 } 705 }
695 706
696 if (do_wakeup) { 707 return 1;
697 smp_mb(); 708}
698 if (waitqueue_active(&pipe->wait)) 709EXPORT_SYMBOL(splice_from_pipe_next);
699 wake_up_interruptible(&pipe->wait);
700 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
701 }
702 710
703 return ret; 711/**
712 * splice_from_pipe_begin - start splicing from pipe
713 * @sd: information about the splice operation
714 *
715 * Description:
716 * This function should be called before a loop containing
717 * splice_from_pipe_next() and splice_from_pipe_feed() to
718 * initialize the necessary fields of @sd.
719 */
720void splice_from_pipe_begin(struct splice_desc *sd)
721{
722 sd->num_spliced = 0;
723 sd->need_wakeup = false;
724}
725EXPORT_SYMBOL(splice_from_pipe_begin);
726
727/**
728 * splice_from_pipe_end - finish splicing from pipe
729 * @pipe: pipe to splice from
730 * @sd: information about the splice operation
731 *
732 * Description:
733 * This function will wake up pipe writers if necessary. It should
734 * be called after a loop containing splice_from_pipe_next() and
735 * splice_from_pipe_feed().
736 */
737void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
738{
739 if (sd->need_wakeup)
740 wakeup_pipe_writers(pipe);
741}
742EXPORT_SYMBOL(splice_from_pipe_end);
743
744/**
745 * __splice_from_pipe - splice data from a pipe to given actor
746 * @pipe: pipe to splice from
747 * @sd: information to @actor
748 * @actor: handler that splices the data
749 *
750 * Description:
751 * This function does little more than loop over the pipe and call
752 * @actor to do the actual moving of a single struct pipe_buffer to
753 * the desired destination. See pipe_to_file, pipe_to_sendpage, or
754 * pipe_to_user.
755 *
756 */
757ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
758 splice_actor *actor)
759{
760 int ret;
761
762 splice_from_pipe_begin(sd);
763 do {
764 ret = splice_from_pipe_next(pipe, sd);
765 if (ret > 0)
766 ret = splice_from_pipe_feed(pipe, sd, actor);
767 } while (ret > 0);
768 splice_from_pipe_end(pipe, sd);
769
770 return sd->num_spliced ? sd->num_spliced : ret;
704} 771}
705EXPORT_SYMBOL(__splice_from_pipe); 772EXPORT_SYMBOL(__splice_from_pipe);
706 773
@@ -714,7 +781,7 @@ EXPORT_SYMBOL(__splice_from_pipe);
714 * @actor: handler that splices the data 781 * @actor: handler that splices the data
715 * 782 *
716 * Description: 783 * Description:
717 * See __splice_from_pipe. This function locks the input and output inodes, 784 * See __splice_from_pipe. This function locks the pipe inode,
718 * otherwise it's identical to __splice_from_pipe(). 785 * otherwise it's identical to __splice_from_pipe().
719 * 786 *
720 */ 787 */
@@ -723,7 +790,6 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
723 splice_actor *actor) 790 splice_actor *actor)
724{ 791{
725 ssize_t ret; 792 ssize_t ret;
726 struct inode *inode = out->f_mapping->host;
727 struct splice_desc sd = { 793 struct splice_desc sd = {
728 .total_len = len, 794 .total_len = len,
729 .flags = flags, 795 .flags = flags,
@@ -731,21 +797,15 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
731 .u.file = out, 797 .u.file = out,
732 }; 798 };
733 799
734 /* 800 pipe_lock(pipe);
735 * The actor worker might be calling ->write_begin and
736 * ->write_end. Most of the time, these expect i_mutex to
737 * be held. Since this may result in an ABBA deadlock with
738 * pipe->inode, we have to order lock acquiry here.
739 */
740 inode_double_lock(inode, pipe->inode);
741 ret = __splice_from_pipe(pipe, &sd, actor); 801 ret = __splice_from_pipe(pipe, &sd, actor);
742 inode_double_unlock(inode, pipe->inode); 802 pipe_unlock(pipe);
743 803
744 return ret; 804 return ret;
745} 805}
746 806
747/** 807/**
748 * generic_file_splice_write_nolock - generic_file_splice_write without mutexes 808 * generic_file_splice_write - splice data from a pipe to a file
749 * @pipe: pipe info 809 * @pipe: pipe info
750 * @out: file to write to 810 * @out: file to write to
751 * @ppos: position in @out 811 * @ppos: position in @out
@@ -754,13 +814,12 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
754 * 814 *
755 * Description: 815 * Description:
756 * Will either move or copy pages (determined by @flags options) from 816 * Will either move or copy pages (determined by @flags options) from
757 * the given pipe inode to the given file. The caller is responsible 817 * the given pipe inode to the given file.
758 * for acquiring i_mutex on both inodes.
759 * 818 *
760 */ 819 */
761ssize_t 820ssize_t
762generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, 821generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
763 loff_t *ppos, size_t len, unsigned int flags) 822 loff_t *ppos, size_t len, unsigned int flags)
764{ 823{
765 struct address_space *mapping = out->f_mapping; 824 struct address_space *mapping = out->f_mapping;
766 struct inode *inode = mapping->host; 825 struct inode *inode = mapping->host;
@@ -771,70 +830,28 @@ generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out,
771 .u.file = out, 830 .u.file = out,
772 }; 831 };
773 ssize_t ret; 832 ssize_t ret;
774 int err;
775
776 err = file_remove_suid(out);
777 if (unlikely(err))
778 return err;
779
780 ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
781 if (ret > 0) {
782 unsigned long nr_pages;
783 833
784 *ppos += ret; 834 pipe_lock(pipe);
785 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
786 835
787 /* 836 splice_from_pipe_begin(&sd);
788 * If file or inode is SYNC and we actually wrote some data, 837 do {
789 * sync it. 838 ret = splice_from_pipe_next(pipe, &sd);
790 */ 839 if (ret <= 0)
791 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 840 break;
792 err = generic_osync_inode(inode, mapping,
793 OSYNC_METADATA|OSYNC_DATA);
794
795 if (err)
796 ret = err;
797 }
798 balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
799 }
800 841
801 return ret; 842 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
802} 843 ret = file_remove_suid(out);
844 if (!ret)
845 ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file);
846 mutex_unlock(&inode->i_mutex);
847 } while (ret > 0);
848 splice_from_pipe_end(pipe, &sd);
803 849
804EXPORT_SYMBOL(generic_file_splice_write_nolock); 850 pipe_unlock(pipe);
805 851
806/** 852 if (sd.num_spliced)
807 * generic_file_splice_write - splice data from a pipe to a file 853 ret = sd.num_spliced;
808 * @pipe: pipe info
809 * @out: file to write to
810 * @ppos: position in @out
811 * @len: number of bytes to splice
812 * @flags: splice modifier flags
813 *
814 * Description:
815 * Will either move or copy pages (determined by @flags options) from
816 * the given pipe inode to the given file.
817 *
818 */
819ssize_t
820generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
821 loff_t *ppos, size_t len, unsigned int flags)
822{
823 struct address_space *mapping = out->f_mapping;
824 struct inode *inode = mapping->host;
825 struct splice_desc sd = {
826 .total_len = len,
827 .flags = flags,
828 .pos = *ppos,
829 .u.file = out,
830 };
831 ssize_t ret;
832 854
833 inode_double_lock(inode, pipe->inode);
834 ret = file_remove_suid(out);
835 if (likely(!ret))
836 ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
837 inode_double_unlock(inode, pipe->inode);
838 if (ret > 0) { 855 if (ret > 0) {
839 unsigned long nr_pages; 856 unsigned long nr_pages;
840 857
@@ -1323,8 +1340,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
1323 if (!pipe) 1340 if (!pipe)
1324 return -EBADF; 1341 return -EBADF;
1325 1342
1326 if (pipe->inode) 1343 pipe_lock(pipe);
1327 mutex_lock(&pipe->inode->i_mutex);
1328 1344
1329 error = ret = 0; 1345 error = ret = 0;
1330 while (nr_segs) { 1346 while (nr_segs) {
@@ -1379,8 +1395,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
1379 iov++; 1395 iov++;
1380 } 1396 }
1381 1397
1382 if (pipe->inode) 1398 pipe_unlock(pipe);
1383 mutex_unlock(&pipe->inode->i_mutex);
1384 1399
1385 if (!ret) 1400 if (!ret)
1386 ret = error; 1401 ret = error;
@@ -1508,7 +1523,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1508 return 0; 1523 return 0;
1509 1524
1510 ret = 0; 1525 ret = 0;
1511 mutex_lock(&pipe->inode->i_mutex); 1526 pipe_lock(pipe);
1512 1527
1513 while (!pipe->nrbufs) { 1528 while (!pipe->nrbufs) {
1514 if (signal_pending(current)) { 1529 if (signal_pending(current)) {
@@ -1526,7 +1541,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1526 pipe_wait(pipe); 1541 pipe_wait(pipe);
1527 } 1542 }
1528 1543
1529 mutex_unlock(&pipe->inode->i_mutex); 1544 pipe_unlock(pipe);
1530 return ret; 1545 return ret;
1531} 1546}
1532 1547
@@ -1546,7 +1561,7 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1546 return 0; 1561 return 0;
1547 1562
1548 ret = 0; 1563 ret = 0;
1549 mutex_lock(&pipe->inode->i_mutex); 1564 pipe_lock(pipe);
1550 1565
1551 while (pipe->nrbufs >= PIPE_BUFFERS) { 1566 while (pipe->nrbufs >= PIPE_BUFFERS) {
1552 if (!pipe->readers) { 1567 if (!pipe->readers) {
@@ -1567,7 +1582,7 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1567 pipe->waiting_writers--; 1582 pipe->waiting_writers--;
1568 } 1583 }
1569 1584
1570 mutex_unlock(&pipe->inode->i_mutex); 1585 pipe_unlock(pipe);
1571 return ret; 1586 return ret;
1572} 1587}
1573 1588
@@ -1583,10 +1598,10 @@ static int link_pipe(struct pipe_inode_info *ipipe,
1583 1598
1584 /* 1599 /*
1585 * Potential ABBA deadlock, work around it by ordering lock 1600 * Potential ABBA deadlock, work around it by ordering lock
1586 * grabbing by inode address. Otherwise two different processes 1601 * grabbing by pipe info address. Otherwise two different processes
1587 * could deadlock (one doing tee from A -> B, the other from B -> A). 1602 * could deadlock (one doing tee from A -> B, the other from B -> A).
1588 */ 1603 */
1589 inode_double_lock(ipipe->inode, opipe->inode); 1604 pipe_double_lock(ipipe, opipe);
1590 1605
1591 do { 1606 do {
1592 if (!opipe->readers) { 1607 if (!opipe->readers) {
@@ -1637,7 +1652,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
1637 if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK)) 1652 if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
1638 ret = -EAGAIN; 1653 ret = -EAGAIN;
1639 1654
1640 inode_double_unlock(ipipe->inode, opipe->inode); 1655 pipe_unlock(ipipe);
1656 pipe_unlock(opipe);
1641 1657
1642 /* 1658 /*
1643 * If we put data in the output pipe, wakeup any potential readers. 1659 * If we put data in the output pipe, wakeup any potential readers.
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 8258cf9a0317..70e3244fa30f 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,4 +5,3 @@
5obj-$(CONFIG_SQUASHFS) += squashfs.o 5obj-$(CONFIG_SQUASHFS) += squashfs.o
6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o 6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
7squashfs-y += namei.o super.o symlink.o 7squashfs-y += namei.o super.o symlink.o
8#squashfs-y += squashfs2_0.o
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 1c4739e33af6..40c98fa6b5d6 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -252,6 +252,7 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries,
252 cache->entries = entries; 252 cache->entries = entries;
253 cache->block_size = block_size; 253 cache->block_size = block_size;
254 cache->pages = block_size >> PAGE_CACHE_SHIFT; 254 cache->pages = block_size >> PAGE_CACHE_SHIFT;
255 cache->pages = cache->pages ? cache->pages : 1;
255 cache->name = name; 256 cache->name = name;
256 cache->num_waiters = 0; 257 cache->num_waiters = 0;
257 spin_lock_init(&cache->lock); 258 spin_lock_init(&cache->lock);
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 69e971d5ddc1..2b1b8fe5e037 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -40,6 +40,7 @@
40#include <linux/dcache.h> 40#include <linux/dcache.h>
41#include <linux/exportfs.h> 41#include <linux/exportfs.h>
42#include <linux/zlib.h> 42#include <linux/zlib.h>
43#include <linux/slab.h>
43 44
44#include "squashfs_fs.h" 45#include "squashfs_fs.h"
45#include "squashfs_fs_sb.h" 46#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 681ec0d83799..0adc624c956f 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -157,6 +157,16 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
157 if (msblk->block_size > SQUASHFS_FILE_MAX_SIZE) 157 if (msblk->block_size > SQUASHFS_FILE_MAX_SIZE)
158 goto failed_mount; 158 goto failed_mount;
159 159
160 /*
161 * Check the system page size is not larger than the filesystem
162 * block size (by default 128K). This is currently not supported.
163 */
164 if (PAGE_CACHE_SIZE > msblk->block_size) {
165 ERROR("Page size > filesystem block size (%d). This is "
166 "currently not supported!\n", msblk->block_size);
167 goto failed_mount;
168 }
169
160 msblk->block_log = le16_to_cpu(sblk->block_log); 170 msblk->block_log = le16_to_cpu(sblk->block_log);
161 if (msblk->block_log > SQUASHFS_FILE_MAX_LOG) 171 if (msblk->block_log > SQUASHFS_FILE_MAX_LOG)
162 goto failed_mount; 172 goto failed_mount;
@@ -301,6 +311,7 @@ failure:
301static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf) 311static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
302{ 312{
303 struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info; 313 struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info;
314 u64 id = huge_encode_dev(dentry->d_sb->s_bdev->bd_dev);
304 315
305 TRACE("Entered squashfs_statfs\n"); 316 TRACE("Entered squashfs_statfs\n");
306 317
@@ -311,6 +322,8 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
311 buf->f_files = msblk->inodes; 322 buf->f_files = msblk->inodes;
312 buf->f_ffree = 0; 323 buf->f_ffree = 0;
313 buf->f_namelen = SQUASHFS_NAME_LEN; 324 buf->f_namelen = SQUASHFS_NAME_LEN;
325 buf->f_fsid.val[0] = (u32)id;
326 buf->f_fsid.val[1] = (u32)(id >> 32);
314 327
315 return 0; 328 return 0;
316} 329}
diff --git a/fs/stat.c b/fs/stat.c
index 2db740a0cfb5..075694e31d8b 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -55,59 +55,54 @@ int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
55 55
56EXPORT_SYMBOL(vfs_getattr); 56EXPORT_SYMBOL(vfs_getattr);
57 57
58int vfs_stat_fd(int dfd, char __user *name, struct kstat *stat) 58int vfs_fstat(unsigned int fd, struct kstat *stat)
59{ 59{
60 struct path path; 60 struct file *f = fget(fd);
61 int error; 61 int error = -EBADF;
62 62
63 error = user_path_at(dfd, name, LOOKUP_FOLLOW, &path); 63 if (f) {
64 if (!error) { 64 error = vfs_getattr(f->f_path.mnt, f->f_path.dentry, stat);
65 error = vfs_getattr(path.mnt, path.dentry, stat); 65 fput(f);
66 path_put(&path);
67 } 66 }
68 return error; 67 return error;
69} 68}
69EXPORT_SYMBOL(vfs_fstat);
70 70
71int vfs_stat(char __user *name, struct kstat *stat) 71int vfs_fstatat(int dfd, char __user *filename, struct kstat *stat, int flag)
72{ 72{
73 return vfs_stat_fd(AT_FDCWD, name, stat); 73 struct path path;
74} 74 int error = -EINVAL;
75 int lookup_flags = 0;
75 76
76EXPORT_SYMBOL(vfs_stat); 77 if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
78 goto out;
77 79
78int vfs_lstat_fd(int dfd, char __user *name, struct kstat *stat) 80 if (!(flag & AT_SYMLINK_NOFOLLOW))
79{ 81 lookup_flags |= LOOKUP_FOLLOW;
80 struct path path;
81 int error;
82 82
83 error = user_path_at(dfd, name, 0, &path); 83 error = user_path_at(dfd, filename, lookup_flags, &path);
84 if (!error) { 84 if (error)
85 error = vfs_getattr(path.mnt, path.dentry, stat); 85 goto out;
86 path_put(&path); 86
87 } 87 error = vfs_getattr(path.mnt, path.dentry, stat);
88 path_put(&path);
89out:
88 return error; 90 return error;
89} 91}
92EXPORT_SYMBOL(vfs_fstatat);
90 93
91int vfs_lstat(char __user *name, struct kstat *stat) 94int vfs_stat(char __user *name, struct kstat *stat)
92{ 95{
93 return vfs_lstat_fd(AT_FDCWD, name, stat); 96 return vfs_fstatat(AT_FDCWD, name, stat, 0);
94} 97}
98EXPORT_SYMBOL(vfs_stat);
95 99
96EXPORT_SYMBOL(vfs_lstat); 100int vfs_lstat(char __user *name, struct kstat *stat)
97
98int vfs_fstat(unsigned int fd, struct kstat *stat)
99{ 101{
100 struct file *f = fget(fd); 102 return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW);
101 int error = -EBADF;
102
103 if (f) {
104 error = vfs_getattr(f->f_path.mnt, f->f_path.dentry, stat);
105 fput(f);
106 }
107 return error;
108} 103}
104EXPORT_SYMBOL(vfs_lstat);
109 105
110EXPORT_SYMBOL(vfs_fstat);
111 106
112#ifdef __ARCH_WANT_OLD_STAT 107#ifdef __ARCH_WANT_OLD_STAT
113 108
@@ -155,23 +150,25 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta
155SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *, statbuf) 150SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
156{ 151{
157 struct kstat stat; 152 struct kstat stat;
158 int error = vfs_stat_fd(AT_FDCWD, filename, &stat); 153 int error;
159 154
160 if (!error) 155 error = vfs_stat(filename, &stat);
161 error = cp_old_stat(&stat, statbuf); 156 if (error)
157 return error;
162 158
163 return error; 159 return cp_old_stat(&stat, statbuf);
164} 160}
165 161
166SYSCALL_DEFINE2(lstat, char __user *, filename, struct __old_kernel_stat __user *, statbuf) 162SYSCALL_DEFINE2(lstat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
167{ 163{
168 struct kstat stat; 164 struct kstat stat;
169 int error = vfs_lstat_fd(AT_FDCWD, filename, &stat); 165 int error;
170 166
171 if (!error) 167 error = vfs_lstat(filename, &stat);
172 error = cp_old_stat(&stat, statbuf); 168 if (error)
169 return error;
173 170
174 return error; 171 return cp_old_stat(&stat, statbuf);
175} 172}
176 173
177SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, statbuf) 174SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, statbuf)
@@ -240,23 +237,23 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
240SYSCALL_DEFINE2(newstat, char __user *, filename, struct stat __user *, statbuf) 237SYSCALL_DEFINE2(newstat, char __user *, filename, struct stat __user *, statbuf)
241{ 238{
242 struct kstat stat; 239 struct kstat stat;
243 int error = vfs_stat_fd(AT_FDCWD, filename, &stat); 240 int error = vfs_stat(filename, &stat);
244
245 if (!error)
246 error = cp_new_stat(&stat, statbuf);
247 241
248 return error; 242 if (error)
243 return error;
244 return cp_new_stat(&stat, statbuf);
249} 245}
250 246
251SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf) 247SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf)
252{ 248{
253 struct kstat stat; 249 struct kstat stat;
254 int error = vfs_lstat_fd(AT_FDCWD, filename, &stat); 250 int error;
255 251
256 if (!error) 252 error = vfs_lstat(filename, &stat);
257 error = cp_new_stat(&stat, statbuf); 253 if (error)
254 return error;
258 255
259 return error; 256 return cp_new_stat(&stat, statbuf);
260} 257}
261 258
262#if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT) 259#if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT)
@@ -264,21 +261,12 @@ SYSCALL_DEFINE4(newfstatat, int, dfd, char __user *, filename,
264 struct stat __user *, statbuf, int, flag) 261 struct stat __user *, statbuf, int, flag)
265{ 262{
266 struct kstat stat; 263 struct kstat stat;
267 int error = -EINVAL; 264 int error;
268
269 if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
270 goto out;
271
272 if (flag & AT_SYMLINK_NOFOLLOW)
273 error = vfs_lstat_fd(dfd, filename, &stat);
274 else
275 error = vfs_stat_fd(dfd, filename, &stat);
276
277 if (!error)
278 error = cp_new_stat(&stat, statbuf);
279 265
280out: 266 error = vfs_fstatat(dfd, filename, &stat, flag);
281 return error; 267 if (error)
268 return error;
269 return cp_new_stat(&stat, statbuf);
282} 270}
283#endif 271#endif
284 272
@@ -404,21 +392,12 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename,
404 struct stat64 __user *, statbuf, int, flag) 392 struct stat64 __user *, statbuf, int, flag)
405{ 393{
406 struct kstat stat; 394 struct kstat stat;
407 int error = -EINVAL; 395 int error;
408
409 if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
410 goto out;
411
412 if (flag & AT_SYMLINK_NOFOLLOW)
413 error = vfs_lstat_fd(dfd, filename, &stat);
414 else
415 error = vfs_stat_fd(dfd, filename, &stat);
416
417 if (!error)
418 error = cp_new_stat64(&stat, statbuf);
419 396
420out: 397 error = vfs_fstatat(dfd, filename, &stat, flag);
421 return error; 398 if (error)
399 return error;
400 return cp_new_stat64(&stat, statbuf);
422} 401}
423#endif /* __ARCH_WANT_STAT64 */ 402#endif /* __ARCH_WANT_STAT64 */
424 403
diff --git a/fs/super.c b/fs/super.c
index 2ba481518ba7..1943fdf655fa 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -208,6 +208,34 @@ void deactivate_super(struct super_block *s)
208EXPORT_SYMBOL(deactivate_super); 208EXPORT_SYMBOL(deactivate_super);
209 209
210/** 210/**
211 * deactivate_locked_super - drop an active reference to superblock
212 * @s: superblock to deactivate
213 *
214 * Equivalent of up_write(&s->s_umount); deactivate_super(s);, except that
215 * it does not unlock it until it's all over. As the result, it's safe to
216 * use to dispose of new superblock on ->get_sb() failure exits - nobody
217 * will see the sucker until it's all over. Equivalent using up_write +
218 * deactivate_super is safe for that purpose only if superblock is either
219 * safe to use or has NULL ->s_root when we unlock.
220 */
221void deactivate_locked_super(struct super_block *s)
222{
223 struct file_system_type *fs = s->s_type;
224 if (atomic_dec_and_lock(&s->s_active, &sb_lock)) {
225 s->s_count -= S_BIAS-1;
226 spin_unlock(&sb_lock);
227 vfs_dq_off(s, 0);
228 fs->kill_sb(s);
229 put_filesystem(fs);
230 put_super(s);
231 } else {
232 up_write(&s->s_umount);
233 }
234}
235
236EXPORT_SYMBOL(deactivate_locked_super);
237
238/**
211 * grab_super - acquire an active reference 239 * grab_super - acquire an active reference
212 * @s: reference we are trying to make active 240 * @s: reference we are trying to make active
213 * 241 *
@@ -287,6 +315,7 @@ int fsync_super(struct super_block *sb)
287 __fsync_super(sb); 315 __fsync_super(sb);
288 return sync_blockdev(sb->s_bdev); 316 return sync_blockdev(sb->s_bdev);
289} 317}
318EXPORT_SYMBOL_GPL(fsync_super);
290 319
291/** 320/**
292 * generic_shutdown_super - common helper for ->kill_sb() 321 * generic_shutdown_super - common helper for ->kill_sb()
@@ -770,6 +799,45 @@ void kill_litter_super(struct super_block *sb)
770 799
771EXPORT_SYMBOL(kill_litter_super); 800EXPORT_SYMBOL(kill_litter_super);
772 801
802static int ns_test_super(struct super_block *sb, void *data)
803{
804 return sb->s_fs_info == data;
805}
806
807static int ns_set_super(struct super_block *sb, void *data)
808{
809 sb->s_fs_info = data;
810 return set_anon_super(sb, NULL);
811}
812
813int get_sb_ns(struct file_system_type *fs_type, int flags, void *data,
814 int (*fill_super)(struct super_block *, void *, int),
815 struct vfsmount *mnt)
816{
817 struct super_block *sb;
818
819 sb = sget(fs_type, ns_test_super, ns_set_super, data);
820 if (IS_ERR(sb))
821 return PTR_ERR(sb);
822
823 if (!sb->s_root) {
824 int err;
825 sb->s_flags = flags;
826 err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
827 if (err) {
828 deactivate_locked_super(sb);
829 return err;
830 }
831
832 sb->s_flags |= MS_ACTIVE;
833 }
834
835 simple_set_mnt(mnt, sb);
836 return 0;
837}
838
839EXPORT_SYMBOL(get_sb_ns);
840
773#ifdef CONFIG_BLOCK 841#ifdef CONFIG_BLOCK
774static int set_bdev_super(struct super_block *s, void *data) 842static int set_bdev_super(struct super_block *s, void *data)
775{ 843{
@@ -813,8 +881,7 @@ int get_sb_bdev(struct file_system_type *fs_type,
813 881
814 if (s->s_root) { 882 if (s->s_root) {
815 if ((flags ^ s->s_flags) & MS_RDONLY) { 883 if ((flags ^ s->s_flags) & MS_RDONLY) {
816 up_write(&s->s_umount); 884 deactivate_locked_super(s);
817 deactivate_super(s);
818 error = -EBUSY; 885 error = -EBUSY;
819 goto error_bdev; 886 goto error_bdev;
820 } 887 }
@@ -829,8 +896,7 @@ int get_sb_bdev(struct file_system_type *fs_type,
829 sb_set_blocksize(s, block_size(bdev)); 896 sb_set_blocksize(s, block_size(bdev));
830 error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); 897 error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
831 if (error) { 898 if (error) {
832 up_write(&s->s_umount); 899 deactivate_locked_super(s);
833 deactivate_super(s);
834 goto error; 900 goto error;
835 } 901 }
836 902
@@ -856,7 +922,7 @@ void kill_block_super(struct super_block *sb)
856 struct block_device *bdev = sb->s_bdev; 922 struct block_device *bdev = sb->s_bdev;
857 fmode_t mode = sb->s_mode; 923 fmode_t mode = sb->s_mode;
858 924
859 bdev->bd_super = 0; 925 bdev->bd_super = NULL;
860 generic_shutdown_super(sb); 926 generic_shutdown_super(sb);
861 sync_blockdev(bdev); 927 sync_blockdev(bdev);
862 close_bdev_exclusive(bdev, mode); 928 close_bdev_exclusive(bdev, mode);
@@ -880,8 +946,7 @@ int get_sb_nodev(struct file_system_type *fs_type,
880 946
881 error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); 947 error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
882 if (error) { 948 if (error) {
883 up_write(&s->s_umount); 949 deactivate_locked_super(s);
884 deactivate_super(s);
885 return error; 950 return error;
886 } 951 }
887 s->s_flags |= MS_ACTIVE; 952 s->s_flags |= MS_ACTIVE;
@@ -911,8 +976,7 @@ int get_sb_single(struct file_system_type *fs_type,
911 s->s_flags = flags; 976 s->s_flags = flags;
912 error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); 977 error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
913 if (error) { 978 if (error) {
914 up_write(&s->s_umount); 979 deactivate_locked_super(s);
915 deactivate_super(s);
916 return error; 980 return error;
917 } 981 }
918 s->s_flags |= MS_ACTIVE; 982 s->s_flags |= MS_ACTIVE;
@@ -965,8 +1029,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
965 return mnt; 1029 return mnt;
966out_sb: 1030out_sb:
967 dput(mnt->mnt_root); 1031 dput(mnt->mnt_root);
968 up_write(&mnt->mnt_sb->s_umount); 1032 deactivate_locked_super(mnt->mnt_sb);
969 deactivate_super(mnt->mnt_sb);
970out_free_secdata: 1033out_free_secdata:
971 free_secdata(secdata); 1034 free_secdata(secdata);
972out_mnt: 1035out_mnt:
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 07703d3ff4a1..9345806c8853 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -157,14 +157,9 @@ static ssize_t write(struct file *file, const char __user *userbuf,
157 count = size - offs; 157 count = size - offs;
158 } 158 }
159 159
160 temp = kmalloc(count, GFP_KERNEL); 160 temp = memdup_user(userbuf, count);
161 if (!temp) 161 if (IS_ERR(temp))
162 return -ENOMEM; 162 return PTR_ERR(temp);
163
164 if (copy_from_user(temp, userbuf, count)) {
165 count = -EFAULT;
166 goto out_free;
167 }
168 163
169 mutex_lock(&bb->mutex); 164 mutex_lock(&bb->mutex);
170 165
@@ -176,8 +171,6 @@ static ssize_t write(struct file *file, const char __user *userbuf,
176 if (count > 0) 171 if (count > 0)
177 *off = offs + count; 172 *off = offs + count;
178 173
179out_free:
180 kfree(temp);
181 return count; 174 return count;
182} 175}
183 176
@@ -234,7 +227,7 @@ static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
234 return ret; 227 return ret;
235} 228}
236 229
237static int bin_page_mkwrite(struct vm_area_struct *vma, struct page *page) 230static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
238{ 231{
239 struct file *file = vma->vm_file; 232 struct file *file = vma->vm_file;
240 struct bin_buffer *bb = file->private_data; 233 struct bin_buffer *bb = file->private_data;
@@ -242,15 +235,15 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct page *page)
242 int ret; 235 int ret;
243 236
244 if (!bb->vm_ops) 237 if (!bb->vm_ops)
245 return -EINVAL; 238 return VM_FAULT_SIGBUS;
246 239
247 if (!bb->vm_ops->page_mkwrite) 240 if (!bb->vm_ops->page_mkwrite)
248 return 0; 241 return 0;
249 242
250 if (!sysfs_get_active_two(attr_sd)) 243 if (!sysfs_get_active_two(attr_sd))
251 return -EINVAL; 244 return VM_FAULT_SIGBUS;
252 245
253 ret = bb->vm_ops->page_mkwrite(vma, page); 246 ret = bb->vm_ops->page_mkwrite(vma, vmf);
254 247
255 sysfs_put_active_two(attr_sd); 248 sysfs_put_active_two(attr_sd);
256 return ret; 249 return ret;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 289c43a47263..561a9c050cef 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -446,11 +446,11 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
446 if (buffer->event != atomic_read(&od->event)) 446 if (buffer->event != atomic_read(&od->event))
447 goto trigger; 447 goto trigger;
448 448
449 return 0; 449 return DEFAULT_POLLMASK;
450 450
451 trigger: 451 trigger:
452 buffer->needs_read_fill = 1; 452 buffer->needs_read_fill = 1;
453 return POLLERR|POLLPRI; 453 return DEFAULT_POLLMASK|POLLERR|POLLPRI;
454} 454}
455 455
456void sysfs_notify_dirent(struct sysfs_dirent *sd) 456void sysfs_notify_dirent(struct sysfs_dirent *sd)
@@ -667,6 +667,7 @@ struct sysfs_schedule_callback_struct {
667 struct work_struct work; 667 struct work_struct work;
668}; 668};
669 669
670static struct workqueue_struct *sysfs_workqueue;
670static DEFINE_MUTEX(sysfs_workq_mutex); 671static DEFINE_MUTEX(sysfs_workq_mutex);
671static LIST_HEAD(sysfs_workq); 672static LIST_HEAD(sysfs_workq);
672static void sysfs_schedule_callback_work(struct work_struct *work) 673static void sysfs_schedule_callback_work(struct work_struct *work)
@@ -715,11 +716,20 @@ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
715 mutex_lock(&sysfs_workq_mutex); 716 mutex_lock(&sysfs_workq_mutex);
716 list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list) 717 list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list)
717 if (ss->kobj == kobj) { 718 if (ss->kobj == kobj) {
719 module_put(owner);
718 mutex_unlock(&sysfs_workq_mutex); 720 mutex_unlock(&sysfs_workq_mutex);
719 return -EAGAIN; 721 return -EAGAIN;
720 } 722 }
721 mutex_unlock(&sysfs_workq_mutex); 723 mutex_unlock(&sysfs_workq_mutex);
722 724
725 if (sysfs_workqueue == NULL) {
726 sysfs_workqueue = create_singlethread_workqueue("sysfsd");
727 if (sysfs_workqueue == NULL) {
728 module_put(owner);
729 return -ENOMEM;
730 }
731 }
732
723 ss = kmalloc(sizeof(*ss), GFP_KERNEL); 733 ss = kmalloc(sizeof(*ss), GFP_KERNEL);
724 if (!ss) { 734 if (!ss) {
725 module_put(owner); 735 module_put(owner);
@@ -735,7 +745,7 @@ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
735 mutex_lock(&sysfs_workq_mutex); 745 mutex_lock(&sysfs_workq_mutex);
736 list_add_tail(&ss->workq_list, &sysfs_workq); 746 list_add_tail(&ss->workq_list, &sysfs_workq);
737 mutex_unlock(&sysfs_workq_mutex); 747 mutex_unlock(&sysfs_workq_mutex);
738 schedule_work(&ss->work); 748 queue_work(sysfs_workqueue, &ss->work);
739 return 0; 749 return 0;
740} 750}
741EXPORT_SYMBOL_GPL(sysfs_schedule_callback); 751EXPORT_SYMBOL_GPL(sysfs_schedule_callback);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 3d81bf58dae2..da20b48d350f 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -90,6 +90,7 @@ static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
90{ 90{
91 struct super_block *sb = dentry->d_sb; 91 struct super_block *sb = dentry->d_sb;
92 struct sysv_sb_info *sbi = SYSV_SB(sb); 92 struct sysv_sb_info *sbi = SYSV_SB(sb);
93 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
93 94
94 buf->f_type = sb->s_magic; 95 buf->f_type = sb->s_magic;
95 buf->f_bsize = sb->s_blocksize; 96 buf->f_bsize = sb->s_blocksize;
@@ -98,6 +99,8 @@ static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
98 buf->f_files = sbi->s_ninodes; 99 buf->f_files = sbi->s_ninodes;
99 buf->f_ffree = sysv_count_free_inodes(sb); 100 buf->f_ffree = sysv_count_free_inodes(sb);
100 buf->f_namelen = SYSV_NAMELEN; 101 buf->f_namelen = SYSV_NAMELEN;
102 buf->f_fsid.val[0] = (u32)id;
103 buf->f_fsid.val[1] = (u32)(id >> 32);
101 return 0; 104 return 0;
102} 105}
103 106
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index e35b54d5059d..830e3f76f442 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -22,7 +22,7 @@ config UBIFS_FS_ADVANCED_COMPR
22 depends on UBIFS_FS 22 depends on UBIFS_FS
23 help 23 help
24 This option allows to explicitly choose which compressions, if any, 24 This option allows to explicitly choose which compressions, if any,
25 are enabled in UBIFS. Removing compressors means inbility to read 25 are enabled in UBIFS. Removing compressors means inability to read
26 existing file systems. 26 existing file systems.
27 27
28 If unsure, say 'N'. 28 If unsure, say 'N'.
@@ -32,7 +32,7 @@ config UBIFS_FS_LZO
32 depends on UBIFS_FS 32 depends on UBIFS_FS
33 default y 33 default y
34 help 34 help
35 LZO compressor is generally faster then zlib but compresses worse. 35 LZO compressor is generally faster than zlib but compresses worse.
36 Say 'Y' if unsure. 36 Say 'Y' if unsure.
37 37
38config UBIFS_FS_ZLIB 38config UBIFS_FS_ZLIB
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index f393620890ee..af1914462f02 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -194,29 +194,26 @@ static int make_free_space(struct ubifs_info *c)
194} 194}
195 195
196/** 196/**
197 * ubifs_calc_min_idx_lebs - calculate amount of eraseblocks for the index. 197 * ubifs_calc_min_idx_lebs - calculate amount of LEBs for the index.
198 * @c: UBIFS file-system description object 198 * @c: UBIFS file-system description object
199 * 199 *
200 * This function calculates and returns the number of eraseblocks which should 200 * This function calculates and returns the number of LEBs which should be kept
201 * be kept for index usage. 201 * for index usage.
202 */ 202 */
203int ubifs_calc_min_idx_lebs(struct ubifs_info *c) 203int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
204{ 204{
205 int idx_lebs, eff_leb_size = c->leb_size - c->max_idx_node_sz; 205 int idx_lebs;
206 long long idx_size; 206 long long idx_size;
207 207
208 idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx; 208 idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
209
210 /* And make sure we have thrice the index size of space reserved */ 209 /* And make sure we have thrice the index size of space reserved */
211 idx_size = idx_size + (idx_size << 1); 210 idx_size += idx_size << 1;
212
213 /* 211 /*
214 * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes' 212 * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes'
215 * pair, nor similarly the two variables for the new index size, so we 213 * pair, nor similarly the two variables for the new index size, so we
216 * have to do this costly 64-bit division on fast-path. 214 * have to do this costly 64-bit division on fast-path.
217 */ 215 */
218 idx_size += eff_leb_size - 1; 216 idx_lebs = div_u64(idx_size + c->idx_leb_size - 1, c->idx_leb_size);
219 idx_lebs = div_u64(idx_size, eff_leb_size);
220 /* 217 /*
221 * The index head is not available for the in-the-gaps method, so add an 218 * The index head is not available for the in-the-gaps method, so add an
222 * extra LEB to compensate. 219 * extra LEB to compensate.
@@ -310,23 +307,23 @@ static int can_use_rp(struct ubifs_info *c)
310 * do_budget_space - reserve flash space for index and data growth. 307 * do_budget_space - reserve flash space for index and data growth.
311 * @c: UBIFS file-system description object 308 * @c: UBIFS file-system description object
312 * 309 *
313 * This function makes sure UBIFS has enough free eraseblocks for index growth 310 * This function makes sure UBIFS has enough free LEBs for index growth and
314 * and data. 311 * data.
315 * 312 *
316 * When budgeting index space, UBIFS reserves thrice as many LEBs as the index 313 * When budgeting index space, UBIFS reserves thrice as many LEBs as the index
317 * would take if it was consolidated and written to the flash. This guarantees 314 * would take if it was consolidated and written to the flash. This guarantees
318 * that the "in-the-gaps" commit method always succeeds and UBIFS will always 315 * that the "in-the-gaps" commit method always succeeds and UBIFS will always
319 * be able to commit dirty index. So this function basically adds amount of 316 * be able to commit dirty index. So this function basically adds amount of
320 * budgeted index space to the size of the current index, multiplies this by 3, 317 * budgeted index space to the size of the current index, multiplies this by 3,
321 * and makes sure this does not exceed the amount of free eraseblocks. 318 * and makes sure this does not exceed the amount of free LEBs.
322 * 319 *
323 * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables: 320 * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables:
324 * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might 321 * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might
325 * be large, because UBIFS does not do any index consolidation as long as 322 * be large, because UBIFS does not do any index consolidation as long as
326 * there is free space. IOW, the index may take a lot of LEBs, but the LEBs 323 * there is free space. IOW, the index may take a lot of LEBs, but the LEBs
327 * will contain a lot of dirt. 324 * will contain a lot of dirt.
328 * o @c->min_idx_lebs is the the index presumably takes. IOW, the index may be 325 * o @c->min_idx_lebs is the number of LEBS the index presumably takes. IOW,
329 * consolidated to take up to @c->min_idx_lebs LEBs. 326 * the index may be consolidated to take up to @c->min_idx_lebs LEBs.
330 * 327 *
331 * This function returns zero in case of success, and %-ENOSPC in case of 328 * This function returns zero in case of success, and %-ENOSPC in case of
332 * failure. 329 * failure.
@@ -695,12 +692,12 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free)
695 * This function calculates amount of free space to report to user-space. 692 * This function calculates amount of free space to report to user-space.
696 * 693 *
697 * Because UBIFS may introduce substantial overhead (the index, node headers, 694 * Because UBIFS may introduce substantial overhead (the index, node headers,
698 * alignment, wastage at the end of eraseblocks, etc), it cannot report real 695 * alignment, wastage at the end of LEBs, etc), it cannot report real amount of
699 * amount of free flash space it has (well, because not all dirty space is 696 * free flash space it has (well, because not all dirty space is reclaimable,
700 * reclaimable, UBIFS does not actually know the real amount). If UBIFS did so, 697 * UBIFS does not actually know the real amount). If UBIFS did so, it would
701 * it would bread user expectations about what free space is. Users seem to 698 * bread user expectations about what free space is. Users seem to accustomed
702 * accustomed to assume that if the file-system reports N bytes of free space, 699 * to assume that if the file-system reports N bytes of free space, they would
703 * they would be able to fit a file of N bytes to the FS. This almost works for 700 * be able to fit a file of N bytes to the FS. This almost works for
704 * traditional file-systems, because they have way less overhead than UBIFS. 701 * traditional file-systems, because they have way less overhead than UBIFS.
705 * So, to keep users happy, UBIFS tries to take the overhead into account. 702 * So, to keep users happy, UBIFS tries to take the overhead into account.
706 */ 703 */
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index e975bd82f38b..ce2cd8343618 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -479,9 +479,9 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
479 "bad or corrupted node)"); 479 "bad or corrupted node)");
480 else { 480 else {
481 for (i = 0; i < nlen && dent->name[i]; i++) 481 for (i = 0; i < nlen && dent->name[i]; i++)
482 printk("%c", dent->name[i]); 482 printk(KERN_CONT "%c", dent->name[i]);
483 } 483 }
484 printk("\n"); 484 printk(KERN_CONT "\n");
485 485
486 break; 486 break;
487 } 487 }
@@ -1214,7 +1214,7 @@ static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr)
1214 1214
1215 /* 1215 /*
1216 * Make sure the last key in our znode is less or 1216 * Make sure the last key in our znode is less or
1217 * equivalent than the the key in zbranch which goes 1217 * equivalent than the key in the zbranch which goes
1218 * after our pointing zbranch. 1218 * after our pointing zbranch.
1219 */ 1219 */
1220 cmp = keys_cmp(c, max, 1220 cmp = keys_cmp(c, max,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 93b6de51f261..6d34dc7e33e1 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -430,6 +430,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
430 struct ubifs_inode *ui = ubifs_inode(inode); 430 struct ubifs_inode *ui = ubifs_inode(inode);
431 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 431 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
432 int uninitialized_var(err), appending = !!(pos + len > inode->i_size); 432 int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
433 int skipped_read = 0;
433 struct page *page; 434 struct page *page;
434 435
435 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size); 436 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
@@ -444,7 +445,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
444 445
445 if (!PageUptodate(page)) { 446 if (!PageUptodate(page)) {
446 /* The page is not loaded from the flash */ 447 /* The page is not loaded from the flash */
447 if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) 448 if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) {
448 /* 449 /*
449 * We change whole page so no need to load it. But we 450 * We change whole page so no need to load it. But we
450 * have to set the @PG_checked flag to make the further 451 * have to set the @PG_checked flag to make the further
@@ -453,7 +454,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
453 * the media. 454 * the media.
454 */ 455 */
455 SetPageChecked(page); 456 SetPageChecked(page);
456 else { 457 skipped_read = 1;
458 } else {
457 err = do_readpage(page); 459 err = do_readpage(page);
458 if (err) { 460 if (err) {
459 unlock_page(page); 461 unlock_page(page);
@@ -470,6 +472,14 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
470 if (unlikely(err)) { 472 if (unlikely(err)) {
471 ubifs_assert(err == -ENOSPC); 473 ubifs_assert(err == -ENOSPC);
472 /* 474 /*
475 * If we skipped reading the page because we were going to
476 * write all of it, then it is not up to date.
477 */
478 if (skipped_read) {
479 ClearPageChecked(page);
480 ClearPageUptodate(page);
481 }
482 /*
473 * Budgeting failed which means it would have to force 483 * Budgeting failed which means it would have to force
474 * write-back but didn't, because we set the @fast flag in the 484 * write-back but didn't, because we set the @fast flag in the
475 * request. Write-back cannot be done now, while we have the 485 * request. Write-back cannot be done now, while we have the
@@ -949,7 +959,7 @@ static int do_writepage(struct page *page, int len)
949 * whole index and correct all inode sizes, which is long an unacceptable. 959 * whole index and correct all inode sizes, which is long an unacceptable.
950 * 960 *
951 * To prevent situations like this, UBIFS writes pages back only if they are 961 * To prevent situations like this, UBIFS writes pages back only if they are
952 * within last synchronized inode size, i.e. the the size which has been 962 * within the last synchronized inode size, i.e. the size which has been
953 * written to the flash media last time. Otherwise, UBIFS forces inode 963 * written to the flash media last time. Otherwise, UBIFS forces inode
954 * write-back, thus making sure the on-flash inode contains current inode size, 964 * write-back, thus making sure the on-flash inode contains current inode size,
955 * and then keeps writing pages back. 965 * and then keeps writing pages back.
@@ -1434,8 +1444,9 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
1434 * mmap()d file has taken write protection fault and is being made 1444 * mmap()d file has taken write protection fault and is being made
1435 * writable. UBIFS must ensure page is budgeted for. 1445 * writable. UBIFS must ensure page is budgeted for.
1436 */ 1446 */
1437static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) 1447static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1438{ 1448{
1449 struct page *page = vmf->page;
1439 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 1450 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1440 struct ubifs_info *c = inode->i_sb->s_fs_info; 1451 struct ubifs_info *c = inode->i_sb->s_fs_info;
1441 struct timespec now = ubifs_current_time(inode); 1452 struct timespec now = ubifs_current_time(inode);
@@ -1447,7 +1458,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
1447 ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY)); 1458 ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY));
1448 1459
1449 if (unlikely(c->ro_media)) 1460 if (unlikely(c->ro_media))
1450 return -EROFS; 1461 return VM_FAULT_SIGBUS; /* -EROFS */
1451 1462
1452 /* 1463 /*
1453 * We have not locked @page so far so we may budget for changing the 1464 * We have not locked @page so far so we may budget for changing the
@@ -1480,7 +1491,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
1480 if (err == -ENOSPC) 1491 if (err == -ENOSPC)
1481 ubifs_warn("out of space for mmapped file " 1492 ubifs_warn("out of space for mmapped file "
1482 "(inode number %lu)", inode->i_ino); 1493 "(inode number %lu)", inode->i_ino);
1483 return err; 1494 return VM_FAULT_SIGBUS;
1484 } 1495 }
1485 1496
1486 lock_page(page); 1497 lock_page(page);
@@ -1520,6 +1531,8 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
1520out_unlock: 1531out_unlock:
1521 unlock_page(page); 1532 unlock_page(page);
1522 ubifs_release_budget(c, &req); 1533 ubifs_release_budget(c, &req);
1534 if (err)
1535 err = VM_FAULT_SIGBUS;
1523 return err; 1536 return err;
1524} 1537}
1525 1538
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 717d79c97c5e..1d54383d1269 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -478,7 +478,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
478 * ubifs_find_free_space - find a data LEB with free space. 478 * ubifs_find_free_space - find a data LEB with free space.
479 * @c: the UBIFS file-system description object 479 * @c: the UBIFS file-system description object
480 * @min_space: minimum amount of required free space 480 * @min_space: minimum amount of required free space
481 * @free: contains amount of free space in the LEB on exit 481 * @offs: contains offset of where free space starts on exit
482 * @squeeze: whether to try to find space in a non-empty LEB first 482 * @squeeze: whether to try to find space in a non-empty LEB first
483 * 483 *
484 * This function looks for an LEB with at least @min_space bytes of free space. 484 * This function looks for an LEB with at least @min_space bytes of free space.
@@ -490,7 +490,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
490 * failed to find a LEB with @min_space bytes of free space and other a negative 490 * failed to find a LEB with @min_space bytes of free space and other a negative
491 * error codes in case of failure. 491 * error codes in case of failure.
492 */ 492 */
493int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, 493int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
494 int squeeze) 494 int squeeze)
495{ 495{
496 const struct ubifs_lprops *lprops; 496 const struct ubifs_lprops *lprops;
@@ -558,10 +558,10 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
558 spin_unlock(&c->space_lock); 558 spin_unlock(&c->space_lock);
559 } 559 }
560 560
561 *free = lprops->free; 561 *offs = c->leb_size - lprops->free;
562 ubifs_release_lprops(c); 562 ubifs_release_lprops(c);
563 563
564 if (*free == c->leb_size) { 564 if (*offs == 0) {
565 /* 565 /*
566 * Ensure that empty LEBs have been unmapped. They may not have 566 * Ensure that empty LEBs have been unmapped. They may not have
567 * been, for example, because of an unclean unmount. Also 567 * been, for example, because of an unclean unmount. Also
@@ -573,8 +573,8 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
573 return err; 573 return err;
574 } 574 }
575 575
576 dbg_find("found LEB %d, free %d", lnum, *free); 576 dbg_find("found LEB %d, free %d", lnum, c->leb_size - *offs);
577 ubifs_assert(*free >= min_space); 577 ubifs_assert(*offs <= c->leb_size - min_space);
578 return lnum; 578 return lnum;
579 579
580out: 580out:
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index a711d33b3d3e..f0f5f15d384e 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -47,7 +47,7 @@
47 * have to waste large pieces of free space at the end of LEB B, because nodes 47 * have to waste large pieces of free space at the end of LEB B, because nodes
48 * from LEB A would not fit. And the worst situation is when all nodes are of 48 * from LEB A would not fit. And the worst situation is when all nodes are of
49 * maximum size. So dark watermark is the amount of free + dirty space in LEB 49 * maximum size. So dark watermark is the amount of free + dirty space in LEB
50 * which are guaranteed to be reclaimable. If LEB has less space, the GC migh 50 * which are guaranteed to be reclaimable. If LEB has less space, the GC might
51 * be unable to reclaim it. So, LEBs with free + dirty greater than dark 51 * be unable to reclaim it. So, LEBs with free + dirty greater than dark
52 * watermark are "good" LEBs from GC's point of few. The other LEBs are not so 52 * watermark are "good" LEBs from GC's point of few. The other LEBs are not so
53 * good, and GC takes extra care when moving them. 53 * good, and GC takes extra care when moving them.
@@ -57,14 +57,6 @@
57#include "ubifs.h" 57#include "ubifs.h"
58 58
59/* 59/*
60 * GC tries to optimize the way it fit nodes to available space, and it sorts
61 * nodes a little. The below constants are watermarks which define "large",
62 * "medium", and "small" nodes.
63 */
64#define MEDIUM_NODE_WM (UBIFS_BLOCK_SIZE / 4)
65#define SMALL_NODE_WM UBIFS_MAX_DENT_NODE_SZ
66
67/*
68 * GC may need to move more than one LEB to make progress. The below constants 60 * GC may need to move more than one LEB to make progress. The below constants
69 * define "soft" and "hard" limits on the number of LEBs the garbage collector 61 * define "soft" and "hard" limits on the number of LEBs the garbage collector
70 * may move. 62 * may move.
@@ -116,83 +108,222 @@ static int switch_gc_head(struct ubifs_info *c)
116} 108}
117 109
118/** 110/**
119 * joinup - bring data nodes for an inode together. 111 * list_sort - sort a list.
120 * @c: UBIFS file-system description object 112 * @priv: private data, passed to @cmp
121 * @sleb: describes scanned LEB 113 * @head: the list to sort
122 * @inum: inode number 114 * @cmp: the elements comparison function
123 * @blk: block number
124 * @data: list to which to add data nodes
125 * 115 *
126 * This function looks at the first few nodes in the scanned LEB @sleb and adds 116 * This function has been implemented by Mark J Roberts <mjr@znex.org>. It
127 * them to @data if they are data nodes from @inum and have a larger block 117 * implements "merge sort" which has O(nlog(n)) complexity. The list is sorted
128 * number than @blk. This function returns %0 on success and a negative error 118 * in ascending order.
129 * code on failure. 119 *
120 * The comparison function @cmp is supposed to return a negative value if @a is
121 * than @b, and a positive value if @a is greater than @b. If @a and @b are
122 * equivalent, then it does not matter what this function returns.
130 */ 123 */
131static int joinup(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ino_t inum, 124static void list_sort(void *priv, struct list_head *head,
132 unsigned int blk, struct list_head *data) 125 int (*cmp)(void *priv, struct list_head *a,
126 struct list_head *b))
133{ 127{
134 int err, cnt = 6, lnum = sleb->lnum, offs; 128 struct list_head *p, *q, *e, *list, *tail, *oldhead;
135 struct ubifs_scan_node *snod, *tmp; 129 int insize, nmerges, psize, qsize, i;
136 union ubifs_key *key; 130
131 if (list_empty(head))
132 return;
133
134 list = head->next;
135 list_del(head);
136 insize = 1;
137 for (;;) {
138 p = oldhead = list;
139 list = tail = NULL;
140 nmerges = 0;
141
142 while (p) {
143 nmerges++;
144 q = p;
145 psize = 0;
146 for (i = 0; i < insize; i++) {
147 psize++;
148 q = q->next == oldhead ? NULL : q->next;
149 if (!q)
150 break;
151 }
137 152
138 list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { 153 qsize = insize;
139 key = &snod->key; 154 while (psize > 0 || (qsize > 0 && q)) {
140 if (key_inum(c, key) == inum && 155 if (!psize) {
141 key_type(c, key) == UBIFS_DATA_KEY && 156 e = q;
142 key_block(c, key) > blk) { 157 q = q->next;
143 offs = snod->offs; 158 qsize--;
144 err = ubifs_tnc_has_node(c, key, 0, lnum, offs, 0); 159 if (q == oldhead)
145 if (err < 0) 160 q = NULL;
146 return err; 161 } else if (!qsize || !q) {
147 list_del(&snod->list); 162 e = p;
148 if (err) { 163 p = p->next;
149 list_add_tail(&snod->list, data); 164 psize--;
150 blk = key_block(c, key); 165 if (p == oldhead)
151 } else 166 p = NULL;
152 kfree(snod); 167 } else if (cmp(priv, p, q) <= 0) {
153 cnt = 6; 168 e = p;
154 } else if (--cnt == 0) 169 p = p->next;
170 psize--;
171 if (p == oldhead)
172 p = NULL;
173 } else {
174 e = q;
175 q = q->next;
176 qsize--;
177 if (q == oldhead)
178 q = NULL;
179 }
180 if (tail)
181 tail->next = e;
182 else
183 list = e;
184 e->prev = tail;
185 tail = e;
186 }
187 p = q;
188 }
189
190 tail->next = list;
191 list->prev = tail;
192
193 if (nmerges <= 1)
155 break; 194 break;
195
196 insize *= 2;
156 } 197 }
157 return 0; 198
199 head->next = list;
200 head->prev = list->prev;
201 list->prev->next = head;
202 list->prev = head;
158} 203}
159 204
160/** 205/**
161 * move_nodes - move nodes. 206 * data_nodes_cmp - compare 2 data nodes.
207 * @priv: UBIFS file-system description object
208 * @a: first data node
209 * @a: second data node
210 *
211 * This function compares data nodes @a and @b. Returns %1 if @a has greater
212 * inode or block number, and %-1 otherwise.
213 */
214int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
215{
216 ino_t inuma, inumb;
217 struct ubifs_info *c = priv;
218 struct ubifs_scan_node *sa, *sb;
219
220 cond_resched();
221 sa = list_entry(a, struct ubifs_scan_node, list);
222 sb = list_entry(b, struct ubifs_scan_node, list);
223 ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY);
224 ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY);
225
226 inuma = key_inum(c, &sa->key);
227 inumb = key_inum(c, &sb->key);
228
229 if (inuma == inumb) {
230 unsigned int blka = key_block(c, &sa->key);
231 unsigned int blkb = key_block(c, &sb->key);
232
233 if (blka <= blkb)
234 return -1;
235 } else if (inuma <= inumb)
236 return -1;
237
238 return 1;
239}
240
241/*
242 * nondata_nodes_cmp - compare 2 non-data nodes.
243 * @priv: UBIFS file-system description object
244 * @a: first node
245 * @a: second node
246 *
247 * This function compares nodes @a and @b. It makes sure that inode nodes go
248 * first and sorted by length in descending order. Directory entry nodes go
249 * after inode nodes and are sorted in ascending hash valuer order.
250 */
251int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
252{
253 int typea, typeb;
254 ino_t inuma, inumb;
255 struct ubifs_info *c = priv;
256 struct ubifs_scan_node *sa, *sb;
257
258 cond_resched();
259 sa = list_entry(a, struct ubifs_scan_node, list);
260 sb = list_entry(b, struct ubifs_scan_node, list);
261 typea = key_type(c, &sa->key);
262 typeb = key_type(c, &sb->key);
263 ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY);
264
265 /* Inodes go before directory entries */
266 if (typea == UBIFS_INO_KEY) {
267 if (typeb == UBIFS_INO_KEY)
268 return sb->len - sa->len;
269 return -1;
270 }
271 if (typeb == UBIFS_INO_KEY)
272 return 1;
273
274 ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY);
275 inuma = key_inum(c, &sa->key);
276 inumb = key_inum(c, &sb->key);
277
278 if (inuma == inumb) {
279 uint32_t hasha = key_hash(c, &sa->key);
280 uint32_t hashb = key_hash(c, &sb->key);
281
282 if (hasha <= hashb)
283 return -1;
284 } else if (inuma <= inumb)
285 return -1;
286
287 return 1;
288}
289
290/**
291 * sort_nodes - sort nodes for GC.
162 * @c: UBIFS file-system description object 292 * @c: UBIFS file-system description object
163 * @sleb: describes nodes to move 293 * @sleb: describes nodes to sort and contains the result on exit
294 * @nondata: contains non-data nodes on exit
295 * @min: minimum node size is returned here
164 * 296 *
165 * This function moves valid nodes from data LEB described by @sleb to the GC 297 * This function sorts the list of inodes to garbage collect. First of all, it
166 * journal head. The obsolete nodes are dropped. 298 * kills obsolete nodes and separates data and non-data nodes to the
299 * @sleb->nodes and @nondata lists correspondingly.
300 *
301 * Data nodes are then sorted in block number order - this is important for
302 * bulk-read; data nodes with lower inode number go before data nodes with
303 * higher inode number, and data nodes with lower block number go before data
304 * nodes with higher block number;
167 * 305 *
168 * When moving nodes we have to deal with classical bin-packing problem: the 306 * Non-data nodes are sorted as follows.
169 * space in the current GC journal head LEB and in @c->gc_lnum are the "bins", 307 * o First go inode nodes - they are sorted in descending length order.
170 * where the nodes in the @sleb->nodes list are the elements which should be 308 * o Then go directory entry nodes - they are sorted in hash order, which
171 * fit optimally to the bins. This function uses the "first fit decreasing" 309 * should supposedly optimize 'readdir()'. Direntry nodes with lower parent
172 * strategy, although it does not really sort the nodes but just split them on 310 * inode number go before direntry nodes with higher parent inode number,
173 * 3 classes - large, medium, and small, so they are roughly sorted. 311 * and direntry nodes with lower name hash values go before direntry nodes
312 * with higher name hash values.
174 * 313 *
175 * This function returns zero in case of success, %-EAGAIN if commit is 314 * This function returns zero in case of success and a negative error code in
176 * required, and other negative error codes in case of other failures. 315 * case of failure.
177 */ 316 */
178static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) 317static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
318 struct list_head *nondata, int *min)
179{ 319{
180 struct ubifs_scan_node *snod, *tmp; 320 struct ubifs_scan_node *snod, *tmp;
181 struct list_head data, large, medium, small;
182 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
183 int avail, err, min = INT_MAX;
184 unsigned int blk = 0;
185 ino_t inum = 0;
186 321
187 INIT_LIST_HEAD(&data); 322 *min = INT_MAX;
188 INIT_LIST_HEAD(&large);
189 INIT_LIST_HEAD(&medium);
190 INIT_LIST_HEAD(&small);
191 323
192 while (!list_empty(&sleb->nodes)) { 324 /* Separate data nodes and non-data nodes */
193 struct list_head *lst = sleb->nodes.next; 325 list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
194 326 int err;
195 snod = list_entry(lst, struct ubifs_scan_node, list);
196 327
197 ubifs_assert(snod->type != UBIFS_IDX_NODE); 328 ubifs_assert(snod->type != UBIFS_IDX_NODE);
198 ubifs_assert(snod->type != UBIFS_REF_NODE); 329 ubifs_assert(snod->type != UBIFS_REF_NODE);
@@ -201,53 +332,72 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
201 err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum, 332 err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
202 snod->offs, 0); 333 snod->offs, 0);
203 if (err < 0) 334 if (err < 0)
204 goto out; 335 return err;
205 336
206 list_del(lst);
207 if (!err) { 337 if (!err) {
208 /* The node is obsolete, remove it from the list */ 338 /* The node is obsolete, remove it from the list */
339 list_del(&snod->list);
209 kfree(snod); 340 kfree(snod);
210 continue; 341 continue;
211 } 342 }
212 343
213 /* 344 if (snod->len < *min)
214 * Sort the list of nodes so that data nodes go first, large 345 *min = snod->len;
215 * nodes go second, and small nodes go last. 346
216 */ 347 if (key_type(c, &snod->key) != UBIFS_DATA_KEY)
217 if (key_type(c, &snod->key) == UBIFS_DATA_KEY) { 348 list_move_tail(&snod->list, nondata);
218 if (inum != key_inum(c, &snod->key)) {
219 if (inum) {
220 /*
221 * Try to move data nodes from the same
222 * inode together.
223 */
224 err = joinup(c, sleb, inum, blk, &data);
225 if (err)
226 goto out;
227 }
228 inum = key_inum(c, &snod->key);
229 blk = key_block(c, &snod->key);
230 }
231 list_add_tail(lst, &data);
232 } else if (snod->len > MEDIUM_NODE_WM)
233 list_add_tail(lst, &large);
234 else if (snod->len > SMALL_NODE_WM)
235 list_add_tail(lst, &medium);
236 else
237 list_add_tail(lst, &small);
238
239 /* And find the smallest node */
240 if (snod->len < min)
241 min = snod->len;
242 } 349 }
243 350
244 /* 351 /* Sort data and non-data nodes */
245 * Join the tree lists so that we'd have one roughly sorted list 352 list_sort(c, &sleb->nodes, &data_nodes_cmp);
246 * ('large' will be the head of the joined list). 353 list_sort(c, nondata, &nondata_nodes_cmp);
247 */ 354 return 0;
248 list_splice(&data, &large); 355}
249 list_splice(&medium, large.prev); 356
250 list_splice(&small, large.prev); 357/**
358 * move_node - move a node.
359 * @c: UBIFS file-system description object
360 * @sleb: describes the LEB to move nodes from
361 * @snod: the mode to move
362 * @wbuf: write-buffer to move node to
363 *
364 * This function moves node @snod to @wbuf, changes TNC correspondingly, and
365 * destroys @snod. Returns zero in case of success and a negative error code in
366 * case of failure.
367 */
368static int move_node(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
369 struct ubifs_scan_node *snod, struct ubifs_wbuf *wbuf)
370{
371 int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used;
372
373 cond_resched();
374 err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len);
375 if (err)
376 return err;
377
378 err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
379 snod->offs, new_lnum, new_offs,
380 snod->len);
381 list_del(&snod->list);
382 kfree(snod);
383 return err;
384}
385
386/**
387 * move_nodes - move nodes.
388 * @c: UBIFS file-system description object
389 * @sleb: describes the LEB to move nodes from
390 *
391 * This function moves valid nodes from data LEB described by @sleb to the GC
392 * journal head. This function returns zero in case of success, %-EAGAIN if
393 * commit is required, and other negative error codes in case of other
394 * failures.
395 */
396static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
397{
398 int err, min;
399 LIST_HEAD(nondata);
400 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
251 401
252 if (wbuf->lnum == -1) { 402 if (wbuf->lnum == -1) {
253 /* 403 /*
@@ -256,42 +406,59 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
256 */ 406 */
257 err = switch_gc_head(c); 407 err = switch_gc_head(c);
258 if (err) 408 if (err)
259 goto out; 409 return err;
260 } 410 }
261 411
412 err = sort_nodes(c, sleb, &nondata, &min);
413 if (err)
414 goto out;
415
262 /* Write nodes to their new location. Use the first-fit strategy */ 416 /* Write nodes to their new location. Use the first-fit strategy */
263 while (1) { 417 while (1) {
264 avail = c->leb_size - wbuf->offs - wbuf->used; 418 int avail;
265 list_for_each_entry_safe(snod, tmp, &large, list) { 419 struct ubifs_scan_node *snod, *tmp;
266 int new_lnum, new_offs; 420
421 /* Move data nodes */
422 list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
423 avail = c->leb_size - wbuf->offs - wbuf->used;
424 if (snod->len > avail)
425 /*
426 * Do not skip data nodes in order to optimize
427 * bulk-read.
428 */
429 break;
430
431 err = move_node(c, sleb, snod, wbuf);
432 if (err)
433 goto out;
434 }
267 435
436 /* Move non-data nodes */
437 list_for_each_entry_safe(snod, tmp, &nondata, list) {
438 avail = c->leb_size - wbuf->offs - wbuf->used;
268 if (avail < min) 439 if (avail < min)
269 break; 440 break;
270 441
271 if (snod->len > avail) 442 if (snod->len > avail) {
272 /* This node does not fit */ 443 /*
444 * Keep going only if this is an inode with
445 * some data. Otherwise stop and switch the GC
446 * head. IOW, we assume that data-less inode
447 * nodes and direntry nodes are roughly of the
448 * same size.
449 */
450 if (key_type(c, &snod->key) == UBIFS_DENT_KEY ||
451 snod->len == UBIFS_INO_NODE_SZ)
452 break;
273 continue; 453 continue;
454 }
274 455
275 cond_resched(); 456 err = move_node(c, sleb, snod, wbuf);
276
277 new_lnum = wbuf->lnum;
278 new_offs = wbuf->offs + wbuf->used;
279 err = ubifs_wbuf_write_nolock(wbuf, snod->node,
280 snod->len);
281 if (err) 457 if (err)
282 goto out; 458 goto out;
283 err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
284 snod->offs, new_lnum, new_offs,
285 snod->len);
286 if (err)
287 goto out;
288
289 avail = c->leb_size - wbuf->offs - wbuf->used;
290 list_del(&snod->list);
291 kfree(snod);
292 } 459 }
293 460
294 if (list_empty(&large)) 461 if (list_empty(&sleb->nodes) && list_empty(&nondata))
295 break; 462 break;
296 463
297 /* 464 /*
@@ -306,10 +473,7 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
306 return 0; 473 return 0;
307 474
308out: 475out:
309 list_for_each_entry_safe(snod, tmp, &large, list) { 476 list_splice_tail(&nondata, &sleb->nodes);
310 list_del(&snod->list);
311 kfree(snod);
312 }
313 return err; 477 return err;
314} 478}
315 479
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index a11ca0958a23..64b5f3a309f5 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -114,7 +114,7 @@ static inline void zero_trun_node_unused(struct ubifs_trun_node *trun)
114 */ 114 */
115static int reserve_space(struct ubifs_info *c, int jhead, int len) 115static int reserve_space(struct ubifs_info *c, int jhead, int len)
116{ 116{
117 int err = 0, err1, retries = 0, avail, lnum, offs, free, squeeze; 117 int err = 0, err1, retries = 0, avail, lnum, offs, squeeze;
118 struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf; 118 struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
119 119
120 /* 120 /*
@@ -139,10 +139,9 @@ again:
139 * Write buffer wasn't seek'ed or there is no enough space - look for an 139 * Write buffer wasn't seek'ed or there is no enough space - look for an
140 * LEB with some empty space. 140 * LEB with some empty space.
141 */ 141 */
142 lnum = ubifs_find_free_space(c, len, &free, squeeze); 142 lnum = ubifs_find_free_space(c, len, &offs, squeeze);
143 if (lnum >= 0) { 143 if (lnum >= 0) {
144 /* Found an LEB, add it to the journal head */ 144 /* Found an LEB, add it to the journal head */
145 offs = c->leb_size - free;
146 err = ubifs_add_bud_to_log(c, jhead, lnum, offs); 145 err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
147 if (err) 146 if (err)
148 goto out_return; 147 goto out_return;
@@ -1366,7 +1365,7 @@ out_ro:
1366 * @host: host inode 1365 * @host: host inode
1367 * 1366 *
1368 * This function writes the updated version of an extended attribute inode and 1367 * This function writes the updated version of an extended attribute inode and
1369 * the host inode tho the journal (to the base head). The host inode is written 1368 * the host inode to the journal (to the base head). The host inode is written
1370 * after the extended attribute inode in order to guarantee that the extended 1369 * after the extended attribute inode in order to guarantee that the extended
1371 * attribute will be flushed when the inode is synchronized by 'fsync()' and 1370 * attribute will be flushed when the inode is synchronized by 'fsync()' and
1372 * consequently, the write-buffer is synchronized. This function returns zero 1371 * consequently, the write-buffer is synchronized. This function returns zero
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index efb3430a2581..5fa27ea031ba 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -381,8 +381,8 @@ static inline ino_t key_inum_flash(const struct ubifs_info *c, const void *k)
381 * @c: UBIFS file-system description object 381 * @c: UBIFS file-system description object
382 * @key: the key to get hash from 382 * @key: the key to get hash from
383 */ 383 */
384static inline int key_hash(const struct ubifs_info *c, 384static inline uint32_t key_hash(const struct ubifs_info *c,
385 const union ubifs_key *key) 385 const union ubifs_key *key)
386{ 386{
387 return key->u32[1] & UBIFS_S_KEY_HASH_MASK; 387 return key->u32[1] & UBIFS_S_KEY_HASH_MASK;
388} 388}
@@ -392,7 +392,7 @@ static inline int key_hash(const struct ubifs_info *c,
392 * @c: UBIFS file-system description object 392 * @c: UBIFS file-system description object
393 * @k: the key to get hash from 393 * @k: the key to get hash from
394 */ 394 */
395static inline int key_hash_flash(const struct ubifs_info *c, const void *k) 395static inline uint32_t key_hash_flash(const struct ubifs_info *c, const void *k)
396{ 396{
397 const union ubifs_key *key = k; 397 const union ubifs_key *key = k;
398 398
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 3e0aa7367556..56e33772a1ee 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -239,7 +239,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
239 } 239 }
240 240
241 /* 241 /*
242 * Make sure the the amount of space in buds will not exceed 242 * Make sure the amount of space in buds will not exceed the
243 * 'c->max_bud_bytes' limit, because we want to guarantee mount time 243 * 'c->max_bud_bytes' limit, because we want to guarantee mount time
244 * limits. 244 * limits.
245 * 245 *
@@ -367,7 +367,6 @@ static void remove_buds(struct ubifs_info *c)
367 bud->jhead, c->leb_size - bud->start, 367 bud->jhead, c->leb_size - bud->start,
368 c->cmt_bud_bytes); 368 c->cmt_bud_bytes);
369 rb_erase(p1, &c->buds); 369 rb_erase(p1, &c->buds);
370 list_del(&bud->list);
371 /* 370 /*
372 * If the commit does not finish, the recovery will need 371 * If the commit does not finish, the recovery will need
373 * to replay the journal, in which case the old buds 372 * to replay the journal, in which case the old buds
@@ -375,7 +374,7 @@ static void remove_buds(struct ubifs_info *c)
375 * commit i.e. do not allow them to be garbage 374 * commit i.e. do not allow them to be garbage
376 * collected. 375 * collected.
377 */ 376 */
378 list_add(&bud->list, &c->old_buds); 377 list_move(&bud->list, &c->old_buds);
379 } 378 }
380 } 379 }
381 spin_unlock(&c->buds_lock); 380 spin_unlock(&c->buds_lock);
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 3216a1f277f8..8cbfb8248025 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -229,7 +229,7 @@ static int layout_cnodes(struct ubifs_info *c)
229 while (offs + len > c->leb_size) { 229 while (offs + len > c->leb_size) {
230 alen = ALIGN(offs, c->min_io_size); 230 alen = ALIGN(offs, c->min_io_size);
231 upd_ltab(c, lnum, c->leb_size - alen, alen - offs); 231 upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
232 dbg_chk_lpt_sz(c, 2, alen - offs); 232 dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
233 err = alloc_lpt_leb(c, &lnum); 233 err = alloc_lpt_leb(c, &lnum);
234 if (err) 234 if (err)
235 goto no_space; 235 goto no_space;
@@ -272,7 +272,7 @@ static int layout_cnodes(struct ubifs_info *c)
272 if (offs + c->lsave_sz > c->leb_size) { 272 if (offs + c->lsave_sz > c->leb_size) {
273 alen = ALIGN(offs, c->min_io_size); 273 alen = ALIGN(offs, c->min_io_size);
274 upd_ltab(c, lnum, c->leb_size - alen, alen - offs); 274 upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
275 dbg_chk_lpt_sz(c, 2, alen - offs); 275 dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
276 err = alloc_lpt_leb(c, &lnum); 276 err = alloc_lpt_leb(c, &lnum);
277 if (err) 277 if (err)
278 goto no_space; 278 goto no_space;
@@ -292,7 +292,7 @@ static int layout_cnodes(struct ubifs_info *c)
292 if (offs + c->ltab_sz > c->leb_size) { 292 if (offs + c->ltab_sz > c->leb_size) {
293 alen = ALIGN(offs, c->min_io_size); 293 alen = ALIGN(offs, c->min_io_size);
294 upd_ltab(c, lnum, c->leb_size - alen, alen - offs); 294 upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
295 dbg_chk_lpt_sz(c, 2, alen - offs); 295 dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
296 err = alloc_lpt_leb(c, &lnum); 296 err = alloc_lpt_leb(c, &lnum);
297 if (err) 297 if (err)
298 goto no_space; 298 goto no_space;
@@ -416,14 +416,12 @@ static int write_cnodes(struct ubifs_info *c)
416 alen, UBI_SHORTTERM); 416 alen, UBI_SHORTTERM);
417 if (err) 417 if (err)
418 return err; 418 return err;
419 dbg_chk_lpt_sz(c, 4, alen - wlen);
420 } 419 }
421 dbg_chk_lpt_sz(c, 2, 0); 420 dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
422 err = realloc_lpt_leb(c, &lnum); 421 err = realloc_lpt_leb(c, &lnum);
423 if (err) 422 if (err)
424 goto no_space; 423 goto no_space;
425 offs = 0; 424 offs = from = 0;
426 from = 0;
427 ubifs_assert(lnum >= c->lpt_first && 425 ubifs_assert(lnum >= c->lpt_first &&
428 lnum <= c->lpt_last); 426 lnum <= c->lpt_last);
429 err = ubifs_leb_unmap(c, lnum); 427 err = ubifs_leb_unmap(c, lnum);
@@ -477,11 +475,11 @@ static int write_cnodes(struct ubifs_info *c)
477 UBI_SHORTTERM); 475 UBI_SHORTTERM);
478 if (err) 476 if (err)
479 return err; 477 return err;
480 dbg_chk_lpt_sz(c, 2, alen - wlen); 478 dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
481 err = realloc_lpt_leb(c, &lnum); 479 err = realloc_lpt_leb(c, &lnum);
482 if (err) 480 if (err)
483 goto no_space; 481 goto no_space;
484 offs = 0; 482 offs = from = 0;
485 ubifs_assert(lnum >= c->lpt_first && 483 ubifs_assert(lnum >= c->lpt_first &&
486 lnum <= c->lpt_last); 484 lnum <= c->lpt_last);
487 err = ubifs_leb_unmap(c, lnum); 485 err = ubifs_leb_unmap(c, lnum);
@@ -504,11 +502,11 @@ static int write_cnodes(struct ubifs_info *c)
504 UBI_SHORTTERM); 502 UBI_SHORTTERM);
505 if (err) 503 if (err)
506 return err; 504 return err;
507 dbg_chk_lpt_sz(c, 2, alen - wlen); 505 dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
508 err = realloc_lpt_leb(c, &lnum); 506 err = realloc_lpt_leb(c, &lnum);
509 if (err) 507 if (err)
510 goto no_space; 508 goto no_space;
511 offs = 0; 509 offs = from = 0;
512 ubifs_assert(lnum >= c->lpt_first && 510 ubifs_assert(lnum >= c->lpt_first &&
513 lnum <= c->lpt_last); 511 lnum <= c->lpt_last);
514 err = ubifs_leb_unmap(c, lnum); 512 err = ubifs_leb_unmap(c, lnum);
@@ -1756,10 +1754,16 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
1756/** 1754/**
1757 * dbg_chk_lpt_sz - check LPT does not write more than LPT size. 1755 * dbg_chk_lpt_sz - check LPT does not write more than LPT size.
1758 * @c: the UBIFS file-system description object 1756 * @c: the UBIFS file-system description object
1759 * @action: action 1757 * @action: what to do
1760 * @len: length written 1758 * @len: length written
1761 * 1759 *
1762 * This function returns %0 on success and a negative error code on failure. 1760 * This function returns %0 on success and a negative error code on failure.
1761 * The @action argument may be one of:
1762 * o %0 - LPT debugging checking starts, initialize debugging variables;
1763 * o %1 - wrote an LPT node, increase LPT size by @len bytes;
1764 * o %2 - switched to a different LEB and wasted @len bytes;
1765 * o %3 - check that we've written the right number of bytes.
1766 * o %4 - wasted @len bytes;
1763 */ 1767 */
1764int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) 1768int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
1765{ 1769{
@@ -1917,12 +1921,12 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
1917 lnum, offs); 1921 lnum, offs);
1918 err = ubifs_unpack_nnode(c, buf, &nnode); 1922 err = ubifs_unpack_nnode(c, buf, &nnode);
1919 for (i = 0; i < UBIFS_LPT_FANOUT; i++) { 1923 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
1920 printk("%d:%d", nnode.nbranch[i].lnum, 1924 printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum,
1921 nnode.nbranch[i].offs); 1925 nnode.nbranch[i].offs);
1922 if (i != UBIFS_LPT_FANOUT - 1) 1926 if (i != UBIFS_LPT_FANOUT - 1)
1923 printk(", "); 1927 printk(KERN_CONT ", ");
1924 } 1928 }
1925 printk("\n"); 1929 printk(KERN_CONT "\n");
1926 break; 1930 break;
1927 } 1931 }
1928 case UBIFS_LPT_LTAB: 1932 case UBIFS_LPT_LTAB:
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 90acac603e63..10662975d2ef 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -425,59 +425,35 @@ static void clean_buf(const struct ubifs_info *c, void **buf, int lnum,
425 * @lnum: LEB number of the LEB from which @buf was read 425 * @lnum: LEB number of the LEB from which @buf was read
426 * @offs: offset from which @buf was read 426 * @offs: offset from which @buf was read
427 * 427 *
428 * This function scans @buf for more nodes and returns %0 is a node is found and 428 * This function ensures that the corrupted node at @offs is the last thing
429 * %1 if no more nodes are found. 429 * written to a LEB. This function returns %1 if more data is not found and
430 * %0 if more data is found.
430 */ 431 */
431static int no_more_nodes(const struct ubifs_info *c, void *buf, int len, 432static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
432 int lnum, int offs) 433 int lnum, int offs)
433{ 434{
434 int skip, next_offs = 0; 435 struct ubifs_ch *ch = buf;
436 int skip, dlen = le32_to_cpu(ch->len);
435 437
436 if (len > UBIFS_DATA_NODE_SZ) { 438 /* Check for empty space after the corrupt node's common header */
437 struct ubifs_ch *ch = buf; 439 skip = ALIGN(offs + UBIFS_CH_SZ, c->min_io_size) - offs;
438 int dlen = le32_to_cpu(ch->len); 440 if (is_empty(buf + skip, len - skip))
439 441 return 1;
440 if (ch->node_type == UBIFS_DATA_NODE && dlen >= UBIFS_CH_SZ && 442 /*
441 dlen <= UBIFS_MAX_DATA_NODE_SZ) 443 * The area after the common header size is not empty, so the common
442 /* The corrupt node looks like a data node */ 444 * header must be intact. Check it.
443 next_offs = ALIGN(offs + dlen, 8); 445 */
444 } 446 if (ubifs_check_node(c, buf, lnum, offs, 1, 0) != -EUCLEAN) {
445 447 dbg_rcvry("unexpected bad common header at %d:%d", lnum, offs);
446 if (c->min_io_size == 1) 448 return 0;
447 skip = 8;
448 else
449 skip = ALIGN(offs + 1, c->min_io_size) - offs;
450
451 offs += skip;
452 buf += skip;
453 len -= skip;
454 while (len > 8) {
455 struct ubifs_ch *ch = buf;
456 uint32_t magic = le32_to_cpu(ch->magic);
457 int ret;
458
459 if (magic == UBIFS_NODE_MAGIC) {
460 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
461 if (ret == SCANNED_A_NODE || ret > 0) {
462 /*
463 * There is a small chance this is just data in
464 * a data node, so check that possibility. e.g.
465 * this is part of a file that itself contains
466 * a UBIFS image.
467 */
468 if (next_offs && offs + le32_to_cpu(ch->len) <=
469 next_offs)
470 continue;
471 dbg_rcvry("unexpected node at %d:%d", lnum,
472 offs);
473 return 0;
474 }
475 }
476 offs += 8;
477 buf += 8;
478 len -= 8;
479 } 449 }
480 return 1; 450 /* Now we know the corrupt node's length we can skip over it */
451 skip = ALIGN(offs + dlen, c->min_io_size) - offs;
452 /* After which there should be empty space */
453 if (is_empty(buf + skip, len - skip))
454 return 1;
455 dbg_rcvry("unexpected data at %d:%d", lnum, offs + skip);
456 return 0;
481} 457}
482 458
483/** 459/**
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index ce42a7b0ca5a..11cc80125a49 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -143,7 +143,7 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
143 dirty -= c->leb_size - lp->free; 143 dirty -= c->leb_size - lp->free;
144 /* 144 /*
145 * If the replay order was perfect the dirty space would now be 145 * If the replay order was perfect the dirty space would now be
146 * zero. The order is not perfect because the the journal heads 146 * zero. The order is not perfect because the journal heads
147 * race with each other. This is not a problem but is does mean 147 * race with each other. This is not a problem but is does mean
148 * that the dirty space may temporarily exceed c->leb_size 148 * that the dirty space may temporarily exceed c->leb_size
149 * during the replay. 149 * during the replay.
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index e070c643d1bb..57085e43320f 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -193,6 +193,7 @@ static int create_default_filesystem(struct ubifs_info *c)
193 if (tmp64 > DEFAULT_MAX_RP_SIZE) 193 if (tmp64 > DEFAULT_MAX_RP_SIZE)
194 tmp64 = DEFAULT_MAX_RP_SIZE; 194 tmp64 = DEFAULT_MAX_RP_SIZE;
195 sup->rp_size = cpu_to_le64(tmp64); 195 sup->rp_size = cpu_to_le64(tmp64);
196 sup->ro_compat_version = cpu_to_le32(UBIFS_RO_COMPAT_VERSION);
196 197
197 err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM); 198 err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM);
198 kfree(sup); 199 kfree(sup);
@@ -532,17 +533,39 @@ int ubifs_read_superblock(struct ubifs_info *c)
532 if (IS_ERR(sup)) 533 if (IS_ERR(sup))
533 return PTR_ERR(sup); 534 return PTR_ERR(sup);
534 535
536 c->fmt_version = le32_to_cpu(sup->fmt_version);
537 c->ro_compat_version = le32_to_cpu(sup->ro_compat_version);
538
535 /* 539 /*
536 * The software supports all previous versions but not future versions, 540 * The software supports all previous versions but not future versions,
537 * due to the unavailability of time-travelling equipment. 541 * due to the unavailability of time-travelling equipment.
538 */ 542 */
539 c->fmt_version = le32_to_cpu(sup->fmt_version);
540 if (c->fmt_version > UBIFS_FORMAT_VERSION) { 543 if (c->fmt_version > UBIFS_FORMAT_VERSION) {
541 ubifs_err("on-flash format version is %d, but software only " 544 struct super_block *sb = c->vfs_sb;
542 "supports up to version %d", c->fmt_version, 545 int mounting_ro = sb->s_flags & MS_RDONLY;
543 UBIFS_FORMAT_VERSION); 546
544 err = -EINVAL; 547 ubifs_assert(!c->ro_media || mounting_ro);
545 goto out; 548 if (!mounting_ro ||
549 c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) {
550 ubifs_err("on-flash format version is w%d/r%d, but "
551 "software only supports up to version "
552 "w%d/r%d", c->fmt_version,
553 c->ro_compat_version, UBIFS_FORMAT_VERSION,
554 UBIFS_RO_COMPAT_VERSION);
555 if (c->ro_compat_version <= UBIFS_RO_COMPAT_VERSION) {
556 ubifs_msg("only R/O mounting is possible");
557 err = -EROFS;
558 } else
559 err = -EINVAL;
560 goto out;
561 }
562
563 /*
564 * The FS is mounted R/O, and the media format is
565 * R/O-compatible with the UBIFS implementation, so we can
566 * mount.
567 */
568 c->rw_incompat = 1;
546 } 569 }
547 570
548 if (c->fmt_version < 3) { 571 if (c->fmt_version < 3) {
@@ -623,7 +646,6 @@ int ubifs_read_superblock(struct ubifs_info *c)
623 c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS; 646 c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS;
624 c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs; 647 c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs;
625 c->main_first = c->leb_cnt - c->main_lebs; 648 c->main_first = c->leb_cnt - c->main_lebs;
626 c->report_rp_size = ubifs_reported_space(c, c->rp_size);
627 649
628 err = validate_sb(c, sup); 650 err = validate_sb(c, sup);
629out: 651out:
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index e7bab52a1410..02feb59cefca 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -206,8 +206,7 @@ static int shrink_tnc_trees(int nr, int age, int *contention)
206 * Move this one to the end of the list to provide some 206 * Move this one to the end of the list to provide some
207 * fairness. 207 * fairness.
208 */ 208 */
209 list_del(&c->infos_list); 209 list_move_tail(&c->infos_list, &ubifs_infos);
210 list_add_tail(&c->infos_list, &ubifs_infos);
211 mutex_unlock(&c->umount_mutex); 210 mutex_unlock(&c->umount_mutex);
212 if (freed >= nr) 211 if (freed >= nr)
213 break; 212 break;
@@ -263,8 +262,7 @@ static int kick_a_thread(void)
263 } 262 }
264 263
265 if (i == 1) { 264 if (i == 1) {
266 list_del(&c->infos_list); 265 list_move_tail(&c->infos_list, &ubifs_infos);
267 list_add_tail(&c->infos_list, &ubifs_infos);
268 spin_unlock(&ubifs_infos_lock); 266 spin_unlock(&ubifs_infos_lock);
269 267
270 ubifs_request_bg_commit(c); 268 ubifs_request_bg_commit(c);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index c5c98355459a..e9f7a754c4f7 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -421,8 +421,8 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
421 seq_printf(s, ",no_chk_data_crc"); 421 seq_printf(s, ",no_chk_data_crc");
422 422
423 if (c->mount_opts.override_compr) { 423 if (c->mount_opts.override_compr) {
424 seq_printf(s, ",compr="); 424 seq_printf(s, ",compr=%s",
425 seq_printf(s, ubifs_compr_name(c->mount_opts.compr_type)); 425 ubifs_compr_name(c->mount_opts.compr_type));
426 } 426 }
427 427
428 return 0; 428 return 0;
@@ -700,6 +700,8 @@ static int init_constants_sb(struct ubifs_info *c)
700 if (err) 700 if (err)
701 return err; 701 return err;
702 702
703 /* Initialize effective LEB size used in budgeting calculations */
704 c->idx_leb_size = c->leb_size - c->max_idx_node_sz;
703 return 0; 705 return 0;
704} 706}
705 707
@@ -716,6 +718,7 @@ static void init_constants_master(struct ubifs_info *c)
716 long long tmp64; 718 long long tmp64;
717 719
718 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); 720 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
721 c->report_rp_size = ubifs_reported_space(c, c->rp_size);
719 722
720 /* 723 /*
721 * Calculate total amount of FS blocks. This number is not used 724 * Calculate total amount of FS blocks. This number is not used
@@ -1201,7 +1204,7 @@ static int mount_ubifs(struct ubifs_info *c)
1201 goto out_cbuf; 1204 goto out_cbuf;
1202 1205
1203 /* Create background thread */ 1206 /* Create background thread */
1204 c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); 1207 c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
1205 if (IS_ERR(c->bgt)) { 1208 if (IS_ERR(c->bgt)) {
1206 err = PTR_ERR(c->bgt); 1209 err = PTR_ERR(c->bgt);
1207 c->bgt = NULL; 1210 c->bgt = NULL;
@@ -1318,11 +1321,15 @@ static int mount_ubifs(struct ubifs_info *c)
1318 else { 1321 else {
1319 c->need_recovery = 0; 1322 c->need_recovery = 0;
1320 ubifs_msg("recovery completed"); 1323 ubifs_msg("recovery completed");
1321 /* GC LEB has to be empty and taken at this point */ 1324 /*
1322 ubifs_assert(c->lst.taken_empty_lebs == 1); 1325 * GC LEB has to be empty and taken at this point. But
1326 * the journal head LEBs may also be accounted as
1327 * "empty taken" if they are empty.
1328 */
1329 ubifs_assert(c->lst.taken_empty_lebs > 0);
1323 } 1330 }
1324 } else 1331 } else
1325 ubifs_assert(c->lst.taken_empty_lebs == 1); 1332 ubifs_assert(c->lst.taken_empty_lebs > 0);
1326 1333
1327 err = dbg_check_filesystem(c); 1334 err = dbg_check_filesystem(c);
1328 if (err) 1335 if (err)
@@ -1344,8 +1351,9 @@ static int mount_ubifs(struct ubifs_info *c)
1344 x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; 1351 x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
1345 ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d " 1352 ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d "
1346 "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt); 1353 "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt);
1347 ubifs_msg("media format: %d (latest is %d)", 1354 ubifs_msg("media format: w%d/r%d (latest is w%d/r%d)",
1348 c->fmt_version, UBIFS_FORMAT_VERSION); 1355 c->fmt_version, c->ro_compat_version,
1356 UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION);
1349 ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr)); 1357 ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr));
1350 ubifs_msg("reserved for root: %llu bytes (%llu KiB)", 1358 ubifs_msg("reserved for root: %llu bytes (%llu KiB)",
1351 c->report_rp_size, c->report_rp_size >> 10); 1359 c->report_rp_size, c->report_rp_size >> 10);
@@ -1485,6 +1493,15 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1485{ 1493{
1486 int err, lnum; 1494 int err, lnum;
1487 1495
1496 if (c->rw_incompat) {
1497 ubifs_err("the file-system is not R/W-compatible");
1498 ubifs_msg("on-flash format version is w%d/r%d, but software "
1499 "only supports up to version w%d/r%d", c->fmt_version,
1500 c->ro_compat_version, UBIFS_FORMAT_VERSION,
1501 UBIFS_RO_COMPAT_VERSION);
1502 return -EROFS;
1503 }
1504
1488 mutex_lock(&c->umount_mutex); 1505 mutex_lock(&c->umount_mutex);
1489 dbg_save_space_info(c); 1506 dbg_save_space_info(c);
1490 c->remounting_rw = 1; 1507 c->remounting_rw = 1;
@@ -1554,7 +1571,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1554 ubifs_create_buds_lists(c); 1571 ubifs_create_buds_lists(c);
1555 1572
1556 /* Create background thread */ 1573 /* Create background thread */
1557 c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); 1574 c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
1558 if (IS_ERR(c->bgt)) { 1575 if (IS_ERR(c->bgt)) {
1559 err = PTR_ERR(c->bgt); 1576 err = PTR_ERR(c->bgt);
1560 c->bgt = NULL; 1577 c->bgt = NULL;
@@ -1775,7 +1792,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1775 c->bu.buf = NULL; 1792 c->bu.buf = NULL;
1776 } 1793 }
1777 1794
1778 ubifs_assert(c->lst.taken_empty_lebs == 1); 1795 ubifs_assert(c->lst.taken_empty_lebs > 0);
1779 return 0; 1796 return 0;
1780} 1797}
1781 1798
@@ -2038,8 +2055,7 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
2038 return 0; 2055 return 0;
2039 2056
2040out_deact: 2057out_deact:
2041 up_write(&sb->s_umount); 2058 deactivate_locked_super(sb);
2042 deactivate_super(sb);
2043out_close: 2059out_close:
2044 ubi_close_volume(ubi); 2060 ubi_close_volume(ubi);
2045 return err; 2061 return err;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index fa28a84c6a1b..f249f7b0d656 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1252,7 +1252,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
1252 * splitting in the middle of the colliding sequence. Also, when 1252 * splitting in the middle of the colliding sequence. Also, when
1253 * removing the leftmost key, we would have to correct the key of the 1253 * removing the leftmost key, we would have to correct the key of the
1254 * parent node, which would introduce additional complications. Namely, 1254 * parent node, which would introduce additional complications. Namely,
1255 * if we changed the the leftmost key of the parent znode, the garbage 1255 * if we changed the leftmost key of the parent znode, the garbage
1256 * collector would be unable to find it (GC is doing this when GC'ing 1256 * collector would be unable to find it (GC is doing this when GC'ing
1257 * indexing LEBs). Although we already have an additional RB-tree where 1257 * indexing LEBs). Although we already have an additional RB-tree where
1258 * we save such changed znodes (see 'ins_clr_old_idx_znode()') until 1258 * we save such changed znodes (see 'ins_clr_old_idx_znode()') until
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index b25fc36cf72f..3eee07e0c495 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -36,9 +36,31 @@
36/* UBIFS node magic number (must not have the padding byte first or last) */ 36/* UBIFS node magic number (must not have the padding byte first or last) */
37#define UBIFS_NODE_MAGIC 0x06101831 37#define UBIFS_NODE_MAGIC 0x06101831
38 38
39/* UBIFS on-flash format version */ 39/*
40 * UBIFS on-flash format version. This version is increased when the on-flash
41 * format is changing. If this happens, UBIFS is will support older versions as
42 * well. But older UBIFS code will not support newer formats. Format changes
43 * will be rare and only when absolutely necessary, e.g. to fix a bug or to add
44 * a new feature.
45 *
46 * UBIFS went into mainline kernel with format version 4. The older formats
47 * were development formats.
48 */
40#define UBIFS_FORMAT_VERSION 4 49#define UBIFS_FORMAT_VERSION 4
41 50
51/*
52 * Read-only compatibility version. If the UBIFS format is changed, older UBIFS
53 * implementations will not be able to mount newer formats in read-write mode.
54 * However, depending on the change, it may be possible to mount newer formats
55 * in R/O mode. This is indicated by the R/O compatibility version which is
56 * stored in the super-block.
57 *
58 * This is needed to support boot-loaders which only need R/O mounting. With
59 * this flag it is possible to do UBIFS format changes without a need to update
60 * boot-loaders.
61 */
62#define UBIFS_RO_COMPAT_VERSION 0
63
42/* Minimum logical eraseblock size in bytes */ 64/* Minimum logical eraseblock size in bytes */
43#define UBIFS_MIN_LEB_SZ (15*1024) 65#define UBIFS_MIN_LEB_SZ (15*1024)
44 66
@@ -53,7 +75,7 @@
53 75
54/* 76/*
55 * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes 77 * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes
56 * shorter than uncompressed data length, UBIFS preferes to leave this data 78 * shorter than uncompressed data length, UBIFS prefers to leave this data
57 * node uncompress, because it'll be read faster. 79 * node uncompress, because it'll be read faster.
58 */ 80 */
59#define UBIFS_MIN_COMPRESS_DIFF 64 81#define UBIFS_MIN_COMPRESS_DIFF 64
@@ -586,6 +608,7 @@ struct ubifs_pad_node {
586 * @padding2: reserved for future, zeroes 608 * @padding2: reserved for future, zeroes
587 * @time_gran: time granularity in nanoseconds 609 * @time_gran: time granularity in nanoseconds
588 * @uuid: UUID generated when the file system image was created 610 * @uuid: UUID generated when the file system image was created
611 * @ro_compat_version: UBIFS R/O compatibility version
589 */ 612 */
590struct ubifs_sb_node { 613struct ubifs_sb_node {
591 struct ubifs_ch ch; 614 struct ubifs_ch ch;
@@ -612,7 +635,8 @@ struct ubifs_sb_node {
612 __le64 rp_size; 635 __le64 rp_size;
613 __le32 time_gran; 636 __le32 time_gran;
614 __u8 uuid[16]; 637 __u8 uuid[16];
615 __u8 padding2[3972]; 638 __le32 ro_compat_version;
639 __u8 padding2[3968];
616} __attribute__ ((packed)); 640} __attribute__ ((packed));
617 641
618/** 642/**
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 039a68bee29a..0a8341e14088 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -934,6 +934,7 @@ struct ubifs_debug_info;
934 * by @commit_sem 934 * by @commit_sem
935 * @cnt_lock: protects @highest_inum and @max_sqnum counters 935 * @cnt_lock: protects @highest_inum and @max_sqnum counters
936 * @fmt_version: UBIFS on-flash format version 936 * @fmt_version: UBIFS on-flash format version
937 * @ro_compat_version: R/O compatibility version
937 * @uuid: UUID from super block 938 * @uuid: UUID from super block
938 * 939 *
939 * @lhead_lnum: log head logical eraseblock number 940 * @lhead_lnum: log head logical eraseblock number
@@ -966,6 +967,7 @@ struct ubifs_debug_info;
966 * recovery) 967 * recovery)
967 * @bulk_read: enable bulk-reads 968 * @bulk_read: enable bulk-reads
968 * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc) 969 * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
970 * @rw_incompat: the media is not R/W compatible
969 * 971 *
970 * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and 972 * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
971 * @calc_idx_sz 973 * @calc_idx_sz
@@ -1015,6 +1017,8 @@ struct ubifs_debug_info;
1015 * @min_io_shift: number of bits in @min_io_size minus one 1017 * @min_io_shift: number of bits in @min_io_size minus one
1016 * @leb_size: logical eraseblock size in bytes 1018 * @leb_size: logical eraseblock size in bytes
1017 * @half_leb_size: half LEB size 1019 * @half_leb_size: half LEB size
1020 * @idx_leb_size: how many bytes of an LEB are effectively available when it is
1021 * used to store indexing nodes (@leb_size - @max_idx_node_sz)
1018 * @leb_cnt: count of logical eraseblocks 1022 * @leb_cnt: count of logical eraseblocks
1019 * @max_leb_cnt: maximum count of logical eraseblocks 1023 * @max_leb_cnt: maximum count of logical eraseblocks
1020 * @old_leb_cnt: count of logical eraseblocks before re-size 1024 * @old_leb_cnt: count of logical eraseblocks before re-size
@@ -1132,8 +1136,8 @@ struct ubifs_debug_info;
1132 * previous commit start 1136 * previous commit start
1133 * @uncat_list: list of un-categorized LEBs 1137 * @uncat_list: list of un-categorized LEBs
1134 * @empty_list: list of empty LEBs 1138 * @empty_list: list of empty LEBs
1135 * @freeable_list: list of freeable non-index LEBs (free + dirty == leb_size) 1139 * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size)
1136 * @frdi_idx_list: list of freeable index LEBs (free + dirty == leb_size) 1140 * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size)
1137 * @freeable_cnt: number of freeable LEBs in @freeable_list 1141 * @freeable_cnt: number of freeable LEBs in @freeable_list
1138 * 1142 *
1139 * @ltab_lnum: LEB number of LPT's own lprops table 1143 * @ltab_lnum: LEB number of LPT's own lprops table
@@ -1177,6 +1181,7 @@ struct ubifs_info {
1177 unsigned long long cmt_no; 1181 unsigned long long cmt_no;
1178 spinlock_t cnt_lock; 1182 spinlock_t cnt_lock;
1179 int fmt_version; 1183 int fmt_version;
1184 int ro_compat_version;
1180 unsigned char uuid[16]; 1185 unsigned char uuid[16];
1181 1186
1182 int lhead_lnum; 1187 int lhead_lnum;
@@ -1205,6 +1210,7 @@ struct ubifs_info {
1205 unsigned int no_chk_data_crc:1; 1210 unsigned int no_chk_data_crc:1;
1206 unsigned int bulk_read:1; 1211 unsigned int bulk_read:1;
1207 unsigned int default_compr:2; 1212 unsigned int default_compr:2;
1213 unsigned int rw_incompat:1;
1208 1214
1209 struct mutex tnc_mutex; 1215 struct mutex tnc_mutex;
1210 struct ubifs_zbranch zroot; 1216 struct ubifs_zbranch zroot;
@@ -1253,6 +1259,7 @@ struct ubifs_info {
1253 int min_io_shift; 1259 int min_io_shift;
1254 int leb_size; 1260 int leb_size;
1255 int half_leb_size; 1261 int half_leb_size;
1262 int idx_leb_size;
1256 int leb_cnt; 1263 int leb_cnt;
1257 int max_leb_cnt; 1264 int max_leb_cnt;
1258 int old_leb_cnt; 1265 int old_leb_cnt;
@@ -1500,7 +1507,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free);
1500long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs); 1507long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
1501 1508
1502/* find.c */ 1509/* find.c */
1503int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, 1510int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
1504 int squeeze); 1511 int squeeze);
1505int ubifs_find_free_leb_for_idx(struct ubifs_info *c); 1512int ubifs_find_free_leb_for_idx(struct ubifs_info *c);
1506int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, 1513int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 2bb788a2acb1..e48e9a3af763 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -87,12 +87,12 @@ static int read_block_bitmap(struct super_block *sb,
87{ 87{
88 struct buffer_head *bh = NULL; 88 struct buffer_head *bh = NULL;
89 int retval = 0; 89 int retval = 0;
90 kernel_lb_addr loc; 90 struct kernel_lb_addr loc;
91 91
92 loc.logicalBlockNum = bitmap->s_extPosition; 92 loc.logicalBlockNum = bitmap->s_extPosition;
93 loc.partitionReferenceNum = UDF_SB(sb)->s_partition; 93 loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
94 94
95 bh = udf_tread(sb, udf_get_lb_pblock(sb, loc, block)); 95 bh = udf_tread(sb, udf_get_lb_pblock(sb, &loc, block));
96 if (!bh) 96 if (!bh)
97 retval = -EIO; 97 retval = -EIO;
98 98
@@ -140,27 +140,29 @@ static inline int load_block_bitmap(struct super_block *sb,
140 return slot; 140 return slot;
141} 141}
142 142
143static bool udf_add_free_space(struct udf_sb_info *sbi, 143static void udf_add_free_space(struct super_block *sb, u16 partition, u32 cnt)
144 u16 partition, u32 cnt)
145{ 144{
145 struct udf_sb_info *sbi = UDF_SB(sb);
146 struct logicalVolIntegrityDesc *lvid; 146 struct logicalVolIntegrityDesc *lvid;
147 147
148 if (sbi->s_lvid_bh == NULL) 148 if (!sbi->s_lvid_bh)
149 return false; 149 return;
150 150
151 lvid = (struct logicalVolIntegrityDesc *)sbi->s_lvid_bh->b_data; 151 lvid = (struct logicalVolIntegrityDesc *)sbi->s_lvid_bh->b_data;
152 le32_add_cpu(&lvid->freeSpaceTable[partition], cnt); 152 le32_add_cpu(&lvid->freeSpaceTable[partition], cnt);
153 return true; 153 udf_updated_lvid(sb);
154} 154}
155 155
156static void udf_bitmap_free_blocks(struct super_block *sb, 156static void udf_bitmap_free_blocks(struct super_block *sb,
157 struct inode *inode, 157 struct inode *inode,
158 struct udf_bitmap *bitmap, 158 struct udf_bitmap *bitmap,
159 kernel_lb_addr bloc, uint32_t offset, 159 struct kernel_lb_addr *bloc,
160 uint32_t offset,
160 uint32_t count) 161 uint32_t count)
161{ 162{
162 struct udf_sb_info *sbi = UDF_SB(sb); 163 struct udf_sb_info *sbi = UDF_SB(sb);
163 struct buffer_head *bh = NULL; 164 struct buffer_head *bh = NULL;
165 struct udf_part_map *partmap;
164 unsigned long block; 166 unsigned long block;
165 unsigned long block_group; 167 unsigned long block_group;
166 unsigned long bit; 168 unsigned long bit;
@@ -169,17 +171,17 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
169 unsigned long overflow; 171 unsigned long overflow;
170 172
171 mutex_lock(&sbi->s_alloc_mutex); 173 mutex_lock(&sbi->s_alloc_mutex);
172 if (bloc.logicalBlockNum < 0 || 174 partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
173 (bloc.logicalBlockNum + count) > 175 if (bloc->logicalBlockNum < 0 ||
174 sbi->s_partmaps[bloc.partitionReferenceNum].s_partition_len) { 176 (bloc->logicalBlockNum + count) >
177 partmap->s_partition_len) {
175 udf_debug("%d < %d || %d + %d > %d\n", 178 udf_debug("%d < %d || %d + %d > %d\n",
176 bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count, 179 bloc->logicalBlockNum, 0, bloc->logicalBlockNum,
177 sbi->s_partmaps[bloc.partitionReferenceNum]. 180 count, partmap->s_partition_len);
178 s_partition_len);
179 goto error_return; 181 goto error_return;
180 } 182 }
181 183
182 block = bloc.logicalBlockNum + offset + 184 block = bloc->logicalBlockNum + offset +
183 (sizeof(struct spaceBitmapDesc) << 3); 185 (sizeof(struct spaceBitmapDesc) << 3);
184 186
185 do { 187 do {
@@ -207,7 +209,7 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
207 } else { 209 } else {
208 if (inode) 210 if (inode)
209 vfs_dq_free_block(inode, 1); 211 vfs_dq_free_block(inode, 1);
210 udf_add_free_space(sbi, sbi->s_partition, 1); 212 udf_add_free_space(sb, sbi->s_partition, 1);
211 } 213 }
212 } 214 }
213 mark_buffer_dirty(bh); 215 mark_buffer_dirty(bh);
@@ -218,9 +220,6 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
218 } while (overflow); 220 } while (overflow);
219 221
220error_return: 222error_return:
221 sb->s_dirt = 1;
222 if (sbi->s_lvid_bh)
223 mark_buffer_dirty(sbi->s_lvid_bh);
224 mutex_unlock(&sbi->s_alloc_mutex); 223 mutex_unlock(&sbi->s_alloc_mutex);
225} 224}
226 225
@@ -277,9 +276,7 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
277 } while (block_count > 0); 276 } while (block_count > 0);
278 277
279out: 278out:
280 if (udf_add_free_space(sbi, partition, -alloc_count)) 279 udf_add_free_space(sb, partition, -alloc_count);
281 mark_buffer_dirty(sbi->s_lvid_bh);
282 sb->s_dirt = 1;
283 mutex_unlock(&sbi->s_alloc_mutex); 280 mutex_unlock(&sbi->s_alloc_mutex);
284 return alloc_count; 281 return alloc_count;
285} 282}
@@ -409,9 +406,7 @@ got_block:
409 406
410 mark_buffer_dirty(bh); 407 mark_buffer_dirty(bh);
411 408
412 if (udf_add_free_space(sbi, partition, -1)) 409 udf_add_free_space(sb, partition, -1);
413 mark_buffer_dirty(sbi->s_lvid_bh);
414 sb->s_dirt = 1;
415 mutex_unlock(&sbi->s_alloc_mutex); 410 mutex_unlock(&sbi->s_alloc_mutex);
416 *err = 0; 411 *err = 0;
417 return newblock; 412 return newblock;
@@ -425,26 +420,28 @@ error_return:
425static void udf_table_free_blocks(struct super_block *sb, 420static void udf_table_free_blocks(struct super_block *sb,
426 struct inode *inode, 421 struct inode *inode,
427 struct inode *table, 422 struct inode *table,
428 kernel_lb_addr bloc, uint32_t offset, 423 struct kernel_lb_addr *bloc,
424 uint32_t offset,
429 uint32_t count) 425 uint32_t count)
430{ 426{
431 struct udf_sb_info *sbi = UDF_SB(sb); 427 struct udf_sb_info *sbi = UDF_SB(sb);
428 struct udf_part_map *partmap;
432 uint32_t start, end; 429 uint32_t start, end;
433 uint32_t elen; 430 uint32_t elen;
434 kernel_lb_addr eloc; 431 struct kernel_lb_addr eloc;
435 struct extent_position oepos, epos; 432 struct extent_position oepos, epos;
436 int8_t etype; 433 int8_t etype;
437 int i; 434 int i;
438 struct udf_inode_info *iinfo; 435 struct udf_inode_info *iinfo;
439 436
440 mutex_lock(&sbi->s_alloc_mutex); 437 mutex_lock(&sbi->s_alloc_mutex);
441 if (bloc.logicalBlockNum < 0 || 438 partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
442 (bloc.logicalBlockNum + count) > 439 if (bloc->logicalBlockNum < 0 ||
443 sbi->s_partmaps[bloc.partitionReferenceNum].s_partition_len) { 440 (bloc->logicalBlockNum + count) >
441 partmap->s_partition_len) {
444 udf_debug("%d < %d || %d + %d > %d\n", 442 udf_debug("%d < %d || %d + %d > %d\n",
445 bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count, 443 bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count,
446 sbi->s_partmaps[bloc.partitionReferenceNum]. 444 partmap->s_partition_len);
447 s_partition_len);
448 goto error_return; 445 goto error_return;
449 } 446 }
450 447
@@ -453,11 +450,10 @@ static void udf_table_free_blocks(struct super_block *sb,
453 could occure, but.. oh well */ 450 could occure, but.. oh well */
454 if (inode) 451 if (inode)
455 vfs_dq_free_block(inode, count); 452 vfs_dq_free_block(inode, count);
456 if (udf_add_free_space(sbi, sbi->s_partition, count)) 453 udf_add_free_space(sb, sbi->s_partition, count);
457 mark_buffer_dirty(sbi->s_lvid_bh);
458 454
459 start = bloc.logicalBlockNum + offset; 455 start = bloc->logicalBlockNum + offset;
460 end = bloc.logicalBlockNum + offset + count - 1; 456 end = bloc->logicalBlockNum + offset + count - 1;
461 457
462 epos.offset = oepos.offset = sizeof(struct unallocSpaceEntry); 458 epos.offset = oepos.offset = sizeof(struct unallocSpaceEntry);
463 elen = 0; 459 elen = 0;
@@ -483,7 +479,7 @@ static void udf_table_free_blocks(struct super_block *sb,
483 start += count; 479 start += count;
484 count = 0; 480 count = 0;
485 } 481 }
486 udf_write_aext(table, &oepos, eloc, elen, 1); 482 udf_write_aext(table, &oepos, &eloc, elen, 1);
487 } else if (eloc.logicalBlockNum == (end + 1)) { 483 } else if (eloc.logicalBlockNum == (end + 1)) {
488 if ((0x3FFFFFFF - elen) < 484 if ((0x3FFFFFFF - elen) <
489 (count << sb->s_blocksize_bits)) { 485 (count << sb->s_blocksize_bits)) {
@@ -502,7 +498,7 @@ static void udf_table_free_blocks(struct super_block *sb,
502 end -= count; 498 end -= count;
503 count = 0; 499 count = 0;
504 } 500 }
505 udf_write_aext(table, &oepos, eloc, elen, 1); 501 udf_write_aext(table, &oepos, &eloc, elen, 1);
506 } 502 }
507 503
508 if (epos.bh != oepos.bh) { 504 if (epos.bh != oepos.bh) {
@@ -532,8 +528,8 @@ static void udf_table_free_blocks(struct super_block *sb,
532 */ 528 */
533 529
534 int adsize; 530 int adsize;
535 short_ad *sad = NULL; 531 struct short_ad *sad = NULL;
536 long_ad *lad = NULL; 532 struct long_ad *lad = NULL;
537 struct allocExtDesc *aed; 533 struct allocExtDesc *aed;
538 534
539 eloc.logicalBlockNum = start; 535 eloc.logicalBlockNum = start;
@@ -541,9 +537,9 @@ static void udf_table_free_blocks(struct super_block *sb,
541 (count << sb->s_blocksize_bits); 537 (count << sb->s_blocksize_bits);
542 538
543 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 539 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
544 adsize = sizeof(short_ad); 540 adsize = sizeof(struct short_ad);
545 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 541 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
546 adsize = sizeof(long_ad); 542 adsize = sizeof(struct long_ad);
547 else { 543 else {
548 brelse(oepos.bh); 544 brelse(oepos.bh);
549 brelse(epos.bh); 545 brelse(epos.bh);
@@ -563,7 +559,7 @@ static void udf_table_free_blocks(struct super_block *sb,
563 elen -= sb->s_blocksize; 559 elen -= sb->s_blocksize;
564 560
565 epos.bh = udf_tread(sb, 561 epos.bh = udf_tread(sb,
566 udf_get_lb_pblock(sb, epos.block, 0)); 562 udf_get_lb_pblock(sb, &epos.block, 0));
567 if (!epos.bh) { 563 if (!epos.bh) {
568 brelse(oepos.bh); 564 brelse(oepos.bh);
569 goto error_return; 565 goto error_return;
@@ -601,15 +597,15 @@ static void udf_table_free_blocks(struct super_block *sb,
601 if (sbi->s_udfrev >= 0x0200) 597 if (sbi->s_udfrev >= 0x0200)
602 udf_new_tag(epos.bh->b_data, TAG_IDENT_AED, 598 udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
603 3, 1, epos.block.logicalBlockNum, 599 3, 1, epos.block.logicalBlockNum,
604 sizeof(tag)); 600 sizeof(struct tag));
605 else 601 else
606 udf_new_tag(epos.bh->b_data, TAG_IDENT_AED, 602 udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
607 2, 1, epos.block.logicalBlockNum, 603 2, 1, epos.block.logicalBlockNum,
608 sizeof(tag)); 604 sizeof(struct tag));
609 605
610 switch (iinfo->i_alloc_type) { 606 switch (iinfo->i_alloc_type) {
611 case ICBTAG_FLAG_AD_SHORT: 607 case ICBTAG_FLAG_AD_SHORT:
612 sad = (short_ad *)sptr; 608 sad = (struct short_ad *)sptr;
613 sad->extLength = cpu_to_le32( 609 sad->extLength = cpu_to_le32(
614 EXT_NEXT_EXTENT_ALLOCDECS | 610 EXT_NEXT_EXTENT_ALLOCDECS |
615 sb->s_blocksize); 611 sb->s_blocksize);
@@ -617,7 +613,7 @@ static void udf_table_free_blocks(struct super_block *sb,
617 cpu_to_le32(epos.block.logicalBlockNum); 613 cpu_to_le32(epos.block.logicalBlockNum);
618 break; 614 break;
619 case ICBTAG_FLAG_AD_LONG: 615 case ICBTAG_FLAG_AD_LONG:
620 lad = (long_ad *)sptr; 616 lad = (struct long_ad *)sptr;
621 lad->extLength = cpu_to_le32( 617 lad->extLength = cpu_to_le32(
622 EXT_NEXT_EXTENT_ALLOCDECS | 618 EXT_NEXT_EXTENT_ALLOCDECS |
623 sb->s_blocksize); 619 sb->s_blocksize);
@@ -635,7 +631,7 @@ static void udf_table_free_blocks(struct super_block *sb,
635 631
636 /* It's possible that stealing the block emptied the extent */ 632 /* It's possible that stealing the block emptied the extent */
637 if (elen) { 633 if (elen) {
638 udf_write_aext(table, &epos, eloc, elen, 1); 634 udf_write_aext(table, &epos, &eloc, elen, 1);
639 635
640 if (!epos.bh) { 636 if (!epos.bh) {
641 iinfo->i_lenAlloc += adsize; 637 iinfo->i_lenAlloc += adsize;
@@ -653,7 +649,6 @@ static void udf_table_free_blocks(struct super_block *sb,
653 brelse(oepos.bh); 649 brelse(oepos.bh);
654 650
655error_return: 651error_return:
656 sb->s_dirt = 1;
657 mutex_unlock(&sbi->s_alloc_mutex); 652 mutex_unlock(&sbi->s_alloc_mutex);
658 return; 653 return;
659} 654}
@@ -666,7 +661,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
666 struct udf_sb_info *sbi = UDF_SB(sb); 661 struct udf_sb_info *sbi = UDF_SB(sb);
667 int alloc_count = 0; 662 int alloc_count = 0;
668 uint32_t elen, adsize; 663 uint32_t elen, adsize;
669 kernel_lb_addr eloc; 664 struct kernel_lb_addr eloc;
670 struct extent_position epos; 665 struct extent_position epos;
671 int8_t etype = -1; 666 int8_t etype = -1;
672 struct udf_inode_info *iinfo; 667 struct udf_inode_info *iinfo;
@@ -677,9 +672,9 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
677 672
678 iinfo = UDF_I(table); 673 iinfo = UDF_I(table);
679 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 674 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
680 adsize = sizeof(short_ad); 675 adsize = sizeof(struct short_ad);
681 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 676 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
682 adsize = sizeof(long_ad); 677 adsize = sizeof(struct long_ad);
683 else 678 else
684 return 0; 679 return 0;
685 680
@@ -707,7 +702,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
707 alloc_count = block_count; 702 alloc_count = block_count;
708 eloc.logicalBlockNum += alloc_count; 703 eloc.logicalBlockNum += alloc_count;
709 elen -= (alloc_count << sb->s_blocksize_bits); 704 elen -= (alloc_count << sb->s_blocksize_bits);
710 udf_write_aext(table, &epos, eloc, 705 udf_write_aext(table, &epos, &eloc,
711 (etype << 30) | elen, 1); 706 (etype << 30) | elen, 1);
712 } else 707 } else
713 udf_delete_aext(table, epos, eloc, 708 udf_delete_aext(table, epos, eloc,
@@ -718,10 +713,8 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
718 713
719 brelse(epos.bh); 714 brelse(epos.bh);
720 715
721 if (alloc_count && udf_add_free_space(sbi, partition, -alloc_count)) { 716 if (alloc_count)
722 mark_buffer_dirty(sbi->s_lvid_bh); 717 udf_add_free_space(sb, partition, -alloc_count);
723 sb->s_dirt = 1;
724 }
725 mutex_unlock(&sbi->s_alloc_mutex); 718 mutex_unlock(&sbi->s_alloc_mutex);
726 return alloc_count; 719 return alloc_count;
727} 720}
@@ -735,7 +728,7 @@ static int udf_table_new_block(struct super_block *sb,
735 uint32_t spread = 0xFFFFFFFF, nspread = 0xFFFFFFFF; 728 uint32_t spread = 0xFFFFFFFF, nspread = 0xFFFFFFFF;
736 uint32_t newblock = 0, adsize; 729 uint32_t newblock = 0, adsize;
737 uint32_t elen, goal_elen = 0; 730 uint32_t elen, goal_elen = 0;
738 kernel_lb_addr eloc, uninitialized_var(goal_eloc); 731 struct kernel_lb_addr eloc, uninitialized_var(goal_eloc);
739 struct extent_position epos, goal_epos; 732 struct extent_position epos, goal_epos;
740 int8_t etype; 733 int8_t etype;
741 struct udf_inode_info *iinfo = UDF_I(table); 734 struct udf_inode_info *iinfo = UDF_I(table);
@@ -743,9 +736,9 @@ static int udf_table_new_block(struct super_block *sb,
743 *err = -ENOSPC; 736 *err = -ENOSPC;
744 737
745 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 738 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
746 adsize = sizeof(short_ad); 739 adsize = sizeof(struct short_ad);
747 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 740 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
748 adsize = sizeof(long_ad); 741 adsize = sizeof(struct long_ad);
749 else 742 else
750 return newblock; 743 return newblock;
751 744
@@ -814,46 +807,37 @@ static int udf_table_new_block(struct super_block *sb,
814 } 807 }
815 808
816 if (goal_elen) 809 if (goal_elen)
817 udf_write_aext(table, &goal_epos, goal_eloc, goal_elen, 1); 810 udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1);
818 else 811 else
819 udf_delete_aext(table, goal_epos, goal_eloc, goal_elen); 812 udf_delete_aext(table, goal_epos, goal_eloc, goal_elen);
820 brelse(goal_epos.bh); 813 brelse(goal_epos.bh);
821 814
822 if (udf_add_free_space(sbi, partition, -1)) 815 udf_add_free_space(sb, partition, -1);
823 mark_buffer_dirty(sbi->s_lvid_bh);
824 816
825 sb->s_dirt = 1;
826 mutex_unlock(&sbi->s_alloc_mutex); 817 mutex_unlock(&sbi->s_alloc_mutex);
827 *err = 0; 818 *err = 0;
828 return newblock; 819 return newblock;
829} 820}
830 821
831inline void udf_free_blocks(struct super_block *sb, 822void udf_free_blocks(struct super_block *sb, struct inode *inode,
832 struct inode *inode, 823 struct kernel_lb_addr *bloc, uint32_t offset,
833 kernel_lb_addr bloc, uint32_t offset, 824 uint32_t count)
834 uint32_t count)
835{ 825{
836 uint16_t partition = bloc.partitionReferenceNum; 826 uint16_t partition = bloc->partitionReferenceNum;
837 struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition]; 827 struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
838 828
839 if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) { 829 if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) {
840 return udf_bitmap_free_blocks(sb, inode, 830 udf_bitmap_free_blocks(sb, inode, map->s_uspace.s_bitmap,
841 map->s_uspace.s_bitmap, 831 bloc, offset, count);
842 bloc, offset, count);
843 } else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) { 832 } else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) {
844 return udf_table_free_blocks(sb, inode, 833 udf_table_free_blocks(sb, inode, map->s_uspace.s_table,
845 map->s_uspace.s_table, 834 bloc, offset, count);
846 bloc, offset, count);
847 } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) { 835 } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) {
848 return udf_bitmap_free_blocks(sb, inode, 836 udf_bitmap_free_blocks(sb, inode, map->s_fspace.s_bitmap,
849 map->s_fspace.s_bitmap, 837 bloc, offset, count);
850 bloc, offset, count);
851 } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) { 838 } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) {
852 return udf_table_free_blocks(sb, inode, 839 udf_table_free_blocks(sb, inode, map->s_fspace.s_table,
853 map->s_fspace.s_table, 840 bloc, offset, count);
854 bloc, offset, count);
855 } else {
856 return;
857 } 841 }
858} 842}
859 843
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 62dc270c69d1..2efd4d5291b6 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -51,7 +51,7 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
51 uint8_t lfi; 51 uint8_t lfi;
52 loff_t size = udf_ext0_offset(dir) + dir->i_size; 52 loff_t size = udf_ext0_offset(dir) + dir->i_size;
53 struct buffer_head *tmp, *bha[16]; 53 struct buffer_head *tmp, *bha[16];
54 kernel_lb_addr eloc; 54 struct kernel_lb_addr eloc;
55 uint32_t elen; 55 uint32_t elen;
56 sector_t offset; 56 sector_t offset;
57 int i, num, ret = 0; 57 int i, num, ret = 0;
@@ -80,13 +80,13 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
80 ret = -ENOENT; 80 ret = -ENOENT;
81 goto out; 81 goto out;
82 } 82 }
83 block = udf_get_lb_pblock(dir->i_sb, eloc, offset); 83 block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
84 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { 84 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
85 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 85 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
86 epos.offset -= sizeof(short_ad); 86 epos.offset -= sizeof(struct short_ad);
87 else if (iinfo->i_alloc_type == 87 else if (iinfo->i_alloc_type ==
88 ICBTAG_FLAG_AD_LONG) 88 ICBTAG_FLAG_AD_LONG)
89 epos.offset -= sizeof(long_ad); 89 epos.offset -= sizeof(struct long_ad);
90 } else { 90 } else {
91 offset = 0; 91 offset = 0;
92 } 92 }
@@ -101,7 +101,7 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
101 if (i + offset > (elen >> dir->i_sb->s_blocksize_bits)) 101 if (i + offset > (elen >> dir->i_sb->s_blocksize_bits))
102 i = (elen >> dir->i_sb->s_blocksize_bits) - offset; 102 i = (elen >> dir->i_sb->s_blocksize_bits) - offset;
103 for (num = 0; i > 0; i--) { 103 for (num = 0; i > 0; i--) {
104 block = udf_get_lb_pblock(dir->i_sb, eloc, offset + i); 104 block = udf_get_lb_pblock(dir->i_sb, &eloc, offset + i);
105 tmp = udf_tgetblk(dir->i_sb, block); 105 tmp = udf_tgetblk(dir->i_sb, block);
106 if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp)) 106 if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp))
107 bha[num++] = tmp; 107 bha[num++] = tmp;
@@ -161,9 +161,9 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
161 memcpy(fname, "..", flen); 161 memcpy(fname, "..", flen);
162 dt_type = DT_DIR; 162 dt_type = DT_DIR;
163 } else { 163 } else {
164 kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation); 164 struct kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation);
165 165
166 iblock = udf_get_lb_pblock(dir->i_sb, tloc, 0); 166 iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0);
167 flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi); 167 flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
168 dt_type = DT_UNKNOWN; 168 dt_type = DT_UNKNOWN;
169 } 169 }
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 2820f8fcf4cc..1d2c570704c8 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -20,7 +20,7 @@
20 20
21#if 0 21#if 0
22static uint8_t *udf_filead_read(struct inode *dir, uint8_t *tmpad, 22static uint8_t *udf_filead_read(struct inode *dir, uint8_t *tmpad,
23 uint8_t ad_size, kernel_lb_addr fe_loc, 23 uint8_t ad_size, struct kernel_lb_addr fe_loc,
24 int *pos, int *offset, struct buffer_head **bh, 24 int *pos, int *offset, struct buffer_head **bh,
25 int *error) 25 int *error)
26{ 26{
@@ -75,7 +75,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
75 struct udf_fileident_bh *fibh, 75 struct udf_fileident_bh *fibh,
76 struct fileIdentDesc *cfi, 76 struct fileIdentDesc *cfi,
77 struct extent_position *epos, 77 struct extent_position *epos,
78 kernel_lb_addr *eloc, uint32_t *elen, 78 struct kernel_lb_addr *eloc, uint32_t *elen,
79 sector_t *offset) 79 sector_t *offset)
80{ 80{
81 struct fileIdentDesc *fi; 81 struct fileIdentDesc *fi;
@@ -111,7 +111,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
111 (EXT_RECORDED_ALLOCATED >> 30)) 111 (EXT_RECORDED_ALLOCATED >> 30))
112 return NULL; 112 return NULL;
113 113
114 block = udf_get_lb_pblock(dir->i_sb, *eloc, *offset); 114 block = udf_get_lb_pblock(dir->i_sb, eloc, *offset);
115 115
116 (*offset)++; 116 (*offset)++;
117 117
@@ -131,7 +131,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
131 if (i + *offset > (*elen >> blocksize_bits)) 131 if (i + *offset > (*elen >> blocksize_bits))
132 i = (*elen >> blocksize_bits)-*offset; 132 i = (*elen >> blocksize_bits)-*offset;
133 for (num = 0; i > 0; i--) { 133 for (num = 0; i > 0; i--) {
134 block = udf_get_lb_pblock(dir->i_sb, *eloc, 134 block = udf_get_lb_pblock(dir->i_sb, eloc,
135 *offset + i); 135 *offset + i);
136 tmp = udf_tgetblk(dir->i_sb, block); 136 tmp = udf_tgetblk(dir->i_sb, block);
137 if (tmp && !buffer_uptodate(tmp) && 137 if (tmp && !buffer_uptodate(tmp) &&
@@ -169,7 +169,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
169 (EXT_RECORDED_ALLOCATED >> 30)) 169 (EXT_RECORDED_ALLOCATED >> 30))
170 return NULL; 170 return NULL;
171 171
172 block = udf_get_lb_pblock(dir->i_sb, *eloc, *offset); 172 block = udf_get_lb_pblock(dir->i_sb, eloc, *offset);
173 173
174 (*offset)++; 174 (*offset)++;
175 175
@@ -249,9 +249,9 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset)
249} 249}
250 250
251#if 0 251#if 0
252static extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset) 252static struct extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
253{ 253{
254 extent_ad *ext; 254 struct extent_ad *ext;
255 struct fileEntry *fe; 255 struct fileEntry *fe;
256 uint8_t *ptr; 256 uint8_t *ptr;
257 257
@@ -274,54 +274,54 @@ static extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
274 if ((*offset > 0) && (*offset < le32_to_cpu(fe->lengthAllocDescs))) 274 if ((*offset > 0) && (*offset < le32_to_cpu(fe->lengthAllocDescs)))
275 ptr += *offset; 275 ptr += *offset;
276 276
277 ext = (extent_ad *)ptr; 277 ext = (struct extent_ad *)ptr;
278 278
279 *offset = *offset + sizeof(extent_ad); 279 *offset = *offset + sizeof(struct extent_ad);
280 return ext; 280 return ext;
281} 281}
282#endif 282#endif
283 283
284short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset, 284struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset,
285 int inc) 285 int inc)
286{ 286{
287 short_ad *sa; 287 struct short_ad *sa;
288 288
289 if ((!ptr) || (!offset)) { 289 if ((!ptr) || (!offset)) {
290 printk(KERN_ERR "udf: udf_get_fileshortad() invalidparms\n"); 290 printk(KERN_ERR "udf: udf_get_fileshortad() invalidparms\n");
291 return NULL; 291 return NULL;
292 } 292 }
293 293
294 if ((*offset + sizeof(short_ad)) > maxoffset) 294 if ((*offset + sizeof(struct short_ad)) > maxoffset)
295 return NULL; 295 return NULL;
296 else { 296 else {
297 sa = (short_ad *)ptr; 297 sa = (struct short_ad *)ptr;
298 if (sa->extLength == 0) 298 if (sa->extLength == 0)
299 return NULL; 299 return NULL;
300 } 300 }
301 301
302 if (inc) 302 if (inc)
303 *offset += sizeof(short_ad); 303 *offset += sizeof(struct short_ad);
304 return sa; 304 return sa;
305} 305}
306 306
307long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc) 307struct long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc)
308{ 308{
309 long_ad *la; 309 struct long_ad *la;
310 310
311 if ((!ptr) || (!offset)) { 311 if ((!ptr) || (!offset)) {
312 printk(KERN_ERR "udf: udf_get_filelongad() invalidparms\n"); 312 printk(KERN_ERR "udf: udf_get_filelongad() invalidparms\n");
313 return NULL; 313 return NULL;
314 } 314 }
315 315
316 if ((*offset + sizeof(long_ad)) > maxoffset) 316 if ((*offset + sizeof(struct long_ad)) > maxoffset)
317 return NULL; 317 return NULL;
318 else { 318 else {
319 la = (long_ad *)ptr; 319 la = (struct long_ad *)ptr;
320 if (la->extLength == 0) 320 if (la->extLength == 0)
321 return NULL; 321 return NULL;
322 } 322 }
323 323
324 if (inc) 324 if (inc)
325 *offset += sizeof(long_ad); 325 *offset += sizeof(struct long_ad);
326 return la; 326 return la;
327} 327}
diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h
index a0974df82b31..4792b771aa80 100644
--- a/fs/udf/ecma_167.h
+++ b/fs/udf/ecma_167.h
@@ -38,10 +38,10 @@
38#define _ECMA_167_H 1 38#define _ECMA_167_H 1
39 39
40/* Character set specification (ECMA 167r3 1/7.2.1) */ 40/* Character set specification (ECMA 167r3 1/7.2.1) */
41typedef struct { 41struct charspec {
42 uint8_t charSetType; 42 uint8_t charSetType;
43 uint8_t charSetInfo[63]; 43 uint8_t charSetInfo[63];
44} __attribute__ ((packed)) charspec; 44} __attribute__ ((packed));
45 45
46/* Character Set Type (ECMA 167r3 1/7.2.1.1) */ 46/* Character Set Type (ECMA 167r3 1/7.2.1.1) */
47#define CHARSPEC_TYPE_CS0 0x00 /* (1/7.2.2) */ 47#define CHARSPEC_TYPE_CS0 0x00 /* (1/7.2.2) */
@@ -57,7 +57,7 @@ typedef struct {
57typedef uint8_t dstring; 57typedef uint8_t dstring;
58 58
59/* Timestamp (ECMA 167r3 1/7.3) */ 59/* Timestamp (ECMA 167r3 1/7.3) */
60typedef struct { 60struct timestamp {
61 __le16 typeAndTimezone; 61 __le16 typeAndTimezone;
62 __le16 year; 62 __le16 year;
63 uint8_t month; 63 uint8_t month;
@@ -68,7 +68,7 @@ typedef struct {
68 uint8_t centiseconds; 68 uint8_t centiseconds;
69 uint8_t hundredsOfMicroseconds; 69 uint8_t hundredsOfMicroseconds;
70 uint8_t microseconds; 70 uint8_t microseconds;
71} __attribute__ ((packed)) timestamp; 71} __attribute__ ((packed));
72 72
73/* Type and Time Zone (ECMA 167r3 1/7.3.1) */ 73/* Type and Time Zone (ECMA 167r3 1/7.3.1) */
74#define TIMESTAMP_TYPE_MASK 0xF000 74#define TIMESTAMP_TYPE_MASK 0xF000
@@ -78,11 +78,11 @@ typedef struct {
78#define TIMESTAMP_TIMEZONE_MASK 0x0FFF 78#define TIMESTAMP_TIMEZONE_MASK 0x0FFF
79 79
80/* Entity identifier (ECMA 167r3 1/7.4) */ 80/* Entity identifier (ECMA 167r3 1/7.4) */
81typedef struct { 81struct regid {
82 uint8_t flags; 82 uint8_t flags;
83 uint8_t ident[23]; 83 uint8_t ident[23];
84 uint8_t identSuffix[8]; 84 uint8_t identSuffix[8];
85} __attribute__ ((packed)) regid; 85} __attribute__ ((packed));
86 86
87/* Flags (ECMA 167r3 1/7.4.1) */ 87/* Flags (ECMA 167r3 1/7.4.1) */
88#define ENTITYID_FLAGS_DIRTY 0x00 88#define ENTITYID_FLAGS_DIRTY 0x00
@@ -126,38 +126,38 @@ struct terminatingExtendedAreaDesc {
126 126
127/* Boot Descriptor (ECMA 167r3 2/9.4) */ 127/* Boot Descriptor (ECMA 167r3 2/9.4) */
128struct bootDesc { 128struct bootDesc {
129 uint8_t structType; 129 uint8_t structType;
130 uint8_t stdIdent[VSD_STD_ID_LEN]; 130 uint8_t stdIdent[VSD_STD_ID_LEN];
131 uint8_t structVersion; 131 uint8_t structVersion;
132 uint8_t reserved1; 132 uint8_t reserved1;
133 regid archType; 133 struct regid archType;
134 regid bootIdent; 134 struct regid bootIdent;
135 __le32 bootExtLocation; 135 __le32 bootExtLocation;
136 __le32 bootExtLength; 136 __le32 bootExtLength;
137 __le64 loadAddress; 137 __le64 loadAddress;
138 __le64 startAddress; 138 __le64 startAddress;
139 timestamp descCreationDateAndTime; 139 struct timestamp descCreationDateAndTime;
140 __le16 flags; 140 __le16 flags;
141 uint8_t reserved2[32]; 141 uint8_t reserved2[32];
142 uint8_t bootUse[1906]; 142 uint8_t bootUse[1906];
143} __attribute__ ((packed)); 143} __attribute__ ((packed));
144 144
145/* Flags (ECMA 167r3 2/9.4.12) */ 145/* Flags (ECMA 167r3 2/9.4.12) */
146#define BOOT_FLAGS_ERASE 0x01 146#define BOOT_FLAGS_ERASE 0x01
147 147
148/* Extent Descriptor (ECMA 167r3 3/7.1) */ 148/* Extent Descriptor (ECMA 167r3 3/7.1) */
149typedef struct { 149struct extent_ad {
150 __le32 extLength; 150 __le32 extLength;
151 __le32 extLocation; 151 __le32 extLocation;
152} __attribute__ ((packed)) extent_ad; 152} __attribute__ ((packed));
153 153
154typedef struct { 154struct kernel_extent_ad {
155 uint32_t extLength; 155 uint32_t extLength;
156 uint32_t extLocation; 156 uint32_t extLocation;
157} kernel_extent_ad; 157};
158 158
159/* Descriptor Tag (ECMA 167r3 3/7.2) */ 159/* Descriptor Tag (ECMA 167r3 3/7.2) */
160typedef struct { 160struct tag {
161 __le16 tagIdent; 161 __le16 tagIdent;
162 __le16 descVersion; 162 __le16 descVersion;
163 uint8_t tagChecksum; 163 uint8_t tagChecksum;
@@ -166,7 +166,7 @@ typedef struct {
166 __le16 descCRC; 166 __le16 descCRC;
167 __le16 descCRCLength; 167 __le16 descCRCLength;
168 __le32 tagLocation; 168 __le32 tagLocation;
169} __attribute__ ((packed)) tag; 169} __attribute__ ((packed));
170 170
171/* Tag Identifier (ECMA 167r3 3/7.2.1) */ 171/* Tag Identifier (ECMA 167r3 3/7.2.1) */
172#define TAG_IDENT_PVD 0x0001 172#define TAG_IDENT_PVD 0x0001
@@ -190,28 +190,28 @@ struct NSRDesc {
190 190
191/* Primary Volume Descriptor (ECMA 167r3 3/10.1) */ 191/* Primary Volume Descriptor (ECMA 167r3 3/10.1) */
192struct primaryVolDesc { 192struct primaryVolDesc {
193 tag descTag; 193 struct tag descTag;
194 __le32 volDescSeqNum; 194 __le32 volDescSeqNum;
195 __le32 primaryVolDescNum; 195 __le32 primaryVolDescNum;
196 dstring volIdent[32]; 196 dstring volIdent[32];
197 __le16 volSeqNum; 197 __le16 volSeqNum;
198 __le16 maxVolSeqNum; 198 __le16 maxVolSeqNum;
199 __le16 interchangeLvl; 199 __le16 interchangeLvl;
200 __le16 maxInterchangeLvl; 200 __le16 maxInterchangeLvl;
201 __le32 charSetList; 201 __le32 charSetList;
202 __le32 maxCharSetList; 202 __le32 maxCharSetList;
203 dstring volSetIdent[128]; 203 dstring volSetIdent[128];
204 charspec descCharSet; 204 struct charspec descCharSet;
205 charspec explanatoryCharSet; 205 struct charspec explanatoryCharSet;
206 extent_ad volAbstract; 206 struct extent_ad volAbstract;
207 extent_ad volCopyright; 207 struct extent_ad volCopyright;
208 regid appIdent; 208 struct regid appIdent;
209 timestamp recordingDateAndTime; 209 struct timestamp recordingDateAndTime;
210 regid impIdent; 210 struct regid impIdent;
211 uint8_t impUse[64]; 211 uint8_t impUse[64];
212 __le32 predecessorVolDescSeqLocation; 212 __le32 predecessorVolDescSeqLocation;
213 __le16 flags; 213 __le16 flags;
214 uint8_t reserved[22]; 214 uint8_t reserved[22];
215} __attribute__ ((packed)); 215} __attribute__ ((packed));
216 216
217/* Flags (ECMA 167r3 3/10.1.21) */ 217/* Flags (ECMA 167r3 3/10.1.21) */
@@ -219,40 +219,40 @@ struct primaryVolDesc {
219 219
220/* Anchor Volume Descriptor Pointer (ECMA 167r3 3/10.2) */ 220/* Anchor Volume Descriptor Pointer (ECMA 167r3 3/10.2) */
221struct anchorVolDescPtr { 221struct anchorVolDescPtr {
222 tag descTag; 222 struct tag descTag;
223 extent_ad mainVolDescSeqExt; 223 struct extent_ad mainVolDescSeqExt;
224 extent_ad reserveVolDescSeqExt; 224 struct extent_ad reserveVolDescSeqExt;
225 uint8_t reserved[480]; 225 uint8_t reserved[480];
226} __attribute__ ((packed)); 226} __attribute__ ((packed));
227 227
228/* Volume Descriptor Pointer (ECMA 167r3 3/10.3) */ 228/* Volume Descriptor Pointer (ECMA 167r3 3/10.3) */
229struct volDescPtr { 229struct volDescPtr {
230 tag descTag; 230 struct tag descTag;
231 __le32 volDescSeqNum; 231 __le32 volDescSeqNum;
232 extent_ad nextVolDescSeqExt; 232 struct extent_ad nextVolDescSeqExt;
233 uint8_t reserved[484]; 233 uint8_t reserved[484];
234} __attribute__ ((packed)); 234} __attribute__ ((packed));
235 235
236/* Implementation Use Volume Descriptor (ECMA 167r3 3/10.4) */ 236/* Implementation Use Volume Descriptor (ECMA 167r3 3/10.4) */
237struct impUseVolDesc { 237struct impUseVolDesc {
238 tag descTag; 238 struct tag descTag;
239 __le32 volDescSeqNum; 239 __le32 volDescSeqNum;
240 regid impIdent; 240 struct regid impIdent;
241 uint8_t impUse[460]; 241 uint8_t impUse[460];
242} __attribute__ ((packed)); 242} __attribute__ ((packed));
243 243
244/* Partition Descriptor (ECMA 167r3 3/10.5) */ 244/* Partition Descriptor (ECMA 167r3 3/10.5) */
245struct partitionDesc { 245struct partitionDesc {
246 tag descTag; 246 struct tag descTag;
247 __le32 volDescSeqNum; 247 __le32 volDescSeqNum;
248 __le16 partitionFlags; 248 __le16 partitionFlags;
249 __le16 partitionNumber; 249 __le16 partitionNumber;
250 regid partitionContents; 250 struct regid partitionContents;
251 uint8_t partitionContentsUse[128]; 251 uint8_t partitionContentsUse[128];
252 __le32 accessType; 252 __le32 accessType;
253 __le32 partitionStartingLocation; 253 __le32 partitionStartingLocation;
254 __le32 partitionLength; 254 __le32 partitionLength;
255 regid impIdent; 255 struct regid impIdent;
256 uint8_t impUse[128]; 256 uint8_t impUse[128];
257 uint8_t reserved[156]; 257 uint8_t reserved[156];
258} __attribute__ ((packed)); 258} __attribute__ ((packed));
@@ -278,19 +278,19 @@ struct partitionDesc {
278 278
279/* Logical Volume Descriptor (ECMA 167r3 3/10.6) */ 279/* Logical Volume Descriptor (ECMA 167r3 3/10.6) */
280struct logicalVolDesc { 280struct logicalVolDesc {
281 tag descTag; 281 struct tag descTag;
282 __le32 volDescSeqNum; 282 __le32 volDescSeqNum;
283 charspec descCharSet; 283 struct charspec descCharSet;
284 dstring logicalVolIdent[128]; 284 dstring logicalVolIdent[128];
285 __le32 logicalBlockSize; 285 __le32 logicalBlockSize;
286 regid domainIdent; 286 struct regid domainIdent;
287 uint8_t logicalVolContentsUse[16]; 287 uint8_t logicalVolContentsUse[16];
288 __le32 mapTableLength; 288 __le32 mapTableLength;
289 __le32 numPartitionMaps; 289 __le32 numPartitionMaps;
290 regid impIdent; 290 struct regid impIdent;
291 uint8_t impUse[128]; 291 uint8_t impUse[128];
292 extent_ad integritySeqExt; 292 struct extent_ad integritySeqExt;
293 uint8_t partitionMaps[0]; 293 uint8_t partitionMaps[0];
294} __attribute__ ((packed)); 294} __attribute__ ((packed));
295 295
296/* Generic Partition Map (ECMA 167r3 3/10.7.1) */ 296/* Generic Partition Map (ECMA 167r3 3/10.7.1) */
@@ -322,30 +322,30 @@ struct genericPartitionMap2 {
322 322
323/* Unallocated Space Descriptor (ECMA 167r3 3/10.8) */ 323/* Unallocated Space Descriptor (ECMA 167r3 3/10.8) */
324struct unallocSpaceDesc { 324struct unallocSpaceDesc {
325 tag descTag; 325 struct tag descTag;
326 __le32 volDescSeqNum; 326 __le32 volDescSeqNum;
327 __le32 numAllocDescs; 327 __le32 numAllocDescs;
328 extent_ad allocDescs[0]; 328 struct extent_ad allocDescs[0];
329} __attribute__ ((packed)); 329} __attribute__ ((packed));
330 330
331/* Terminating Descriptor (ECMA 167r3 3/10.9) */ 331/* Terminating Descriptor (ECMA 167r3 3/10.9) */
332struct terminatingDesc { 332struct terminatingDesc {
333 tag descTag; 333 struct tag descTag;
334 uint8_t reserved[496]; 334 uint8_t reserved[496];
335} __attribute__ ((packed)); 335} __attribute__ ((packed));
336 336
337/* Logical Volume Integrity Descriptor (ECMA 167r3 3/10.10) */ 337/* Logical Volume Integrity Descriptor (ECMA 167r3 3/10.10) */
338struct logicalVolIntegrityDesc { 338struct logicalVolIntegrityDesc {
339 tag descTag; 339 struct tag descTag;
340 timestamp recordingDateAndTime; 340 struct timestamp recordingDateAndTime;
341 __le32 integrityType; 341 __le32 integrityType;
342 extent_ad nextIntegrityExt; 342 struct extent_ad nextIntegrityExt;
343 uint8_t logicalVolContentsUse[32]; 343 uint8_t logicalVolContentsUse[32];
344 __le32 numOfPartitions; 344 __le32 numOfPartitions;
345 __le32 lengthOfImpUse; 345 __le32 lengthOfImpUse;
346 __le32 freeSpaceTable[0]; 346 __le32 freeSpaceTable[0];
347 __le32 sizeTable[0]; 347 __le32 sizeTable[0];
348 uint8_t impUse[0]; 348 uint8_t impUse[0];
349} __attribute__ ((packed)); 349} __attribute__ ((packed));
350 350
351/* Integrity Type (ECMA 167r3 3/10.10.3) */ 351/* Integrity Type (ECMA 167r3 3/10.10.3) */
@@ -353,50 +353,50 @@ struct logicalVolIntegrityDesc {
353#define LVID_INTEGRITY_TYPE_CLOSE 0x00000001 353#define LVID_INTEGRITY_TYPE_CLOSE 0x00000001
354 354
355/* Recorded Address (ECMA 167r3 4/7.1) */ 355/* Recorded Address (ECMA 167r3 4/7.1) */
356typedef struct { 356struct lb_addr {
357 __le32 logicalBlockNum; 357 __le32 logicalBlockNum;
358 __le16 partitionReferenceNum; 358 __le16 partitionReferenceNum;
359} __attribute__ ((packed)) lb_addr; 359} __attribute__ ((packed));
360 360
361/* ... and its in-core analog */ 361/* ... and its in-core analog */
362typedef struct { 362struct kernel_lb_addr {
363 uint32_t logicalBlockNum; 363 uint32_t logicalBlockNum;
364 uint16_t partitionReferenceNum; 364 uint16_t partitionReferenceNum;
365} kernel_lb_addr; 365};
366 366
367/* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */ 367/* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */
368typedef struct { 368struct short_ad {
369 __le32 extLength; 369 __le32 extLength;
370 __le32 extPosition; 370 __le32 extPosition;
371} __attribute__ ((packed)) short_ad; 371} __attribute__ ((packed));
372 372
373/* Long Allocation Descriptor (ECMA 167r3 4/14.14.2) */ 373/* Long Allocation Descriptor (ECMA 167r3 4/14.14.2) */
374typedef struct { 374struct long_ad {
375 __le32 extLength; 375 __le32 extLength;
376 lb_addr extLocation; 376 struct lb_addr extLocation;
377 uint8_t impUse[6]; 377 uint8_t impUse[6];
378} __attribute__ ((packed)) long_ad; 378} __attribute__ ((packed));
379 379
380typedef struct { 380struct kernel_long_ad {
381 uint32_t extLength; 381 uint32_t extLength;
382 kernel_lb_addr extLocation; 382 struct kernel_lb_addr extLocation;
383 uint8_t impUse[6]; 383 uint8_t impUse[6];
384} kernel_long_ad; 384};
385 385
386/* Extended Allocation Descriptor (ECMA 167r3 4/14.14.3) */ 386/* Extended Allocation Descriptor (ECMA 167r3 4/14.14.3) */
387typedef struct { 387struct ext_ad {
388 __le32 extLength; 388 __le32 extLength;
389 __le32 recordedLength; 389 __le32 recordedLength;
390 __le32 informationLength; 390 __le32 informationLength;
391 lb_addr extLocation; 391 struct lb_addr extLocation;
392} __attribute__ ((packed)) ext_ad; 392} __attribute__ ((packed));
393 393
394typedef struct { 394struct kernel_ext_ad {
395 uint32_t extLength; 395 uint32_t extLength;
396 uint32_t recordedLength; 396 uint32_t recordedLength;
397 uint32_t informationLength; 397 uint32_t informationLength;
398 kernel_lb_addr extLocation; 398 struct kernel_lb_addr extLocation;
399} kernel_ext_ad; 399};
400 400
401/* Descriptor Tag (ECMA 167r3 4/7.2 - See 3/7.2) */ 401/* Descriptor Tag (ECMA 167r3 4/7.2 - See 3/7.2) */
402 402
@@ -415,44 +415,44 @@ typedef struct {
415 415
416/* File Set Descriptor (ECMA 167r3 4/14.1) */ 416/* File Set Descriptor (ECMA 167r3 4/14.1) */
417struct fileSetDesc { 417struct fileSetDesc {
418 tag descTag; 418 struct tag descTag;
419 timestamp recordingDateAndTime; 419 struct timestamp recordingDateAndTime;
420 __le16 interchangeLvl; 420 __le16 interchangeLvl;
421 __le16 maxInterchangeLvl; 421 __le16 maxInterchangeLvl;
422 __le32 charSetList; 422 __le32 charSetList;
423 __le32 maxCharSetList; 423 __le32 maxCharSetList;
424 __le32 fileSetNum; 424 __le32 fileSetNum;
425 __le32 fileSetDescNum; 425 __le32 fileSetDescNum;
426 charspec logicalVolIdentCharSet; 426 struct charspec logicalVolIdentCharSet;
427 dstring logicalVolIdent[128]; 427 dstring logicalVolIdent[128];
428 charspec fileSetCharSet; 428 struct charspec fileSetCharSet;
429 dstring fileSetIdent[32]; 429 dstring fileSetIdent[32];
430 dstring copyrightFileIdent[32]; 430 dstring copyrightFileIdent[32];
431 dstring abstractFileIdent[32]; 431 dstring abstractFileIdent[32];
432 long_ad rootDirectoryICB; 432 struct long_ad rootDirectoryICB;
433 regid domainIdent; 433 struct regid domainIdent;
434 long_ad nextExt; 434 struct long_ad nextExt;
435 long_ad streamDirectoryICB; 435 struct long_ad streamDirectoryICB;
436 uint8_t reserved[32]; 436 uint8_t reserved[32];
437} __attribute__ ((packed)); 437} __attribute__ ((packed));
438 438
439/* Partition Header Descriptor (ECMA 167r3 4/14.3) */ 439/* Partition Header Descriptor (ECMA 167r3 4/14.3) */
440struct partitionHeaderDesc { 440struct partitionHeaderDesc {
441 short_ad unallocSpaceTable; 441 struct short_ad unallocSpaceTable;
442 short_ad unallocSpaceBitmap; 442 struct short_ad unallocSpaceBitmap;
443 short_ad partitionIntegrityTable; 443 struct short_ad partitionIntegrityTable;
444 short_ad freedSpaceTable; 444 struct short_ad freedSpaceTable;
445 short_ad freedSpaceBitmap; 445 struct short_ad freedSpaceBitmap;
446 uint8_t reserved[88]; 446 uint8_t reserved[88];
447} __attribute__ ((packed)); 447} __attribute__ ((packed));
448 448
449/* File Identifier Descriptor (ECMA 167r3 4/14.4) */ 449/* File Identifier Descriptor (ECMA 167r3 4/14.4) */
450struct fileIdentDesc { 450struct fileIdentDesc {
451 tag descTag; 451 struct tag descTag;
452 __le16 fileVersionNum; 452 __le16 fileVersionNum;
453 uint8_t fileCharacteristics; 453 uint8_t fileCharacteristics;
454 uint8_t lengthFileIdent; 454 uint8_t lengthFileIdent;
455 long_ad icb; 455 struct long_ad icb;
456 __le16 lengthOfImpUse; 456 __le16 lengthOfImpUse;
457 uint8_t impUse[0]; 457 uint8_t impUse[0];
458 uint8_t fileIdent[0]; 458 uint8_t fileIdent[0];
@@ -468,22 +468,22 @@ struct fileIdentDesc {
468 468
469/* Allocation Ext Descriptor (ECMA 167r3 4/14.5) */ 469/* Allocation Ext Descriptor (ECMA 167r3 4/14.5) */
470struct allocExtDesc { 470struct allocExtDesc {
471 tag descTag; 471 struct tag descTag;
472 __le32 previousAllocExtLocation; 472 __le32 previousAllocExtLocation;
473 __le32 lengthAllocDescs; 473 __le32 lengthAllocDescs;
474} __attribute__ ((packed)); 474} __attribute__ ((packed));
475 475
476/* ICB Tag (ECMA 167r3 4/14.6) */ 476/* ICB Tag (ECMA 167r3 4/14.6) */
477typedef struct { 477struct icbtag {
478 __le32 priorRecordedNumDirectEntries; 478 __le32 priorRecordedNumDirectEntries;
479 __le16 strategyType; 479 __le16 strategyType;
480 __le16 strategyParameter; 480 __le16 strategyParameter;
481 __le16 numEntries; 481 __le16 numEntries;
482 uint8_t reserved; 482 uint8_t reserved;
483 uint8_t fileType; 483 uint8_t fileType;
484 lb_addr parentICBLocation; 484 struct lb_addr parentICBLocation;
485 __le16 flags; 485 __le16 flags;
486} __attribute__ ((packed)) icbtag; 486} __attribute__ ((packed));
487 487
488/* Strategy Type (ECMA 167r3 4/14.6.2) */ 488/* Strategy Type (ECMA 167r3 4/14.6.2) */
489#define ICBTAG_STRATEGY_TYPE_UNDEF 0x0000 489#define ICBTAG_STRATEGY_TYPE_UNDEF 0x0000
@@ -528,41 +528,41 @@ typedef struct {
528 528
529/* Indirect Entry (ECMA 167r3 4/14.7) */ 529/* Indirect Entry (ECMA 167r3 4/14.7) */
530struct indirectEntry { 530struct indirectEntry {
531 tag descTag; 531 struct tag descTag;
532 icbtag icbTag; 532 struct icbtag icbTag;
533 long_ad indirectICB; 533 struct long_ad indirectICB;
534} __attribute__ ((packed)); 534} __attribute__ ((packed));
535 535
536/* Terminal Entry (ECMA 167r3 4/14.8) */ 536/* Terminal Entry (ECMA 167r3 4/14.8) */
537struct terminalEntry { 537struct terminalEntry {
538 tag descTag; 538 struct tag descTag;
539 icbtag icbTag; 539 struct icbtag icbTag;
540} __attribute__ ((packed)); 540} __attribute__ ((packed));
541 541
542/* File Entry (ECMA 167r3 4/14.9) */ 542/* File Entry (ECMA 167r3 4/14.9) */
543struct fileEntry { 543struct fileEntry {
544 tag descTag; 544 struct tag descTag;
545 icbtag icbTag; 545 struct icbtag icbTag;
546 __le32 uid; 546 __le32 uid;
547 __le32 gid; 547 __le32 gid;
548 __le32 permissions; 548 __le32 permissions;
549 __le16 fileLinkCount; 549 __le16 fileLinkCount;
550 uint8_t recordFormat; 550 uint8_t recordFormat;
551 uint8_t recordDisplayAttr; 551 uint8_t recordDisplayAttr;
552 __le32 recordLength; 552 __le32 recordLength;
553 __le64 informationLength; 553 __le64 informationLength;
554 __le64 logicalBlocksRecorded; 554 __le64 logicalBlocksRecorded;
555 timestamp accessTime; 555 struct timestamp accessTime;
556 timestamp modificationTime; 556 struct timestamp modificationTime;
557 timestamp attrTime; 557 struct timestamp attrTime;
558 __le32 checkpoint; 558 __le32 checkpoint;
559 long_ad extendedAttrICB; 559 struct long_ad extendedAttrICB;
560 regid impIdent; 560 struct regid impIdent;
561 __le64 uniqueID; 561 __le64 uniqueID;
562 __le32 lengthExtendedAttr; 562 __le32 lengthExtendedAttr;
563 __le32 lengthAllocDescs; 563 __le32 lengthAllocDescs;
564 uint8_t extendedAttr[0]; 564 uint8_t extendedAttr[0];
565 uint8_t allocDescs[0]; 565 uint8_t allocDescs[0];
566} __attribute__ ((packed)); 566} __attribute__ ((packed));
567 567
568/* Permissions (ECMA 167r3 4/14.9.5) */ 568/* Permissions (ECMA 167r3 4/14.9.5) */
@@ -604,7 +604,7 @@ struct fileEntry {
604 604
605/* Extended Attribute Header Descriptor (ECMA 167r3 4/14.10.1) */ 605/* Extended Attribute Header Descriptor (ECMA 167r3 4/14.10.1) */
606struct extendedAttrHeaderDesc { 606struct extendedAttrHeaderDesc {
607 tag descTag; 607 struct tag descTag;
608 __le32 impAttrLocation; 608 __le32 impAttrLocation;
609 __le32 appAttrLocation; 609 __le32 appAttrLocation;
610} __attribute__ ((packed)); 610} __attribute__ ((packed));
@@ -687,7 +687,7 @@ struct impUseExtAttr {
687 uint8_t reserved[3]; 687 uint8_t reserved[3];
688 __le32 attrLength; 688 __le32 attrLength;
689 __le32 impUseLength; 689 __le32 impUseLength;
690 regid impIdent; 690 struct regid impIdent;
691 uint8_t impUse[0]; 691 uint8_t impUse[0];
692} __attribute__ ((packed)); 692} __attribute__ ((packed));
693 693
@@ -698,7 +698,7 @@ struct appUseExtAttr {
698 uint8_t reserved[3]; 698 uint8_t reserved[3];
699 __le32 attrLength; 699 __le32 attrLength;
700 __le32 appUseLength; 700 __le32 appUseLength;
701 regid appIdent; 701 struct regid appIdent;
702 uint8_t appUse[0]; 702 uint8_t appUse[0];
703} __attribute__ ((packed)); 703} __attribute__ ((packed));
704 704
@@ -712,15 +712,15 @@ struct appUseExtAttr {
712 712
713/* Unallocated Space Entry (ECMA 167r3 4/14.11) */ 713/* Unallocated Space Entry (ECMA 167r3 4/14.11) */
714struct unallocSpaceEntry { 714struct unallocSpaceEntry {
715 tag descTag; 715 struct tag descTag;
716 icbtag icbTag; 716 struct icbtag icbTag;
717 __le32 lengthAllocDescs; 717 __le32 lengthAllocDescs;
718 uint8_t allocDescs[0]; 718 uint8_t allocDescs[0];
719} __attribute__ ((packed)); 719} __attribute__ ((packed));
720 720
721/* Space Bitmap Descriptor (ECMA 167r3 4/14.12) */ 721/* Space Bitmap Descriptor (ECMA 167r3 4/14.12) */
722struct spaceBitmapDesc { 722struct spaceBitmapDesc {
723 tag descTag; 723 struct tag descTag;
724 __le32 numOfBits; 724 __le32 numOfBits;
725 __le32 numOfBytes; 725 __le32 numOfBytes;
726 uint8_t bitmap[0]; 726 uint8_t bitmap[0];
@@ -728,13 +728,13 @@ struct spaceBitmapDesc {
728 728
729/* Partition Integrity Entry (ECMA 167r3 4/14.13) */ 729/* Partition Integrity Entry (ECMA 167r3 4/14.13) */
730struct partitionIntegrityEntry { 730struct partitionIntegrityEntry {
731 tag descTag; 731 struct tag descTag;
732 icbtag icbTag; 732 struct icbtag icbTag;
733 timestamp recordingDateAndTime; 733 struct timestamp recordingDateAndTime;
734 uint8_t integrityType; 734 uint8_t integrityType;
735 uint8_t reserved[175]; 735 uint8_t reserved[175];
736 regid impIdent; 736 struct regid impIdent;
737 uint8_t impUse[256]; 737 uint8_t impUse[256];
738} __attribute__ ((packed)); 738} __attribute__ ((packed));
739 739
740/* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */ 740/* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */
@@ -765,32 +765,32 @@ struct pathComponent {
765 765
766/* File Entry (ECMA 167r3 4/14.17) */ 766/* File Entry (ECMA 167r3 4/14.17) */
767struct extendedFileEntry { 767struct extendedFileEntry {
768 tag descTag; 768 struct tag descTag;
769 icbtag icbTag; 769 struct icbtag icbTag;
770 __le32 uid; 770 __le32 uid;
771 __le32 gid; 771 __le32 gid;
772 __le32 permissions; 772 __le32 permissions;
773 __le16 fileLinkCount; 773 __le16 fileLinkCount;
774 uint8_t recordFormat; 774 uint8_t recordFormat;
775 uint8_t recordDisplayAttr; 775 uint8_t recordDisplayAttr;
776 __le32 recordLength; 776 __le32 recordLength;
777 __le64 informationLength; 777 __le64 informationLength;
778 __le64 objectSize; 778 __le64 objectSize;
779 __le64 logicalBlocksRecorded; 779 __le64 logicalBlocksRecorded;
780 timestamp accessTime; 780 struct timestamp accessTime;
781 timestamp modificationTime; 781 struct timestamp modificationTime;
782 timestamp createTime; 782 struct timestamp createTime;
783 timestamp attrTime; 783 struct timestamp attrTime;
784 __le32 checkpoint; 784 __le32 checkpoint;
785 __le32 reserved; 785 __le32 reserved;
786 long_ad extendedAttrICB; 786 struct long_ad extendedAttrICB;
787 long_ad streamDirectoryICB; 787 struct long_ad streamDirectoryICB;
788 regid impIdent; 788 struct regid impIdent;
789 __le64 uniqueID; 789 __le64 uniqueID;
790 __le32 lengthExtendedAttr; 790 __le32 lengthExtendedAttr;
791 __le32 lengthAllocDescs; 791 __le32 lengthAllocDescs;
792 uint8_t extendedAttr[0]; 792 uint8_t extendedAttr[0];
793 uint8_t allocDescs[0]; 793 uint8_t allocDescs[0];
794} __attribute__ ((packed)); 794} __attribute__ ((packed));
795 795
796#endif /* _ECMA_167_H */ 796#endif /* _ECMA_167_H */
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 47dbe5613f90..c10fa39f97e2 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -49,12 +49,11 @@ void udf_free_inode(struct inode *inode)
49 le32_add_cpu(&lvidiu->numDirs, -1); 49 le32_add_cpu(&lvidiu->numDirs, -1);
50 else 50 else
51 le32_add_cpu(&lvidiu->numFiles, -1); 51 le32_add_cpu(&lvidiu->numFiles, -1);
52 52 udf_updated_lvid(sb);
53 mark_buffer_dirty(sbi->s_lvid_bh);
54 } 53 }
55 mutex_unlock(&sbi->s_alloc_mutex); 54 mutex_unlock(&sbi->s_alloc_mutex);
56 55
57 udf_free_blocks(sb, NULL, UDF_I(inode)->i_location, 0, 1); 56 udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1);
58} 57}
59 58
60struct inode *udf_new_inode(struct inode *dir, int mode, int *err) 59struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
@@ -122,7 +121,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
122 if (!(++uniqueID & 0x00000000FFFFFFFFUL)) 121 if (!(++uniqueID & 0x00000000FFFFFFFFUL))
123 uniqueID += 16; 122 uniqueID += 16;
124 lvhd->uniqueID = cpu_to_le64(uniqueID); 123 lvhd->uniqueID = cpu_to_le64(uniqueID);
125 mark_buffer_dirty(sbi->s_lvid_bh); 124 udf_updated_lvid(sb);
126 } 125 }
127 mutex_unlock(&sbi->s_alloc_mutex); 126 mutex_unlock(&sbi->s_alloc_mutex);
128 inode->i_mode = mode; 127 inode->i_mode = mode;
@@ -138,7 +137,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
138 iinfo->i_location.logicalBlockNum = block; 137 iinfo->i_location.logicalBlockNum = block;
139 iinfo->i_location.partitionReferenceNum = 138 iinfo->i_location.partitionReferenceNum =
140 dinfo->i_location.partitionReferenceNum; 139 dinfo->i_location.partitionReferenceNum;
141 inode->i_ino = udf_get_lb_pblock(sb, iinfo->i_location, 0); 140 inode->i_ino = udf_get_lb_pblock(sb, &iinfo->i_location, 0);
142 inode->i_blocks = 0; 141 inode->i_blocks = 0;
143 iinfo->i_lenEAttr = 0; 142 iinfo->i_lenEAttr = 0;
144 iinfo->i_lenAlloc = 0; 143 iinfo->i_lenAlloc = 0;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 30ebde490f7f..e7533f785636 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -55,15 +55,15 @@ static int udf_alloc_i_data(struct inode *inode, size_t size);
55static struct buffer_head *inode_getblk(struct inode *, sector_t, int *, 55static struct buffer_head *inode_getblk(struct inode *, sector_t, int *,
56 sector_t *, int *); 56 sector_t *, int *);
57static int8_t udf_insert_aext(struct inode *, struct extent_position, 57static int8_t udf_insert_aext(struct inode *, struct extent_position,
58 kernel_lb_addr, uint32_t); 58 struct kernel_lb_addr, uint32_t);
59static void udf_split_extents(struct inode *, int *, int, int, 59static void udf_split_extents(struct inode *, int *, int, int,
60 kernel_long_ad[EXTENT_MERGE_SIZE], int *); 60 struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
61static void udf_prealloc_extents(struct inode *, int, int, 61static void udf_prealloc_extents(struct inode *, int, int,
62 kernel_long_ad[EXTENT_MERGE_SIZE], int *); 62 struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
63static void udf_merge_extents(struct inode *, 63static void udf_merge_extents(struct inode *,
64 kernel_long_ad[EXTENT_MERGE_SIZE], int *); 64 struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
65static void udf_update_extents(struct inode *, 65static void udf_update_extents(struct inode *,
66 kernel_long_ad[EXTENT_MERGE_SIZE], int, int, 66 struct kernel_long_ad[EXTENT_MERGE_SIZE], int, int,
67 struct extent_position *); 67 struct extent_position *);
68static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); 68static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
69 69
@@ -200,7 +200,7 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
200{ 200{
201 int newblock; 201 int newblock;
202 struct buffer_head *dbh = NULL; 202 struct buffer_head *dbh = NULL;
203 kernel_lb_addr eloc; 203 struct kernel_lb_addr eloc;
204 uint32_t elen; 204 uint32_t elen;
205 uint8_t alloctype; 205 uint8_t alloctype;
206 struct extent_position epos; 206 struct extent_position epos;
@@ -281,7 +281,7 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
281 epos.bh = NULL; 281 epos.bh = NULL;
282 epos.block = iinfo->i_location; 282 epos.block = iinfo->i_location;
283 epos.offset = udf_file_entry_alloc_offset(inode); 283 epos.offset = udf_file_entry_alloc_offset(inode);
284 udf_add_aext(inode, &epos, eloc, elen, 0); 284 udf_add_aext(inode, &epos, &eloc, elen, 0);
285 /* UniqueID stuff */ 285 /* UniqueID stuff */
286 286
287 brelse(epos.bh); 287 brelse(epos.bh);
@@ -359,12 +359,12 @@ static struct buffer_head *udf_getblk(struct inode *inode, long block,
359 359
360/* Extend the file by 'blocks' blocks, return the number of extents added */ 360/* Extend the file by 'blocks' blocks, return the number of extents added */
361int udf_extend_file(struct inode *inode, struct extent_position *last_pos, 361int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
362 kernel_long_ad *last_ext, sector_t blocks) 362 struct kernel_long_ad *last_ext, sector_t blocks)
363{ 363{
364 sector_t add; 364 sector_t add;
365 int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK); 365 int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
366 struct super_block *sb = inode->i_sb; 366 struct super_block *sb = inode->i_sb;
367 kernel_lb_addr prealloc_loc = {}; 367 struct kernel_lb_addr prealloc_loc = {};
368 int prealloc_len = 0; 368 int prealloc_len = 0;
369 struct udf_inode_info *iinfo; 369 struct udf_inode_info *iinfo;
370 370
@@ -411,11 +411,11 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
411 } 411 }
412 412
413 if (fake) { 413 if (fake) {
414 udf_add_aext(inode, last_pos, last_ext->extLocation, 414 udf_add_aext(inode, last_pos, &last_ext->extLocation,
415 last_ext->extLength, 1); 415 last_ext->extLength, 1);
416 count++; 416 count++;
417 } else 417 } else
418 udf_write_aext(inode, last_pos, last_ext->extLocation, 418 udf_write_aext(inode, last_pos, &last_ext->extLocation,
419 last_ext->extLength, 1); 419 last_ext->extLength, 1);
420 420
421 /* Managed to do everything necessary? */ 421 /* Managed to do everything necessary? */
@@ -432,7 +432,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
432 /* Create enough extents to cover the whole hole */ 432 /* Create enough extents to cover the whole hole */
433 while (blocks > add) { 433 while (blocks > add) {
434 blocks -= add; 434 blocks -= add;
435 if (udf_add_aext(inode, last_pos, last_ext->extLocation, 435 if (udf_add_aext(inode, last_pos, &last_ext->extLocation,
436 last_ext->extLength, 1) == -1) 436 last_ext->extLength, 1) == -1)
437 return -1; 437 return -1;
438 count++; 438 count++;
@@ -440,7 +440,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
440 if (blocks) { 440 if (blocks) {
441 last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | 441 last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
442 (blocks << sb->s_blocksize_bits); 442 (blocks << sb->s_blocksize_bits);
443 if (udf_add_aext(inode, last_pos, last_ext->extLocation, 443 if (udf_add_aext(inode, last_pos, &last_ext->extLocation,
444 last_ext->extLength, 1) == -1) 444 last_ext->extLength, 1) == -1)
445 return -1; 445 return -1;
446 count++; 446 count++;
@@ -449,7 +449,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
449out: 449out:
450 /* Do we have some preallocated blocks saved? */ 450 /* Do we have some preallocated blocks saved? */
451 if (prealloc_len) { 451 if (prealloc_len) {
452 if (udf_add_aext(inode, last_pos, prealloc_loc, 452 if (udf_add_aext(inode, last_pos, &prealloc_loc,
453 prealloc_len, 1) == -1) 453 prealloc_len, 1) == -1)
454 return -1; 454 return -1;
455 last_ext->extLocation = prealloc_loc; 455 last_ext->extLocation = prealloc_loc;
@@ -459,9 +459,9 @@ out:
459 459
460 /* last_pos should point to the last written extent... */ 460 /* last_pos should point to the last written extent... */
461 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 461 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
462 last_pos->offset -= sizeof(short_ad); 462 last_pos->offset -= sizeof(struct short_ad);
463 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 463 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
464 last_pos->offset -= sizeof(long_ad); 464 last_pos->offset -= sizeof(struct long_ad);
465 else 465 else
466 return -1; 466 return -1;
467 467
@@ -473,11 +473,11 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
473{ 473{
474 static sector_t last_block; 474 static sector_t last_block;
475 struct buffer_head *result = NULL; 475 struct buffer_head *result = NULL;
476 kernel_long_ad laarr[EXTENT_MERGE_SIZE]; 476 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE];
477 struct extent_position prev_epos, cur_epos, next_epos; 477 struct extent_position prev_epos, cur_epos, next_epos;
478 int count = 0, startnum = 0, endnum = 0; 478 int count = 0, startnum = 0, endnum = 0;
479 uint32_t elen = 0, tmpelen; 479 uint32_t elen = 0, tmpelen;
480 kernel_lb_addr eloc, tmpeloc; 480 struct kernel_lb_addr eloc, tmpeloc;
481 int c = 1; 481 int c = 1;
482 loff_t lbcount = 0, b_off = 0; 482 loff_t lbcount = 0, b_off = 0;
483 uint32_t newblocknum, newblock; 483 uint32_t newblocknum, newblock;
@@ -550,12 +550,12 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
550 elen = EXT_RECORDED_ALLOCATED | 550 elen = EXT_RECORDED_ALLOCATED |
551 ((elen + inode->i_sb->s_blocksize - 1) & 551 ((elen + inode->i_sb->s_blocksize - 1) &
552 ~(inode->i_sb->s_blocksize - 1)); 552 ~(inode->i_sb->s_blocksize - 1));
553 etype = udf_write_aext(inode, &cur_epos, eloc, elen, 1); 553 etype = udf_write_aext(inode, &cur_epos, &eloc, elen, 1);
554 } 554 }
555 brelse(prev_epos.bh); 555 brelse(prev_epos.bh);
556 brelse(cur_epos.bh); 556 brelse(cur_epos.bh);
557 brelse(next_epos.bh); 557 brelse(next_epos.bh);
558 newblock = udf_get_lb_pblock(inode->i_sb, eloc, offset); 558 newblock = udf_get_lb_pblock(inode->i_sb, &eloc, offset);
559 *phys = newblock; 559 *phys = newblock;
560 return NULL; 560 return NULL;
561 } 561 }
@@ -572,7 +572,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
572 } else { 572 } else {
573 /* Create a fake extent when there's not one */ 573 /* Create a fake extent when there's not one */
574 memset(&laarr[0].extLocation, 0x00, 574 memset(&laarr[0].extLocation, 0x00,
575 sizeof(kernel_lb_addr)); 575 sizeof(struct kernel_lb_addr));
576 laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED; 576 laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
577 /* Will udf_extend_file() create real extent from 577 /* Will udf_extend_file() create real extent from
578 a fake one? */ 578 a fake one? */
@@ -602,7 +602,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
602 laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | 602 laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
603 inode->i_sb->s_blocksize; 603 inode->i_sb->s_blocksize;
604 memset(&laarr[c].extLocation, 0x00, 604 memset(&laarr[c].extLocation, 0x00,
605 sizeof(kernel_lb_addr)); 605 sizeof(struct kernel_lb_addr));
606 count++; 606 count++;
607 endnum++; 607 endnum++;
608 } 608 }
@@ -699,7 +699,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
699 699
700static void udf_split_extents(struct inode *inode, int *c, int offset, 700static void udf_split_extents(struct inode *inode, int *c, int offset,
701 int newblocknum, 701 int newblocknum,
702 kernel_long_ad laarr[EXTENT_MERGE_SIZE], 702 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
703 int *endnum) 703 int *endnum)
704{ 704{
705 unsigned long blocksize = inode->i_sb->s_blocksize; 705 unsigned long blocksize = inode->i_sb->s_blocksize;
@@ -726,7 +726,7 @@ static void udf_split_extents(struct inode *inode, int *c, int offset,
726 if (offset) { 726 if (offset) {
727 if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { 727 if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
728 udf_free_blocks(inode->i_sb, inode, 728 udf_free_blocks(inode->i_sb, inode,
729 laarr[curr].extLocation, 729 &laarr[curr].extLocation,
730 0, offset); 730 0, offset);
731 laarr[curr].extLength = 731 laarr[curr].extLength =
732 EXT_NOT_RECORDED_NOT_ALLOCATED | 732 EXT_NOT_RECORDED_NOT_ALLOCATED |
@@ -763,7 +763,7 @@ static void udf_split_extents(struct inode *inode, int *c, int offset,
763} 763}
764 764
765static void udf_prealloc_extents(struct inode *inode, int c, int lastblock, 765static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
766 kernel_long_ad laarr[EXTENT_MERGE_SIZE], 766 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
767 int *endnum) 767 int *endnum)
768{ 768{
769 int start, length = 0, currlength = 0, i; 769 int start, length = 0, currlength = 0, i;
@@ -817,7 +817,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
817 inode->i_sb->s_blocksize_bits); 817 inode->i_sb->s_blocksize_bits);
818 else { 818 else {
819 memmove(&laarr[c + 2], &laarr[c + 1], 819 memmove(&laarr[c + 2], &laarr[c + 1],
820 sizeof(long_ad) * (*endnum - (c + 1))); 820 sizeof(struct long_ad) * (*endnum - (c + 1)));
821 (*endnum)++; 821 (*endnum)++;
822 laarr[c + 1].extLocation.logicalBlockNum = next; 822 laarr[c + 1].extLocation.logicalBlockNum = next;
823 laarr[c + 1].extLocation.partitionReferenceNum = 823 laarr[c + 1].extLocation.partitionReferenceNum =
@@ -846,7 +846,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
846 if (*endnum > (i + 1)) 846 if (*endnum > (i + 1))
847 memmove(&laarr[i], 847 memmove(&laarr[i],
848 &laarr[i + 1], 848 &laarr[i + 1],
849 sizeof(long_ad) * 849 sizeof(struct long_ad) *
850 (*endnum - (i + 1))); 850 (*endnum - (i + 1)));
851 i--; 851 i--;
852 (*endnum)--; 852 (*endnum)--;
@@ -859,7 +859,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
859} 859}
860 860
861static void udf_merge_extents(struct inode *inode, 861static void udf_merge_extents(struct inode *inode,
862 kernel_long_ad laarr[EXTENT_MERGE_SIZE], 862 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
863 int *endnum) 863 int *endnum)
864{ 864{
865 int i; 865 int i;
@@ -867,8 +867,8 @@ static void udf_merge_extents(struct inode *inode,
867 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; 867 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
868 868
869 for (i = 0; i < (*endnum - 1); i++) { 869 for (i = 0; i < (*endnum - 1); i++) {
870 kernel_long_ad *li /*l[i]*/ = &laarr[i]; 870 struct kernel_long_ad *li /*l[i]*/ = &laarr[i];
871 kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1]; 871 struct kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1];
872 872
873 if (((li->extLength >> 30) == (lip1->extLength >> 30)) && 873 if (((li->extLength >> 30) == (lip1->extLength >> 30)) &&
874 (((li->extLength >> 30) == 874 (((li->extLength >> 30) ==
@@ -902,7 +902,7 @@ static void udf_merge_extents(struct inode *inode,
902 blocksize - 1) & ~(blocksize - 1)); 902 blocksize - 1) & ~(blocksize - 1));
903 if (*endnum > (i + 2)) 903 if (*endnum > (i + 2))
904 memmove(&laarr[i + 1], &laarr[i + 2], 904 memmove(&laarr[i + 1], &laarr[i + 2],
905 sizeof(long_ad) * 905 sizeof(struct long_ad) *
906 (*endnum - (i + 2))); 906 (*endnum - (i + 2)));
907 i--; 907 i--;
908 (*endnum)--; 908 (*endnum)--;
@@ -911,7 +911,7 @@ static void udf_merge_extents(struct inode *inode,
911 (EXT_NOT_RECORDED_ALLOCATED >> 30)) && 911 (EXT_NOT_RECORDED_ALLOCATED >> 30)) &&
912 ((lip1->extLength >> 30) == 912 ((lip1->extLength >> 30) ==
913 (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))) { 913 (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))) {
914 udf_free_blocks(inode->i_sb, inode, li->extLocation, 0, 914 udf_free_blocks(inode->i_sb, inode, &li->extLocation, 0,
915 ((li->extLength & 915 ((li->extLength &
916 UDF_EXTENT_LENGTH_MASK) + 916 UDF_EXTENT_LENGTH_MASK) +
917 blocksize - 1) >> blocksize_bits); 917 blocksize - 1) >> blocksize_bits);
@@ -937,7 +937,7 @@ static void udf_merge_extents(struct inode *inode,
937 blocksize - 1) & ~(blocksize - 1)); 937 blocksize - 1) & ~(blocksize - 1));
938 if (*endnum > (i + 2)) 938 if (*endnum > (i + 2))
939 memmove(&laarr[i + 1], &laarr[i + 2], 939 memmove(&laarr[i + 1], &laarr[i + 2],
940 sizeof(long_ad) * 940 sizeof(struct long_ad) *
941 (*endnum - (i + 2))); 941 (*endnum - (i + 2)));
942 i--; 942 i--;
943 (*endnum)--; 943 (*endnum)--;
@@ -945,7 +945,7 @@ static void udf_merge_extents(struct inode *inode,
945 } else if ((li->extLength >> 30) == 945 } else if ((li->extLength >> 30) ==
946 (EXT_NOT_RECORDED_ALLOCATED >> 30)) { 946 (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
947 udf_free_blocks(inode->i_sb, inode, 947 udf_free_blocks(inode->i_sb, inode,
948 li->extLocation, 0, 948 &li->extLocation, 0,
949 ((li->extLength & 949 ((li->extLength &
950 UDF_EXTENT_LENGTH_MASK) + 950 UDF_EXTENT_LENGTH_MASK) +
951 blocksize - 1) >> blocksize_bits); 951 blocksize - 1) >> blocksize_bits);
@@ -959,12 +959,12 @@ static void udf_merge_extents(struct inode *inode,
959} 959}
960 960
961static void udf_update_extents(struct inode *inode, 961static void udf_update_extents(struct inode *inode,
962 kernel_long_ad laarr[EXTENT_MERGE_SIZE], 962 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
963 int startnum, int endnum, 963 int startnum, int endnum,
964 struct extent_position *epos) 964 struct extent_position *epos)
965{ 965{
966 int start = 0, i; 966 int start = 0, i;
967 kernel_lb_addr tmploc; 967 struct kernel_lb_addr tmploc;
968 uint32_t tmplen; 968 uint32_t tmplen;
969 969
970 if (startnum > endnum) { 970 if (startnum > endnum) {
@@ -983,7 +983,7 @@ static void udf_update_extents(struct inode *inode,
983 983
984 for (i = start; i < endnum; i++) { 984 for (i = start; i < endnum; i++) {
985 udf_next_aext(inode, epos, &tmploc, &tmplen, 0); 985 udf_next_aext(inode, epos, &tmploc, &tmplen, 0);
986 udf_write_aext(inode, epos, laarr[i].extLocation, 986 udf_write_aext(inode, epos, &laarr[i].extLocation,
987 laarr[i].extLength, 1); 987 laarr[i].extLength, 1);
988 } 988 }
989} 989}
@@ -1076,7 +1076,7 @@ static void __udf_read_inode(struct inode *inode)
1076 * i_nlink = 1 1076 * i_nlink = 1
1077 * i_op = NULL; 1077 * i_op = NULL;
1078 */ 1078 */
1079 bh = udf_read_ptagged(inode->i_sb, iinfo->i_location, 0, &ident); 1079 bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident);
1080 if (!bh) { 1080 if (!bh) {
1081 printk(KERN_ERR "udf: udf_read_inode(ino %ld) failed !bh\n", 1081 printk(KERN_ERR "udf: udf_read_inode(ino %ld) failed !bh\n",
1082 inode->i_ino); 1082 inode->i_ino);
@@ -1098,24 +1098,24 @@ static void __udf_read_inode(struct inode *inode)
1098 if (fe->icbTag.strategyType == cpu_to_le16(4096)) { 1098 if (fe->icbTag.strategyType == cpu_to_le16(4096)) {
1099 struct buffer_head *ibh; 1099 struct buffer_head *ibh;
1100 1100
1101 ibh = udf_read_ptagged(inode->i_sb, iinfo->i_location, 1, 1101 ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1,
1102 &ident); 1102 &ident);
1103 if (ident == TAG_IDENT_IE && ibh) { 1103 if (ident == TAG_IDENT_IE && ibh) {
1104 struct buffer_head *nbh = NULL; 1104 struct buffer_head *nbh = NULL;
1105 kernel_lb_addr loc; 1105 struct kernel_lb_addr loc;
1106 struct indirectEntry *ie; 1106 struct indirectEntry *ie;
1107 1107
1108 ie = (struct indirectEntry *)ibh->b_data; 1108 ie = (struct indirectEntry *)ibh->b_data;
1109 loc = lelb_to_cpu(ie->indirectICB.extLocation); 1109 loc = lelb_to_cpu(ie->indirectICB.extLocation);
1110 1110
1111 if (ie->indirectICB.extLength && 1111 if (ie->indirectICB.extLength &&
1112 (nbh = udf_read_ptagged(inode->i_sb, loc, 0, 1112 (nbh = udf_read_ptagged(inode->i_sb, &loc, 0,
1113 &ident))) { 1113 &ident))) {
1114 if (ident == TAG_IDENT_FE || 1114 if (ident == TAG_IDENT_FE ||
1115 ident == TAG_IDENT_EFE) { 1115 ident == TAG_IDENT_EFE) {
1116 memcpy(&iinfo->i_location, 1116 memcpy(&iinfo->i_location,
1117 &loc, 1117 &loc,
1118 sizeof(kernel_lb_addr)); 1118 sizeof(struct kernel_lb_addr));
1119 brelse(bh); 1119 brelse(bh);
1120 brelse(ibh); 1120 brelse(ibh);
1121 brelse(nbh); 1121 brelse(nbh);
@@ -1222,8 +1222,15 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1222 inode->i_size = le64_to_cpu(fe->informationLength); 1222 inode->i_size = le64_to_cpu(fe->informationLength);
1223 iinfo->i_lenExtents = inode->i_size; 1223 iinfo->i_lenExtents = inode->i_size;
1224 1224
1225 inode->i_mode = udf_convert_permissions(fe); 1225 if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY &&
1226 inode->i_mode &= ~UDF_SB(inode->i_sb)->s_umask; 1226 sbi->s_fmode != UDF_INVALID_MODE)
1227 inode->i_mode = sbi->s_fmode;
1228 else if (fe->icbTag.fileType == ICBTAG_FILE_TYPE_DIRECTORY &&
1229 sbi->s_dmode != UDF_INVALID_MODE)
1230 inode->i_mode = sbi->s_dmode;
1231 else
1232 inode->i_mode = udf_convert_permissions(fe);
1233 inode->i_mode &= ~sbi->s_umask;
1227 1234
1228 if (iinfo->i_efe == 0) { 1235 if (iinfo->i_efe == 0) {
1229 inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) << 1236 inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
@@ -1396,7 +1403,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1396 1403
1397 bh = udf_tread(inode->i_sb, 1404 bh = udf_tread(inode->i_sb,
1398 udf_get_lb_pblock(inode->i_sb, 1405 udf_get_lb_pblock(inode->i_sb,
1399 iinfo->i_location, 0)); 1406 &iinfo->i_location, 0));
1400 if (!bh) { 1407 if (!bh) {
1401 udf_debug("bread failure\n"); 1408 udf_debug("bread failure\n");
1402 return -EIO; 1409 return -EIO;
@@ -1416,13 +1423,13 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1416 iinfo->i_ext.i_data, inode->i_sb->s_blocksize - 1423 iinfo->i_ext.i_data, inode->i_sb->s_blocksize -
1417 sizeof(struct unallocSpaceEntry)); 1424 sizeof(struct unallocSpaceEntry));
1418 crclen = sizeof(struct unallocSpaceEntry) + 1425 crclen = sizeof(struct unallocSpaceEntry) +
1419 iinfo->i_lenAlloc - sizeof(tag); 1426 iinfo->i_lenAlloc - sizeof(struct tag);
1420 use->descTag.tagLocation = cpu_to_le32( 1427 use->descTag.tagLocation = cpu_to_le32(
1421 iinfo->i_location. 1428 iinfo->i_location.
1422 logicalBlockNum); 1429 logicalBlockNum);
1423 use->descTag.descCRCLength = cpu_to_le16(crclen); 1430 use->descTag.descCRCLength = cpu_to_le16(crclen);
1424 use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use + 1431 use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use +
1425 sizeof(tag), 1432 sizeof(struct tag),
1426 crclen)); 1433 crclen));
1427 use->descTag.tagChecksum = udf_tag_checksum(&use->descTag); 1434 use->descTag.tagChecksum = udf_tag_checksum(&use->descTag);
1428 1435
@@ -1459,23 +1466,23 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1459 fe->informationLength = cpu_to_le64(inode->i_size); 1466 fe->informationLength = cpu_to_le64(inode->i_size);
1460 1467
1461 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { 1468 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
1462 regid *eid; 1469 struct regid *eid;
1463 struct deviceSpec *dsea = 1470 struct deviceSpec *dsea =
1464 (struct deviceSpec *)udf_get_extendedattr(inode, 12, 1); 1471 (struct deviceSpec *)udf_get_extendedattr(inode, 12, 1);
1465 if (!dsea) { 1472 if (!dsea) {
1466 dsea = (struct deviceSpec *) 1473 dsea = (struct deviceSpec *)
1467 udf_add_extendedattr(inode, 1474 udf_add_extendedattr(inode,
1468 sizeof(struct deviceSpec) + 1475 sizeof(struct deviceSpec) +
1469 sizeof(regid), 12, 0x3); 1476 sizeof(struct regid), 12, 0x3);
1470 dsea->attrType = cpu_to_le32(12); 1477 dsea->attrType = cpu_to_le32(12);
1471 dsea->attrSubtype = 1; 1478 dsea->attrSubtype = 1;
1472 dsea->attrLength = cpu_to_le32( 1479 dsea->attrLength = cpu_to_le32(
1473 sizeof(struct deviceSpec) + 1480 sizeof(struct deviceSpec) +
1474 sizeof(regid)); 1481 sizeof(struct regid));
1475 dsea->impUseLength = cpu_to_le32(sizeof(regid)); 1482 dsea->impUseLength = cpu_to_le32(sizeof(struct regid));
1476 } 1483 }
1477 eid = (regid *)dsea->impUse; 1484 eid = (struct regid *)dsea->impUse;
1478 memset(eid, 0, sizeof(regid)); 1485 memset(eid, 0, sizeof(struct regid));
1479 strcpy(eid->ident, UDF_ID_DEVELOPER); 1486 strcpy(eid->ident, UDF_ID_DEVELOPER);
1480 eid->identSuffix[0] = UDF_OS_CLASS_UNIX; 1487 eid->identSuffix[0] = UDF_OS_CLASS_UNIX;
1481 eid->identSuffix[1] = UDF_OS_ID_LINUX; 1488 eid->identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1494,7 +1501,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1494 udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime); 1501 udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime);
1495 udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime); 1502 udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime);
1496 udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime); 1503 udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime);
1497 memset(&(fe->impIdent), 0, sizeof(regid)); 1504 memset(&(fe->impIdent), 0, sizeof(struct regid));
1498 strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER); 1505 strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER);
1499 fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; 1506 fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
1500 fe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; 1507 fe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1533,7 +1540,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1533 udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime); 1540 udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime);
1534 udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime); 1541 udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime);
1535 1542
1536 memset(&(efe->impIdent), 0, sizeof(regid)); 1543 memset(&(efe->impIdent), 0, sizeof(struct regid));
1537 strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER); 1544 strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER);
1538 efe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; 1545 efe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
1539 efe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; 1546 efe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1584,9 +1591,9 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1584 fe->descTag.tagLocation = cpu_to_le32( 1591 fe->descTag.tagLocation = cpu_to_le32(
1585 iinfo->i_location.logicalBlockNum); 1592 iinfo->i_location.logicalBlockNum);
1586 crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - 1593 crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc -
1587 sizeof(tag); 1594 sizeof(struct tag);
1588 fe->descTag.descCRCLength = cpu_to_le16(crclen); 1595 fe->descTag.descCRCLength = cpu_to_le16(crclen);
1589 fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(tag), 1596 fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag),
1590 crclen)); 1597 crclen));
1591 fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag); 1598 fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag);
1592 1599
@@ -1606,7 +1613,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1606 return err; 1613 return err;
1607} 1614}
1608 1615
1609struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino) 1616struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino)
1610{ 1617{
1611 unsigned long block = udf_get_lb_pblock(sb, ino, 0); 1618 unsigned long block = udf_get_lb_pblock(sb, ino, 0);
1612 struct inode *inode = iget_locked(sb, block); 1619 struct inode *inode = iget_locked(sb, block);
@@ -1615,7 +1622,7 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
1615 return NULL; 1622 return NULL;
1616 1623
1617 if (inode->i_state & I_NEW) { 1624 if (inode->i_state & I_NEW) {
1618 memcpy(&UDF_I(inode)->i_location, &ino, sizeof(kernel_lb_addr)); 1625 memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr));
1619 __udf_read_inode(inode); 1626 __udf_read_inode(inode);
1620 unlock_new_inode(inode); 1627 unlock_new_inode(inode);
1621 } 1628 }
@@ -1623,10 +1630,10 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
1623 if (is_bad_inode(inode)) 1630 if (is_bad_inode(inode))
1624 goto out_iput; 1631 goto out_iput;
1625 1632
1626 if (ino.logicalBlockNum >= UDF_SB(sb)-> 1633 if (ino->logicalBlockNum >= UDF_SB(sb)->
1627 s_partmaps[ino.partitionReferenceNum].s_partition_len) { 1634 s_partmaps[ino->partitionReferenceNum].s_partition_len) {
1628 udf_debug("block=%d, partition=%d out of range\n", 1635 udf_debug("block=%d, partition=%d out of range\n",
1629 ino.logicalBlockNum, ino.partitionReferenceNum); 1636 ino->logicalBlockNum, ino->partitionReferenceNum);
1630 make_bad_inode(inode); 1637 make_bad_inode(inode);
1631 goto out_iput; 1638 goto out_iput;
1632 } 1639 }
@@ -1639,11 +1646,11 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
1639} 1646}
1640 1647
1641int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, 1648int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1642 kernel_lb_addr eloc, uint32_t elen, int inc) 1649 struct kernel_lb_addr *eloc, uint32_t elen, int inc)
1643{ 1650{
1644 int adsize; 1651 int adsize;
1645 short_ad *sad = NULL; 1652 struct short_ad *sad = NULL;
1646 long_ad *lad = NULL; 1653 struct long_ad *lad = NULL;
1647 struct allocExtDesc *aed; 1654 struct allocExtDesc *aed;
1648 int8_t etype; 1655 int8_t etype;
1649 uint8_t *ptr; 1656 uint8_t *ptr;
@@ -1657,9 +1664,9 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1657 ptr = epos->bh->b_data + epos->offset; 1664 ptr = epos->bh->b_data + epos->offset;
1658 1665
1659 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 1666 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
1660 adsize = sizeof(short_ad); 1667 adsize = sizeof(struct short_ad);
1661 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 1668 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
1662 adsize = sizeof(long_ad); 1669 adsize = sizeof(struct long_ad);
1663 else 1670 else
1664 return -1; 1671 return -1;
1665 1672
@@ -1667,7 +1674,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1667 char *sptr, *dptr; 1674 char *sptr, *dptr;
1668 struct buffer_head *nbh; 1675 struct buffer_head *nbh;
1669 int err, loffset; 1676 int err, loffset;
1670 kernel_lb_addr obloc = epos->block; 1677 struct kernel_lb_addr obloc = epos->block;
1671 1678
1672 epos->block.logicalBlockNum = udf_new_block(inode->i_sb, NULL, 1679 epos->block.logicalBlockNum = udf_new_block(inode->i_sb, NULL,
1673 obloc.partitionReferenceNum, 1680 obloc.partitionReferenceNum,
@@ -1675,7 +1682,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1675 if (!epos->block.logicalBlockNum) 1682 if (!epos->block.logicalBlockNum)
1676 return -1; 1683 return -1;
1677 nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb, 1684 nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb,
1678 epos->block, 1685 &epos->block,
1679 0)); 1686 0));
1680 if (!nbh) 1687 if (!nbh)
1681 return -1; 1688 return -1;
@@ -1712,20 +1719,20 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1712 } 1719 }
1713 if (UDF_SB(inode->i_sb)->s_udfrev >= 0x0200) 1720 if (UDF_SB(inode->i_sb)->s_udfrev >= 0x0200)
1714 udf_new_tag(nbh->b_data, TAG_IDENT_AED, 3, 1, 1721 udf_new_tag(nbh->b_data, TAG_IDENT_AED, 3, 1,
1715 epos->block.logicalBlockNum, sizeof(tag)); 1722 epos->block.logicalBlockNum, sizeof(struct tag));
1716 else 1723 else
1717 udf_new_tag(nbh->b_data, TAG_IDENT_AED, 2, 1, 1724 udf_new_tag(nbh->b_data, TAG_IDENT_AED, 2, 1,
1718 epos->block.logicalBlockNum, sizeof(tag)); 1725 epos->block.logicalBlockNum, sizeof(struct tag));
1719 switch (iinfo->i_alloc_type) { 1726 switch (iinfo->i_alloc_type) {
1720 case ICBTAG_FLAG_AD_SHORT: 1727 case ICBTAG_FLAG_AD_SHORT:
1721 sad = (short_ad *)sptr; 1728 sad = (struct short_ad *)sptr;
1722 sad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS | 1729 sad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
1723 inode->i_sb->s_blocksize); 1730 inode->i_sb->s_blocksize);
1724 sad->extPosition = 1731 sad->extPosition =
1725 cpu_to_le32(epos->block.logicalBlockNum); 1732 cpu_to_le32(epos->block.logicalBlockNum);
1726 break; 1733 break;
1727 case ICBTAG_FLAG_AD_LONG: 1734 case ICBTAG_FLAG_AD_LONG:
1728 lad = (long_ad *)sptr; 1735 lad = (struct long_ad *)sptr;
1729 lad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS | 1736 lad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
1730 inode->i_sb->s_blocksize); 1737 inode->i_sb->s_blocksize);
1731 lad->extLocation = cpu_to_lelb(epos->block); 1738 lad->extLocation = cpu_to_lelb(epos->block);
@@ -1769,12 +1776,12 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1769} 1776}
1770 1777
1771int8_t udf_write_aext(struct inode *inode, struct extent_position *epos, 1778int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
1772 kernel_lb_addr eloc, uint32_t elen, int inc) 1779 struct kernel_lb_addr *eloc, uint32_t elen, int inc)
1773{ 1780{
1774 int adsize; 1781 int adsize;
1775 uint8_t *ptr; 1782 uint8_t *ptr;
1776 short_ad *sad; 1783 struct short_ad *sad;
1777 long_ad *lad; 1784 struct long_ad *lad;
1778 struct udf_inode_info *iinfo = UDF_I(inode); 1785 struct udf_inode_info *iinfo = UDF_I(inode);
1779 1786
1780 if (!epos->bh) 1787 if (!epos->bh)
@@ -1786,17 +1793,17 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
1786 1793
1787 switch (iinfo->i_alloc_type) { 1794 switch (iinfo->i_alloc_type) {
1788 case ICBTAG_FLAG_AD_SHORT: 1795 case ICBTAG_FLAG_AD_SHORT:
1789 sad = (short_ad *)ptr; 1796 sad = (struct short_ad *)ptr;
1790 sad->extLength = cpu_to_le32(elen); 1797 sad->extLength = cpu_to_le32(elen);
1791 sad->extPosition = cpu_to_le32(eloc.logicalBlockNum); 1798 sad->extPosition = cpu_to_le32(eloc->logicalBlockNum);
1792 adsize = sizeof(short_ad); 1799 adsize = sizeof(struct short_ad);
1793 break; 1800 break;
1794 case ICBTAG_FLAG_AD_LONG: 1801 case ICBTAG_FLAG_AD_LONG:
1795 lad = (long_ad *)ptr; 1802 lad = (struct long_ad *)ptr;
1796 lad->extLength = cpu_to_le32(elen); 1803 lad->extLength = cpu_to_le32(elen);
1797 lad->extLocation = cpu_to_lelb(eloc); 1804 lad->extLocation = cpu_to_lelb(*eloc);
1798 memset(lad->impUse, 0x00, sizeof(lad->impUse)); 1805 memset(lad->impUse, 0x00, sizeof(lad->impUse));
1799 adsize = sizeof(long_ad); 1806 adsize = sizeof(struct long_ad);
1800 break; 1807 break;
1801 default: 1808 default:
1802 return -1; 1809 return -1;
@@ -1823,7 +1830,7 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
1823} 1830}
1824 1831
1825int8_t udf_next_aext(struct inode *inode, struct extent_position *epos, 1832int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
1826 kernel_lb_addr *eloc, uint32_t *elen, int inc) 1833 struct kernel_lb_addr *eloc, uint32_t *elen, int inc)
1827{ 1834{
1828 int8_t etype; 1835 int8_t etype;
1829 1836
@@ -1833,7 +1840,7 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
1833 epos->block = *eloc; 1840 epos->block = *eloc;
1834 epos->offset = sizeof(struct allocExtDesc); 1841 epos->offset = sizeof(struct allocExtDesc);
1835 brelse(epos->bh); 1842 brelse(epos->bh);
1836 block = udf_get_lb_pblock(inode->i_sb, epos->block, 0); 1843 block = udf_get_lb_pblock(inode->i_sb, &epos->block, 0);
1837 epos->bh = udf_tread(inode->i_sb, block); 1844 epos->bh = udf_tread(inode->i_sb, block);
1838 if (!epos->bh) { 1845 if (!epos->bh) {
1839 udf_debug("reading block %d failed!\n", block); 1846 udf_debug("reading block %d failed!\n", block);
@@ -1845,13 +1852,13 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
1845} 1852}
1846 1853
1847int8_t udf_current_aext(struct inode *inode, struct extent_position *epos, 1854int8_t udf_current_aext(struct inode *inode, struct extent_position *epos,
1848 kernel_lb_addr *eloc, uint32_t *elen, int inc) 1855 struct kernel_lb_addr *eloc, uint32_t *elen, int inc)
1849{ 1856{
1850 int alen; 1857 int alen;
1851 int8_t etype; 1858 int8_t etype;
1852 uint8_t *ptr; 1859 uint8_t *ptr;
1853 short_ad *sad; 1860 struct short_ad *sad;
1854 long_ad *lad; 1861 struct long_ad *lad;
1855 struct udf_inode_info *iinfo = UDF_I(inode); 1862 struct udf_inode_info *iinfo = UDF_I(inode);
1856 1863
1857 if (!epos->bh) { 1864 if (!epos->bh) {
@@ -1900,9 +1907,9 @@ int8_t udf_current_aext(struct inode *inode, struct extent_position *epos,
1900} 1907}
1901 1908
1902static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos, 1909static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos,
1903 kernel_lb_addr neloc, uint32_t nelen) 1910 struct kernel_lb_addr neloc, uint32_t nelen)
1904{ 1911{
1905 kernel_lb_addr oeloc; 1912 struct kernel_lb_addr oeloc;
1906 uint32_t oelen; 1913 uint32_t oelen;
1907 int8_t etype; 1914 int8_t etype;
1908 1915
@@ -1910,18 +1917,18 @@ static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos,
1910 get_bh(epos.bh); 1917 get_bh(epos.bh);
1911 1918
1912 while ((etype = udf_next_aext(inode, &epos, &oeloc, &oelen, 0)) != -1) { 1919 while ((etype = udf_next_aext(inode, &epos, &oeloc, &oelen, 0)) != -1) {
1913 udf_write_aext(inode, &epos, neloc, nelen, 1); 1920 udf_write_aext(inode, &epos, &neloc, nelen, 1);
1914 neloc = oeloc; 1921 neloc = oeloc;
1915 nelen = (etype << 30) | oelen; 1922 nelen = (etype << 30) | oelen;
1916 } 1923 }
1917 udf_add_aext(inode, &epos, neloc, nelen, 1); 1924 udf_add_aext(inode, &epos, &neloc, nelen, 1);
1918 brelse(epos.bh); 1925 brelse(epos.bh);
1919 1926
1920 return (nelen >> 30); 1927 return (nelen >> 30);
1921} 1928}
1922 1929
1923int8_t udf_delete_aext(struct inode *inode, struct extent_position epos, 1930int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
1924 kernel_lb_addr eloc, uint32_t elen) 1931 struct kernel_lb_addr eloc, uint32_t elen)
1925{ 1932{
1926 struct extent_position oepos; 1933 struct extent_position oepos;
1927 int adsize; 1934 int adsize;
@@ -1936,9 +1943,9 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
1936 1943
1937 iinfo = UDF_I(inode); 1944 iinfo = UDF_I(inode);
1938 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 1945 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
1939 adsize = sizeof(short_ad); 1946 adsize = sizeof(struct short_ad);
1940 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 1947 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
1941 adsize = sizeof(long_ad); 1948 adsize = sizeof(struct long_ad);
1942 else 1949 else
1943 adsize = 0; 1950 adsize = 0;
1944 1951
@@ -1947,7 +1954,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
1947 return -1; 1954 return -1;
1948 1955
1949 while ((etype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) { 1956 while ((etype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) {
1950 udf_write_aext(inode, &oepos, eloc, (etype << 30) | elen, 1); 1957 udf_write_aext(inode, &oepos, &eloc, (etype << 30) | elen, 1);
1951 if (oepos.bh != epos.bh) { 1958 if (oepos.bh != epos.bh) {
1952 oepos.block = epos.block; 1959 oepos.block = epos.block;
1953 brelse(oepos.bh); 1960 brelse(oepos.bh);
@@ -1956,13 +1963,13 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
1956 oepos.offset = epos.offset - adsize; 1963 oepos.offset = epos.offset - adsize;
1957 } 1964 }
1958 } 1965 }
1959 memset(&eloc, 0x00, sizeof(kernel_lb_addr)); 1966 memset(&eloc, 0x00, sizeof(struct kernel_lb_addr));
1960 elen = 0; 1967 elen = 0;
1961 1968
1962 if (epos.bh != oepos.bh) { 1969 if (epos.bh != oepos.bh) {
1963 udf_free_blocks(inode->i_sb, inode, epos.block, 0, 1); 1970 udf_free_blocks(inode->i_sb, inode, &epos.block, 0, 1);
1964 udf_write_aext(inode, &oepos, eloc, elen, 1); 1971 udf_write_aext(inode, &oepos, &eloc, elen, 1);
1965 udf_write_aext(inode, &oepos, eloc, elen, 1); 1972 udf_write_aext(inode, &oepos, &eloc, elen, 1);
1966 if (!oepos.bh) { 1973 if (!oepos.bh) {
1967 iinfo->i_lenAlloc -= (adsize * 2); 1974 iinfo->i_lenAlloc -= (adsize * 2);
1968 mark_inode_dirty(inode); 1975 mark_inode_dirty(inode);
@@ -1979,7 +1986,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
1979 mark_buffer_dirty_inode(oepos.bh, inode); 1986 mark_buffer_dirty_inode(oepos.bh, inode);
1980 } 1987 }
1981 } else { 1988 } else {
1982 udf_write_aext(inode, &oepos, eloc, elen, 1); 1989 udf_write_aext(inode, &oepos, &eloc, elen, 1);
1983 if (!oepos.bh) { 1990 if (!oepos.bh) {
1984 iinfo->i_lenAlloc -= adsize; 1991 iinfo->i_lenAlloc -= adsize;
1985 mark_inode_dirty(inode); 1992 mark_inode_dirty(inode);
@@ -2004,7 +2011,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
2004} 2011}
2005 2012
2006int8_t inode_bmap(struct inode *inode, sector_t block, 2013int8_t inode_bmap(struct inode *inode, sector_t block,
2007 struct extent_position *pos, kernel_lb_addr *eloc, 2014 struct extent_position *pos, struct kernel_lb_addr *eloc,
2008 uint32_t *elen, sector_t *offset) 2015 uint32_t *elen, sector_t *offset)
2009{ 2016{
2010 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; 2017 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
@@ -2036,7 +2043,7 @@ int8_t inode_bmap(struct inode *inode, sector_t block,
2036 2043
2037long udf_block_map(struct inode *inode, sector_t block) 2044long udf_block_map(struct inode *inode, sector_t block)
2038{ 2045{
2039 kernel_lb_addr eloc; 2046 struct kernel_lb_addr eloc;
2040 uint32_t elen; 2047 uint32_t elen;
2041 sector_t offset; 2048 sector_t offset;
2042 struct extent_position epos = {}; 2049 struct extent_position epos = {};
@@ -2046,7 +2053,7 @@ long udf_block_map(struct inode *inode, sector_t block)
2046 2053
2047 if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) == 2054 if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) ==
2048 (EXT_RECORDED_ALLOCATED >> 30)) 2055 (EXT_RECORDED_ALLOCATED >> 30))
2049 ret = udf_get_lb_pblock(inode->i_sb, eloc, offset); 2056 ret = udf_get_lb_pblock(inode->i_sb, &eloc, offset);
2050 else 2057 else
2051 ret = 0; 2058 ret = 0;
2052 2059
diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index 84bf0fd4a4f1..9215700c00a4 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -134,10 +134,10 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size,
134 } 134 }
135 } 135 }
136 /* rewrite CRC + checksum of eahd */ 136 /* rewrite CRC + checksum of eahd */
137 crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(tag); 137 crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(struct tag);
138 eahd->descTag.descCRCLength = cpu_to_le16(crclen); 138 eahd->descTag.descCRCLength = cpu_to_le16(crclen);
139 eahd->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)eahd + 139 eahd->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)eahd +
140 sizeof(tag), crclen)); 140 sizeof(struct tag), crclen));
141 eahd->descTag.tagChecksum = udf_tag_checksum(&eahd->descTag); 141 eahd->descTag.tagChecksum = udf_tag_checksum(&eahd->descTag);
142 iinfo->i_lenEAttr += size; 142 iinfo->i_lenEAttr += size;
143 return (struct genericFormat *)&ea[offset]; 143 return (struct genericFormat *)&ea[offset];
@@ -202,7 +202,7 @@ struct genericFormat *udf_get_extendedattr(struct inode *inode, uint32_t type,
202struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, 202struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
203 uint32_t location, uint16_t *ident) 203 uint32_t location, uint16_t *ident)
204{ 204{
205 tag *tag_p; 205 struct tag *tag_p;
206 struct buffer_head *bh = NULL; 206 struct buffer_head *bh = NULL;
207 207
208 /* Read the block */ 208 /* Read the block */
@@ -216,7 +216,7 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
216 return NULL; 216 return NULL;
217 } 217 }
218 218
219 tag_p = (tag *)(bh->b_data); 219 tag_p = (struct tag *)(bh->b_data);
220 220
221 *ident = le16_to_cpu(tag_p->tagIdent); 221 *ident = le16_to_cpu(tag_p->tagIdent);
222 222
@@ -241,9 +241,9 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
241 } 241 }
242 242
243 /* Verify the descriptor CRC */ 243 /* Verify the descriptor CRC */
244 if (le16_to_cpu(tag_p->descCRCLength) + sizeof(tag) > sb->s_blocksize || 244 if (le16_to_cpu(tag_p->descCRCLength) + sizeof(struct tag) > sb->s_blocksize ||
245 le16_to_cpu(tag_p->descCRC) == crc_itu_t(0, 245 le16_to_cpu(tag_p->descCRC) == crc_itu_t(0,
246 bh->b_data + sizeof(tag), 246 bh->b_data + sizeof(struct tag),
247 le16_to_cpu(tag_p->descCRCLength))) 247 le16_to_cpu(tag_p->descCRCLength)))
248 return bh; 248 return bh;
249 249
@@ -255,27 +255,28 @@ error_out:
255 return NULL; 255 return NULL;
256} 256}
257 257
258struct buffer_head *udf_read_ptagged(struct super_block *sb, kernel_lb_addr loc, 258struct buffer_head *udf_read_ptagged(struct super_block *sb,
259 struct kernel_lb_addr *loc,
259 uint32_t offset, uint16_t *ident) 260 uint32_t offset, uint16_t *ident)
260{ 261{
261 return udf_read_tagged(sb, udf_get_lb_pblock(sb, loc, offset), 262 return udf_read_tagged(sb, udf_get_lb_pblock(sb, loc, offset),
262 loc.logicalBlockNum + offset, ident); 263 loc->logicalBlockNum + offset, ident);
263} 264}
264 265
265void udf_update_tag(char *data, int length) 266void udf_update_tag(char *data, int length)
266{ 267{
267 tag *tptr = (tag *)data; 268 struct tag *tptr = (struct tag *)data;
268 length -= sizeof(tag); 269 length -= sizeof(struct tag);
269 270
270 tptr->descCRCLength = cpu_to_le16(length); 271 tptr->descCRCLength = cpu_to_le16(length);
271 tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(tag), length)); 272 tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(struct tag), length));
272 tptr->tagChecksum = udf_tag_checksum(tptr); 273 tptr->tagChecksum = udf_tag_checksum(tptr);
273} 274}
274 275
275void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum, 276void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum,
276 uint32_t loc, int length) 277 uint32_t loc, int length)
277{ 278{
278 tag *tptr = (tag *)data; 279 struct tag *tptr = (struct tag *)data;
279 tptr->tagIdent = cpu_to_le16(ident); 280 tptr->tagIdent = cpu_to_le16(ident);
280 tptr->descVersion = cpu_to_le16(version); 281 tptr->descVersion = cpu_to_le16(version);
281 tptr->tagSerialNum = cpu_to_le16(snum); 282 tptr->tagSerialNum = cpu_to_le16(snum);
@@ -283,12 +284,12 @@ void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum,
283 udf_update_tag(data, length); 284 udf_update_tag(data, length);
284} 285}
285 286
286u8 udf_tag_checksum(const tag *t) 287u8 udf_tag_checksum(const struct tag *t)
287{ 288{
288 u8 *data = (u8 *)t; 289 u8 *data = (u8 *)t;
289 u8 checksum = 0; 290 u8 checksum = 0;
290 int i; 291 int i;
291 for (i = 0; i < sizeof(tag); ++i) 292 for (i = 0; i < sizeof(struct tag); ++i)
292 if (i != 4) /* position of checksum */ 293 if (i != 4) /* position of checksum */
293 checksum += data[i]; 294 checksum += data[i];
294 return checksum; 295 return checksum;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index f84bfaa8d941..6a29fa34c478 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -47,7 +47,7 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
47 struct fileIdentDesc *sfi, struct udf_fileident_bh *fibh, 47 struct fileIdentDesc *sfi, struct udf_fileident_bh *fibh,
48 uint8_t *impuse, uint8_t *fileident) 48 uint8_t *impuse, uint8_t *fileident)
49{ 49{
50 uint16_t crclen = fibh->eoffset - fibh->soffset - sizeof(tag); 50 uint16_t crclen = fibh->eoffset - fibh->soffset - sizeof(struct tag);
51 uint16_t crc; 51 uint16_t crc;
52 int offset; 52 int offset;
53 uint16_t liu = le16_to_cpu(cfi->lengthOfImpUse); 53 uint16_t liu = le16_to_cpu(cfi->lengthOfImpUse);
@@ -99,18 +99,18 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
99 memset(fibh->ebh->b_data, 0x00, padlen + offset); 99 memset(fibh->ebh->b_data, 0x00, padlen + offset);
100 } 100 }
101 101
102 crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(tag), 102 crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(struct tag),
103 sizeof(struct fileIdentDesc) - sizeof(tag)); 103 sizeof(struct fileIdentDesc) - sizeof(struct tag));
104 104
105 if (fibh->sbh == fibh->ebh) { 105 if (fibh->sbh == fibh->ebh) {
106 crc = crc_itu_t(crc, (uint8_t *)sfi->impUse, 106 crc = crc_itu_t(crc, (uint8_t *)sfi->impUse,
107 crclen + sizeof(tag) - 107 crclen + sizeof(struct tag) -
108 sizeof(struct fileIdentDesc)); 108 sizeof(struct fileIdentDesc));
109 } else if (sizeof(struct fileIdentDesc) >= -fibh->soffset) { 109 } else if (sizeof(struct fileIdentDesc) >= -fibh->soffset) {
110 crc = crc_itu_t(crc, fibh->ebh->b_data + 110 crc = crc_itu_t(crc, fibh->ebh->b_data +
111 sizeof(struct fileIdentDesc) + 111 sizeof(struct fileIdentDesc) +
112 fibh->soffset, 112 fibh->soffset,
113 crclen + sizeof(tag) - 113 crclen + sizeof(struct tag) -
114 sizeof(struct fileIdentDesc)); 114 sizeof(struct fileIdentDesc));
115 } else { 115 } else {
116 crc = crc_itu_t(crc, (uint8_t *)sfi->impUse, 116 crc = crc_itu_t(crc, (uint8_t *)sfi->impUse,
@@ -154,7 +154,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
154 uint8_t lfi; 154 uint8_t lfi;
155 uint16_t liu; 155 uint16_t liu;
156 loff_t size; 156 loff_t size;
157 kernel_lb_addr eloc; 157 struct kernel_lb_addr eloc;
158 uint32_t elen; 158 uint32_t elen;
159 sector_t offset; 159 sector_t offset;
160 struct extent_position epos = {}; 160 struct extent_position epos = {};
@@ -171,12 +171,12 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
171 if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos, 171 if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos,
172 &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) 172 &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30))
173 goto out_err; 173 goto out_err;
174 block = udf_get_lb_pblock(dir->i_sb, eloc, offset); 174 block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
175 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { 175 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
176 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 176 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
177 epos.offset -= sizeof(short_ad); 177 epos.offset -= sizeof(struct short_ad);
178 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 178 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
179 epos.offset -= sizeof(long_ad); 179 epos.offset -= sizeof(struct long_ad);
180 } else 180 } else
181 offset = 0; 181 offset = 0;
182 182
@@ -268,7 +268,7 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
268#ifdef UDF_RECOVERY 268#ifdef UDF_RECOVERY
269 /* temporary shorthand for specifying files by inode number */ 269 /* temporary shorthand for specifying files by inode number */
270 if (!strncmp(dentry->d_name.name, ".B=", 3)) { 270 if (!strncmp(dentry->d_name.name, ".B=", 3)) {
271 kernel_lb_addr lb = { 271 struct kernel_lb_addr lb = {
272 .logicalBlockNum = 0, 272 .logicalBlockNum = 0,
273 .partitionReferenceNum = 273 .partitionReferenceNum =
274 simple_strtoul(dentry->d_name.name + 3, 274 simple_strtoul(dentry->d_name.name + 3,
@@ -283,11 +283,14 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
283#endif /* UDF_RECOVERY */ 283#endif /* UDF_RECOVERY */
284 284
285 if (udf_find_entry(dir, &dentry->d_name, &fibh, &cfi)) { 285 if (udf_find_entry(dir, &dentry->d_name, &fibh, &cfi)) {
286 struct kernel_lb_addr loc;
287
286 if (fibh.sbh != fibh.ebh) 288 if (fibh.sbh != fibh.ebh)
287 brelse(fibh.ebh); 289 brelse(fibh.ebh);
288 brelse(fibh.sbh); 290 brelse(fibh.sbh);
289 291
290 inode = udf_iget(dir->i_sb, lelb_to_cpu(cfi.icb.extLocation)); 292 loc = lelb_to_cpu(cfi.icb.extLocation);
293 inode = udf_iget(dir->i_sb, &loc);
291 if (!inode) { 294 if (!inode) {
292 unlock_kernel(); 295 unlock_kernel();
293 return ERR_PTR(-EACCES); 296 return ERR_PTR(-EACCES);
@@ -313,7 +316,7 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
313 uint8_t lfi; 316 uint8_t lfi;
314 uint16_t liu; 317 uint16_t liu;
315 int block; 318 int block;
316 kernel_lb_addr eloc; 319 struct kernel_lb_addr eloc;
317 uint32_t elen = 0; 320 uint32_t elen = 0;
318 sector_t offset; 321 sector_t offset;
319 struct extent_position epos = {}; 322 struct extent_position epos = {};
@@ -351,16 +354,16 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
351 if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos, 354 if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos,
352 &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) { 355 &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) {
353 block = udf_get_lb_pblock(dir->i_sb, 356 block = udf_get_lb_pblock(dir->i_sb,
354 dinfo->i_location, 0); 357 &dinfo->i_location, 0);
355 fibh->soffset = fibh->eoffset = sb->s_blocksize; 358 fibh->soffset = fibh->eoffset = sb->s_blocksize;
356 goto add; 359 goto add;
357 } 360 }
358 block = udf_get_lb_pblock(dir->i_sb, eloc, offset); 361 block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
359 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { 362 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
360 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 363 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
361 epos.offset -= sizeof(short_ad); 364 epos.offset -= sizeof(struct short_ad);
362 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 365 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
363 epos.offset -= sizeof(long_ad); 366 epos.offset -= sizeof(struct long_ad);
364 } else 367 } else
365 offset = 0; 368 offset = 0;
366 369
@@ -409,10 +412,10 @@ add:
409 if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && elen) { 412 if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && elen) {
410 elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1); 413 elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1);
411 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 414 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
412 epos.offset -= sizeof(short_ad); 415 epos.offset -= sizeof(struct short_ad);
413 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 416 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
414 epos.offset -= sizeof(long_ad); 417 epos.offset -= sizeof(struct long_ad);
415 udf_write_aext(dir, &epos, eloc, elen, 1); 418 udf_write_aext(dir, &epos, &eloc, elen, 1);
416 } 419 }
417 f_pos += nfidlen; 420 f_pos += nfidlen;
418 421
@@ -494,10 +497,10 @@ add:
494 memset(cfi, 0, sizeof(struct fileIdentDesc)); 497 memset(cfi, 0, sizeof(struct fileIdentDesc));
495 if (UDF_SB(sb)->s_udfrev >= 0x0200) 498 if (UDF_SB(sb)->s_udfrev >= 0x0200)
496 udf_new_tag((char *)cfi, TAG_IDENT_FID, 3, 1, block, 499 udf_new_tag((char *)cfi, TAG_IDENT_FID, 3, 1, block,
497 sizeof(tag)); 500 sizeof(struct tag));
498 else 501 else
499 udf_new_tag((char *)cfi, TAG_IDENT_FID, 2, 1, block, 502 udf_new_tag((char *)cfi, TAG_IDENT_FID, 2, 1, block,
500 sizeof(tag)); 503 sizeof(struct tag));
501 cfi->fileVersionNum = cpu_to_le16(1); 504 cfi->fileVersionNum = cpu_to_le16(1);
502 cfi->lengthFileIdent = namelen; 505 cfi->lengthFileIdent = namelen;
503 cfi->lengthOfImpUse = cpu_to_le16(0); 506 cfi->lengthOfImpUse = cpu_to_le16(0);
@@ -530,7 +533,7 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi,
530 cfi->fileCharacteristics |= FID_FILE_CHAR_DELETED; 533 cfi->fileCharacteristics |= FID_FILE_CHAR_DELETED;
531 534
532 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT)) 535 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT))
533 memset(&(cfi->icb), 0x00, sizeof(long_ad)); 536 memset(&(cfi->icb), 0x00, sizeof(struct long_ad));
534 537
535 return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL); 538 return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL);
536} 539}
@@ -710,7 +713,7 @@ static int empty_dir(struct inode *dir)
710 loff_t f_pos; 713 loff_t f_pos;
711 loff_t size = udf_ext0_offset(dir) + dir->i_size; 714 loff_t size = udf_ext0_offset(dir) + dir->i_size;
712 int block; 715 int block;
713 kernel_lb_addr eloc; 716 struct kernel_lb_addr eloc;
714 uint32_t elen; 717 uint32_t elen;
715 sector_t offset; 718 sector_t offset;
716 struct extent_position epos = {}; 719 struct extent_position epos = {};
@@ -724,12 +727,12 @@ static int empty_dir(struct inode *dir)
724 else if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, 727 else if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits,
725 &epos, &eloc, &elen, &offset) == 728 &epos, &eloc, &elen, &offset) ==
726 (EXT_RECORDED_ALLOCATED >> 30)) { 729 (EXT_RECORDED_ALLOCATED >> 30)) {
727 block = udf_get_lb_pblock(dir->i_sb, eloc, offset); 730 block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
728 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { 731 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
729 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 732 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
730 epos.offset -= sizeof(short_ad); 733 epos.offset -= sizeof(struct short_ad);
731 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 734 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
732 epos.offset -= sizeof(long_ad); 735 epos.offset -= sizeof(struct long_ad);
733 } else 736 } else
734 offset = 0; 737 offset = 0;
735 738
@@ -778,7 +781,7 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
778 struct inode *inode = dentry->d_inode; 781 struct inode *inode = dentry->d_inode;
779 struct udf_fileident_bh fibh; 782 struct udf_fileident_bh fibh;
780 struct fileIdentDesc *fi, cfi; 783 struct fileIdentDesc *fi, cfi;
781 kernel_lb_addr tloc; 784 struct kernel_lb_addr tloc;
782 785
783 retval = -ENOENT; 786 retval = -ENOENT;
784 lock_kernel(); 787 lock_kernel();
@@ -788,7 +791,7 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
788 791
789 retval = -EIO; 792 retval = -EIO;
790 tloc = lelb_to_cpu(cfi.icb.extLocation); 793 tloc = lelb_to_cpu(cfi.icb.extLocation);
791 if (udf_get_lb_pblock(dir->i_sb, tloc, 0) != inode->i_ino) 794 if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino)
792 goto end_rmdir; 795 goto end_rmdir;
793 retval = -ENOTEMPTY; 796 retval = -ENOTEMPTY;
794 if (!empty_dir(inode)) 797 if (!empty_dir(inode))
@@ -824,7 +827,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
824 struct udf_fileident_bh fibh; 827 struct udf_fileident_bh fibh;
825 struct fileIdentDesc *fi; 828 struct fileIdentDesc *fi;
826 struct fileIdentDesc cfi; 829 struct fileIdentDesc cfi;
827 kernel_lb_addr tloc; 830 struct kernel_lb_addr tloc;
828 831
829 retval = -ENOENT; 832 retval = -ENOENT;
830 lock_kernel(); 833 lock_kernel();
@@ -834,7 +837,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
834 837
835 retval = -EIO; 838 retval = -EIO;
836 tloc = lelb_to_cpu(cfi.icb.extLocation); 839 tloc = lelb_to_cpu(cfi.icb.extLocation);
837 if (udf_get_lb_pblock(dir->i_sb, tloc, 0) != inode->i_ino) 840 if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino)
838 goto end_unlink; 841 goto end_unlink;
839 842
840 if (!inode->i_nlink) { 843 if (!inode->i_nlink) {
@@ -897,7 +900,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
897 inode->i_op = &page_symlink_inode_operations; 900 inode->i_op = &page_symlink_inode_operations;
898 901
899 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { 902 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
900 kernel_lb_addr eloc; 903 struct kernel_lb_addr eloc;
901 uint32_t bsize; 904 uint32_t bsize;
902 905
903 block = udf_new_block(inode->i_sb, inode, 906 block = udf_new_block(inode->i_sb, inode,
@@ -913,7 +916,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
913 iinfo->i_location.partitionReferenceNum; 916 iinfo->i_location.partitionReferenceNum;
914 bsize = inode->i_sb->s_blocksize; 917 bsize = inode->i_sb->s_blocksize;
915 iinfo->i_lenExtents = bsize; 918 iinfo->i_lenExtents = bsize;
916 udf_add_aext(inode, &epos, eloc, bsize, 0); 919 udf_add_aext(inode, &epos, &eloc, bsize, 0);
917 brelse(epos.bh); 920 brelse(epos.bh);
918 921
919 block = udf_get_pblock(inode->i_sb, block, 922 block = udf_get_pblock(inode->i_sb, block,
@@ -1108,7 +1111,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1108 struct fileIdentDesc ocfi, ncfi; 1111 struct fileIdentDesc ocfi, ncfi;
1109 struct buffer_head *dir_bh = NULL; 1112 struct buffer_head *dir_bh = NULL;
1110 int retval = -ENOENT; 1113 int retval = -ENOENT;
1111 kernel_lb_addr tloc; 1114 struct kernel_lb_addr tloc;
1112 struct udf_inode_info *old_iinfo = UDF_I(old_inode); 1115 struct udf_inode_info *old_iinfo = UDF_I(old_inode);
1113 1116
1114 lock_kernel(); 1117 lock_kernel();
@@ -1119,7 +1122,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1119 brelse(ofibh.sbh); 1122 brelse(ofibh.sbh);
1120 } 1123 }
1121 tloc = lelb_to_cpu(ocfi.icb.extLocation); 1124 tloc = lelb_to_cpu(ocfi.icb.extLocation);
1122 if (!ofi || udf_get_lb_pblock(old_dir->i_sb, tloc, 0) 1125 if (!ofi || udf_get_lb_pblock(old_dir->i_sb, &tloc, 0)
1123 != old_inode->i_ino) 1126 != old_inode->i_ino)
1124 goto end_rename; 1127 goto end_rename;
1125 1128
@@ -1158,7 +1161,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1158 if (!dir_fi) 1161 if (!dir_fi)
1159 goto end_rename; 1162 goto end_rename;
1160 tloc = lelb_to_cpu(dir_fi->icb.extLocation); 1163 tloc = lelb_to_cpu(dir_fi->icb.extLocation);
1161 if (udf_get_lb_pblock(old_inode->i_sb, tloc, 0) != 1164 if (udf_get_lb_pblock(old_inode->i_sb, &tloc, 0) !=
1162 old_dir->i_ino) 1165 old_dir->i_ino)
1163 goto end_rename; 1166 goto end_rename;
1164 1167
@@ -1187,7 +1190,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1187 */ 1190 */
1188 ncfi.fileVersionNum = ocfi.fileVersionNum; 1191 ncfi.fileVersionNum = ocfi.fileVersionNum;
1189 ncfi.fileCharacteristics = ocfi.fileCharacteristics; 1192 ncfi.fileCharacteristics = ocfi.fileCharacteristics;
1190 memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(long_ad)); 1193 memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(struct long_ad));
1191 udf_write_fi(new_dir, &ncfi, nfi, &nfibh, NULL, NULL); 1194 udf_write_fi(new_dir, &ncfi, nfi, &nfibh, NULL, NULL);
1192 1195
1193 /* The old fid may have moved - find it again */ 1196 /* The old fid may have moved - find it again */
@@ -1242,6 +1245,7 @@ end_rename:
1242 1245
1243static struct dentry *udf_get_parent(struct dentry *child) 1246static struct dentry *udf_get_parent(struct dentry *child)
1244{ 1247{
1248 struct kernel_lb_addr tloc;
1245 struct inode *inode = NULL; 1249 struct inode *inode = NULL;
1246 struct qstr dotdot = {.name = "..", .len = 2}; 1250 struct qstr dotdot = {.name = "..", .len = 2};
1247 struct fileIdentDesc cfi; 1251 struct fileIdentDesc cfi;
@@ -1255,8 +1259,8 @@ static struct dentry *udf_get_parent(struct dentry *child)
1255 brelse(fibh.ebh); 1259 brelse(fibh.ebh);
1256 brelse(fibh.sbh); 1260 brelse(fibh.sbh);
1257 1261
1258 inode = udf_iget(child->d_inode->i_sb, 1262 tloc = lelb_to_cpu(cfi.icb.extLocation);
1259 lelb_to_cpu(cfi.icb.extLocation)); 1263 inode = udf_iget(child->d_inode->i_sb, &tloc);
1260 if (!inode) 1264 if (!inode)
1261 goto out_unlock; 1265 goto out_unlock;
1262 unlock_kernel(); 1266 unlock_kernel();
@@ -1272,14 +1276,14 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block,
1272 u16 partref, __u32 generation) 1276 u16 partref, __u32 generation)
1273{ 1277{
1274 struct inode *inode; 1278 struct inode *inode;
1275 kernel_lb_addr loc; 1279 struct kernel_lb_addr loc;
1276 1280
1277 if (block == 0) 1281 if (block == 0)
1278 return ERR_PTR(-ESTALE); 1282 return ERR_PTR(-ESTALE);
1279 1283
1280 loc.logicalBlockNum = block; 1284 loc.logicalBlockNum = block;
1281 loc.partitionReferenceNum = partref; 1285 loc.partitionReferenceNum = partref;
1282 inode = udf_iget(sb, loc); 1286 inode = udf_iget(sb, &loc);
1283 1287
1284 if (inode == NULL) 1288 if (inode == NULL)
1285 return ERR_PTR(-ENOMEM); 1289 return ERR_PTR(-ENOMEM);
@@ -1318,7 +1322,7 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp,
1318{ 1322{
1319 int len = *lenp; 1323 int len = *lenp;
1320 struct inode *inode = de->d_inode; 1324 struct inode *inode = de->d_inode;
1321 kernel_lb_addr location = UDF_I(inode)->i_location; 1325 struct kernel_lb_addr location = UDF_I(inode)->i_location;
1322 struct fid *fid = (struct fid *)fh; 1326 struct fid *fid = (struct fid *)fh;
1323 int type = FILEID_UDF_WITHOUT_PARENT; 1327 int type = FILEID_UDF_WITHOUT_PARENT;
1324 1328
diff --git a/fs/udf/osta_udf.h b/fs/udf/osta_udf.h
index 65ff47902bd2..fbff74654df2 100644
--- a/fs/udf/osta_udf.h
+++ b/fs/udf/osta_udf.h
@@ -85,7 +85,7 @@ struct appIdentSuffix {
85/* Logical Volume Integrity Descriptor (UDF 2.50 2.2.6) */ 85/* Logical Volume Integrity Descriptor (UDF 2.50 2.2.6) */
86/* Implementation Use (UDF 2.50 2.2.6.4) */ 86/* Implementation Use (UDF 2.50 2.2.6.4) */
87struct logicalVolIntegrityDescImpUse { 87struct logicalVolIntegrityDescImpUse {
88 regid impIdent; 88 struct regid impIdent;
89 __le32 numFiles; 89 __le32 numFiles;
90 __le32 numDirs; 90 __le32 numDirs;
91 __le16 minUDFReadRev; 91 __le16 minUDFReadRev;
@@ -97,12 +97,12 @@ struct logicalVolIntegrityDescImpUse {
97/* Implementation Use Volume Descriptor (UDF 2.50 2.2.7) */ 97/* Implementation Use Volume Descriptor (UDF 2.50 2.2.7) */
98/* Implementation Use (UDF 2.50 2.2.7.2) */ 98/* Implementation Use (UDF 2.50 2.2.7.2) */
99struct impUseVolDescImpUse { 99struct impUseVolDescImpUse {
100 charspec LVICharset; 100 struct charspec LVICharset;
101 dstring logicalVolIdent[128]; 101 dstring logicalVolIdent[128];
102 dstring LVInfo1[36]; 102 dstring LVInfo1[36];
103 dstring LVInfo2[36]; 103 dstring LVInfo2[36];
104 dstring LVInfo3[36]; 104 dstring LVInfo3[36];
105 regid impIdent; 105 struct regid impIdent;
106 uint8_t impUse[128]; 106 uint8_t impUse[128];
107} __attribute__ ((packed)); 107} __attribute__ ((packed));
108 108
@@ -110,7 +110,7 @@ struct udfPartitionMap2 {
110 uint8_t partitionMapType; 110 uint8_t partitionMapType;
111 uint8_t partitionMapLength; 111 uint8_t partitionMapLength;
112 uint8_t reserved1[2]; 112 uint8_t reserved1[2];
113 regid partIdent; 113 struct regid partIdent;
114 __le16 volSeqNum; 114 __le16 volSeqNum;
115 __le16 partitionNum; 115 __le16 partitionNum;
116} __attribute__ ((packed)); 116} __attribute__ ((packed));
@@ -120,7 +120,7 @@ struct virtualPartitionMap {
120 uint8_t partitionMapType; 120 uint8_t partitionMapType;
121 uint8_t partitionMapLength; 121 uint8_t partitionMapLength;
122 uint8_t reserved1[2]; 122 uint8_t reserved1[2];
123 regid partIdent; 123 struct regid partIdent;
124 __le16 volSeqNum; 124 __le16 volSeqNum;
125 __le16 partitionNum; 125 __le16 partitionNum;
126 uint8_t reserved2[24]; 126 uint8_t reserved2[24];
@@ -131,7 +131,7 @@ struct sparablePartitionMap {
131 uint8_t partitionMapType; 131 uint8_t partitionMapType;
132 uint8_t partitionMapLength; 132 uint8_t partitionMapLength;
133 uint8_t reserved1[2]; 133 uint8_t reserved1[2];
134 regid partIdent; 134 struct regid partIdent;
135 __le16 volSeqNum; 135 __le16 volSeqNum;
136 __le16 partitionNum; 136 __le16 partitionNum;
137 __le16 packetLength; 137 __le16 packetLength;
@@ -146,7 +146,7 @@ struct metadataPartitionMap {
146 uint8_t partitionMapType; 146 uint8_t partitionMapType;
147 uint8_t partitionMapLength; 147 uint8_t partitionMapLength;
148 uint8_t reserved1[2]; 148 uint8_t reserved1[2];
149 regid partIdent; 149 struct regid partIdent;
150 __le16 volSeqNum; 150 __le16 volSeqNum;
151 __le16 partitionNum; 151 __le16 partitionNum;
152 __le32 metadataFileLoc; 152 __le32 metadataFileLoc;
@@ -161,7 +161,7 @@ struct metadataPartitionMap {
161/* Virtual Allocation Table (UDF 1.5 2.2.10) */ 161/* Virtual Allocation Table (UDF 1.5 2.2.10) */
162struct virtualAllocationTable15 { 162struct virtualAllocationTable15 {
163 __le32 VirtualSector[0]; 163 __le32 VirtualSector[0];
164 regid vatIdent; 164 struct regid vatIdent;
165 __le32 previousVATICBLoc; 165 __le32 previousVATICBLoc;
166} __attribute__ ((packed)); 166} __attribute__ ((packed));
167 167
@@ -192,8 +192,8 @@ struct sparingEntry {
192} __attribute__ ((packed)); 192} __attribute__ ((packed));
193 193
194struct sparingTable { 194struct sparingTable {
195 tag descTag; 195 struct tag descTag;
196 regid sparingIdent; 196 struct regid sparingIdent;
197 __le16 reallocationTableLen; 197 __le16 reallocationTableLen;
198 __le16 reserved; 198 __le16 reserved;
199 __le32 sequenceNum; 199 __le32 sequenceNum;
@@ -206,7 +206,7 @@ struct sparingTable {
206#define ICBTAG_FILE_TYPE_MIRROR 0xFB 206#define ICBTAG_FILE_TYPE_MIRROR 0xFB
207#define ICBTAG_FILE_TYPE_BITMAP 0xFC 207#define ICBTAG_FILE_TYPE_BITMAP 0xFC
208 208
209/* struct long_ad ICB - ADImpUse (UDF 2.50 2.2.4.3) */ 209/* struct struct long_ad ICB - ADImpUse (UDF 2.50 2.2.4.3) */
210struct allocDescImpUse { 210struct allocDescImpUse {
211 __le16 flags; 211 __le16 flags;
212 uint8_t impUse[4]; 212 uint8_t impUse[4];
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 96dfd207c3d6..4b540ee632d5 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -273,7 +273,7 @@ static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block,
273{ 273{
274 struct super_block *sb = inode->i_sb; 274 struct super_block *sb = inode->i_sb;
275 struct udf_part_map *map; 275 struct udf_part_map *map;
276 kernel_lb_addr eloc; 276 struct kernel_lb_addr eloc;
277 uint32_t elen; 277 uint32_t elen;
278 sector_t ext_offset; 278 sector_t ext_offset;
279 struct extent_position epos = {}; 279 struct extent_position epos = {};
diff --git a/fs/udf/super.c b/fs/udf/super.c
index e25e7010627b..72348cc855a4 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -81,16 +81,13 @@ static char error_buf[1024];
81/* These are the "meat" - everything else is stuffing */ 81/* These are the "meat" - everything else is stuffing */
82static int udf_fill_super(struct super_block *, void *, int); 82static int udf_fill_super(struct super_block *, void *, int);
83static void udf_put_super(struct super_block *); 83static void udf_put_super(struct super_block *);
84static void udf_write_super(struct super_block *); 84static int udf_sync_fs(struct super_block *, int);
85static int udf_remount_fs(struct super_block *, int *, char *); 85static int udf_remount_fs(struct super_block *, int *, char *);
86static int udf_check_valid(struct super_block *, int, int); 86static void udf_load_logicalvolint(struct super_block *, struct kernel_extent_ad);
87static int udf_vrs(struct super_block *sb, int silent); 87static int udf_find_fileset(struct super_block *, struct kernel_lb_addr *,
88static void udf_load_logicalvolint(struct super_block *, kernel_extent_ad); 88 struct kernel_lb_addr *);
89static void udf_find_anchor(struct super_block *);
90static int udf_find_fileset(struct super_block *, kernel_lb_addr *,
91 kernel_lb_addr *);
92static void udf_load_fileset(struct super_block *, struct buffer_head *, 89static void udf_load_fileset(struct super_block *, struct buffer_head *,
93 kernel_lb_addr *); 90 struct kernel_lb_addr *);
94static void udf_open_lvid(struct super_block *); 91static void udf_open_lvid(struct super_block *);
95static void udf_close_lvid(struct super_block *); 92static void udf_close_lvid(struct super_block *);
96static unsigned int udf_count_free(struct super_block *); 93static unsigned int udf_count_free(struct super_block *);
@@ -181,7 +178,7 @@ static const struct super_operations udf_sb_ops = {
181 .delete_inode = udf_delete_inode, 178 .delete_inode = udf_delete_inode,
182 .clear_inode = udf_clear_inode, 179 .clear_inode = udf_clear_inode,
183 .put_super = udf_put_super, 180 .put_super = udf_put_super,
184 .write_super = udf_write_super, 181 .sync_fs = udf_sync_fs,
185 .statfs = udf_statfs, 182 .statfs = udf_statfs,
186 .remount_fs = udf_remount_fs, 183 .remount_fs = udf_remount_fs,
187 .show_options = udf_show_options, 184 .show_options = udf_show_options,
@@ -201,6 +198,8 @@ struct udf_options {
201 mode_t umask; 198 mode_t umask;
202 gid_t gid; 199 gid_t gid;
203 uid_t uid; 200 uid_t uid;
201 mode_t fmode;
202 mode_t dmode;
204 struct nls_table *nls_map; 203 struct nls_table *nls_map;
205}; 204};
206 205
@@ -258,7 +257,7 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
258 257
259 if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT)) 258 if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT))
260 seq_puts(seq, ",nostrict"); 259 seq_puts(seq, ",nostrict");
261 if (sb->s_blocksize != UDF_DEFAULT_BLOCKSIZE) 260 if (UDF_QUERY_FLAG(sb, UDF_FLAG_BLOCKSIZE_SET))
262 seq_printf(seq, ",bs=%lu", sb->s_blocksize); 261 seq_printf(seq, ",bs=%lu", sb->s_blocksize);
263 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE)) 262 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE))
264 seq_puts(seq, ",unhide"); 263 seq_puts(seq, ",unhide");
@@ -282,18 +281,16 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
282 seq_printf(seq, ",gid=%u", sbi->s_gid); 281 seq_printf(seq, ",gid=%u", sbi->s_gid);
283 if (sbi->s_umask != 0) 282 if (sbi->s_umask != 0)
284 seq_printf(seq, ",umask=%o", sbi->s_umask); 283 seq_printf(seq, ",umask=%o", sbi->s_umask);
284 if (sbi->s_fmode != UDF_INVALID_MODE)
285 seq_printf(seq, ",mode=%o", sbi->s_fmode);
286 if (sbi->s_dmode != UDF_INVALID_MODE)
287 seq_printf(seq, ",dmode=%o", sbi->s_dmode);
285 if (UDF_QUERY_FLAG(sb, UDF_FLAG_SESSION_SET)) 288 if (UDF_QUERY_FLAG(sb, UDF_FLAG_SESSION_SET))
286 seq_printf(seq, ",session=%u", sbi->s_session); 289 seq_printf(seq, ",session=%u", sbi->s_session);
287 if (UDF_QUERY_FLAG(sb, UDF_FLAG_LASTBLOCK_SET)) 290 if (UDF_QUERY_FLAG(sb, UDF_FLAG_LASTBLOCK_SET))
288 seq_printf(seq, ",lastblock=%u", sbi->s_last_block); 291 seq_printf(seq, ",lastblock=%u", sbi->s_last_block);
289 /* 292 if (sbi->s_anchor != 0)
290 * s_anchor[2] could be zeroed out in case there is no anchor 293 seq_printf(seq, ",anchor=%u", sbi->s_anchor);
291 * in the specified block, but then the "anchor=N" option
292 * originally given by the user wasn't effective, so it's OK
293 * if we don't show it.
294 */
295 if (sbi->s_anchor[2] != 0)
296 seq_printf(seq, ",anchor=%u", sbi->s_anchor[2]);
297 /* 294 /*
298 * volume, partition, fileset and rootdir seem to be ignored 295 * volume, partition, fileset and rootdir seem to be ignored
299 * currently 296 * currently
@@ -317,6 +314,8 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
317 * 314 *
318 * gid= Set the default group. 315 * gid= Set the default group.
319 * umask= Set the default umask. 316 * umask= Set the default umask.
317 * mode= Set the default file permissions.
318 * dmode= Set the default directory permissions.
320 * uid= Set the default user. 319 * uid= Set the default user.
321 * bs= Set the block size. 320 * bs= Set the block size.
322 * unhide Show otherwise hidden files. 321 * unhide Show otherwise hidden files.
@@ -366,7 +365,8 @@ enum {
366 Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock, 365 Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock,
367 Opt_anchor, Opt_volume, Opt_partition, Opt_fileset, 366 Opt_anchor, Opt_volume, Opt_partition, Opt_fileset,
368 Opt_rootdir, Opt_utf8, Opt_iocharset, 367 Opt_rootdir, Opt_utf8, Opt_iocharset,
369 Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore 368 Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore,
369 Opt_fmode, Opt_dmode
370}; 370};
371 371
372static const match_table_t tokens = { 372static const match_table_t tokens = {
@@ -395,6 +395,8 @@ static const match_table_t tokens = {
395 {Opt_rootdir, "rootdir=%u"}, 395 {Opt_rootdir, "rootdir=%u"},
396 {Opt_utf8, "utf8"}, 396 {Opt_utf8, "utf8"},
397 {Opt_iocharset, "iocharset=%s"}, 397 {Opt_iocharset, "iocharset=%s"},
398 {Opt_fmode, "mode=%o"},
399 {Opt_dmode, "dmode=%o"},
398 {Opt_err, NULL} 400 {Opt_err, NULL}
399}; 401};
400 402
@@ -405,7 +407,6 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
405 int option; 407 int option;
406 408
407 uopt->novrs = 0; 409 uopt->novrs = 0;
408 uopt->blocksize = UDF_DEFAULT_BLOCKSIZE;
409 uopt->partition = 0xFFFF; 410 uopt->partition = 0xFFFF;
410 uopt->session = 0xFFFFFFFF; 411 uopt->session = 0xFFFFFFFF;
411 uopt->lastblock = 0; 412 uopt->lastblock = 0;
@@ -428,10 +429,12 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
428 switch (token) { 429 switch (token) {
429 case Opt_novrs: 430 case Opt_novrs:
430 uopt->novrs = 1; 431 uopt->novrs = 1;
432 break;
431 case Opt_bs: 433 case Opt_bs:
432 if (match_int(&args[0], &option)) 434 if (match_int(&args[0], &option))
433 return 0; 435 return 0;
434 uopt->blocksize = option; 436 uopt->blocksize = option;
437 uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET);
435 break; 438 break;
436 case Opt_unhide: 439 case Opt_unhide:
437 uopt->flags |= (1 << UDF_FLAG_UNHIDE); 440 uopt->flags |= (1 << UDF_FLAG_UNHIDE);
@@ -531,6 +534,16 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
531 case Opt_gforget: 534 case Opt_gforget:
532 uopt->flags |= (1 << UDF_FLAG_GID_FORGET); 535 uopt->flags |= (1 << UDF_FLAG_GID_FORGET);
533 break; 536 break;
537 case Opt_fmode:
538 if (match_octal(args, &option))
539 return 0;
540 uopt->fmode = option & 0777;
541 break;
542 case Opt_dmode:
543 if (match_octal(args, &option))
544 return 0;
545 uopt->dmode = option & 0777;
546 break;
534 default: 547 default:
535 printk(KERN_ERR "udf: bad mount option \"%s\" " 548 printk(KERN_ERR "udf: bad mount option \"%s\" "
536 "or missing value\n", p); 549 "or missing value\n", p);
@@ -540,17 +553,6 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
540 return 1; 553 return 1;
541} 554}
542 555
543static void udf_write_super(struct super_block *sb)
544{
545 lock_kernel();
546
547 if (!(sb->s_flags & MS_RDONLY))
548 udf_open_lvid(sb);
549 sb->s_dirt = 0;
550
551 unlock_kernel();
552}
553
554static int udf_remount_fs(struct super_block *sb, int *flags, char *options) 556static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
555{ 557{
556 struct udf_options uopt; 558 struct udf_options uopt;
@@ -560,6 +562,8 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
560 uopt.uid = sbi->s_uid; 562 uopt.uid = sbi->s_uid;
561 uopt.gid = sbi->s_gid; 563 uopt.gid = sbi->s_gid;
562 uopt.umask = sbi->s_umask; 564 uopt.umask = sbi->s_umask;
565 uopt.fmode = sbi->s_fmode;
566 uopt.dmode = sbi->s_dmode;
563 567
564 if (!udf_parse_options(options, &uopt, true)) 568 if (!udf_parse_options(options, &uopt, true))
565 return -EINVAL; 569 return -EINVAL;
@@ -568,6 +572,8 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
568 sbi->s_uid = uopt.uid; 572 sbi->s_uid = uopt.uid;
569 sbi->s_gid = uopt.gid; 573 sbi->s_gid = uopt.gid;
570 sbi->s_umask = uopt.umask; 574 sbi->s_umask = uopt.umask;
575 sbi->s_fmode = uopt.fmode;
576 sbi->s_dmode = uopt.dmode;
571 577
572 if (sbi->s_lvid_bh) { 578 if (sbi->s_lvid_bh) {
573 int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev); 579 int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev);
@@ -585,22 +591,19 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
585 return 0; 591 return 0;
586} 592}
587 593
588static int udf_vrs(struct super_block *sb, int silent) 594/* Check Volume Structure Descriptors (ECMA 167 2/9.1) */
595/* We also check any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */
596static loff_t udf_check_vsd(struct super_block *sb)
589{ 597{
590 struct volStructDesc *vsd = NULL; 598 struct volStructDesc *vsd = NULL;
591 loff_t sector = 32768; 599 loff_t sector = 32768;
592 int sectorsize; 600 int sectorsize;
593 struct buffer_head *bh = NULL; 601 struct buffer_head *bh = NULL;
594 int iso9660 = 0;
595 int nsr02 = 0; 602 int nsr02 = 0;
596 int nsr03 = 0; 603 int nsr03 = 0;
597 struct udf_sb_info *sbi; 604 struct udf_sb_info *sbi;
598 605
599 /* Block size must be a multiple of 512 */
600 if (sb->s_blocksize & 511)
601 return 0;
602 sbi = UDF_SB(sb); 606 sbi = UDF_SB(sb);
603
604 if (sb->s_blocksize < sizeof(struct volStructDesc)) 607 if (sb->s_blocksize < sizeof(struct volStructDesc))
605 sectorsize = sizeof(struct volStructDesc); 608 sectorsize = sizeof(struct volStructDesc);
606 else 609 else
@@ -627,7 +630,6 @@ static int udf_vrs(struct super_block *sb, int silent)
627 break; 630 break;
628 } else if (!strncmp(vsd->stdIdent, VSD_STD_ID_CD001, 631 } else if (!strncmp(vsd->stdIdent, VSD_STD_ID_CD001,
629 VSD_STD_ID_LEN)) { 632 VSD_STD_ID_LEN)) {
630 iso9660 = sector;
631 switch (vsd->structType) { 633 switch (vsd->structType) {
632 case 0: 634 case 0:
633 udf_debug("ISO9660 Boot Record found\n"); 635 udf_debug("ISO9660 Boot Record found\n");
@@ -679,139 +681,9 @@ static int udf_vrs(struct super_block *sb, int silent)
679 return 0; 681 return 0;
680} 682}
681 683
682/*
683 * Check whether there is an anchor block in the given block
684 */
685static int udf_check_anchor_block(struct super_block *sb, sector_t block)
686{
687 struct buffer_head *bh;
688 uint16_t ident;
689
690 if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
691 udf_fixed_to_variable(block) >=
692 sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
693 return 0;
694
695 bh = udf_read_tagged(sb, block, block, &ident);
696 if (!bh)
697 return 0;
698 brelse(bh);
699
700 return ident == TAG_IDENT_AVDP;
701}
702
703/* Search for an anchor volume descriptor pointer */
704static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock)
705{
706 sector_t last[6];
707 int i;
708 struct udf_sb_info *sbi = UDF_SB(sb);
709
710 last[0] = lastblock;
711 last[1] = last[0] - 1;
712 last[2] = last[0] + 1;
713 last[3] = last[0] - 2;
714 last[4] = last[0] - 150;
715 last[5] = last[0] - 152;
716
717 /* according to spec, anchor is in either:
718 * block 256
719 * lastblock-256
720 * lastblock
721 * however, if the disc isn't closed, it could be 512 */
722
723 for (i = 0; i < ARRAY_SIZE(last); i++) {
724 if (last[i] < 0)
725 continue;
726 if (last[i] >= sb->s_bdev->bd_inode->i_size >>
727 sb->s_blocksize_bits)
728 continue;
729
730 if (udf_check_anchor_block(sb, last[i])) {
731 sbi->s_anchor[0] = last[i];
732 sbi->s_anchor[1] = last[i] - 256;
733 return last[i];
734 }
735
736 if (last[i] < 256)
737 continue;
738
739 if (udf_check_anchor_block(sb, last[i] - 256)) {
740 sbi->s_anchor[1] = last[i] - 256;
741 return last[i];
742 }
743 }
744
745 if (udf_check_anchor_block(sb, sbi->s_session + 256)) {
746 sbi->s_anchor[0] = sbi->s_session + 256;
747 return last[0];
748 }
749 if (udf_check_anchor_block(sb, sbi->s_session + 512)) {
750 sbi->s_anchor[0] = sbi->s_session + 512;
751 return last[0];
752 }
753 return 0;
754}
755
756/*
757 * Find an anchor volume descriptor. The function expects sbi->s_lastblock to
758 * be the last block on the media.
759 *
760 * Return 1 if not found, 0 if ok
761 *
762 */
763static void udf_find_anchor(struct super_block *sb)
764{
765 sector_t lastblock;
766 struct buffer_head *bh = NULL;
767 uint16_t ident;
768 int i;
769 struct udf_sb_info *sbi = UDF_SB(sb);
770
771 lastblock = udf_scan_anchors(sb, sbi->s_last_block);
772 if (lastblock)
773 goto check_anchor;
774
775 /* No anchor found? Try VARCONV conversion of block numbers */
776 UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
777 /* Firstly, we try to not convert number of the last block */
778 lastblock = udf_scan_anchors(sb,
779 udf_variable_to_fixed(sbi->s_last_block));
780 if (lastblock)
781 goto check_anchor;
782
783 /* Secondly, we try with converted number of the last block */
784 lastblock = udf_scan_anchors(sb, sbi->s_last_block);
785 if (!lastblock) {
786 /* VARCONV didn't help. Clear it. */
787 UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV);
788 }
789
790check_anchor:
791 /*
792 * Check located anchors and the anchor block supplied via
793 * mount options
794 */
795 for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) {
796 if (!sbi->s_anchor[i])
797 continue;
798 bh = udf_read_tagged(sb, sbi->s_anchor[i],
799 sbi->s_anchor[i], &ident);
800 if (!bh)
801 sbi->s_anchor[i] = 0;
802 else {
803 brelse(bh);
804 if (ident != TAG_IDENT_AVDP)
805 sbi->s_anchor[i] = 0;
806 }
807 }
808
809 sbi->s_last_block = lastblock;
810}
811
812static int udf_find_fileset(struct super_block *sb, 684static int udf_find_fileset(struct super_block *sb,
813 kernel_lb_addr *fileset, 685 struct kernel_lb_addr *fileset,
814 kernel_lb_addr *root) 686 struct kernel_lb_addr *root)
815{ 687{
816 struct buffer_head *bh = NULL; 688 struct buffer_head *bh = NULL;
817 long lastblock; 689 long lastblock;
@@ -820,7 +692,7 @@ static int udf_find_fileset(struct super_block *sb,
820 692
821 if (fileset->logicalBlockNum != 0xFFFFFFFF || 693 if (fileset->logicalBlockNum != 0xFFFFFFFF ||
822 fileset->partitionReferenceNum != 0xFFFF) { 694 fileset->partitionReferenceNum != 0xFFFF) {
823 bh = udf_read_ptagged(sb, *fileset, 0, &ident); 695 bh = udf_read_ptagged(sb, fileset, 0, &ident);
824 696
825 if (!bh) { 697 if (!bh) {
826 return 1; 698 return 1;
@@ -834,7 +706,7 @@ static int udf_find_fileset(struct super_block *sb,
834 sbi = UDF_SB(sb); 706 sbi = UDF_SB(sb);
835 if (!bh) { 707 if (!bh) {
836 /* Search backwards through the partitions */ 708 /* Search backwards through the partitions */
837 kernel_lb_addr newfileset; 709 struct kernel_lb_addr newfileset;
838 710
839/* --> cvg: FIXME - is it reasonable? */ 711/* --> cvg: FIXME - is it reasonable? */
840 return 1; 712 return 1;
@@ -850,7 +722,7 @@ static int udf_find_fileset(struct super_block *sb,
850 newfileset.logicalBlockNum = 0; 722 newfileset.logicalBlockNum = 0;
851 723
852 do { 724 do {
853 bh = udf_read_ptagged(sb, newfileset, 0, 725 bh = udf_read_ptagged(sb, &newfileset, 0,
854 &ident); 726 &ident);
855 if (!bh) { 727 if (!bh) {
856 newfileset.logicalBlockNum++; 728 newfileset.logicalBlockNum++;
@@ -902,14 +774,23 @@ static int udf_find_fileset(struct super_block *sb,
902static int udf_load_pvoldesc(struct super_block *sb, sector_t block) 774static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
903{ 775{
904 struct primaryVolDesc *pvoldesc; 776 struct primaryVolDesc *pvoldesc;
905 struct ustr instr; 777 struct ustr *instr, *outstr;
906 struct ustr outstr;
907 struct buffer_head *bh; 778 struct buffer_head *bh;
908 uint16_t ident; 779 uint16_t ident;
780 int ret = 1;
781
782 instr = kmalloc(sizeof(struct ustr), GFP_NOFS);
783 if (!instr)
784 return 1;
785
786 outstr = kmalloc(sizeof(struct ustr), GFP_NOFS);
787 if (!outstr)
788 goto out1;
909 789
910 bh = udf_read_tagged(sb, block, block, &ident); 790 bh = udf_read_tagged(sb, block, block, &ident);
911 if (!bh) 791 if (!bh)
912 return 1; 792 goto out2;
793
913 BUG_ON(ident != TAG_IDENT_PVD); 794 BUG_ON(ident != TAG_IDENT_PVD);
914 795
915 pvoldesc = (struct primaryVolDesc *)bh->b_data; 796 pvoldesc = (struct primaryVolDesc *)bh->b_data;
@@ -917,7 +798,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
917 if (udf_disk_stamp_to_time(&UDF_SB(sb)->s_record_time, 798 if (udf_disk_stamp_to_time(&UDF_SB(sb)->s_record_time,
918 pvoldesc->recordingDateAndTime)) { 799 pvoldesc->recordingDateAndTime)) {
919#ifdef UDFFS_DEBUG 800#ifdef UDFFS_DEBUG
920 timestamp *ts = &pvoldesc->recordingDateAndTime; 801 struct timestamp *ts = &pvoldesc->recordingDateAndTime;
921 udf_debug("recording time %04u/%02u/%02u" 802 udf_debug("recording time %04u/%02u/%02u"
922 " %02u:%02u (%x)\n", 803 " %02u:%02u (%x)\n",
923 le16_to_cpu(ts->year), ts->month, ts->day, ts->hour, 804 le16_to_cpu(ts->year), ts->month, ts->day, ts->hour,
@@ -925,20 +806,25 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
925#endif 806#endif
926 } 807 }
927 808
928 if (!udf_build_ustr(&instr, pvoldesc->volIdent, 32)) 809 if (!udf_build_ustr(instr, pvoldesc->volIdent, 32))
929 if (udf_CS0toUTF8(&outstr, &instr)) { 810 if (udf_CS0toUTF8(outstr, instr)) {
930 strncpy(UDF_SB(sb)->s_volume_ident, outstr.u_name, 811 strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name,
931 outstr.u_len > 31 ? 31 : outstr.u_len); 812 outstr->u_len > 31 ? 31 : outstr->u_len);
932 udf_debug("volIdent[] = '%s'\n", 813 udf_debug("volIdent[] = '%s'\n",
933 UDF_SB(sb)->s_volume_ident); 814 UDF_SB(sb)->s_volume_ident);
934 } 815 }
935 816
936 if (!udf_build_ustr(&instr, pvoldesc->volSetIdent, 128)) 817 if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128))
937 if (udf_CS0toUTF8(&outstr, &instr)) 818 if (udf_CS0toUTF8(outstr, instr))
938 udf_debug("volSetIdent[] = '%s'\n", outstr.u_name); 819 udf_debug("volSetIdent[] = '%s'\n", outstr->u_name);
939 820
940 brelse(bh); 821 brelse(bh);
941 return 0; 822 ret = 0;
823out2:
824 kfree(outstr);
825out1:
826 kfree(instr);
827 return ret;
942} 828}
943 829
944static int udf_load_metadata_files(struct super_block *sb, int partition) 830static int udf_load_metadata_files(struct super_block *sb, int partition)
@@ -946,7 +832,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
946 struct udf_sb_info *sbi = UDF_SB(sb); 832 struct udf_sb_info *sbi = UDF_SB(sb);
947 struct udf_part_map *map; 833 struct udf_part_map *map;
948 struct udf_meta_data *mdata; 834 struct udf_meta_data *mdata;
949 kernel_lb_addr addr; 835 struct kernel_lb_addr addr;
950 int fe_error = 0; 836 int fe_error = 0;
951 837
952 map = &sbi->s_partmaps[partition]; 838 map = &sbi->s_partmaps[partition];
@@ -959,7 +845,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
959 udf_debug("Metadata file location: block = %d part = %d\n", 845 udf_debug("Metadata file location: block = %d part = %d\n",
960 addr.logicalBlockNum, addr.partitionReferenceNum); 846 addr.logicalBlockNum, addr.partitionReferenceNum);
961 847
962 mdata->s_metadata_fe = udf_iget(sb, addr); 848 mdata->s_metadata_fe = udf_iget(sb, &addr);
963 849
964 if (mdata->s_metadata_fe == NULL) { 850 if (mdata->s_metadata_fe == NULL) {
965 udf_warning(sb, __func__, "metadata inode efe not found, " 851 udf_warning(sb, __func__, "metadata inode efe not found, "
@@ -981,7 +867,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
981 udf_debug("Mirror metadata file location: block = %d part = %d\n", 867 udf_debug("Mirror metadata file location: block = %d part = %d\n",
982 addr.logicalBlockNum, addr.partitionReferenceNum); 868 addr.logicalBlockNum, addr.partitionReferenceNum);
983 869
984 mdata->s_mirror_fe = udf_iget(sb, addr); 870 mdata->s_mirror_fe = udf_iget(sb, &addr);
985 871
986 if (mdata->s_mirror_fe == NULL) { 872 if (mdata->s_mirror_fe == NULL) {
987 if (fe_error) { 873 if (fe_error) {
@@ -1013,7 +899,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
1013 udf_debug("Bitmap file location: block = %d part = %d\n", 899 udf_debug("Bitmap file location: block = %d part = %d\n",
1014 addr.logicalBlockNum, addr.partitionReferenceNum); 900 addr.logicalBlockNum, addr.partitionReferenceNum);
1015 901
1016 mdata->s_bitmap_fe = udf_iget(sb, addr); 902 mdata->s_bitmap_fe = udf_iget(sb, &addr);
1017 903
1018 if (mdata->s_bitmap_fe == NULL) { 904 if (mdata->s_bitmap_fe == NULL) {
1019 if (sb->s_flags & MS_RDONLY) 905 if (sb->s_flags & MS_RDONLY)
@@ -1037,7 +923,7 @@ error_exit:
1037} 923}
1038 924
1039static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh, 925static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh,
1040 kernel_lb_addr *root) 926 struct kernel_lb_addr *root)
1041{ 927{
1042 struct fileSetDesc *fset; 928 struct fileSetDesc *fset;
1043 929
@@ -1119,13 +1005,13 @@ static int udf_fill_partdesc_info(struct super_block *sb,
1119 1005
1120 phd = (struct partitionHeaderDesc *)p->partitionContentsUse; 1006 phd = (struct partitionHeaderDesc *)p->partitionContentsUse;
1121 if (phd->unallocSpaceTable.extLength) { 1007 if (phd->unallocSpaceTable.extLength) {
1122 kernel_lb_addr loc = { 1008 struct kernel_lb_addr loc = {
1123 .logicalBlockNum = le32_to_cpu( 1009 .logicalBlockNum = le32_to_cpu(
1124 phd->unallocSpaceTable.extPosition), 1010 phd->unallocSpaceTable.extPosition),
1125 .partitionReferenceNum = p_index, 1011 .partitionReferenceNum = p_index,
1126 }; 1012 };
1127 1013
1128 map->s_uspace.s_table = udf_iget(sb, loc); 1014 map->s_uspace.s_table = udf_iget(sb, &loc);
1129 if (!map->s_uspace.s_table) { 1015 if (!map->s_uspace.s_table) {
1130 udf_debug("cannot load unallocSpaceTable (part %d)\n", 1016 udf_debug("cannot load unallocSpaceTable (part %d)\n",
1131 p_index); 1017 p_index);
@@ -1154,13 +1040,13 @@ static int udf_fill_partdesc_info(struct super_block *sb,
1154 udf_debug("partitionIntegrityTable (part %d)\n", p_index); 1040 udf_debug("partitionIntegrityTable (part %d)\n", p_index);
1155 1041
1156 if (phd->freedSpaceTable.extLength) { 1042 if (phd->freedSpaceTable.extLength) {
1157 kernel_lb_addr loc = { 1043 struct kernel_lb_addr loc = {
1158 .logicalBlockNum = le32_to_cpu( 1044 .logicalBlockNum = le32_to_cpu(
1159 phd->freedSpaceTable.extPosition), 1045 phd->freedSpaceTable.extPosition),
1160 .partitionReferenceNum = p_index, 1046 .partitionReferenceNum = p_index,
1161 }; 1047 };
1162 1048
1163 map->s_fspace.s_table = udf_iget(sb, loc); 1049 map->s_fspace.s_table = udf_iget(sb, &loc);
1164 if (!map->s_fspace.s_table) { 1050 if (!map->s_fspace.s_table) {
1165 udf_debug("cannot load freedSpaceTable (part %d)\n", 1051 udf_debug("cannot load freedSpaceTable (part %d)\n",
1166 p_index); 1052 p_index);
@@ -1192,7 +1078,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
1192{ 1078{
1193 struct udf_sb_info *sbi = UDF_SB(sb); 1079 struct udf_sb_info *sbi = UDF_SB(sb);
1194 struct udf_part_map *map = &sbi->s_partmaps[p_index]; 1080 struct udf_part_map *map = &sbi->s_partmaps[p_index];
1195 kernel_lb_addr ino; 1081 struct kernel_lb_addr ino;
1196 struct buffer_head *bh = NULL; 1082 struct buffer_head *bh = NULL;
1197 struct udf_inode_info *vati; 1083 struct udf_inode_info *vati;
1198 uint32_t pos; 1084 uint32_t pos;
@@ -1201,7 +1087,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
1201 /* VAT file entry is in the last recorded block */ 1087 /* VAT file entry is in the last recorded block */
1202 ino.partitionReferenceNum = type1_index; 1088 ino.partitionReferenceNum = type1_index;
1203 ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root; 1089 ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root;
1204 sbi->s_vat_inode = udf_iget(sb, ino); 1090 sbi->s_vat_inode = udf_iget(sb, &ino);
1205 if (!sbi->s_vat_inode) 1091 if (!sbi->s_vat_inode)
1206 return 1; 1092 return 1;
1207 1093
@@ -1322,7 +1208,7 @@ out_bh:
1322} 1208}
1323 1209
1324static int udf_load_logicalvol(struct super_block *sb, sector_t block, 1210static int udf_load_logicalvol(struct super_block *sb, sector_t block,
1325 kernel_lb_addr *fileset) 1211 struct kernel_lb_addr *fileset)
1326{ 1212{
1327 struct logicalVolDesc *lvd; 1213 struct logicalVolDesc *lvd;
1328 int i, j, offset; 1214 int i, j, offset;
@@ -1471,7 +1357,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
1471 } 1357 }
1472 1358
1473 if (fileset) { 1359 if (fileset) {
1474 long_ad *la = (long_ad *)&(lvd->logicalVolContentsUse[0]); 1360 struct long_ad *la = (struct long_ad *)&(lvd->logicalVolContentsUse[0]);
1475 1361
1476 *fileset = lelb_to_cpu(la->extLocation); 1362 *fileset = lelb_to_cpu(la->extLocation);
1477 udf_debug("FileSet found in LogicalVolDesc at block=%d, " 1363 udf_debug("FileSet found in LogicalVolDesc at block=%d, "
@@ -1490,7 +1376,7 @@ out_bh:
1490 * udf_load_logicalvolint 1376 * udf_load_logicalvolint
1491 * 1377 *
1492 */ 1378 */
1493static void udf_load_logicalvolint(struct super_block *sb, kernel_extent_ad loc) 1379static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ad loc)
1494{ 1380{
1495 struct buffer_head *bh = NULL; 1381 struct buffer_head *bh = NULL;
1496 uint16_t ident; 1382 uint16_t ident;
@@ -1533,7 +1419,7 @@ static void udf_load_logicalvolint(struct super_block *sb, kernel_extent_ad loc)
1533 * Written, tested, and released. 1419 * Written, tested, and released.
1534 */ 1420 */
1535static noinline int udf_process_sequence(struct super_block *sb, long block, 1421static noinline int udf_process_sequence(struct super_block *sb, long block,
1536 long lastblock, kernel_lb_addr *fileset) 1422 long lastblock, struct kernel_lb_addr *fileset)
1537{ 1423{
1538 struct buffer_head *bh = NULL; 1424 struct buffer_head *bh = NULL;
1539 struct udf_vds_record vds[VDS_POS_LENGTH]; 1425 struct udf_vds_record vds[VDS_POS_LENGTH];
@@ -1655,85 +1541,199 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
1655 return 0; 1541 return 0;
1656} 1542}
1657 1543
1544static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh,
1545 struct kernel_lb_addr *fileset)
1546{
1547 struct anchorVolDescPtr *anchor;
1548 long main_s, main_e, reserve_s, reserve_e;
1549 struct udf_sb_info *sbi;
1550
1551 sbi = UDF_SB(sb);
1552 anchor = (struct anchorVolDescPtr *)bh->b_data;
1553
1554 /* Locate the main sequence */
1555 main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation);
1556 main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength);
1557 main_e = main_e >> sb->s_blocksize_bits;
1558 main_e += main_s;
1559
1560 /* Locate the reserve sequence */
1561 reserve_s = le32_to_cpu(anchor->reserveVolDescSeqExt.extLocation);
1562 reserve_e = le32_to_cpu(anchor->reserveVolDescSeqExt.extLength);
1563 reserve_e = reserve_e >> sb->s_blocksize_bits;
1564 reserve_e += reserve_s;
1565
1566 /* Process the main & reserve sequences */
1567 /* responsible for finding the PartitionDesc(s) */
1568 if (!udf_process_sequence(sb, main_s, main_e, fileset))
1569 return 1;
1570 return !udf_process_sequence(sb, reserve_s, reserve_e, fileset);
1571}
1572
1658/* 1573/*
1659 * udf_check_valid() 1574 * Check whether there is an anchor block in the given block and
1575 * load Volume Descriptor Sequence if so.
1660 */ 1576 */
1661static int udf_check_valid(struct super_block *sb, int novrs, int silent) 1577static int udf_check_anchor_block(struct super_block *sb, sector_t block,
1578 struct kernel_lb_addr *fileset)
1662{ 1579{
1663 long block; 1580 struct buffer_head *bh;
1664 struct udf_sb_info *sbi = UDF_SB(sb); 1581 uint16_t ident;
1582 int ret;
1665 1583
1666 if (novrs) { 1584 if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
1667 udf_debug("Validity check skipped because of novrs option\n"); 1585 udf_fixed_to_variable(block) >=
1586 sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
1587 return 0;
1588
1589 bh = udf_read_tagged(sb, block, block, &ident);
1590 if (!bh)
1591 return 0;
1592 if (ident != TAG_IDENT_AVDP) {
1593 brelse(bh);
1668 return 0; 1594 return 0;
1669 } 1595 }
1670 /* Check that it is NSR02 compliant */ 1596 ret = udf_load_sequence(sb, bh, fileset);
1671 /* Process any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */ 1597 brelse(bh);
1672 block = udf_vrs(sb, silent); 1598 return ret;
1673 if (block == -1)
1674 udf_debug("Failed to read byte 32768. Assuming open "
1675 "disc. Skipping validity check\n");
1676 if (block && !sbi->s_last_block)
1677 sbi->s_last_block = udf_get_last_block(sb);
1678 return !block;
1679} 1599}
1680 1600
1681static int udf_load_sequence(struct super_block *sb, kernel_lb_addr *fileset) 1601/* Search for an anchor volume descriptor pointer */
1602static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock,
1603 struct kernel_lb_addr *fileset)
1682{ 1604{
1683 struct anchorVolDescPtr *anchor; 1605 sector_t last[6];
1684 uint16_t ident;
1685 struct buffer_head *bh;
1686 long main_s, main_e, reserve_s, reserve_e;
1687 int i; 1606 int i;
1688 struct udf_sb_info *sbi; 1607 struct udf_sb_info *sbi = UDF_SB(sb);
1689 1608 int last_count = 0;
1690 if (!sb)
1691 return 1;
1692 sbi = UDF_SB(sb);
1693 1609
1694 for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) { 1610 /* First try user provided anchor */
1695 if (!sbi->s_anchor[i]) 1611 if (sbi->s_anchor) {
1612 if (udf_check_anchor_block(sb, sbi->s_anchor, fileset))
1613 return lastblock;
1614 }
1615 /*
1616 * according to spec, anchor is in either:
1617 * block 256
1618 * lastblock-256
1619 * lastblock
1620 * however, if the disc isn't closed, it could be 512.
1621 */
1622 if (udf_check_anchor_block(sb, sbi->s_session + 256, fileset))
1623 return lastblock;
1624 /*
1625 * The trouble is which block is the last one. Drives often misreport
1626 * this so we try various possibilities.
1627 */
1628 last[last_count++] = lastblock;
1629 if (lastblock >= 1)
1630 last[last_count++] = lastblock - 1;
1631 last[last_count++] = lastblock + 1;
1632 if (lastblock >= 2)
1633 last[last_count++] = lastblock - 2;
1634 if (lastblock >= 150)
1635 last[last_count++] = lastblock - 150;
1636 if (lastblock >= 152)
1637 last[last_count++] = lastblock - 152;
1638
1639 for (i = 0; i < last_count; i++) {
1640 if (last[i] >= sb->s_bdev->bd_inode->i_size >>
1641 sb->s_blocksize_bits)
1696 continue; 1642 continue;
1697 1643 if (udf_check_anchor_block(sb, last[i], fileset))
1698 bh = udf_read_tagged(sb, sbi->s_anchor[i], sbi->s_anchor[i], 1644 return last[i];
1699 &ident); 1645 if (last[i] < 256)
1700 if (!bh)
1701 continue; 1646 continue;
1647 if (udf_check_anchor_block(sb, last[i] - 256, fileset))
1648 return last[i];
1649 }
1702 1650
1703 anchor = (struct anchorVolDescPtr *)bh->b_data; 1651 /* Finally try block 512 in case media is open */
1652 if (udf_check_anchor_block(sb, sbi->s_session + 512, fileset))
1653 return last[0];
1654 return 0;
1655}
1704 1656
1705 /* Locate the main sequence */ 1657/*
1706 main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation); 1658 * Find an anchor volume descriptor and load Volume Descriptor Sequence from
1707 main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength); 1659 * area specified by it. The function expects sbi->s_lastblock to be the last
1708 main_e = main_e >> sb->s_blocksize_bits; 1660 * block on the media.
1709 main_e += main_s; 1661 *
1662 * Return 1 if ok, 0 if not found.
1663 *
1664 */
1665static int udf_find_anchor(struct super_block *sb,
1666 struct kernel_lb_addr *fileset)
1667{
1668 sector_t lastblock;
1669 struct udf_sb_info *sbi = UDF_SB(sb);
1710 1670
1711 /* Locate the reserve sequence */ 1671 lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset);
1712 reserve_s = le32_to_cpu( 1672 if (lastblock)
1713 anchor->reserveVolDescSeqExt.extLocation); 1673 goto out;
1714 reserve_e = le32_to_cpu(
1715 anchor->reserveVolDescSeqExt.extLength);
1716 reserve_e = reserve_e >> sb->s_blocksize_bits;
1717 reserve_e += reserve_s;
1718 1674
1719 brelse(bh); 1675 /* No anchor found? Try VARCONV conversion of block numbers */
1676 UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
1677 /* Firstly, we try to not convert number of the last block */
1678 lastblock = udf_scan_anchors(sb,
1679 udf_variable_to_fixed(sbi->s_last_block),
1680 fileset);
1681 if (lastblock)
1682 goto out;
1720 1683
1721 /* Process the main & reserve sequences */ 1684 /* Secondly, we try with converted number of the last block */
1722 /* responsible for finding the PartitionDesc(s) */ 1685 lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset);
1723 if (!(udf_process_sequence(sb, main_s, main_e, 1686 if (!lastblock) {
1724 fileset) && 1687 /* VARCONV didn't help. Clear it. */
1725 udf_process_sequence(sb, reserve_s, reserve_e, 1688 UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV);
1726 fileset))) 1689 return 0;
1727 break;
1728 } 1690 }
1691out:
1692 sbi->s_last_block = lastblock;
1693 return 1;
1694}
1729 1695
1730 if (i == ARRAY_SIZE(sbi->s_anchor)) { 1696/*
1731 udf_debug("No Anchor block found\n"); 1697 * Check Volume Structure Descriptor, find Anchor block and load Volume
1732 return 1; 1698 * Descriptor Sequence
1699 */
1700static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
1701 int silent, struct kernel_lb_addr *fileset)
1702{
1703 struct udf_sb_info *sbi = UDF_SB(sb);
1704 loff_t nsr_off;
1705
1706 if (!sb_set_blocksize(sb, uopt->blocksize)) {
1707 if (!silent)
1708 printk(KERN_WARNING "UDF-fs: Bad block size\n");
1709 return 0;
1710 }
1711 sbi->s_last_block = uopt->lastblock;
1712 if (!uopt->novrs) {
1713 /* Check that it is NSR02 compliant */
1714 nsr_off = udf_check_vsd(sb);
1715 if (!nsr_off) {
1716 if (!silent)
1717 printk(KERN_WARNING "UDF-fs: No VRS found\n");
1718 return 0;
1719 }
1720 if (nsr_off == -1)
1721 udf_debug("Failed to read byte 32768. Assuming open "
1722 "disc. Skipping validity check\n");
1723 if (!sbi->s_last_block)
1724 sbi->s_last_block = udf_get_last_block(sb);
1725 } else {
1726 udf_debug("Validity check skipped because of novrs option\n");
1733 } 1727 }
1734 udf_debug("Using anchor in block %d\n", sbi->s_anchor[i]);
1735 1728
1736 return 0; 1729 /* Look for anchor block and load Volume Descriptor Sequence */
1730 sbi->s_anchor = uopt->anchor;
1731 if (!udf_find_anchor(sb, fileset)) {
1732 if (!silent)
1733 printk(KERN_WARNING "UDF-fs: No anchor found\n");
1734 return 0;
1735 }
1736 return 1;
1737} 1737}
1738 1738
1739static void udf_open_lvid(struct super_block *sb) 1739static void udf_open_lvid(struct super_block *sb)
@@ -1742,9 +1742,9 @@ static void udf_open_lvid(struct super_block *sb)
1742 struct buffer_head *bh = sbi->s_lvid_bh; 1742 struct buffer_head *bh = sbi->s_lvid_bh;
1743 struct logicalVolIntegrityDesc *lvid; 1743 struct logicalVolIntegrityDesc *lvid;
1744 struct logicalVolIntegrityDescImpUse *lvidiu; 1744 struct logicalVolIntegrityDescImpUse *lvidiu;
1745
1745 if (!bh) 1746 if (!bh)
1746 return; 1747 return;
1747
1748 lvid = (struct logicalVolIntegrityDesc *)bh->b_data; 1748 lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
1749 lvidiu = udf_sb_lvidiu(sbi); 1749 lvidiu = udf_sb_lvidiu(sbi);
1750 1750
@@ -1752,14 +1752,15 @@ static void udf_open_lvid(struct super_block *sb)
1752 lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; 1752 lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
1753 udf_time_to_disk_stamp(&lvid->recordingDateAndTime, 1753 udf_time_to_disk_stamp(&lvid->recordingDateAndTime,
1754 CURRENT_TIME); 1754 CURRENT_TIME);
1755 lvid->integrityType = LVID_INTEGRITY_TYPE_OPEN; 1755 lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN);
1756 1756
1757 lvid->descTag.descCRC = cpu_to_le16( 1757 lvid->descTag.descCRC = cpu_to_le16(
1758 crc_itu_t(0, (char *)lvid + sizeof(tag), 1758 crc_itu_t(0, (char *)lvid + sizeof(struct tag),
1759 le16_to_cpu(lvid->descTag.descCRCLength))); 1759 le16_to_cpu(lvid->descTag.descCRCLength)));
1760 1760
1761 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); 1761 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
1762 mark_buffer_dirty(bh); 1762 mark_buffer_dirty(bh);
1763 sbi->s_lvid_dirty = 0;
1763} 1764}
1764 1765
1765static void udf_close_lvid(struct super_block *sb) 1766static void udf_close_lvid(struct super_block *sb)
@@ -1773,10 +1774,6 @@ static void udf_close_lvid(struct super_block *sb)
1773 return; 1774 return;
1774 1775
1775 lvid = (struct logicalVolIntegrityDesc *)bh->b_data; 1776 lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
1776
1777 if (lvid->integrityType != LVID_INTEGRITY_TYPE_OPEN)
1778 return;
1779
1780 lvidiu = udf_sb_lvidiu(sbi); 1777 lvidiu = udf_sb_lvidiu(sbi);
1781 lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; 1778 lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
1782 lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; 1779 lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1790,11 +1787,12 @@ static void udf_close_lvid(struct super_block *sb)
1790 lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE); 1787 lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE);
1791 1788
1792 lvid->descTag.descCRC = cpu_to_le16( 1789 lvid->descTag.descCRC = cpu_to_le16(
1793 crc_itu_t(0, (char *)lvid + sizeof(tag), 1790 crc_itu_t(0, (char *)lvid + sizeof(struct tag),
1794 le16_to_cpu(lvid->descTag.descCRCLength))); 1791 le16_to_cpu(lvid->descTag.descCRCLength)));
1795 1792
1796 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); 1793 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
1797 mark_buffer_dirty(bh); 1794 mark_buffer_dirty(bh);
1795 sbi->s_lvid_dirty = 0;
1798} 1796}
1799 1797
1800static void udf_sb_free_bitmap(struct udf_bitmap *bitmap) 1798static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
@@ -1846,15 +1844,18 @@ static void udf_free_partition(struct udf_part_map *map)
1846static int udf_fill_super(struct super_block *sb, void *options, int silent) 1844static int udf_fill_super(struct super_block *sb, void *options, int silent)
1847{ 1845{
1848 int i; 1846 int i;
1847 int ret;
1849 struct inode *inode = NULL; 1848 struct inode *inode = NULL;
1850 struct udf_options uopt; 1849 struct udf_options uopt;
1851 kernel_lb_addr rootdir, fileset; 1850 struct kernel_lb_addr rootdir, fileset;
1852 struct udf_sb_info *sbi; 1851 struct udf_sb_info *sbi;
1853 1852
1854 uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT); 1853 uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
1855 uopt.uid = -1; 1854 uopt.uid = -1;
1856 uopt.gid = -1; 1855 uopt.gid = -1;
1857 uopt.umask = 0; 1856 uopt.umask = 0;
1857 uopt.fmode = UDF_INVALID_MODE;
1858 uopt.dmode = UDF_INVALID_MODE;
1858 1859
1859 sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL); 1860 sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL);
1860 if (!sbi) 1861 if (!sbi)
@@ -1892,15 +1893,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1892 sbi->s_uid = uopt.uid; 1893 sbi->s_uid = uopt.uid;
1893 sbi->s_gid = uopt.gid; 1894 sbi->s_gid = uopt.gid;
1894 sbi->s_umask = uopt.umask; 1895 sbi->s_umask = uopt.umask;
1896 sbi->s_fmode = uopt.fmode;
1897 sbi->s_dmode = uopt.dmode;
1895 sbi->s_nls_map = uopt.nls_map; 1898 sbi->s_nls_map = uopt.nls_map;
1896 1899
1897 /* Set the block size for all transfers */
1898 if (!sb_min_blocksize(sb, uopt.blocksize)) {
1899 udf_debug("Bad block size (%d)\n", uopt.blocksize);
1900 printk(KERN_ERR "udf: bad block size (%d)\n", uopt.blocksize);
1901 goto error_out;
1902 }
1903
1904 if (uopt.session == 0xFFFFFFFF) 1900 if (uopt.session == 0xFFFFFFFF)
1905 sbi->s_session = udf_get_last_session(sb); 1901 sbi->s_session = udf_get_last_session(sb);
1906 else 1902 else
@@ -1908,18 +1904,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1908 1904
1909 udf_debug("Multi-session=%d\n", sbi->s_session); 1905 udf_debug("Multi-session=%d\n", sbi->s_session);
1910 1906
1911 sbi->s_last_block = uopt.lastblock;
1912 sbi->s_anchor[0] = sbi->s_anchor[1] = 0;
1913 sbi->s_anchor[2] = uopt.anchor;
1914
1915 if (udf_check_valid(sb, uopt.novrs, silent)) {
1916 /* read volume recognition sequences */
1917 printk(KERN_WARNING "UDF-fs: No VRS found\n");
1918 goto error_out;
1919 }
1920
1921 udf_find_anchor(sb);
1922
1923 /* Fill in the rest of the superblock */ 1907 /* Fill in the rest of the superblock */
1924 sb->s_op = &udf_sb_ops; 1908 sb->s_op = &udf_sb_ops;
1925 sb->s_export_op = &udf_export_ops; 1909 sb->s_export_op = &udf_export_ops;
@@ -1928,7 +1912,21 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1928 sb->s_magic = UDF_SUPER_MAGIC; 1912 sb->s_magic = UDF_SUPER_MAGIC;
1929 sb->s_time_gran = 1000; 1913 sb->s_time_gran = 1000;
1930 1914
1931 if (udf_load_sequence(sb, &fileset)) { 1915 if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) {
1916 ret = udf_load_vrs(sb, &uopt, silent, &fileset);
1917 } else {
1918 uopt.blocksize = bdev_hardsect_size(sb->s_bdev);
1919 ret = udf_load_vrs(sb, &uopt, silent, &fileset);
1920 if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
1921 if (!silent)
1922 printk(KERN_NOTICE
1923 "UDF-fs: Rescanning with blocksize "
1924 "%d\n", UDF_DEFAULT_BLOCKSIZE);
1925 uopt.blocksize = UDF_DEFAULT_BLOCKSIZE;
1926 ret = udf_load_vrs(sb, &uopt, silent, &fileset);
1927 }
1928 }
1929 if (!ret) {
1932 printk(KERN_WARNING "UDF-fs: No partition found (1)\n"); 1930 printk(KERN_WARNING "UDF-fs: No partition found (1)\n");
1933 goto error_out; 1931 goto error_out;
1934 } 1932 }
@@ -1978,7 +1976,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1978 } 1976 }
1979 1977
1980 if (!silent) { 1978 if (!silent) {
1981 timestamp ts; 1979 struct timestamp ts;
1982 udf_time_to_disk_stamp(&ts, sbi->s_record_time); 1980 udf_time_to_disk_stamp(&ts, sbi->s_record_time);
1983 udf_info("UDF: Mounting volume '%s', " 1981 udf_info("UDF: Mounting volume '%s', "
1984 "timestamp %04u/%02u/%02u %02u:%02u (%x)\n", 1982 "timestamp %04u/%02u/%02u %02u:%02u (%x)\n",
@@ -1991,7 +1989,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1991 /* Assign the root inode */ 1989 /* Assign the root inode */
1992 /* assign inodes by physical block number */ 1990 /* assign inodes by physical block number */
1993 /* perhaps it's not extensible enough, but for now ... */ 1991 /* perhaps it's not extensible enough, but for now ... */
1994 inode = udf_iget(sb, rootdir); 1992 inode = udf_iget(sb, &rootdir);
1995 if (!inode) { 1993 if (!inode) {
1996 printk(KERN_ERR "UDF-fs: Error in udf_iget, block=%d, " 1994 printk(KERN_ERR "UDF-fs: Error in udf_iget, block=%d, "
1997 "partition=%d\n", 1995 "partition=%d\n",
@@ -2081,11 +2079,31 @@ static void udf_put_super(struct super_block *sb)
2081 sb->s_fs_info = NULL; 2079 sb->s_fs_info = NULL;
2082} 2080}
2083 2081
2082static int udf_sync_fs(struct super_block *sb, int wait)
2083{
2084 struct udf_sb_info *sbi = UDF_SB(sb);
2085
2086 mutex_lock(&sbi->s_alloc_mutex);
2087 if (sbi->s_lvid_dirty) {
2088 /*
2089 * Blockdevice will be synced later so we don't have to submit
2090 * the buffer for IO
2091 */
2092 mark_buffer_dirty(sbi->s_lvid_bh);
2093 sb->s_dirt = 0;
2094 sbi->s_lvid_dirty = 0;
2095 }
2096 mutex_unlock(&sbi->s_alloc_mutex);
2097
2098 return 0;
2099}
2100
2084static int udf_statfs(struct dentry *dentry, struct kstatfs *buf) 2101static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
2085{ 2102{
2086 struct super_block *sb = dentry->d_sb; 2103 struct super_block *sb = dentry->d_sb;
2087 struct udf_sb_info *sbi = UDF_SB(sb); 2104 struct udf_sb_info *sbi = UDF_SB(sb);
2088 struct logicalVolIntegrityDescImpUse *lvidiu; 2105 struct logicalVolIntegrityDescImpUse *lvidiu;
2106 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
2089 2107
2090 if (sbi->s_lvid_bh != NULL) 2108 if (sbi->s_lvid_bh != NULL)
2091 lvidiu = udf_sb_lvidiu(sbi); 2109 lvidiu = udf_sb_lvidiu(sbi);
@@ -2101,8 +2119,9 @@ static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
2101 le32_to_cpu(lvidiu->numDirs)) : 0) 2119 le32_to_cpu(lvidiu->numDirs)) : 0)
2102 + buf->f_bfree; 2120 + buf->f_bfree;
2103 buf->f_ffree = buf->f_bfree; 2121 buf->f_ffree = buf->f_bfree;
2104 /* __kernel_fsid_t f_fsid */
2105 buf->f_namelen = UDF_NAME_LEN - 2; 2122 buf->f_namelen = UDF_NAME_LEN - 2;
2123 buf->f_fsid.val[0] = (u32)id;
2124 buf->f_fsid.val[1] = (u32)(id >> 32);
2106 2125
2107 return 0; 2126 return 0;
2108} 2127}
@@ -2114,7 +2133,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
2114 unsigned int accum = 0; 2133 unsigned int accum = 0;
2115 int index; 2134 int index;
2116 int block = 0, newblock; 2135 int block = 0, newblock;
2117 kernel_lb_addr loc; 2136 struct kernel_lb_addr loc;
2118 uint32_t bytes; 2137 uint32_t bytes;
2119 uint8_t *ptr; 2138 uint8_t *ptr;
2120 uint16_t ident; 2139 uint16_t ident;
@@ -2124,7 +2143,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
2124 2143
2125 loc.logicalBlockNum = bitmap->s_extPosition; 2144 loc.logicalBlockNum = bitmap->s_extPosition;
2126 loc.partitionReferenceNum = UDF_SB(sb)->s_partition; 2145 loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
2127 bh = udf_read_ptagged(sb, loc, 0, &ident); 2146 bh = udf_read_ptagged(sb, &loc, 0, &ident);
2128 2147
2129 if (!bh) { 2148 if (!bh) {
2130 printk(KERN_ERR "udf: udf_count_free failed\n"); 2149 printk(KERN_ERR "udf: udf_count_free failed\n");
@@ -2147,7 +2166,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
2147 bytes -= cur_bytes; 2166 bytes -= cur_bytes;
2148 if (bytes) { 2167 if (bytes) {
2149 brelse(bh); 2168 brelse(bh);
2150 newblock = udf_get_lb_pblock(sb, loc, ++block); 2169 newblock = udf_get_lb_pblock(sb, &loc, ++block);
2151 bh = udf_tread(sb, newblock); 2170 bh = udf_tread(sb, newblock);
2152 if (!bh) { 2171 if (!bh) {
2153 udf_debug("read failed\n"); 2172 udf_debug("read failed\n");
@@ -2170,7 +2189,7 @@ static unsigned int udf_count_free_table(struct super_block *sb,
2170{ 2189{
2171 unsigned int accum = 0; 2190 unsigned int accum = 0;
2172 uint32_t elen; 2191 uint32_t elen;
2173 kernel_lb_addr eloc; 2192 struct kernel_lb_addr eloc;
2174 int8_t etype; 2193 int8_t etype;
2175 struct extent_position epos; 2194 struct extent_position epos;
2176 2195
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 65e19b4f9424..225527cdc885 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -28,10 +28,10 @@
28#include "udf_sb.h" 28#include "udf_sb.h"
29 29
30static void extent_trunc(struct inode *inode, struct extent_position *epos, 30static void extent_trunc(struct inode *inode, struct extent_position *epos,
31 kernel_lb_addr eloc, int8_t etype, uint32_t elen, 31 struct kernel_lb_addr *eloc, int8_t etype, uint32_t elen,
32 uint32_t nelen) 32 uint32_t nelen)
33{ 33{
34 kernel_lb_addr neloc = {}; 34 struct kernel_lb_addr neloc = {};
35 int last_block = (elen + inode->i_sb->s_blocksize - 1) >> 35 int last_block = (elen + inode->i_sb->s_blocksize - 1) >>
36 inode->i_sb->s_blocksize_bits; 36 inode->i_sb->s_blocksize_bits;
37 int first_block = (nelen + inode->i_sb->s_blocksize - 1) >> 37 int first_block = (nelen + inode->i_sb->s_blocksize - 1) >>
@@ -43,12 +43,12 @@ static void extent_trunc(struct inode *inode, struct extent_position *epos,
43 last_block); 43 last_block);
44 etype = (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30); 44 etype = (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30);
45 } else 45 } else
46 neloc = eloc; 46 neloc = *eloc;
47 nelen = (etype << 30) | nelen; 47 nelen = (etype << 30) | nelen;
48 } 48 }
49 49
50 if (elen != nelen) { 50 if (elen != nelen) {
51 udf_write_aext(inode, epos, neloc, nelen, 0); 51 udf_write_aext(inode, epos, &neloc, nelen, 0);
52 if (last_block - first_block > 0) { 52 if (last_block - first_block > 0) {
53 if (etype == (EXT_RECORDED_ALLOCATED >> 30)) 53 if (etype == (EXT_RECORDED_ALLOCATED >> 30))
54 mark_inode_dirty(inode); 54 mark_inode_dirty(inode);
@@ -68,7 +68,7 @@ static void extent_trunc(struct inode *inode, struct extent_position *epos,
68void udf_truncate_tail_extent(struct inode *inode) 68void udf_truncate_tail_extent(struct inode *inode)
69{ 69{
70 struct extent_position epos = {}; 70 struct extent_position epos = {};
71 kernel_lb_addr eloc; 71 struct kernel_lb_addr eloc;
72 uint32_t elen, nelen; 72 uint32_t elen, nelen;
73 uint64_t lbcount = 0; 73 uint64_t lbcount = 0;
74 int8_t etype = -1, netype; 74 int8_t etype = -1, netype;
@@ -83,9 +83,9 @@ void udf_truncate_tail_extent(struct inode *inode)
83 return; 83 return;
84 84
85 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 85 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
86 adsize = sizeof(short_ad); 86 adsize = sizeof(struct short_ad);
87 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 87 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
88 adsize = sizeof(long_ad); 88 adsize = sizeof(struct long_ad);
89 else 89 else
90 BUG(); 90 BUG();
91 91
@@ -106,7 +106,7 @@ void udf_truncate_tail_extent(struct inode *inode)
106 (unsigned)elen); 106 (unsigned)elen);
107 nelen = elen - (lbcount - inode->i_size); 107 nelen = elen - (lbcount - inode->i_size);
108 epos.offset -= adsize; 108 epos.offset -= adsize;
109 extent_trunc(inode, &epos, eloc, etype, elen, nelen); 109 extent_trunc(inode, &epos, &eloc, etype, elen, nelen);
110 epos.offset += adsize; 110 epos.offset += adsize;
111 if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1) 111 if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1)
112 printk(KERN_ERR "udf_truncate_tail_extent(): " 112 printk(KERN_ERR "udf_truncate_tail_extent(): "
@@ -124,7 +124,7 @@ void udf_truncate_tail_extent(struct inode *inode)
124void udf_discard_prealloc(struct inode *inode) 124void udf_discard_prealloc(struct inode *inode)
125{ 125{
126 struct extent_position epos = { NULL, 0, {0, 0} }; 126 struct extent_position epos = { NULL, 0, {0, 0} };
127 kernel_lb_addr eloc; 127 struct kernel_lb_addr eloc;
128 uint32_t elen; 128 uint32_t elen;
129 uint64_t lbcount = 0; 129 uint64_t lbcount = 0;
130 int8_t etype = -1, netype; 130 int8_t etype = -1, netype;
@@ -136,9 +136,9 @@ void udf_discard_prealloc(struct inode *inode)
136 return; 136 return;
137 137
138 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 138 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
139 adsize = sizeof(short_ad); 139 adsize = sizeof(struct short_ad);
140 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 140 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
141 adsize = sizeof(long_ad); 141 adsize = sizeof(struct long_ad);
142 else 142 else
143 adsize = 0; 143 adsize = 0;
144 144
@@ -152,7 +152,7 @@ void udf_discard_prealloc(struct inode *inode)
152 if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { 152 if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
153 epos.offset -= adsize; 153 epos.offset -= adsize;
154 lbcount -= elen; 154 lbcount -= elen;
155 extent_trunc(inode, &epos, eloc, etype, elen, 0); 155 extent_trunc(inode, &epos, &eloc, etype, elen, 0);
156 if (!epos.bh) { 156 if (!epos.bh) {
157 iinfo->i_lenAlloc = 157 iinfo->i_lenAlloc =
158 epos.offset - 158 epos.offset -
@@ -200,7 +200,7 @@ static void udf_update_alloc_ext_desc(struct inode *inode,
200void udf_truncate_extents(struct inode *inode) 200void udf_truncate_extents(struct inode *inode)
201{ 201{
202 struct extent_position epos; 202 struct extent_position epos;
203 kernel_lb_addr eloc, neloc = {}; 203 struct kernel_lb_addr eloc, neloc = {};
204 uint32_t elen, nelen = 0, indirect_ext_len = 0, lenalloc; 204 uint32_t elen, nelen = 0, indirect_ext_len = 0, lenalloc;
205 int8_t etype; 205 int8_t etype;
206 struct super_block *sb = inode->i_sb; 206 struct super_block *sb = inode->i_sb;
@@ -210,9 +210,9 @@ void udf_truncate_extents(struct inode *inode)
210 struct udf_inode_info *iinfo = UDF_I(inode); 210 struct udf_inode_info *iinfo = UDF_I(inode);
211 211
212 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 212 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
213 adsize = sizeof(short_ad); 213 adsize = sizeof(struct short_ad);
214 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 214 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
215 adsize = sizeof(long_ad); 215 adsize = sizeof(struct long_ad);
216 else 216 else
217 BUG(); 217 BUG();
218 218
@@ -221,7 +221,7 @@ void udf_truncate_extents(struct inode *inode)
221 (inode->i_size & (sb->s_blocksize - 1)); 221 (inode->i_size & (sb->s_blocksize - 1));
222 if (etype != -1) { 222 if (etype != -1) {
223 epos.offset -= adsize; 223 epos.offset -= adsize;
224 extent_trunc(inode, &epos, eloc, etype, elen, byte_offset); 224 extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset);
225 epos.offset += adsize; 225 epos.offset += adsize;
226 if (byte_offset) 226 if (byte_offset)
227 lenalloc = epos.offset; 227 lenalloc = epos.offset;
@@ -236,12 +236,12 @@ void udf_truncate_extents(struct inode *inode)
236 while ((etype = udf_current_aext(inode, &epos, &eloc, 236 while ((etype = udf_current_aext(inode, &epos, &eloc,
237 &elen, 0)) != -1) { 237 &elen, 0)) != -1) {
238 if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) { 238 if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
239 udf_write_aext(inode, &epos, neloc, nelen, 0); 239 udf_write_aext(inode, &epos, &neloc, nelen, 0);
240 if (indirect_ext_len) { 240 if (indirect_ext_len) {
241 /* We managed to free all extents in the 241 /* We managed to free all extents in the
242 * indirect extent - free it too */ 242 * indirect extent - free it too */
243 BUG_ON(!epos.bh); 243 BUG_ON(!epos.bh);
244 udf_free_blocks(sb, inode, epos.block, 244 udf_free_blocks(sb, inode, &epos.block,
245 0, indirect_ext_len); 245 0, indirect_ext_len);
246 } else if (!epos.bh) { 246 } else if (!epos.bh) {
247 iinfo->i_lenAlloc = lenalloc; 247 iinfo->i_lenAlloc = lenalloc;
@@ -253,7 +253,7 @@ void udf_truncate_extents(struct inode *inode)
253 epos.offset = sizeof(struct allocExtDesc); 253 epos.offset = sizeof(struct allocExtDesc);
254 epos.block = eloc; 254 epos.block = eloc;
255 epos.bh = udf_tread(sb, 255 epos.bh = udf_tread(sb,
256 udf_get_lb_pblock(sb, eloc, 0)); 256 udf_get_lb_pblock(sb, &eloc, 0));
257 if (elen) 257 if (elen)
258 indirect_ext_len = 258 indirect_ext_len =
259 (elen + sb->s_blocksize - 1) >> 259 (elen + sb->s_blocksize - 1) >>
@@ -261,7 +261,7 @@ void udf_truncate_extents(struct inode *inode)
261 else 261 else
262 indirect_ext_len = 1; 262 indirect_ext_len = 1;
263 } else { 263 } else {
264 extent_trunc(inode, &epos, eloc, etype, 264 extent_trunc(inode, &epos, &eloc, etype,
265 elen, 0); 265 elen, 0);
266 epos.offset += adsize; 266 epos.offset += adsize;
267 } 267 }
@@ -269,7 +269,7 @@ void udf_truncate_extents(struct inode *inode)
269 269
270 if (indirect_ext_len) { 270 if (indirect_ext_len) {
271 BUG_ON(!epos.bh); 271 BUG_ON(!epos.bh);
272 udf_free_blocks(sb, inode, epos.block, 0, 272 udf_free_blocks(sb, inode, &epos.block, 0,
273 indirect_ext_len); 273 indirect_ext_len);
274 } else if (!epos.bh) { 274 } else if (!epos.bh) {
275 iinfo->i_lenAlloc = lenalloc; 275 iinfo->i_lenAlloc = lenalloc;
@@ -278,7 +278,7 @@ void udf_truncate_extents(struct inode *inode)
278 udf_update_alloc_ext_desc(inode, &epos, lenalloc); 278 udf_update_alloc_ext_desc(inode, &epos, lenalloc);
279 } else if (inode->i_size) { 279 } else if (inode->i_size) {
280 if (byte_offset) { 280 if (byte_offset) {
281 kernel_long_ad extent; 281 struct kernel_long_ad extent;
282 282
283 /* 283 /*
284 * OK, there is not extent covering inode->i_size and 284 * OK, there is not extent covering inode->i_size and
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index 4f86b1d98a5d..e58d1de41073 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -4,7 +4,7 @@
4struct udf_inode_info { 4struct udf_inode_info {
5 struct timespec i_crtime; 5 struct timespec i_crtime;
6 /* Physical address of inode */ 6 /* Physical address of inode */
7 kernel_lb_addr i_location; 7 struct kernel_lb_addr i_location;
8 __u64 i_unique; 8 __u64 i_unique;
9 __u32 i_lenEAttr; 9 __u32 i_lenEAttr;
10 __u32 i_lenAlloc; 10 __u32 i_lenAlloc;
@@ -17,8 +17,8 @@ struct udf_inode_info {
17 unsigned i_strat4096 : 1; 17 unsigned i_strat4096 : 1;
18 unsigned reserved : 26; 18 unsigned reserved : 26;
19 union { 19 union {
20 short_ad *i_sad; 20 struct short_ad *i_sad;
21 long_ad *i_lad; 21 struct long_ad *i_lad;
22 __u8 *i_data; 22 __u8 *i_data;
23 } i_ext; 23 } i_ext;
24 struct inode vfs_inode; 24 struct inode vfs_inode;
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 1c1c514a9725..d113b72c2768 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -30,6 +30,7 @@
30#define UDF_FLAG_GID_SET 16 30#define UDF_FLAG_GID_SET 16
31#define UDF_FLAG_SESSION_SET 17 31#define UDF_FLAG_SESSION_SET 17
32#define UDF_FLAG_LASTBLOCK_SET 18 32#define UDF_FLAG_LASTBLOCK_SET 18
33#define UDF_FLAG_BLOCKSIZE_SET 19
33 34
34#define UDF_PART_FLAG_UNALLOC_BITMAP 0x0001 35#define UDF_PART_FLAG_UNALLOC_BITMAP 0x0001
35#define UDF_PART_FLAG_UNALLOC_TABLE 0x0002 36#define UDF_PART_FLAG_UNALLOC_TABLE 0x0002
@@ -48,6 +49,8 @@
48#define UDF_SPARABLE_MAP15 0x1522U 49#define UDF_SPARABLE_MAP15 0x1522U
49#define UDF_METADATA_MAP25 0x2511U 50#define UDF_METADATA_MAP25 0x2511U
50 51
52#define UDF_INVALID_MODE ((mode_t)-1)
53
51#pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */ 54#pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */
52 55
53struct udf_meta_data { 56struct udf_meta_data {
@@ -114,7 +117,7 @@ struct udf_sb_info {
114 117
115 /* Sector headers */ 118 /* Sector headers */
116 __s32 s_session; 119 __s32 s_session;
117 __u32 s_anchor[3]; 120 __u32 s_anchor;
118 __u32 s_last_block; 121 __u32 s_last_block;
119 122
120 struct buffer_head *s_lvid_bh; 123 struct buffer_head *s_lvid_bh;
@@ -123,6 +126,8 @@ struct udf_sb_info {
123 mode_t s_umask; 126 mode_t s_umask;
124 gid_t s_gid; 127 gid_t s_gid;
125 uid_t s_uid; 128 uid_t s_uid;
129 mode_t s_fmode;
130 mode_t s_dmode;
126 131
127 /* Root Info */ 132 /* Root Info */
128 struct timespec s_record_time; 133 struct timespec s_record_time;
@@ -143,6 +148,8 @@ struct udf_sb_info {
143 struct inode *s_vat_inode; 148 struct inode *s_vat_inode;
144 149
145 struct mutex s_alloc_mutex; 150 struct mutex s_alloc_mutex;
151 /* Protected by s_alloc_mutex */
152 unsigned int s_lvid_dirty;
146}; 153};
147 154
148static inline struct udf_sb_info *UDF_SB(struct super_block *sb) 155static inline struct udf_sb_info *UDF_SB(struct super_block *sb)
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 8ec865de5f13..cac51b77a5d1 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -62,10 +62,8 @@ static inline size_t udf_ext0_offset(struct inode *inode)
62 return 0; 62 return 0;
63} 63}
64 64
65#define udf_get_lb_pblock(sb,loc,offset) udf_get_pblock((sb), (loc).logicalBlockNum, (loc).partitionReferenceNum, (offset))
66
67/* computes tag checksum */ 65/* computes tag checksum */
68u8 udf_tag_checksum(const tag *t); 66u8 udf_tag_checksum(const struct tag *t);
69 67
70struct dentry; 68struct dentry;
71struct inode; 69struct inode;
@@ -95,7 +93,7 @@ struct udf_vds_record {
95}; 93};
96 94
97struct generic_desc { 95struct generic_desc {
98 tag descTag; 96 struct tag descTag;
99 __le32 volDescSeqNum; 97 __le32 volDescSeqNum;
100}; 98};
101 99
@@ -108,11 +106,22 @@ struct ustr {
108struct extent_position { 106struct extent_position {
109 struct buffer_head *bh; 107 struct buffer_head *bh;
110 uint32_t offset; 108 uint32_t offset;
111 kernel_lb_addr block; 109 struct kernel_lb_addr block;
112}; 110};
113 111
114/* super.c */ 112/* super.c */
115extern void udf_warning(struct super_block *, const char *, const char *, ...); 113extern void udf_warning(struct super_block *, const char *, const char *, ...);
114static inline void udf_updated_lvid(struct super_block *sb)
115{
116 struct buffer_head *bh = UDF_SB(sb)->s_lvid_bh;
117
118 BUG_ON(!bh);
119 WARN_ON_ONCE(((struct logicalVolIntegrityDesc *)
120 bh->b_data)->integrityType !=
121 cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN));
122 sb->s_dirt = 1;
123 UDF_SB(sb)->s_lvid_dirty = 1;
124}
116 125
117/* namei.c */ 126/* namei.c */
118extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, 127extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
@@ -124,7 +133,7 @@ extern int udf_ioctl(struct inode *, struct file *, unsigned int,
124 unsigned long); 133 unsigned long);
125 134
126/* inode.c */ 135/* inode.c */
127extern struct inode *udf_iget(struct super_block *, kernel_lb_addr); 136extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
128extern int udf_sync_inode(struct inode *); 137extern int udf_sync_inode(struct inode *);
129extern void udf_expand_file_adinicb(struct inode *, int, int *); 138extern void udf_expand_file_adinicb(struct inode *, int, int *);
130extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *); 139extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
@@ -136,19 +145,19 @@ extern void udf_clear_inode(struct inode *);
136extern int udf_write_inode(struct inode *, int); 145extern int udf_write_inode(struct inode *, int);
137extern long udf_block_map(struct inode *, sector_t); 146extern long udf_block_map(struct inode *, sector_t);
138extern int udf_extend_file(struct inode *, struct extent_position *, 147extern int udf_extend_file(struct inode *, struct extent_position *,
139 kernel_long_ad *, sector_t); 148 struct kernel_long_ad *, sector_t);
140extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *, 149extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *,
141 kernel_lb_addr *, uint32_t *, sector_t *); 150 struct kernel_lb_addr *, uint32_t *, sector_t *);
142extern int8_t udf_add_aext(struct inode *, struct extent_position *, 151extern int8_t udf_add_aext(struct inode *, struct extent_position *,
143 kernel_lb_addr, uint32_t, int); 152 struct kernel_lb_addr *, uint32_t, int);
144extern int8_t udf_write_aext(struct inode *, struct extent_position *, 153extern int8_t udf_write_aext(struct inode *, struct extent_position *,
145 kernel_lb_addr, uint32_t, int); 154 struct kernel_lb_addr *, uint32_t, int);
146extern int8_t udf_delete_aext(struct inode *, struct extent_position, 155extern int8_t udf_delete_aext(struct inode *, struct extent_position,
147 kernel_lb_addr, uint32_t); 156 struct kernel_lb_addr, uint32_t);
148extern int8_t udf_next_aext(struct inode *, struct extent_position *, 157extern int8_t udf_next_aext(struct inode *, struct extent_position *,
149 kernel_lb_addr *, uint32_t *, int); 158 struct kernel_lb_addr *, uint32_t *, int);
150extern int8_t udf_current_aext(struct inode *, struct extent_position *, 159extern int8_t udf_current_aext(struct inode *, struct extent_position *,
151 kernel_lb_addr *, uint32_t *, int); 160 struct kernel_lb_addr *, uint32_t *, int);
152 161
153/* misc.c */ 162/* misc.c */
154extern struct buffer_head *udf_tgetblk(struct super_block *, int); 163extern struct buffer_head *udf_tgetblk(struct super_block *, int);
@@ -160,7 +169,7 @@ extern struct genericFormat *udf_get_extendedattr(struct inode *, uint32_t,
160extern struct buffer_head *udf_read_tagged(struct super_block *, uint32_t, 169extern struct buffer_head *udf_read_tagged(struct super_block *, uint32_t,
161 uint32_t, uint16_t *); 170 uint32_t, uint16_t *);
162extern struct buffer_head *udf_read_ptagged(struct super_block *, 171extern struct buffer_head *udf_read_ptagged(struct super_block *,
163 kernel_lb_addr, uint32_t, 172 struct kernel_lb_addr *, uint32_t,
164 uint16_t *); 173 uint16_t *);
165extern void udf_update_tag(char *, int); 174extern void udf_update_tag(char *, int);
166extern void udf_new_tag(char *, uint16_t, uint16_t, uint16_t, uint32_t, int); 175extern void udf_new_tag(char *, uint16_t, uint16_t, uint16_t, uint32_t, int);
@@ -182,6 +191,14 @@ extern uint32_t udf_get_pblock_meta25(struct super_block *, uint32_t, uint16_t,
182 uint32_t); 191 uint32_t);
183extern int udf_relocate_blocks(struct super_block *, long, long *); 192extern int udf_relocate_blocks(struct super_block *, long, long *);
184 193
194static inline uint32_t
195udf_get_lb_pblock(struct super_block *sb, struct kernel_lb_addr *loc,
196 uint32_t offset)
197{
198 return udf_get_pblock(sb, loc->logicalBlockNum,
199 loc->partitionReferenceNum, offset);
200}
201
185/* unicode.c */ 202/* unicode.c */
186extern int udf_get_filename(struct super_block *, uint8_t *, uint8_t *, int); 203extern int udf_get_filename(struct super_block *, uint8_t *, uint8_t *, int);
187extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *, 204extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *,
@@ -200,7 +217,7 @@ extern void udf_truncate_extents(struct inode *);
200 217
201/* balloc.c */ 218/* balloc.c */
202extern void udf_free_blocks(struct super_block *, struct inode *, 219extern void udf_free_blocks(struct super_block *, struct inode *,
203 kernel_lb_addr, uint32_t, uint32_t); 220 struct kernel_lb_addr *, uint32_t, uint32_t);
204extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t, 221extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t,
205 uint32_t, uint32_t); 222 uint32_t, uint32_t);
206extern int udf_new_block(struct super_block *, struct inode *, uint16_t, 223extern int udf_new_block(struct super_block *, struct inode *, uint16_t,
@@ -214,16 +231,16 @@ extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *,
214 struct udf_fileident_bh *, 231 struct udf_fileident_bh *,
215 struct fileIdentDesc *, 232 struct fileIdentDesc *,
216 struct extent_position *, 233 struct extent_position *,
217 kernel_lb_addr *, uint32_t *, 234 struct kernel_lb_addr *, uint32_t *,
218 sector_t *); 235 sector_t *);
219extern struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, 236extern struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize,
220 int *offset); 237 int *offset);
221extern long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int); 238extern struct long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int);
222extern short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int); 239extern struct short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int);
223 240
224/* udftime.c */ 241/* udftime.c */
225extern struct timespec *udf_disk_stamp_to_time(struct timespec *dest, 242extern struct timespec *udf_disk_stamp_to_time(struct timespec *dest,
226 timestamp src); 243 struct timestamp src);
227extern timestamp *udf_time_to_disk_stamp(timestamp *dest, struct timespec src); 244extern struct timestamp *udf_time_to_disk_stamp(struct timestamp *dest, struct timespec src);
228 245
229#endif /* __UDF_DECL_H */ 246#endif /* __UDF_DECL_H */
diff --git a/fs/udf/udfend.h b/fs/udf/udfend.h
index 489f52fb428c..6a9f3a9cc428 100644
--- a/fs/udf/udfend.h
+++ b/fs/udf/udfend.h
@@ -4,9 +4,9 @@
4#include <asm/byteorder.h> 4#include <asm/byteorder.h>
5#include <linux/string.h> 5#include <linux/string.h>
6 6
7static inline kernel_lb_addr lelb_to_cpu(lb_addr in) 7static inline struct kernel_lb_addr lelb_to_cpu(struct lb_addr in)
8{ 8{
9 kernel_lb_addr out; 9 struct kernel_lb_addr out;
10 10
11 out.logicalBlockNum = le32_to_cpu(in.logicalBlockNum); 11 out.logicalBlockNum = le32_to_cpu(in.logicalBlockNum);
12 out.partitionReferenceNum = le16_to_cpu(in.partitionReferenceNum); 12 out.partitionReferenceNum = le16_to_cpu(in.partitionReferenceNum);
@@ -14,9 +14,9 @@ static inline kernel_lb_addr lelb_to_cpu(lb_addr in)
14 return out; 14 return out;
15} 15}
16 16
17static inline lb_addr cpu_to_lelb(kernel_lb_addr in) 17static inline struct lb_addr cpu_to_lelb(struct kernel_lb_addr in)
18{ 18{
19 lb_addr out; 19 struct lb_addr out;
20 20
21 out.logicalBlockNum = cpu_to_le32(in.logicalBlockNum); 21 out.logicalBlockNum = cpu_to_le32(in.logicalBlockNum);
22 out.partitionReferenceNum = cpu_to_le16(in.partitionReferenceNum); 22 out.partitionReferenceNum = cpu_to_le16(in.partitionReferenceNum);
@@ -24,9 +24,9 @@ static inline lb_addr cpu_to_lelb(kernel_lb_addr in)
24 return out; 24 return out;
25} 25}
26 26
27static inline short_ad lesa_to_cpu(short_ad in) 27static inline struct short_ad lesa_to_cpu(struct short_ad in)
28{ 28{
29 short_ad out; 29 struct short_ad out;
30 30
31 out.extLength = le32_to_cpu(in.extLength); 31 out.extLength = le32_to_cpu(in.extLength);
32 out.extPosition = le32_to_cpu(in.extPosition); 32 out.extPosition = le32_to_cpu(in.extPosition);
@@ -34,9 +34,9 @@ static inline short_ad lesa_to_cpu(short_ad in)
34 return out; 34 return out;
35} 35}
36 36
37static inline short_ad cpu_to_lesa(short_ad in) 37static inline struct short_ad cpu_to_lesa(struct short_ad in)
38{ 38{
39 short_ad out; 39 struct short_ad out;
40 40
41 out.extLength = cpu_to_le32(in.extLength); 41 out.extLength = cpu_to_le32(in.extLength);
42 out.extPosition = cpu_to_le32(in.extPosition); 42 out.extPosition = cpu_to_le32(in.extPosition);
@@ -44,9 +44,9 @@ static inline short_ad cpu_to_lesa(short_ad in)
44 return out; 44 return out;
45} 45}
46 46
47static inline kernel_long_ad lela_to_cpu(long_ad in) 47static inline struct kernel_long_ad lela_to_cpu(struct long_ad in)
48{ 48{
49 kernel_long_ad out; 49 struct kernel_long_ad out;
50 50
51 out.extLength = le32_to_cpu(in.extLength); 51 out.extLength = le32_to_cpu(in.extLength);
52 out.extLocation = lelb_to_cpu(in.extLocation); 52 out.extLocation = lelb_to_cpu(in.extLocation);
@@ -54,9 +54,9 @@ static inline kernel_long_ad lela_to_cpu(long_ad in)
54 return out; 54 return out;
55} 55}
56 56
57static inline long_ad cpu_to_lela(kernel_long_ad in) 57static inline struct long_ad cpu_to_lela(struct kernel_long_ad in)
58{ 58{
59 long_ad out; 59 struct long_ad out;
60 60
61 out.extLength = cpu_to_le32(in.extLength); 61 out.extLength = cpu_to_le32(in.extLength);
62 out.extLocation = cpu_to_lelb(in.extLocation); 62 out.extLocation = cpu_to_lelb(in.extLocation);
@@ -64,9 +64,9 @@ static inline long_ad cpu_to_lela(kernel_long_ad in)
64 return out; 64 return out;
65} 65}
66 66
67static inline kernel_extent_ad leea_to_cpu(extent_ad in) 67static inline struct kernel_extent_ad leea_to_cpu(struct extent_ad in)
68{ 68{
69 kernel_extent_ad out; 69 struct kernel_extent_ad out;
70 70
71 out.extLength = le32_to_cpu(in.extLength); 71 out.extLength = le32_to_cpu(in.extLength);
72 out.extLocation = le32_to_cpu(in.extLocation); 72 out.extLocation = le32_to_cpu(in.extLocation);
diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c
index 5f811655c9b5..b8c828c4d200 100644
--- a/fs/udf/udftime.c
+++ b/fs/udf/udftime.c
@@ -85,7 +85,8 @@ extern struct timezone sys_tz;
85#define SECS_PER_HOUR (60 * 60) 85#define SECS_PER_HOUR (60 * 60)
86#define SECS_PER_DAY (SECS_PER_HOUR * 24) 86#define SECS_PER_DAY (SECS_PER_HOUR * 24)
87 87
88struct timespec *udf_disk_stamp_to_time(struct timespec *dest, timestamp src) 88struct timespec *
89udf_disk_stamp_to_time(struct timespec *dest, struct timestamp src)
89{ 90{
90 int yday; 91 int yday;
91 u16 typeAndTimezone = le16_to_cpu(src.typeAndTimezone); 92 u16 typeAndTimezone = le16_to_cpu(src.typeAndTimezone);
@@ -116,7 +117,8 @@ struct timespec *udf_disk_stamp_to_time(struct timespec *dest, timestamp src)
116 return dest; 117 return dest;
117} 118}
118 119
119timestamp *udf_time_to_disk_stamp(timestamp *dest, struct timespec ts) 120struct timestamp *
121udf_time_to_disk_stamp(struct timestamp *dest, struct timespec ts)
120{ 122{
121 long int days, rem, y; 123 long int days, rem, y;
122 const unsigned short int *ip; 124 const unsigned short int *ip;
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index 9fdf8c93c58e..cefa8c8913e6 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -254,7 +254,7 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
254{ 254{
255 const uint8_t *ocu; 255 const uint8_t *ocu;
256 uint8_t cmp_id, ocu_len; 256 uint8_t cmp_id, ocu_len;
257 int i; 257 int i, len;
258 258
259 259
260 ocu_len = ocu_i->u_len; 260 ocu_len = ocu_i->u_len;
@@ -279,8 +279,13 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
279 if (cmp_id == 16) 279 if (cmp_id == 16)
280 c = (c << 8) | ocu[i++]; 280 c = (c << 8) | ocu[i++];
281 281
282 utf_o->u_len += nls->uni2char(c, &utf_o->u_name[utf_o->u_len], 282 len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
283 UDF_NAME_LEN - utf_o->u_len); 283 UDF_NAME_LEN - utf_o->u_len);
284 /* Valid character? */
285 if (len >= 0)
286 utf_o->u_len += len;
287 else
288 utf_o->u_name[utf_o->u_len++] = '?';
284 } 289 }
285 utf_o->u_cmpID = 8; 290 utf_o->u_cmpID = 8;
286 291
@@ -290,7 +295,8 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
290static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni, 295static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni,
291 int length) 296 int length)
292{ 297{
293 unsigned len, i, max_val; 298 int len;
299 unsigned i, max_val;
294 uint16_t uni_char; 300 uint16_t uni_char;
295 int u_len; 301 int u_len;
296 302
@@ -302,8 +308,13 @@ try_again:
302 u_len = 0U; 308 u_len = 0U;
303 for (i = 0U; i < uni->u_len; i++) { 309 for (i = 0U; i < uni->u_len; i++) {
304 len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char); 310 len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char);
305 if (len <= 0) 311 if (!len)
306 continue; 312 continue;
313 /* Invalid character, deal with it */
314 if (len < 0) {
315 len = 1;
316 uni_char = '?';
317 }
307 318
308 if (uni_char > max_val) { 319 if (uni_char > max_val) {
309 max_val = 0xffffU; 320 max_val = 0xffffU;
@@ -324,34 +335,43 @@ try_again:
324int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname, 335int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname,
325 int flen) 336 int flen)
326{ 337{
327 struct ustr filename, unifilename; 338 struct ustr *filename, *unifilename;
328 int len; 339 int len = 0;
329 340
330 if (udf_build_ustr_exact(&unifilename, sname, flen)) 341 filename = kmalloc(sizeof(struct ustr), GFP_NOFS);
342 if (!filename)
331 return 0; 343 return 0;
332 344
345 unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS);
346 if (!unifilename)
347 goto out1;
348
349 if (udf_build_ustr_exact(unifilename, sname, flen))
350 goto out2;
351
333 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { 352 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
334 if (!udf_CS0toUTF8(&filename, &unifilename)) { 353 if (!udf_CS0toUTF8(filename, unifilename)) {
335 udf_debug("Failed in udf_get_filename: sname = %s\n", 354 udf_debug("Failed in udf_get_filename: sname = %s\n",
336 sname); 355 sname);
337 return 0; 356 goto out2;
338 } 357 }
339 } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { 358 } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
340 if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, &filename, 359 if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename,
341 &unifilename)) { 360 unifilename)) {
342 udf_debug("Failed in udf_get_filename: sname = %s\n", 361 udf_debug("Failed in udf_get_filename: sname = %s\n",
343 sname); 362 sname);
344 return 0; 363 goto out2;
345 } 364 }
346 } else 365 } else
347 return 0; 366 goto out2;
348 367
349 len = udf_translate_to_linux(dname, filename.u_name, filename.u_len, 368 len = udf_translate_to_linux(dname, filename->u_name, filename->u_len,
350 unifilename.u_name, unifilename.u_len); 369 unifilename->u_name, unifilename->u_len);
351 if (len) 370out2:
352 return len; 371 kfree(unifilename);
353 372out1:
354 return 0; 373 kfree(filename);
374 return len;
355} 375}
356 376
357int udf_put_filename(struct super_block *sb, const uint8_t *sname, 377int udf_put_filename(struct super_block *sb, const uint8_t *sname,
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index dbbbc4668769..6321b797061b 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -666,6 +666,6 @@ not_empty:
666const struct file_operations ufs_dir_operations = { 666const struct file_operations ufs_dir_operations = {
667 .read = generic_read_dir, 667 .read = generic_read_dir,
668 .readdir = ufs_readdir, 668 .readdir = ufs_readdir,
669 .fsync = file_fsync, 669 .fsync = ufs_sync_file,
670 .llseek = generic_file_llseek, 670 .llseek = generic_file_llseek,
671}; 671};
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 625ef17c6f83..2bd3a1615714 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -30,7 +30,7 @@
30#include "ufs.h" 30#include "ufs.h"
31 31
32 32
33static int ufs_sync_file(struct file *file, struct dentry *dentry, int datasync) 33int ufs_sync_file(struct file *file, struct dentry *dentry, int datasync)
34{ 34{
35 struct inode *inode = dentry->d_inode; 35 struct inode *inode = dentry->d_inode;
36 int err; 36 int err;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index e1c1fc5ee239..60359291761f 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1268,6 +1268,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
1268 struct ufs_super_block_first *usb1; 1268 struct ufs_super_block_first *usb1;
1269 struct ufs_super_block_second *usb2; 1269 struct ufs_super_block_second *usb2;
1270 struct ufs_super_block_third *usb3; 1270 struct ufs_super_block_third *usb3;
1271 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
1271 1272
1272 lock_kernel(); 1273 lock_kernel();
1273 1274
@@ -1290,6 +1291,8 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
1290 ? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0; 1291 ? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0;
1291 buf->f_files = uspi->s_ncg * uspi->s_ipg; 1292 buf->f_files = uspi->s_ncg * uspi->s_ipg;
1292 buf->f_namelen = UFS_MAXNAMLEN; 1293 buf->f_namelen = UFS_MAXNAMLEN;
1294 buf->f_fsid.val[0] = (u32)id;
1295 buf->f_fsid.val[1] = (u32)(id >> 32);
1293 1296
1294 unlock_kernel(); 1297 unlock_kernel();
1295 1298
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 69b3427d7885..d0c4acd4f1f3 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -98,8 +98,8 @@ extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
98/* file.c */ 98/* file.c */
99extern const struct inode_operations ufs_file_inode_operations; 99extern const struct inode_operations ufs_file_inode_operations;
100extern const struct file_operations ufs_file_operations; 100extern const struct file_operations ufs_file_operations;
101
102extern const struct address_space_operations ufs_aops; 101extern const struct address_space_operations ufs_aops;
102extern int ufs_sync_file(struct file *, struct dentry *, int);
103 103
104/* ialloc.c */ 104/* ialloc.c */
105extern void ufs_free_inode (struct inode *inode); 105extern void ufs_free_inode (struct inode *inode);
diff --git a/fs/xattr.c b/fs/xattr.c
index 197c4fcac032..d51b8f9db921 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -237,13 +237,9 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
237 if (size) { 237 if (size) {
238 if (size > XATTR_SIZE_MAX) 238 if (size > XATTR_SIZE_MAX)
239 return -E2BIG; 239 return -E2BIG;
240 kvalue = kmalloc(size, GFP_KERNEL); 240 kvalue = memdup_user(value, size);
241 if (!kvalue) 241 if (IS_ERR(kvalue))
242 return -ENOMEM; 242 return PTR_ERR(kvalue);
243 if (copy_from_user(kvalue, value, size)) {
244 kfree(kvalue);
245 return -EFAULT;
246 }
247 } 243 }
248 244
249 error = vfs_setxattr(d, kname, kvalue, size, flags); 245 error = vfs_setxattr(d, kname, kvalue, size, flags);
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index e14c4e3aea0c..f4e255441574 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -234,9 +234,9 @@ xfs_file_mmap(
234STATIC int 234STATIC int
235xfs_vm_page_mkwrite( 235xfs_vm_page_mkwrite(
236 struct vm_area_struct *vma, 236 struct vm_area_struct *vma,
237 struct page *page) 237 struct vm_fault *vmf)
238{ 238{
239 return block_page_mkwrite(vma, page, xfs_get_blocks); 239 return block_page_mkwrite(vma, vmf, xfs_get_blocks);
240} 240}
241 241
242const struct file_operations xfs_file_operations = { 242const struct file_operations xfs_file_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index d0b499418a7d..34eaab608e6e 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -489,17 +489,12 @@ xfs_attrmulti_attr_set(
489 if (len > XATTR_SIZE_MAX) 489 if (len > XATTR_SIZE_MAX)
490 return EINVAL; 490 return EINVAL;
491 491
492 kbuf = kmalloc(len, GFP_KERNEL); 492 kbuf = memdup_user(ubuf, len);
493 if (!kbuf) 493 if (IS_ERR(kbuf))
494 return ENOMEM; 494 return PTR_ERR(kbuf);
495
496 if (copy_from_user(kbuf, ubuf, len))
497 goto out_kfree;
498 495
499 error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags); 496 error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
500 497
501 out_kfree:
502 kfree(kbuf);
503 return error; 498 return error;
504} 499}
505 500
@@ -540,20 +535,16 @@ xfs_attrmulti_by_handle(
540 if (!size || size > 16 * PAGE_SIZE) 535 if (!size || size > 16 * PAGE_SIZE)
541 goto out_dput; 536 goto out_dput;
542 537
543 error = ENOMEM; 538 ops = memdup_user(am_hreq.ops, size);
544 ops = kmalloc(size, GFP_KERNEL); 539 if (IS_ERR(ops)) {
545 if (!ops) 540 error = PTR_ERR(ops);
546 goto out_dput; 541 goto out_dput;
547 542 }
548 error = EFAULT;
549 if (copy_from_user(ops, am_hreq.ops, size))
550 goto out_kfree_ops;
551 543
552 attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); 544 attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
553 if (!attr_name) 545 if (!attr_name)
554 goto out_kfree_ops; 546 goto out_kfree_ops;
555 547
556
557 error = 0; 548 error = 0;
558 for (i = 0; i < am_hreq.opcount; i++) { 549 for (i = 0; i < am_hreq.opcount; i++) {
559 ops[i].am_error = strncpy_from_user(attr_name, 550 ops[i].am_error = strncpy_from_user(attr_name,
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index c70c4e3db790..0882d166239a 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -427,20 +427,16 @@ xfs_compat_attrmulti_by_handle(
427 if (!size || size > 16 * PAGE_SIZE) 427 if (!size || size > 16 * PAGE_SIZE)
428 goto out_dput; 428 goto out_dput;
429 429
430 error = ENOMEM; 430 ops = memdup_user(compat_ptr(am_hreq.ops), size);
431 ops = kmalloc(size, GFP_KERNEL); 431 if (IS_ERR(ops)) {
432 if (!ops) 432 error = PTR_ERR(ops);
433 goto out_dput; 433 goto out_dput;
434 434 }
435 error = EFAULT;
436 if (copy_from_user(ops, compat_ptr(am_hreq.ops), size))
437 goto out_kfree_ops;
438 435
439 attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); 436 attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
440 if (!attr_name) 437 if (!attr_name)
441 goto out_kfree_ops; 438 goto out_kfree_ops;
442 439
443
444 error = 0; 440 error = 0;
445 for (i = 0; i < am_hreq.opcount; i++) { 441 for (i = 0; i < am_hreq.opcount; i++) {
446 ops[i].am_error = strncpy_from_user(attr_name, 442 ops[i].am_error = strncpy_from_user(attr_name,
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 631d0137551e..6075382336d7 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -230,7 +230,7 @@ xfs_vn_mknod(
230 } 230 }
231 231
232 if (IS_POSIXACL(dir) && !default_acl) 232 if (IS_POSIXACL(dir) && !default_acl)
233 mode &= ~current->fs->umask; 233 mode &= ~current_umask();
234 234
235 xfs_dentry_to_name(&name, dentry); 235 xfs_dentry_to_name(&name, dentry);
236 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL); 236 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
@@ -404,7 +404,7 @@ xfs_vn_symlink(
404 mode_t mode; 404 mode_t mode;
405 405
406 mode = S_IFLNK | 406 mode = S_IFLNK |
407 (irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO); 407 (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
408 xfs_dentry_to_name(&name, dentry); 408 xfs_dentry_to_name(&name, dentry);
409 409
410 error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL); 410 error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL);