aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/00-INDEX2
-rw-r--r--Documentation/filesystems/ext4.txt236
-rw-r--r--Documentation/lockdep-design.txt6
-rw-r--r--Documentation/sysctl/kernel.txt5
-rw-r--r--Makefile3
-rw-r--r--arch/alpha/kernel/alpha_ksyms.c93
-rw-r--r--arch/alpha/kernel/core_irongate.c2
-rw-r--r--arch/alpha/kernel/irq_alpha.c3
-rw-r--r--arch/alpha/kernel/pci-noop.c1
-rw-r--r--arch/alpha/kernel/pci_iommu.c17
-rw-r--r--arch/alpha/kernel/process.c5
-rw-r--r--arch/alpha/kernel/setup.c6
-rw-r--r--arch/alpha/kernel/smp.c8
-rw-r--r--arch/alpha/kernel/time.c1
-rw-r--r--arch/alpha/mm/numa.c2
-rw-r--r--arch/arm/kernel/armksyms.c6
-rw-r--r--arch/arm/mach-versatile/core.c4
-rw-r--r--arch/arm/mach-versatile/pci.c32
-rw-r--r--arch/arm/vfp/vfpmodule.c2
-rw-r--r--arch/arm26/kernel/armksyms.c8
-rw-r--r--arch/avr32/kernel/time.c10
-rw-r--r--arch/avr32/mach-at32ap/extint.c5
-rw-r--r--arch/avr32/mach-at32ap/intc.c7
-rw-r--r--arch/i386/Kconfig.cpu3
-rw-r--r--arch/i386/kernel/io_apic.c2
-rw-r--r--arch/i386/kernel/microcode.c8
-rw-r--r--arch/i386/kernel/setup.c13
-rw-r--r--arch/i386/kernel/syscall_table.S1
-rw-r--r--arch/i386/lib/usercopy.c2
-rw-r--r--arch/i386/mm/discontig.c11
-rw-r--r--arch/ia64/mm/contig.c1
-rw-r--r--arch/ia64/mm/discontig.c1
-rw-r--r--arch/m32r/kernel/setup.c8
-rw-r--r--arch/m32r/kernel/setup_mappi.c16
-rw-r--r--arch/m32r/kernel/signal.c6
-rw-r--r--arch/m32r/kernel/smp.c2
-rw-r--r--arch/m32r/kernel/sys_m32r.c6
-rw-r--r--arch/m32r/kernel/traps.c2
-rw-r--r--arch/m68k/kernel/m68k_ksyms.c51
-rw-r--r--arch/m68k/kernel/process.c11
-rw-r--r--arch/m68k/kernel/setup.c15
-rw-r--r--arch/m68k/kernel/traps.c6
-rw-r--r--arch/m68k/mm/kmap.c4
-rw-r--r--arch/m68k/mm/memory.c8
-rw-r--r--arch/m68k/mm/sun3kmap.c11
-rw-r--r--arch/m68k/sun3/Makefile2
-rw-r--r--arch/m68k/sun3/idprom.c3
-rw-r--r--arch/m68k/sun3/sun3_ksyms.c13
-rw-r--r--arch/m68k/sun3/sun3dvma.c6
-rw-r--r--arch/parisc/kernel/parisc_ksyms.c4
-rw-r--r--arch/powerpc/mm/mem.c7
-rw-r--r--arch/powerpc/mm/numa.c6
-rw-r--r--arch/ppc/mm/init.c7
-rw-r--r--arch/s390/kernel/s390_ksyms.c1
-rw-r--r--arch/um/Kconfig35
-rw-r--r--arch/um/Kconfig.i38649
-rw-r--r--arch/um/Makefile-x86_644
-rw-r--r--arch/um/include/common-offsets.h11
-rw-r--r--arch/um/include/longjmp.h3
-rw-r--r--arch/um/include/os.h1
-rw-r--r--arch/um/include/sysdep-i386/kernel-offsets.h5
-rw-r--r--arch/um/include/sysdep-x86_64/kernel-offsets.h5
-rw-r--r--arch/um/kernel/skas/mmu.c5
-rw-r--r--arch/um/kernel/tt/uaccess_user.c6
-rw-r--r--arch/um/os-Linux/tt.c1
-rw-r--r--arch/um/os-Linux/util.c9
-rw-r--r--arch/um/sys-x86_64/ksyms.c3
-rw-r--r--arch/x86_64/kernel/io_apic.c2
-rw-r--r--arch/x86_64/mm/init.c9
-rw-r--r--arch/x86_64/mm/numa.c8
-rw-r--r--drivers/ata/libata-core.c6
-rw-r--r--drivers/ata/libata-scsi.c46
-rw-r--r--drivers/ata/pata_qdi.c2
-rw-r--r--drivers/ata/sata_promise.c1
-rw-r--r--drivers/block/DAC960.h4
-rw-r--r--drivers/block/amiflop.c13
-rw-r--r--drivers/block/xd.c2
-rw-r--r--drivers/block/z2ram.c28
-rw-r--r--drivers/char/ip2/i2lib.c11
-rw-r--r--drivers/char/ip2/i2lib.h2
-rw-r--r--drivers/char/ip2/ip2main.c4
-rw-r--r--drivers/char/ipmi/ipmi_msghandler.c122
-rw-r--r--drivers/char/tpm/tpm.c9
-rw-r--r--drivers/char/tpm/tpm_atmel.c10
-rw-r--r--drivers/char/tpm/tpm_nsc.c6
-rw-r--r--drivers/eisa/eisa-bus.c22
-rw-r--r--drivers/firmware/dell_rbu.c23
-rw-r--r--drivers/firmware/efivars.c7
-rw-r--r--drivers/ide/pci/generic.c10
-rw-r--r--drivers/input/misc/wistron_btns.c2
-rw-r--r--drivers/isdn/pcbit/layer2.c1
-rw-r--r--drivers/isdn/sc/init.c23
-rw-r--r--drivers/isdn/sc/packet.c14
-rw-r--r--drivers/isdn/sc/shmem.c2
-rw-r--r--drivers/mca/mca-bus.c28
-rw-r--r--drivers/md/bitmap.c2
-rw-r--r--drivers/net/b44.c9
-rw-r--r--drivers/net/bonding/bond_alb.c4
-rw-r--r--drivers/net/ehea/ehea.h13
-rw-r--r--drivers/net/ehea/ehea_main.c6
-rw-r--r--drivers/net/ehea/ehea_phyp.c573
-rw-r--r--drivers/net/eth16i.c2
-rw-r--r--drivers/net/forcedeth.c31
-rw-r--r--drivers/net/ibmveth.c58
-rw-r--r--drivers/net/mv643xx_eth.c4
-rw-r--r--drivers/net/skge.c220
-rw-r--r--drivers/net/skge.h25
-rw-r--r--drivers/net/sky2.c36
-rw-r--r--drivers/net/sky2.h45
-rw-r--r--drivers/net/smc91x.h18
-rw-r--r--drivers/net/spider_net.c246
-rw-r--r--drivers/net/spider_net.h35
-rw-r--r--drivers/net/spider_net_ethtool.c6
-rw-r--r--drivers/net/sun3_82586.c2
-rw-r--r--drivers/net/sun3lance.c6
-rw-r--r--drivers/net/tulip/de2104x.c8
-rw-r--r--drivers/pci/Kconfig2
-rw-r--r--drivers/scsi/aha152x.c2
-rw-r--r--drivers/scsi/dtc.c2
-rw-r--r--drivers/scsi/fdomain.c2
-rw-r--r--drivers/scsi/seagate.c2
-rw-r--r--drivers/scsi/t128.c2
-rw-r--r--drivers/scsi/wd7000.c2
-rw-r--r--drivers/video/Kconfig1
-rw-r--r--drivers/video/nvidia/nv_i2c.c45
-rw-r--r--fs/Kconfig107
-rw-r--r--fs/Makefile2
-rw-r--r--fs/afs/dir.c8
-rw-r--r--fs/autofs4/autofs_i.h3
-rw-r--r--fs/autofs4/init.c2
-rw-r--r--fs/autofs4/inode.c22
-rw-r--r--fs/autofs4/waitq.c1
-rw-r--r--fs/bio.c9
-rw-r--r--fs/buffer.c23
-rw-r--r--fs/compat_ioctl.c10
-rw-r--r--fs/dcache.c130
-rw-r--r--fs/eventpoll.c56
-rw-r--r--fs/ext2/super.c16
-rw-r--r--fs/ext3/super.c2
-rw-r--r--fs/ext4/Makefile12
-rw-r--r--fs/ext4/acl.c551
-rw-r--r--fs/ext4/acl.h81
-rw-r--r--fs/ext4/balloc.c1833
-rw-r--r--fs/ext4/bitmap.c32
-rw-r--r--fs/ext4/dir.c518
-rw-r--r--fs/ext4/extents.c2152
-rw-r--r--fs/ext4/file.c139
-rw-r--r--fs/ext4/fsync.c88
-rw-r--r--fs/ext4/hash.c152
-rw-r--r--fs/ext4/ialloc.c772
-rw-r--r--fs/ext4/inode.c3233
-rw-r--r--fs/ext4/ioctl.c306
-rw-r--r--fs/ext4/namei.c2395
-rw-r--r--fs/ext4/namei.h8
-rw-r--r--fs/ext4/resize.c1045
-rw-r--r--fs/ext4/super.c2829
-rw-r--r--fs/ext4/symlink.c54
-rw-r--r--fs/ext4/xattr.c1317
-rw-r--r--fs/ext4/xattr.h145
-rw-r--r--fs/ext4/xattr_security.c77
-rw-r--r--fs/ext4/xattr_trusted.c62
-rw-r--r--fs/ext4/xattr_user.c64
-rw-r--r--fs/fat/inode.c2
-rw-r--r--fs/gfs2/locking/dlm/mount.c2
-rw-r--r--fs/hugetlbfs/inode.c2
-rw-r--r--fs/jbd/journal.c3
-rw-r--r--fs/jbd2/Makefile7
-rw-r--r--fs/jbd2/checkpoint.c697
-rw-r--r--fs/jbd2/commit.c920
-rw-r--r--fs/jbd2/journal.c2083
-rw-r--r--fs/jbd2/recovery.c609
-rw-r--r--fs/jbd2/revoke.c712
-rw-r--r--fs/jbd2/transaction.c2080
-rw-r--r--fs/jffs2/super.c8
-rw-r--r--fs/minix/inode.c8
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/reiserfs/super.c31
-rw-r--r--fs/super.c12
-rw-r--r--fs/sysv/super.c15
-rw-r--r--include/asm-alpha/io.h13
-rw-r--r--include/asm-arm/arch-versatile/hardware.h4
-rw-r--r--include/asm-arm/io.h17
-rw-r--r--include/asm-arm/uaccess.h2
-rw-r--r--include/asm-avr32/irq_regs.h1
-rw-r--r--include/asm-frv/io.h21
-rw-r--r--include/asm-generic/bitops/sched.h2
-rw-r--r--include/asm-i386/io.h27
-rw-r--r--include/asm-i386/uaccess.h67
-rw-r--r--include/asm-i386/unistd.h3
-rw-r--r--include/asm-m32r/io.h32
-rw-r--r--include/asm-m68k/uaccess.h16
-rw-r--r--include/asm-mips/io.h26
-rw-r--r--include/asm-powerpc/io.h26
-rw-r--r--include/asm-ppc/io.h16
-rw-r--r--include/asm-sh/io.h16
-rw-r--r--include/asm-sh64/io.h16
-rw-r--r--include/asm-sparc64/io.h15
-rw-r--r--include/asm-x86_64/io.h27
-rw-r--r--include/linux/bitmap.h13
-rw-r--r--include/linux/carta_random32.h29
-rw-r--r--include/linux/compat_ioctl.h24
-rw-r--r--include/linux/cpumask.h14
-rw-r--r--include/linux/dcache.h1
-rw-r--r--include/linux/ext4_fs.h994
-rw-r--r--include/linux/ext4_fs_extents.h198
-rw-r--r--include/linux/ext4_fs_i.h158
-rw-r--r--include/linux/ext4_fs_sb.h94
-rw-r--r--include/linux/ext4_jbd2.h273
-rw-r--r--include/linux/hugetlb.h1
-rw-r--r--include/linux/io.h27
-rw-r--r--include/linux/jbd2.h1107
-rw-r--r--include/linux/magic.h1
-rw-r--r--include/linux/mm.h7
-rw-r--r--include/linux/module.h3
-rw-r--r--include/linux/nbd.h2
-rw-r--r--include/linux/nodemask.h14
-rw-r--r--include/linux/syscalls.h4
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/lockdep.c5
-rw-r--r--kernel/module.c94
-rw-r--r--kernel/power/disk.c8
-rw-r--r--kernel/power/user.c8
-rw-r--r--kernel/printk.c11
-rw-r--r--kernel/profile.c2
-rw-r--r--kernel/sched.c6
-rw-r--r--kernel/workqueue.c7
-rw-r--r--lib/Kconfig.debug15
-rw-r--r--lib/Makefile2
-rw-r--r--lib/bitmap.c54
-rw-r--r--lib/carta_random32.c41
-rw-r--r--mm/hugetlb.c22
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/page_alloc.c31
-rw-r--r--mm/rmap.c5
-rw-r--r--mm/shmem_acl.c2
-rw-r--r--mm/truncate.c3
-rw-r--r--scripts/kconfig/lxdialog/dialog.h1
-rwxr-xr-xscripts/kernel-doc15
238 files changed, 30142 insertions, 1618 deletions
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index 3c384c0cf86e..4dc28cc93503 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -34,6 +34,8 @@ ext2.txt
34 - info, mount options and specifications for the Ext2 filesystem. 34 - info, mount options and specifications for the Ext2 filesystem.
35ext3.txt 35ext3.txt
36 - info, mount options and specifications for the Ext3 filesystem. 36 - info, mount options and specifications for the Ext3 filesystem.
37ext4.txt
38 - info, mount options and specifications for the Ext4 filesystem.
37files.txt 39files.txt
38 - info on file management in the Linux kernel. 40 - info on file management in the Linux kernel.
39fuse.txt 41fuse.txt
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
new file mode 100644
index 000000000000..6a4adcae9f9a
--- /dev/null
+++ b/Documentation/filesystems/ext4.txt
@@ -0,0 +1,236 @@
1
2Ext4 Filesystem
3===============
4
5This is a development version of the ext4 filesystem, an advanced level
6of the ext3 filesystem which incorporates scalability and reliability
7enhancements for supporting large filesystems (64 bit) in keeping with
8increasing disk capacities and state-of-the-art feature requirements.
9
10Mailing list: linux-ext4@vger.kernel.org
11
12
131. Quick usage instructions:
14===========================
15
16 - Grab updated e2fsprogs from
17 ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs-interim/
18 This is a patchset on top of e2fsprogs-1.39, which can be found at
19 ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/
20
21 - It's still mke2fs -j /dev/hda1
22
23 - mount /dev/hda1 /wherever -t ext4dev
24
25 - To enable extents,
26
27 mount /dev/hda1 /wherever -t ext4dev -o extents
28
29 - The filesystem is compatible with the ext3 driver until you add a file
30 which has extents (ie: `mount -o extents', then create a file).
31
32 NOTE: The "extents" mount flag is temporary. It will soon go away and
33 extents will be enabled by the "-o extents" flag to mke2fs or tune2fs
34
35 - When comparing performance with other filesystems, remember that
36 ext3/4 by default offers higher data integrity guarantees than most. So
37 when comparing with a metadata-only journalling filesystem, use `mount -o
38 data=writeback'. And you might as well use `mount -o nobh' too along
39 with it. Making the journal larger than the mke2fs default often helps
40 performance with metadata-intensive workloads.
41
422. Features
43===========
44
452.1 Currently available
46
47* ability to use filesystems > 16TB
48* extent format reduces metadata overhead (RAM, IO for access, transactions)
49* extent format more robust in face of on-disk corruption due to magics,
50* internal redunancy in tree
51
522.1 Previously available, soon to be enabled by default by "mkefs.ext4":
53
54* dir_index and resize inode will be on by default
55* large inodes will be used by default for fast EAs, nsec timestamps, etc
56
572.2 Candidate features for future inclusion
58
59There are several under discussion, whether they all make it in is
60partly a function of how much time everyone has to work on them:
61
62* improved file allocation (multi-block alloc, delayed alloc; basically done)
63* fix 32000 subdirectory limit (patch exists, needs some e2fsck work)
64* nsec timestamps for mtime, atime, ctime, create time (patch exists,
65 needs some e2fsck work)
66* inode version field on disk (NFSv4, Lustre; prototype exists)
67* reduced mke2fs/e2fsck time via uninitialized groups (prototype exists)
68* journal checksumming for robustness, performance (prototype exists)
69* persistent file preallocation (e.g for streaming media, databases)
70
71Features like metadata checksumming have been discussed and planned for
72a bit but no patches exist yet so I'm not sure they're in the near-term
73roadmap.
74
75The big performance win will come with mballoc and delalloc. CFS has
76been using mballoc for a few years already with Lustre, and IBM + Bull
77did a lot of benchmarking on it. The reason it isn't in the first set of
78patches is partly a manageability issue, and partly because it doesn't
79directly affect the on-disk format (outside of much better allocation)
80so it isn't critical to get into the first round of changes. I believe
81Alex is working on a new set of patches right now.
82
833. Options
84==========
85
86When mounting an ext4 filesystem, the following option are accepted:
87(*) == default
88
89extents ext4 will use extents to address file data. The
90 file system will no longer be mountable by ext3.
91
92journal=update Update the ext4 file system's journal to the current
93 format.
94
95journal=inum When a journal already exists, this option is ignored.
96 Otherwise, it specifies the number of the inode which
97 will represent the ext4 file system's journal file.
98
99journal_dev=devnum When the external journal device's major/minor numbers
100 have changed, this option allows the user to specify
101 the new journal location. The journal device is
102 identified through its new major/minor numbers encoded
103 in devnum.
104
105noload Don't load the journal on mounting.
106
107data=journal All data are committed into the journal prior to being
108 written into the main file system.
109
110data=ordered (*) All data are forced directly out to the main file
111 system prior to its metadata being committed to the
112 journal.
113
114data=writeback Data ordering is not preserved, data may be written
115 into the main file system after its metadata has been
116 committed to the journal.
117
118commit=nrsec (*) Ext4 can be told to sync all its data and metadata
119 every 'nrsec' seconds. The default value is 5 seconds.
120 This means that if you lose your power, you will lose
121 as much as the latest 5 seconds of work (your
122 filesystem will not be damaged though, thanks to the
123 journaling). This default value (or any low value)
124 will hurt performance, but it's good for data-safety.
125 Setting it to 0 will have the same effect as leaving
126 it at the default (5 seconds).
127 Setting it to very large values will improve
128 performance.
129
130barrier=1 This enables/disables barriers. barrier=0 disables
131 it, barrier=1 enables it.
132
133orlov (*) This enables the new Orlov block allocator. It is
134 enabled by default.
135
136oldalloc This disables the Orlov block allocator and enables
137 the old block allocator. Orlov should have better
138 performance - we'd like to get some feedback if it's
139 the contrary for you.
140
141user_xattr Enables Extended User Attributes. Additionally, you
142 need to have extended attribute support enabled in the
143 kernel configuration (CONFIG_EXT4_FS_XATTR). See the
144 attr(5) manual page and http://acl.bestbits.at/ to
145 learn more about extended attributes.
146
147nouser_xattr Disables Extended User Attributes.
148
149acl Enables POSIX Access Control Lists support.
150 Additionally, you need to have ACL support enabled in
151 the kernel configuration (CONFIG_EXT4_FS_POSIX_ACL).
152 See the acl(5) manual page and http://acl.bestbits.at/
153 for more information.
154
155noacl This option disables POSIX Access Control List
156 support.
157
158reservation
159
160noreservation
161
162bsddf (*) Make 'df' act like BSD.
163minixdf Make 'df' act like Minix.
164
165check=none Don't do extra checking of bitmaps on mount.
166nocheck
167
168debug Extra debugging information is sent to syslog.
169
170errors=remount-ro(*) Remount the filesystem read-only on an error.
171errors=continue Keep going on a filesystem error.
172errors=panic Panic and halt the machine if an error occurs.
173
174grpid Give objects the same group ID as their creator.
175bsdgroups
176
177nogrpid (*) New objects have the group ID of their creator.
178sysvgroups
179
180resgid=n The group ID which may use the reserved blocks.
181
182resuid=n The user ID which may use the reserved blocks.
183
184sb=n Use alternate superblock at this location.
185
186quota
187noquota
188grpquota
189usrquota
190
191bh (*) ext4 associates buffer heads to data pages to
192nobh (a) cache disk block mapping information
193 (b) link pages into transaction to provide
194 ordering guarantees.
195 "bh" option forces use of buffer heads.
196 "nobh" option tries to avoid associating buffer
197 heads (supported only for "writeback" mode).
198
199
200Data Mode
201---------
202There are 3 different data modes:
203
204* writeback mode
205In data=writeback mode, ext4 does not journal data at all. This mode provides
206a similar level of journaling as that of XFS, JFS, and ReiserFS in its default
207mode - metadata journaling. A crash+recovery can cause incorrect data to
208appear in files which were written shortly before the crash. This mode will
209typically provide the best ext4 performance.
210
211* ordered mode
212In data=ordered mode, ext4 only officially journals metadata, but it logically
213groups metadata and data blocks into a single unit called a transaction. When
214it's time to write the new metadata out to disk, the associated data blocks
215are written first. In general, this mode performs slightly slower than
216writeback but significantly faster than journal mode.
217
218* journal mode
219data=journal mode provides full data and metadata journaling. All new data is
220written to the journal first, and then to its final location.
221In the event of a crash, the journal can be replayed, bringing both data and
222metadata into a consistent state. This mode is the slowest except when data
223needs to be read from and written to disk at the same time where it
224outperforms all others modes.
225
226References
227==========
228
229kernel source: <file:fs/ext4/>
230 <file:fs/jbd2/>
231
232programs: http://e2fsprogs.sourceforge.net/
233 http://ext2resize.sourceforge.net
234
235useful links: http://fedoraproject.org/wiki/ext3-devel
236 http://www.bullopensource.org/ext4/
diff --git a/Documentation/lockdep-design.txt b/Documentation/lockdep-design.txt
index dab123db5a4f..488773018152 100644
--- a/Documentation/lockdep-design.txt
+++ b/Documentation/lockdep-design.txt
@@ -50,10 +50,10 @@ The bit position indicates hardirq, softirq, hardirq-read,
50softirq-read respectively, and the character displayed in each 50softirq-read respectively, and the character displayed in each
51indicates: 51indicates:
52 52
53 '.' acquired while irqs enabled 53 '.' acquired while irqs disabled
54 '+' acquired in irq context 54 '+' acquired in irq context
55 '-' acquired in process context with irqs disabled 55 '-' acquired with irqs enabled
56 '?' read-acquired both with irqs enabled and in irq context 56 '?' read acquired in irq context with irqs enabled.
57 57
58Unused mutexes cannot be part of the cause of an error. 58Unused mutexes cannot be part of the cause of an error.
59 59
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 89bf8c20a586..0bc7f1e3c9e6 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -86,7 +86,7 @@ valid for 30 seconds.
86core_pattern: 86core_pattern:
87 87
88core_pattern is used to specify a core dumpfile pattern name. 88core_pattern is used to specify a core dumpfile pattern name.
89. max length 64 characters; default value is "core" 89. max length 128 characters; default value is "core"
90. core_pattern is used as a pattern template for the output filename; 90. core_pattern is used as a pattern template for the output filename;
91 certain string patterns (beginning with '%') are substituted with 91 certain string patterns (beginning with '%') are substituted with
92 their actual values. 92 their actual values.
@@ -105,6 +105,9 @@ core_pattern is used to specify a core dumpfile pattern name.
105 %h hostname 105 %h hostname
106 %e executable filename 106 %e executable filename
107 %<OTHER> both are dropped 107 %<OTHER> both are dropped
108. If the first character of the pattern is a '|', the kernel will treat
109 the rest of the pattern as a command to run. The core dump will be
110 written to the standard input of that program instead of to a file.
108 111
109============================================================== 112==============================================================
110 113
diff --git a/Makefile b/Makefile
index 274b780029b1..f242829c4f0b 100644
--- a/Makefile
+++ b/Makefile
@@ -741,6 +741,9 @@ endif # ifdef CONFIG_KALLSYMS
741 741
742# vmlinux image - including updated kernel symbols 742# vmlinux image - including updated kernel symbols
743vmlinux: $(vmlinux-lds) $(vmlinux-init) $(vmlinux-main) $(kallsyms.o) FORCE 743vmlinux: $(vmlinux-lds) $(vmlinux-init) $(vmlinux-main) $(kallsyms.o) FORCE
744ifdef CONFIG_HEADERS_CHECK
745 $(Q)$(MAKE) headers_check
746endif
744 $(call if_changed_rule,vmlinux__) 747 $(call if_changed_rule,vmlinux__)
745 $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modpost $@ 748 $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modpost $@
746 $(Q)rm -f .old_version 749 $(Q)rm -f .old_version
diff --git a/arch/alpha/kernel/alpha_ksyms.c b/arch/alpha/kernel/alpha_ksyms.c
index 8b02420f732e..692809e4aece 100644
--- a/arch/alpha/kernel/alpha_ksyms.c
+++ b/arch/alpha/kernel/alpha_ksyms.c
@@ -6,41 +6,14 @@
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/string.h>
10#include <linux/user.h>
11#include <linux/elfcore.h>
12#include <linux/socket.h>
13#include <linux/syscalls.h>
14#include <linux/in.h>
15#include <linux/in6.h>
16#include <linux/pci.h>
17#include <linux/screen_info.h>
18#include <linux/tty.h>
19#include <linux/mm.h>
20#include <linux/delay.h>
21#include <linux/dma-mapping.h>
22
23#include <asm/io.h>
24#include <asm/console.h> 9#include <asm/console.h>
25#include <asm/hwrpb.h>
26#include <asm/uaccess.h> 10#include <asm/uaccess.h>
27#include <asm/processor.h>
28#include <asm/checksum.h> 11#include <asm/checksum.h>
29#include <linux/interrupt.h>
30#include <asm/fpu.h> 12#include <asm/fpu.h>
31#include <asm/irq.h>
32#include <asm/machvec.h> 13#include <asm/machvec.h>
33#include <asm/pgalloc.h>
34#include <asm/semaphore.h>
35#include <asm/tlbflush.h>
36#include <asm/cacheflush.h>
37#include <asm/vga.h>
38 14
39#include <asm/unistd.h> 15#include <asm/unistd.h>
40 16
41extern struct hwrpb_struct *hwrpb;
42extern spinlock_t rtc_lock;
43
44/* these are C runtime functions with special calling conventions: */ 17/* these are C runtime functions with special calling conventions: */
45extern void __divl (void); 18extern void __divl (void);
46extern void __reml (void); 19extern void __reml (void);
@@ -52,14 +25,9 @@ extern void __divqu (void);
52extern void __remqu (void); 25extern void __remqu (void);
53 26
54EXPORT_SYMBOL(alpha_mv); 27EXPORT_SYMBOL(alpha_mv);
55EXPORT_SYMBOL(screen_info);
56EXPORT_SYMBOL(perf_irq);
57EXPORT_SYMBOL(callback_getenv); 28EXPORT_SYMBOL(callback_getenv);
58EXPORT_SYMBOL(callback_setenv); 29EXPORT_SYMBOL(callback_setenv);
59EXPORT_SYMBOL(callback_save_env); 30EXPORT_SYMBOL(callback_save_env);
60#ifdef CONFIG_ALPHA_GENERIC
61EXPORT_SYMBOL(alpha_using_srm);
62#endif /* CONFIG_ALPHA_GENERIC */
63 31
64/* platform dependent support */ 32/* platform dependent support */
65EXPORT_SYMBOL(strcat); 33EXPORT_SYMBOL(strcat);
@@ -77,47 +45,14 @@ EXPORT_SYMBOL(__constant_c_memset);
77EXPORT_SYMBOL(copy_page); 45EXPORT_SYMBOL(copy_page);
78EXPORT_SYMBOL(clear_page); 46EXPORT_SYMBOL(clear_page);
79 47
80EXPORT_SYMBOL(__direct_map_base);
81EXPORT_SYMBOL(__direct_map_size);
82
83#ifdef CONFIG_PCI
84EXPORT_SYMBOL(pci_alloc_consistent);
85EXPORT_SYMBOL(pci_free_consistent);
86EXPORT_SYMBOL(pci_map_single);
87EXPORT_SYMBOL(pci_map_page);
88EXPORT_SYMBOL(pci_unmap_single);
89EXPORT_SYMBOL(pci_unmap_page);
90EXPORT_SYMBOL(pci_map_sg);
91EXPORT_SYMBOL(pci_unmap_sg);
92EXPORT_SYMBOL(pci_dma_supported);
93EXPORT_SYMBOL(pci_dac_dma_supported);
94EXPORT_SYMBOL(pci_dac_page_to_dma);
95EXPORT_SYMBOL(pci_dac_dma_to_page);
96EXPORT_SYMBOL(pci_dac_dma_to_offset);
97EXPORT_SYMBOL(alpha_gendev_to_pci);
98#endif
99EXPORT_SYMBOL(dma_set_mask);
100
101EXPORT_SYMBOL(dump_thread);
102EXPORT_SYMBOL(dump_elf_thread);
103EXPORT_SYMBOL(dump_elf_task);
104EXPORT_SYMBOL(dump_elf_task_fp);
105EXPORT_SYMBOL(hwrpb);
106EXPORT_SYMBOL(start_thread);
107EXPORT_SYMBOL(alpha_read_fp_reg); 48EXPORT_SYMBOL(alpha_read_fp_reg);
108EXPORT_SYMBOL(alpha_read_fp_reg_s); 49EXPORT_SYMBOL(alpha_read_fp_reg_s);
109EXPORT_SYMBOL(alpha_write_fp_reg); 50EXPORT_SYMBOL(alpha_write_fp_reg);
110EXPORT_SYMBOL(alpha_write_fp_reg_s); 51EXPORT_SYMBOL(alpha_write_fp_reg_s);
111 52
112/* In-kernel system calls. */ 53/* entry.S */
113EXPORT_SYMBOL(kernel_thread); 54EXPORT_SYMBOL(kernel_thread);
114EXPORT_SYMBOL(sys_dup);
115EXPORT_SYMBOL(sys_exit);
116EXPORT_SYMBOL(sys_write);
117EXPORT_SYMBOL(sys_lseek);
118EXPORT_SYMBOL(kernel_execve); 55EXPORT_SYMBOL(kernel_execve);
119EXPORT_SYMBOL(sys_setsid);
120EXPORT_SYMBOL(sys_wait4);
121 56
122/* Networking helper routines. */ 57/* Networking helper routines. */
123EXPORT_SYMBOL(csum_tcpudp_magic); 58EXPORT_SYMBOL(csum_tcpudp_magic);
@@ -134,10 +69,6 @@ EXPORT_SYMBOL(alpha_fp_emul_imprecise);
134EXPORT_SYMBOL(alpha_fp_emul); 69EXPORT_SYMBOL(alpha_fp_emul);
135#endif 70#endif
136 71
137#ifdef CONFIG_ALPHA_BROKEN_IRQ_MASK
138EXPORT_SYMBOL(__min_ipl);
139#endif
140
141/* 72/*
142 * The following are specially called from the uaccess assembly stubs. 73 * The following are specially called from the uaccess assembly stubs.
143 */ 74 */
@@ -160,27 +91,10 @@ EXPORT_SYMBOL(up);
160 */ 91 */
161 92
162#ifdef CONFIG_SMP 93#ifdef CONFIG_SMP
163EXPORT_SYMBOL(flush_tlb_mm);
164EXPORT_SYMBOL(flush_tlb_range);
165EXPORT_SYMBOL(flush_tlb_page);
166EXPORT_SYMBOL(smp_imb);
167EXPORT_SYMBOL(cpu_data);
168EXPORT_SYMBOL(smp_num_cpus);
169EXPORT_SYMBOL(smp_call_function);
170EXPORT_SYMBOL(smp_call_function_on_cpu);
171EXPORT_SYMBOL(_atomic_dec_and_lock); 94EXPORT_SYMBOL(_atomic_dec_and_lock);
172#endif /* CONFIG_SMP */ 95#endif /* CONFIG_SMP */
173 96
174/* 97/*
175 * NUMA specific symbols
176 */
177#ifdef CONFIG_DISCONTIGMEM
178EXPORT_SYMBOL(node_data);
179#endif /* CONFIG_DISCONTIGMEM */
180
181EXPORT_SYMBOL(rtc_lock);
182
183/*
184 * The following are special because they're not called 98 * The following are special because they're not called
185 * explicitly (the C compiler or assembler generates them in 99 * explicitly (the C compiler or assembler generates them in
186 * response to division operations). Fortunately, their 100 * response to division operations). Fortunately, their
@@ -200,8 +114,3 @@ EXPORT_SYMBOL(__remqu);
200EXPORT_SYMBOL(memcpy); 114EXPORT_SYMBOL(memcpy);
201EXPORT_SYMBOL(memset); 115EXPORT_SYMBOL(memset);
202EXPORT_SYMBOL(memchr); 116EXPORT_SYMBOL(memchr);
203
204#ifdef CONFIG_ALPHA_IRONGATE
205EXPORT_SYMBOL(irongate_ioremap);
206EXPORT_SYMBOL(irongate_iounmap);
207#endif
diff --git a/arch/alpha/kernel/core_irongate.c b/arch/alpha/kernel/core_irongate.c
index 138d497d1cca..e4a0bcf1d28b 100644
--- a/arch/alpha/kernel/core_irongate.c
+++ b/arch/alpha/kernel/core_irongate.c
@@ -404,6 +404,7 @@ irongate_ioremap(unsigned long addr, unsigned long size)
404#endif 404#endif
405 return (void __iomem *)vaddr; 405 return (void __iomem *)vaddr;
406} 406}
407EXPORT_SYMBOL(irongate_ioremap);
407 408
408void 409void
409irongate_iounmap(volatile void __iomem *xaddr) 410irongate_iounmap(volatile void __iomem *xaddr)
@@ -414,3 +415,4 @@ irongate_iounmap(volatile void __iomem *xaddr)
414 if (addr) 415 if (addr)
415 return vfree((void *)(PAGE_MASK & addr)); 416 return vfree((void *)(PAGE_MASK & addr));
416} 417}
418EXPORT_SYMBOL(irongate_iounmap);
diff --git a/arch/alpha/kernel/irq_alpha.c b/arch/alpha/kernel/irq_alpha.c
index 6dd126b8be85..e16aeb6e79ef 100644
--- a/arch/alpha/kernel/irq_alpha.c
+++ b/arch/alpha/kernel/irq_alpha.c
@@ -6,6 +6,7 @@
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/irq.h> 7#include <linux/irq.h>
8#include <linux/kernel_stat.h> 8#include <linux/kernel_stat.h>
9#include <linux/module.h>
9 10
10#include <asm/machvec.h> 11#include <asm/machvec.h>
11#include <asm/dma.h> 12#include <asm/dma.h>
@@ -16,6 +17,7 @@
16/* Hack minimum IPL during interrupt processing for broken hardware. */ 17/* Hack minimum IPL during interrupt processing for broken hardware. */
17#ifdef CONFIG_ALPHA_BROKEN_IRQ_MASK 18#ifdef CONFIG_ALPHA_BROKEN_IRQ_MASK
18int __min_ipl; 19int __min_ipl;
20EXPORT_SYMBOL(__min_ipl);
19#endif 21#endif
20 22
21/* 23/*
@@ -30,6 +32,7 @@ dummy_perf(unsigned long vector, struct pt_regs *regs)
30} 32}
31 33
32void (*perf_irq)(unsigned long, struct pt_regs *) = dummy_perf; 34void (*perf_irq)(unsigned long, struct pt_regs *) = dummy_perf;
35EXPORT_SYMBOL(perf_irq);
33 36
34/* 37/*
35 * The main interrupt entry point. 38 * The main interrupt entry point.
diff --git a/arch/alpha/kernel/pci-noop.c b/arch/alpha/kernel/pci-noop.c
index fff5cf93e816..174b729c504b 100644
--- a/arch/alpha/kernel/pci-noop.c
+++ b/arch/alpha/kernel/pci-noop.c
@@ -201,6 +201,7 @@ dma_set_mask(struct device *dev, u64 mask)
201 201
202 return 0; 202 return 0;
203} 203}
204EXPORT_SYMBOL(dma_set_mask);
204 205
205void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen) 206void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
206{ 207{
diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c
index c468e312e5f8..6e7d1fe6e935 100644
--- a/arch/alpha/kernel/pci_iommu.c
+++ b/arch/alpha/kernel/pci_iommu.c
@@ -300,6 +300,7 @@ pci_map_single(struct pci_dev *pdev, void *cpu_addr, size_t size, int dir)
300 dac_allowed = pdev ? pci_dac_dma_supported(pdev, pdev->dma_mask) : 0; 300 dac_allowed = pdev ? pci_dac_dma_supported(pdev, pdev->dma_mask) : 0;
301 return pci_map_single_1(pdev, cpu_addr, size, dac_allowed); 301 return pci_map_single_1(pdev, cpu_addr, size, dac_allowed);
302} 302}
303EXPORT_SYMBOL(pci_map_single);
303 304
304dma_addr_t 305dma_addr_t
305pci_map_page(struct pci_dev *pdev, struct page *page, unsigned long offset, 306pci_map_page(struct pci_dev *pdev, struct page *page, unsigned long offset,
@@ -314,6 +315,7 @@ pci_map_page(struct pci_dev *pdev, struct page *page, unsigned long offset,
314 return pci_map_single_1(pdev, (char *)page_address(page) + offset, 315 return pci_map_single_1(pdev, (char *)page_address(page) + offset,
315 size, dac_allowed); 316 size, dac_allowed);
316} 317}
318EXPORT_SYMBOL(pci_map_page);
317 319
318/* Unmap a single streaming mode DMA translation. The DMA_ADDR and 320/* Unmap a single streaming mode DMA translation. The DMA_ADDR and
319 SIZE must match what was provided for in a previous pci_map_single 321 SIZE must match what was provided for in a previous pci_map_single
@@ -379,6 +381,7 @@ pci_unmap_single(struct pci_dev *pdev, dma_addr_t dma_addr, size_t size,
379 DBGA2("pci_unmap_single: sg [%lx,%lx] np %ld from %p\n", 381 DBGA2("pci_unmap_single: sg [%lx,%lx] np %ld from %p\n",
380 dma_addr, size, npages, __builtin_return_address(0)); 382 dma_addr, size, npages, __builtin_return_address(0));
381} 383}
384EXPORT_SYMBOL(pci_unmap_single);
382 385
383void 386void
384pci_unmap_page(struct pci_dev *pdev, dma_addr_t dma_addr, 387pci_unmap_page(struct pci_dev *pdev, dma_addr_t dma_addr,
@@ -386,6 +389,7 @@ pci_unmap_page(struct pci_dev *pdev, dma_addr_t dma_addr,
386{ 389{
387 pci_unmap_single(pdev, dma_addr, size, direction); 390 pci_unmap_single(pdev, dma_addr, size, direction);
388} 391}
392EXPORT_SYMBOL(pci_unmap_page);
389 393
390/* Allocate and map kernel buffer using consistent mode DMA for PCI 394/* Allocate and map kernel buffer using consistent mode DMA for PCI
391 device. Returns non-NULL cpu-view pointer to the buffer if 395 device. Returns non-NULL cpu-view pointer to the buffer if
@@ -427,6 +431,7 @@ try_again:
427 431
428 return cpu_addr; 432 return cpu_addr;
429} 433}
434EXPORT_SYMBOL(pci_alloc_consistent);
430 435
431/* Free and unmap a consistent DMA buffer. CPU_ADDR and DMA_ADDR must 436/* Free and unmap a consistent DMA buffer. CPU_ADDR and DMA_ADDR must
432 be values that were returned from pci_alloc_consistent. SIZE must 437 be values that were returned from pci_alloc_consistent. SIZE must
@@ -444,7 +449,7 @@ pci_free_consistent(struct pci_dev *pdev, size_t size, void *cpu_addr,
444 DBGA2("pci_free_consistent: [%x,%lx] from %p\n", 449 DBGA2("pci_free_consistent: [%x,%lx] from %p\n",
445 dma_addr, size, __builtin_return_address(0)); 450 dma_addr, size, __builtin_return_address(0));
446} 451}
447 452EXPORT_SYMBOL(pci_free_consistent);
448 453
449/* Classify the elements of the scatterlist. Write dma_address 454/* Classify the elements of the scatterlist. Write dma_address
450 of each element with: 455 of each element with:
@@ -672,6 +677,7 @@ pci_map_sg(struct pci_dev *pdev, struct scatterlist *sg, int nents,
672 pci_unmap_sg(pdev, start, out - start, direction); 677 pci_unmap_sg(pdev, start, out - start, direction);
673 return 0; 678 return 0;
674} 679}
680EXPORT_SYMBOL(pci_map_sg);
675 681
676/* Unmap a set of streaming mode DMA translations. Again, cpu read 682/* Unmap a set of streaming mode DMA translations. Again, cpu read
677 rules concerning calls here are the same as for pci_unmap_single() 683 rules concerning calls here are the same as for pci_unmap_single()
@@ -752,6 +758,7 @@ pci_unmap_sg(struct pci_dev *pdev, struct scatterlist *sg, int nents,
752 758
753 DBGA("pci_unmap_sg: %ld entries\n", nents - (end - sg)); 759 DBGA("pci_unmap_sg: %ld entries\n", nents - (end - sg));
754} 760}
761EXPORT_SYMBOL(pci_unmap_sg);
755 762
756 763
757/* Return whether the given PCI device DMA address mask can be 764/* Return whether the given PCI device DMA address mask can be
@@ -786,6 +793,7 @@ pci_dma_supported(struct pci_dev *pdev, u64 mask)
786 793
787 return 0; 794 return 0;
788} 795}
796EXPORT_SYMBOL(pci_dma_supported);
789 797
790 798
791/* 799/*
@@ -908,6 +916,7 @@ pci_dac_dma_supported(struct pci_dev *dev, u64 mask)
908 916
909 return ok; 917 return ok;
910} 918}
919EXPORT_SYMBOL(pci_dac_dma_supported);
911 920
912dma64_addr_t 921dma64_addr_t
913pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, 922pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page,
@@ -917,6 +926,7 @@ pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page,
917 + __pa(page_address(page)) 926 + __pa(page_address(page))
918 + (dma64_addr_t) offset); 927 + (dma64_addr_t) offset);
919} 928}
929EXPORT_SYMBOL(pci_dac_page_to_dma);
920 930
921struct page * 931struct page *
922pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr) 932pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr)
@@ -924,13 +934,14 @@ pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr)
924 unsigned long paddr = (dma_addr & PAGE_MASK) - alpha_mv.pci_dac_offset; 934 unsigned long paddr = (dma_addr & PAGE_MASK) - alpha_mv.pci_dac_offset;
925 return virt_to_page(__va(paddr)); 935 return virt_to_page(__va(paddr));
926} 936}
937EXPORT_SYMBOL(pci_dac_dma_to_page);
927 938
928unsigned long 939unsigned long
929pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr) 940pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr)
930{ 941{
931 return (dma_addr & ~PAGE_MASK); 942 return (dma_addr & ~PAGE_MASK);
932} 943}
933 944EXPORT_SYMBOL(pci_dac_dma_to_offset);
934 945
935/* Helper for generic DMA-mapping functions. */ 946/* Helper for generic DMA-mapping functions. */
936 947
@@ -957,6 +968,7 @@ alpha_gendev_to_pci(struct device *dev)
957 /* This assumes ISA bus master with dma_mask 0xffffff. */ 968 /* This assumes ISA bus master with dma_mask 0xffffff. */
958 return NULL; 969 return NULL;
959} 970}
971EXPORT_SYMBOL(alpha_gendev_to_pci);
960 972
961int 973int
962dma_set_mask(struct device *dev, u64 mask) 974dma_set_mask(struct device *dev, u64 mask)
@@ -969,3 +981,4 @@ dma_set_mask(struct device *dev, u64 mask)
969 981
970 return 0; 982 return 0;
971} 983}
984EXPORT_SYMBOL(dma_set_mask);
diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index b3a8a2980365..3370e6faeae0 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -205,6 +205,7 @@ start_thread(struct pt_regs * regs, unsigned long pc, unsigned long sp)
205 regs->ps = 8; 205 regs->ps = 8;
206 wrusp(sp); 206 wrusp(sp);
207} 207}
208EXPORT_SYMBOL(start_thread);
208 209
209/* 210/*
210 * Free current thread data structures etc.. 211 * Free current thread data structures etc..
@@ -376,6 +377,7 @@ dump_thread(struct pt_regs * pt, struct user * dump)
376 dump->regs[EF_A2] = pt->r18; 377 dump->regs[EF_A2] = pt->r18;
377 memcpy((char *)dump->regs + EF_SIZE, sw->fp, 32 * 8); 378 memcpy((char *)dump->regs + EF_SIZE, sw->fp, 32 * 8);
378} 379}
380EXPORT_SYMBOL(dump_thread);
379 381
380/* 382/*
381 * Fill in the user structure for a ELF core dump. 383 * Fill in the user structure for a ELF core dump.
@@ -424,6 +426,7 @@ dump_elf_thread(elf_greg_t *dest, struct pt_regs *pt, struct thread_info *ti)
424 useful value of the thread's UNIQUE field. */ 426 useful value of the thread's UNIQUE field. */
425 dest[32] = ti->pcb.unique; 427 dest[32] = ti->pcb.unique;
426} 428}
429EXPORT_SYMBOL(dump_elf_thread);
427 430
428int 431int
429dump_elf_task(elf_greg_t *dest, struct task_struct *task) 432dump_elf_task(elf_greg_t *dest, struct task_struct *task)
@@ -431,6 +434,7 @@ dump_elf_task(elf_greg_t *dest, struct task_struct *task)
431 dump_elf_thread(dest, task_pt_regs(task), task_thread_info(task)); 434 dump_elf_thread(dest, task_pt_regs(task), task_thread_info(task));
432 return 1; 435 return 1;
433} 436}
437EXPORT_SYMBOL(dump_elf_task);
434 438
435int 439int
436dump_elf_task_fp(elf_fpreg_t *dest, struct task_struct *task) 440dump_elf_task_fp(elf_fpreg_t *dest, struct task_struct *task)
@@ -439,6 +443,7 @@ dump_elf_task_fp(elf_fpreg_t *dest, struct task_struct *task)
439 memcpy(dest, sw->fp, 32 * 8); 443 memcpy(dest, sw->fp, 32 * 8);
440 return 1; 444 return 1;
441} 445}
446EXPORT_SYMBOL(dump_elf_task_fp);
442 447
443/* 448/*
444 * sys_execve() executes a new program. 449 * sys_execve() executes a new program.
diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c
index a94e6d93e2ee..1aea7c7c683c 100644
--- a/arch/alpha/kernel/setup.c
+++ b/arch/alpha/kernel/setup.c
@@ -66,6 +66,7 @@ static struct notifier_block alpha_panic_block = {
66 66
67 67
68struct hwrpb_struct *hwrpb; 68struct hwrpb_struct *hwrpb;
69EXPORT_SYMBOL(hwrpb);
69unsigned long srm_hae; 70unsigned long srm_hae;
70 71
71int alpha_l1i_cacheshape; 72int alpha_l1i_cacheshape;
@@ -111,6 +112,7 @@ unsigned long alpha_agpgart_size = DEFAULT_AGP_APER_SIZE;
111#ifdef CONFIG_ALPHA_GENERIC 112#ifdef CONFIG_ALPHA_GENERIC
112struct alpha_machine_vector alpha_mv; 113struct alpha_machine_vector alpha_mv;
113int alpha_using_srm; 114int alpha_using_srm;
115EXPORT_SYMBOL(alpha_using_srm);
114#endif 116#endif
115 117
116static struct alpha_machine_vector *get_sysvec(unsigned long, unsigned long, 118static struct alpha_machine_vector *get_sysvec(unsigned long, unsigned long,
@@ -137,6 +139,8 @@ struct screen_info screen_info = {
137 .orig_video_points = 16 139 .orig_video_points = 16
138}; 140};
139 141
142EXPORT_SYMBOL(screen_info);
143
140/* 144/*
141 * The direct map I/O window, if any. This should be the same 145 * The direct map I/O window, if any. This should be the same
142 * for all busses, since it's used by virt_to_bus. 146 * for all busses, since it's used by virt_to_bus.
@@ -144,6 +148,8 @@ struct screen_info screen_info = {
144 148
145unsigned long __direct_map_base; 149unsigned long __direct_map_base;
146unsigned long __direct_map_size; 150unsigned long __direct_map_size;
151EXPORT_SYMBOL(__direct_map_base);
152EXPORT_SYMBOL(__direct_map_size);
147 153
148/* 154/*
149 * Declare all of the machine vectors. 155 * Declare all of the machine vectors.
diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index 596780e2c7da..d1ec4f51df1a 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -52,6 +52,7 @@
52 52
53/* A collection of per-processor data. */ 53/* A collection of per-processor data. */
54struct cpuinfo_alpha cpu_data[NR_CPUS]; 54struct cpuinfo_alpha cpu_data[NR_CPUS];
55EXPORT_SYMBOL(cpu_data);
55 56
56/* A collection of single bit ipi messages. */ 57/* A collection of single bit ipi messages. */
57static struct { 58static struct {
@@ -74,6 +75,7 @@ EXPORT_SYMBOL(cpu_online_map);
74 75
75int smp_num_probed; /* Internal processor count */ 76int smp_num_probed; /* Internal processor count */
76int smp_num_cpus = 1; /* Number that came online. */ 77int smp_num_cpus = 1; /* Number that came online. */
78EXPORT_SYMBOL(smp_num_cpus);
77 79
78extern void calibrate_delay(void); 80extern void calibrate_delay(void);
79 81
@@ -790,6 +792,7 @@ smp_call_function_on_cpu (void (*func) (void *info), void *info, int retry,
790 792
791 return 0; 793 return 0;
792} 794}
795EXPORT_SYMBOL(smp_call_function_on_cpu);
793 796
794int 797int
795smp_call_function (void (*func) (void *info), void *info, int retry, int wait) 798smp_call_function (void (*func) (void *info), void *info, int retry, int wait)
@@ -797,6 +800,7 @@ smp_call_function (void (*func) (void *info), void *info, int retry, int wait)
797 return smp_call_function_on_cpu (func, info, retry, wait, 800 return smp_call_function_on_cpu (func, info, retry, wait,
798 cpu_online_map); 801 cpu_online_map);
799} 802}
803EXPORT_SYMBOL(smp_call_function);
800 804
801static void 805static void
802ipi_imb(void *ignored) 806ipi_imb(void *ignored)
@@ -811,6 +815,7 @@ smp_imb(void)
811 if (on_each_cpu(ipi_imb, NULL, 1, 1)) 815 if (on_each_cpu(ipi_imb, NULL, 1, 1))
812 printk(KERN_CRIT "smp_imb: timed out\n"); 816 printk(KERN_CRIT "smp_imb: timed out\n");
813} 817}
818EXPORT_SYMBOL(smp_imb);
814 819
815static void 820static void
816ipi_flush_tlb_all(void *ignored) 821ipi_flush_tlb_all(void *ignored)
@@ -866,6 +871,7 @@ flush_tlb_mm(struct mm_struct *mm)
866 871
867 preempt_enable(); 872 preempt_enable();
868} 873}
874EXPORT_SYMBOL(flush_tlb_mm);
869 875
870struct flush_tlb_page_struct { 876struct flush_tlb_page_struct {
871 struct vm_area_struct *vma; 877 struct vm_area_struct *vma;
@@ -918,6 +924,7 @@ flush_tlb_page(struct vm_area_struct *vma, unsigned long addr)
918 924
919 preempt_enable(); 925 preempt_enable();
920} 926}
927EXPORT_SYMBOL(flush_tlb_page);
921 928
922void 929void
923flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) 930flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end)
@@ -925,6 +932,7 @@ flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long e
925 /* On the Alpha we always flush the whole user tlb. */ 932 /* On the Alpha we always flush the whole user tlb. */
926 flush_tlb_mm(vma->vm_mm); 933 flush_tlb_mm(vma->vm_mm);
927} 934}
935EXPORT_SYMBOL(flush_tlb_range);
928 936
929static void 937static void
930ipi_flush_icache_page(void *x) 938ipi_flush_icache_page(void *x)
diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c
index cf0666523989..d7053eb4ffcf 100644
--- a/arch/alpha/kernel/time.c
+++ b/arch/alpha/kernel/time.c
@@ -57,6 +57,7 @@
57static int set_rtc_mmss(unsigned long); 57static int set_rtc_mmss(unsigned long);
58 58
59DEFINE_SPINLOCK(rtc_lock); 59DEFINE_SPINLOCK(rtc_lock);
60EXPORT_SYMBOL(rtc_lock);
60 61
61#define TICK_SIZE (tick_nsec / 1000) 62#define TICK_SIZE (tick_nsec / 1000)
62 63
diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c
index b826f58c6e72..e3e3806a6f25 100644
--- a/arch/alpha/mm/numa.c
+++ b/arch/alpha/mm/numa.c
@@ -13,12 +13,14 @@
13#include <linux/swap.h> 13#include <linux/swap.h>
14#include <linux/initrd.h> 14#include <linux/initrd.h>
15#include <linux/pfn.h> 15#include <linux/pfn.h>
16#include <linux/module.h>
16 17
17#include <asm/hwrpb.h> 18#include <asm/hwrpb.h>
18#include <asm/pgalloc.h> 19#include <asm/pgalloc.h>
19 20
20pg_data_t node_data[MAX_NUMNODES]; 21pg_data_t node_data[MAX_NUMNODES];
21bootmem_data_t node_bdata[MAX_NUMNODES]; 22bootmem_data_t node_bdata[MAX_NUMNODES];
23EXPORT_SYMBOL(node_data);
22 24
23#undef DEBUG_DISCONTIG 25#undef DEBUG_DISCONTIG
24#ifdef DEBUG_DISCONTIG 26#ifdef DEBUG_DISCONTIG
diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c
index da69e660574b..4779f474f911 100644
--- a/arch/arm/kernel/armksyms.c
+++ b/arch/arm/kernel/armksyms.c
@@ -178,9 +178,3 @@ EXPORT_SYMBOL(_find_next_zero_bit_be);
178EXPORT_SYMBOL(_find_first_bit_be); 178EXPORT_SYMBOL(_find_first_bit_be);
179EXPORT_SYMBOL(_find_next_bit_be); 179EXPORT_SYMBOL(_find_next_bit_be);
180#endif 180#endif
181
182 /* syscalls */
183EXPORT_SYMBOL(sys_write);
184EXPORT_SYMBOL(sys_lseek);
185EXPORT_SYMBOL(sys_exit);
186EXPORT_SYMBOL(sys_wait4);
diff --git a/arch/arm/mach-versatile/core.c b/arch/arm/mach-versatile/core.c
index 2aa150b57ba1..3b8576111c16 100644
--- a/arch/arm/mach-versatile/core.c
+++ b/arch/arm/mach-versatile/core.c
@@ -188,12 +188,12 @@ static struct map_desc versatile_io_desc[] __initdata = {
188 .length = SZ_4K, 188 .length = SZ_4K,
189 .type = MT_DEVICE 189 .type = MT_DEVICE
190 }, { 190 }, {
191 .virtual = VERSATILE_PCI_VIRT_BASE, 191 .virtual = (unsigned long)VERSATILE_PCI_VIRT_BASE,
192 .pfn = __phys_to_pfn(VERSATILE_PCI_BASE), 192 .pfn = __phys_to_pfn(VERSATILE_PCI_BASE),
193 .length = VERSATILE_PCI_BASE_SIZE, 193 .length = VERSATILE_PCI_BASE_SIZE,
194 .type = MT_DEVICE 194 .type = MT_DEVICE
195 }, { 195 }, {
196 .virtual = VERSATILE_PCI_CFG_VIRT_BASE, 196 .virtual = (unsigned long)VERSATILE_PCI_CFG_VIRT_BASE,
197 .pfn = __phys_to_pfn(VERSATILE_PCI_CFG_BASE), 197 .pfn = __phys_to_pfn(VERSATILE_PCI_CFG_BASE),
198 .length = VERSATILE_PCI_CFG_BASE_SIZE, 198 .length = VERSATILE_PCI_CFG_BASE_SIZE,
199 .type = MT_DEVICE 199 .type = MT_DEVICE
diff --git a/arch/arm/mach-versatile/pci.c b/arch/arm/mach-versatile/pci.c
index 13bbd08ff841..5cd0b5d9e7eb 100644
--- a/arch/arm/mach-versatile/pci.c
+++ b/arch/arm/mach-versatile/pci.c
@@ -40,14 +40,15 @@
40 * Cfg 42000000 - 42FFFFFF PCI config 40 * Cfg 42000000 - 42FFFFFF PCI config
41 * 41 *
42 */ 42 */
43#define SYS_PCICTL IO_ADDRESS(VERSATILE_SYS_PCICTL) 43#define __IO_ADDRESS(n) ((void __iomem *)(unsigned long)IO_ADDRESS(n))
44#define PCI_IMAP0 IO_ADDRESS(VERSATILE_PCI_CORE_BASE+0x0) 44#define SYS_PCICTL __IO_ADDRESS(VERSATILE_SYS_PCICTL)
45#define PCI_IMAP1 IO_ADDRESS(VERSATILE_PCI_CORE_BASE+0x4) 45#define PCI_IMAP0 __IO_ADDRESS(VERSATILE_PCI_CORE_BASE+0x0)
46#define PCI_IMAP2 IO_ADDRESS(VERSATILE_PCI_CORE_BASE+0x8) 46#define PCI_IMAP1 __IO_ADDRESS(VERSATILE_PCI_CORE_BASE+0x4)
47#define PCI_SMAP0 IO_ADDRESS(VERSATILE_PCI_CORE_BASE+0x10) 47#define PCI_IMAP2 __IO_ADDRESS(VERSATILE_PCI_CORE_BASE+0x8)
48#define PCI_SMAP1 IO_ADDRESS(VERSATILE_PCI_CORE_BASE+0x14) 48#define PCI_SMAP0 __IO_ADDRESS(VERSATILE_PCI_CORE_BASE+0x10)
49#define PCI_SMAP2 IO_ADDRESS(VERSATILE_PCI_CORE_BASE+0x18) 49#define PCI_SMAP1 __IO_ADDRESS(VERSATILE_PCI_CORE_BASE+0x14)
50#define PCI_SELFID IO_ADDRESS(VERSATILE_PCI_CORE_BASE+0xc) 50#define PCI_SMAP2 __IO_ADDRESS(VERSATILE_PCI_CORE_BASE+0x18)
51#define PCI_SELFID __IO_ADDRESS(VERSATILE_PCI_CORE_BASE+0xc)
51 52
52#define DEVICE_ID_OFFSET 0x00 53#define DEVICE_ID_OFFSET 0x00
53#define CSR_OFFSET 0x04 54#define CSR_OFFSET 0x04
@@ -76,7 +77,7 @@ static int __init versatile_pci_slot_ignore(char *str)
76__setup("pci_slot_ignore=", versatile_pci_slot_ignore); 77__setup("pci_slot_ignore=", versatile_pci_slot_ignore);
77 78
78 79
79static unsigned long __pci_addr(struct pci_bus *bus, 80static void __iomem *__pci_addr(struct pci_bus *bus,
80 unsigned int devfn, int offset) 81 unsigned int devfn, int offset)
81{ 82{
82 unsigned int busnr = bus->number; 83 unsigned int busnr = bus->number;
@@ -91,14 +92,14 @@ static unsigned long __pci_addr(struct pci_bus *bus,
91 if (devfn > 255) 92 if (devfn > 255)
92 BUG(); 93 BUG();
93 94
94 return (VERSATILE_PCI_CFG_VIRT_BASE | (busnr << 16) | 95 return VERSATILE_PCI_CFG_VIRT_BASE + ((busnr << 16) |
95 (PCI_SLOT(devfn) << 11) | (PCI_FUNC(devfn) << 8) | offset); 96 (PCI_SLOT(devfn) << 11) | (PCI_FUNC(devfn) << 8) | offset);
96} 97}
97 98
98static int versatile_read_config(struct pci_bus *bus, unsigned int devfn, int where, 99static int versatile_read_config(struct pci_bus *bus, unsigned int devfn, int where,
99 int size, u32 *val) 100 int size, u32 *val)
100{ 101{
101 unsigned long addr = __pci_addr(bus, devfn, where); 102 void __iomem *addr = __pci_addr(bus, devfn, where & ~3);
102 u32 v; 103 u32 v;
103 int slot = PCI_SLOT(devfn); 104 int slot = PCI_SLOT(devfn);
104 105
@@ -121,13 +122,12 @@ static int versatile_read_config(struct pci_bus *bus, unsigned int devfn, int wh
121 break; 122 break;
122 123
123 case 2: 124 case 2:
124 v = __raw_readl(addr & ~3); 125 v = __raw_readl(addr);
125 if (addr & 2) v >>= 16; 126 if (where & 2) v >>= 16;
126 v &= 0xffff; 127 v &= 0xffff;
127 break; 128 break;
128 129
129 default: 130 default:
130 addr &= ~3;
131 v = __raw_readl(addr); 131 v = __raw_readl(addr);
132 break; 132 break;
133 } 133 }
@@ -140,7 +140,7 @@ static int versatile_read_config(struct pci_bus *bus, unsigned int devfn, int wh
140static int versatile_write_config(struct pci_bus *bus, unsigned int devfn, int where, 140static int versatile_write_config(struct pci_bus *bus, unsigned int devfn, int where,
141 int size, u32 val) 141 int size, u32 val)
142{ 142{
143 unsigned long addr = __pci_addr(bus, devfn, where); 143 void __iomem *addr = __pci_addr(bus, devfn, where);
144 int slot = PCI_SLOT(devfn); 144 int slot = PCI_SLOT(devfn);
145 145
146 if (pci_slot_ignore & (1 << slot)) { 146 if (pci_slot_ignore & (1 << slot)) {
@@ -279,7 +279,7 @@ int __init pci_versatile_setup(int nr, struct pci_sys_data *sys)
279 printk("PCI core found (slot %d)\n",myslot); 279 printk("PCI core found (slot %d)\n",myslot);
280 280
281 __raw_writel(myslot, PCI_SELFID); 281 __raw_writel(myslot, PCI_SELFID);
282 local_pci_cfg_base = (void *) VERSATILE_PCI_CFG_VIRT_BASE + (myslot << 11); 282 local_pci_cfg_base = VERSATILE_PCI_CFG_VIRT_BASE + (myslot << 11);
283 283
284 val = __raw_readl(local_pci_cfg_base + CSR_OFFSET); 284 val = __raw_readl(local_pci_cfg_base + CSR_OFFSET);
285 val |= PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER | PCI_COMMAND_INVALIDATE; 285 val |= PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER | PCI_COMMAND_INVALIDATE;
diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c
index dedbb449632e..a657a28f08db 100644
--- a/arch/arm/vfp/vfpmodule.c
+++ b/arch/arm/vfp/vfpmodule.c
@@ -90,7 +90,7 @@ void vfp_raise_sigfpe(unsigned int sicode, struct pt_regs *regs)
90 90
91 info.si_signo = SIGFPE; 91 info.si_signo = SIGFPE;
92 info.si_code = sicode; 92 info.si_code = sicode;
93 info.si_addr = (void *)(instruction_pointer(regs) - 4); 93 info.si_addr = (void __user *)(instruction_pointer(regs) - 4);
94 94
95 /* 95 /*
96 * This is the same as NWFPE, because it's not clear what 96 * This is the same as NWFPE, because it's not clear what
diff --git a/arch/arm26/kernel/armksyms.c b/arch/arm26/kernel/armksyms.c
index 07907b6ecb63..93293d04b303 100644
--- a/arch/arm26/kernel/armksyms.c
+++ b/arch/arm26/kernel/armksyms.c
@@ -202,14 +202,6 @@ EXPORT_SYMBOL(_find_next_zero_bit_le);
202EXPORT_SYMBOL(elf_platform); 202EXPORT_SYMBOL(elf_platform);
203EXPORT_SYMBOL(elf_hwcap); 203EXPORT_SYMBOL(elf_hwcap);
204 204
205 /* syscalls */
206EXPORT_SYMBOL(sys_write);
207EXPORT_SYMBOL(sys_read);
208EXPORT_SYMBOL(sys_lseek);
209EXPORT_SYMBOL(sys_open);
210EXPORT_SYMBOL(sys_exit);
211EXPORT_SYMBOL(sys_wait4);
212
213#ifdef CONFIG_PREEMPT 205#ifdef CONFIG_PREEMPT
214EXPORT_SYMBOL(kernel_flag); 206EXPORT_SYMBOL(kernel_flag);
215#endif 207#endif
diff --git a/arch/avr32/kernel/time.c b/arch/avr32/kernel/time.c
index 3e56b9f4358a..5a247ba71a72 100644
--- a/arch/avr32/kernel/time.c
+++ b/arch/avr32/kernel/time.c
@@ -124,15 +124,15 @@ unsigned long long sched_clock(void)
124 * 124 *
125 * In UP mode, it is invoked from the (global) timer_interrupt. 125 * In UP mode, it is invoked from the (global) timer_interrupt.
126 */ 126 */
127static void local_timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) 127static void local_timer_interrupt(int irq, void *dev_id)
128{ 128{
129 if (current->pid) 129 if (current->pid)
130 profile_tick(CPU_PROFILING, regs); 130 profile_tick(CPU_PROFILING);
131 update_process_times(user_mode(regs)); 131 update_process_times(user_mode(get_irq_regs()));
132} 132}
133 133
134static irqreturn_t 134static irqreturn_t
135timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) 135timer_interrupt(int irq, void *dev_id)
136{ 136{
137 unsigned int count; 137 unsigned int count;
138 138
@@ -157,7 +157,7 @@ timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
157 * 157 *
158 * SMP is not supported yet. 158 * SMP is not supported yet.
159 */ 159 */
160 local_timer_interrupt(irq, dev_id, regs); 160 local_timer_interrupt(irq, dev_id);
161 161
162 return IRQ_HANDLED; 162 return IRQ_HANDLED;
163} 163}
diff --git a/arch/avr32/mach-at32ap/extint.c b/arch/avr32/mach-at32ap/extint.c
index 7da9c5f7a0eb..4dff1f988900 100644
--- a/arch/avr32/mach-at32ap/extint.c
+++ b/arch/avr32/mach-at32ap/extint.c
@@ -102,8 +102,7 @@ struct irq_chip eim_chip = {
102 .set_type = eim_set_irq_type, 102 .set_type = eim_set_irq_type,
103}; 103};
104 104
105static void demux_eim_irq(unsigned int irq, struct irq_desc *desc, 105static void demux_eim_irq(unsigned int irq, struct irq_desc *desc)
106 struct pt_regs *regs)
107{ 106{
108 struct at32_sm *sm = desc->handler_data; 107 struct at32_sm *sm = desc->handler_data;
109 struct irq_desc *ext_desc; 108 struct irq_desc *ext_desc;
@@ -121,7 +120,7 @@ static void demux_eim_irq(unsigned int irq, struct irq_desc *desc,
121 120
122 ext_irq = i + sm->eim_first_irq; 121 ext_irq = i + sm->eim_first_irq;
123 ext_desc = irq_desc + ext_irq; 122 ext_desc = irq_desc + ext_irq;
124 ext_desc->handle_irq(ext_irq, ext_desc, regs); 123 ext_desc->handle_irq(ext_irq, ext_desc);
125 } 124 }
126 125
127 spin_unlock(&sm->lock); 126 spin_unlock(&sm->lock);
diff --git a/arch/avr32/mach-at32ap/intc.c b/arch/avr32/mach-at32ap/intc.c
index 74f8c9f2f03d..eb87a18ad7b2 100644
--- a/arch/avr32/mach-at32ap/intc.c
+++ b/arch/avr32/mach-at32ap/intc.c
@@ -52,16 +52,19 @@ static struct intc intc0 = {
52asmlinkage void do_IRQ(int level, struct pt_regs *regs) 52asmlinkage void do_IRQ(int level, struct pt_regs *regs)
53{ 53{
54 struct irq_desc *desc; 54 struct irq_desc *desc;
55 struct pt_regs *old_regs;
55 unsigned int irq; 56 unsigned int irq;
56 unsigned long status_reg; 57 unsigned long status_reg;
57 58
58 local_irq_disable(); 59 local_irq_disable();
59 60
61 old_regs = set_irq_regs(regs);
62
60 irq_enter(); 63 irq_enter();
61 64
62 irq = intc_readl(&intc0, INTCAUSE0 - 4 * level); 65 irq = intc_readl(&intc0, INTCAUSE0 - 4 * level);
63 desc = irq_desc + irq; 66 desc = irq_desc + irq;
64 desc->handle_irq(irq, desc, regs); 67 desc->handle_irq(irq, desc);
65 68
66 /* 69 /*
67 * Clear all interrupt level masks so that we may handle 70 * Clear all interrupt level masks so that we may handle
@@ -75,6 +78,8 @@ asmlinkage void do_IRQ(int level, struct pt_regs *regs)
75 sysreg_write(SR, status_reg); 78 sysreg_write(SR, status_reg);
76 79
77 irq_exit(); 80 irq_exit();
81
82 set_irq_regs(old_regs);
78} 83}
79 84
80void __init init_IRQ(void) 85void __init init_IRQ(void)
diff --git a/arch/i386/Kconfig.cpu b/arch/i386/Kconfig.cpu
index 21c9a4e71104..fc4f2abccf06 100644
--- a/arch/i386/Kconfig.cpu
+++ b/arch/i386/Kconfig.cpu
@@ -7,6 +7,7 @@ choice
7 7
8config M386 8config M386
9 bool "386" 9 bool "386"
10 depends on !UML
10 ---help--- 11 ---help---
11 This is the processor type of your CPU. This information is used for 12 This is the processor type of your CPU. This information is used for
12 optimizing purposes. In order to compile a kernel that can run on 13 optimizing purposes. In order to compile a kernel that can run on
@@ -301,7 +302,7 @@ config X86_USE_PPRO_CHECKSUM
301 302
302config X86_USE_3DNOW 303config X86_USE_3DNOW
303 bool 304 bool
304 depends on MCYRIXIII || MK7 || MGEODE_LX 305 depends on (MCYRIXIII || MK7 || MGEODE_LX) && !UML
305 default y 306 default y
306 307
307config X86_OOSTORE 308config X86_OOSTORE
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index cd082c36ca03..27bceaf5ce40 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -2594,7 +2594,7 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
2594} 2594}
2595#endif 2595#endif
2596 2596
2597static struct hw_interrupt_type ht_irq_chip = { 2597static struct irq_chip ht_irq_chip = {
2598 .name = "PCI-HT", 2598 .name = "PCI-HT",
2599 .mask = mask_ht_irq, 2599 .mask = mask_ht_irq,
2600 .unmask = unmask_ht_irq, 2600 .unmask = unmask_ht_irq,
diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c
index 9b9479768d5e..c4d0291b519f 100644
--- a/arch/i386/kernel/microcode.c
+++ b/arch/i386/kernel/microcode.c
@@ -656,14 +656,18 @@ static struct attribute_group mc_attr_group = {
656 656
657static int mc_sysdev_add(struct sys_device *sys_dev) 657static int mc_sysdev_add(struct sys_device *sys_dev)
658{ 658{
659 int cpu = sys_dev->id; 659 int err, cpu = sys_dev->id;
660 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 660 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
661 661
662 if (!cpu_online(cpu)) 662 if (!cpu_online(cpu))
663 return 0; 663 return 0;
664
664 pr_debug("Microcode:CPU %d added\n", cpu); 665 pr_debug("Microcode:CPU %d added\n", cpu);
665 memset(uci, 0, sizeof(*uci)); 666 memset(uci, 0, sizeof(*uci));
666 sysfs_create_group(&sys_dev->kobj, &mc_attr_group); 667
668 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
669 if (err)
670 return err;
667 671
668 microcode_init_cpu(cpu); 672 microcode_init_cpu(cpu);
669 return 0; 673 return 0;
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 000cf03751fe..519e63c3c130 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -1083,16 +1083,15 @@ static unsigned long __init setup_memory(void)
1083 1083
1084void __init zone_sizes_init(void) 1084void __init zone_sizes_init(void)
1085{ 1085{
1086 unsigned long max_zone_pfns[MAX_NR_ZONES];
1087 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
1088 max_zone_pfns[ZONE_DMA] =
1089 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
1090 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
1086#ifdef CONFIG_HIGHMEM 1091#ifdef CONFIG_HIGHMEM
1087 unsigned long max_zone_pfns[MAX_NR_ZONES] = { 1092 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
1088 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT,
1089 max_low_pfn,
1090 highend_pfn};
1091 add_active_range(0, 0, highend_pfn); 1093 add_active_range(0, 0, highend_pfn);
1092#else 1094#else
1093 unsigned long max_zone_pfns[MAX_NR_ZONES] = {
1094 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT,
1095 max_low_pfn};
1096 add_active_range(0, 0, max_low_pfn); 1095 add_active_range(0, 0, max_low_pfn);
1097#endif 1096#endif
1098 1097
diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index 7e639f78b0b9..2697e9210e92 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -318,3 +318,4 @@ ENTRY(sys_call_table)
318 .long sys_vmsplice 318 .long sys_vmsplice
319 .long sys_move_pages 319 .long sys_move_pages
320 .long sys_getcpu 320 .long sys_getcpu
321 .long sys_epoll_pwait
diff --git a/arch/i386/lib/usercopy.c b/arch/i386/lib/usercopy.c
index 08502fc6d0cb..258df6b4d7d7 100644
--- a/arch/i386/lib/usercopy.c
+++ b/arch/i386/lib/usercopy.c
@@ -179,7 +179,7 @@ __clear_user(void __user *to, unsigned long n)
179EXPORT_SYMBOL(__clear_user); 179EXPORT_SYMBOL(__clear_user);
180 180
181/** 181/**
182 * strlen_user: - Get the size of a string in user space. 182 * strnlen_user: - Get the size of a string in user space.
183 * @s: The string to measure. 183 * @s: The string to measure.
184 * @n: The maximum valid length 184 * @n: The maximum valid length
185 * 185 *
diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c
index 455597db84df..ddbdb0336f28 100644
--- a/arch/i386/mm/discontig.c
+++ b/arch/i386/mm/discontig.c
@@ -356,11 +356,12 @@ void __init numa_kva_reserve(void)
356void __init zone_sizes_init(void) 356void __init zone_sizes_init(void)
357{ 357{
358 int nid; 358 int nid;
359 unsigned long max_zone_pfns[MAX_NR_ZONES] = { 359 unsigned long max_zone_pfns[MAX_NR_ZONES];
360 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT, 360 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
361 max_low_pfn, 361 max_zone_pfns[ZONE_DMA] =
362 highend_pfn 362 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
363 }; 363 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
364 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
364 365
365 /* If SRAT has not registered memory, register it now */ 366 /* If SRAT has not registered memory, register it now */
366 if (find_max_pfn_with_active_regions() == 0) { 367 if (find_max_pfn_with_active_regions() == 0) {
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c
index daf977ff2920..82deaa3a7c48 100644
--- a/arch/ia64/mm/contig.c
+++ b/arch/ia64/mm/contig.c
@@ -233,6 +233,7 @@ paging_init (void)
233 efi_memmap_walk(count_pages, &num_physpages); 233 efi_memmap_walk(count_pages, &num_physpages);
234 234
235 max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT; 235 max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
236 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
236 max_zone_pfns[ZONE_DMA] = max_dma; 237 max_zone_pfns[ZONE_DMA] = max_dma;
237 max_zone_pfns[ZONE_NORMAL] = max_low_pfn; 238 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
238 239
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index d497b6b0f5b2..96722cb1b49d 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -709,6 +709,7 @@ void __init paging_init(void)
709 max_pfn = mem_data[node].max_pfn; 709 max_pfn = mem_data[node].max_pfn;
710 } 710 }
711 711
712 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
712 max_zone_pfns[ZONE_DMA] = max_dma; 713 max_zone_pfns[ZONE_DMA] = max_dma;
713 max_zone_pfns[ZONE_NORMAL] = max_pfn; 714 max_zone_pfns[ZONE_NORMAL] = max_pfn;
714 free_area_init_nodes(max_zone_pfns); 715 free_area_init_nodes(max_zone_pfns);
diff --git a/arch/m32r/kernel/setup.c b/arch/m32r/kernel/setup.c
index 3f35ab3d2dc2..0e7778be33cc 100644
--- a/arch/m32r/kernel/setup.c
+++ b/arch/m32r/kernel/setup.c
@@ -369,10 +369,10 @@ static void c_stop(struct seq_file *m, void *v)
369} 369}
370 370
371struct seq_operations cpuinfo_op = { 371struct seq_operations cpuinfo_op = {
372 start: c_start, 372 .start = c_start,
373 next: c_next, 373 .next = c_next,
374 stop: c_stop, 374 .stop = c_stop,
375 show: show_cpuinfo, 375 .show = show_cpuinfo,
376}; 376};
377#endif /* CONFIG_PROC_FS */ 377#endif /* CONFIG_PROC_FS */
378 378
diff --git a/arch/m32r/kernel/setup_mappi.c b/arch/m32r/kernel/setup_mappi.c
index 67dbbdc9d111..6b2d77da0683 100644
--- a/arch/m32r/kernel/setup_mappi.c
+++ b/arch/m32r/kernel/setup_mappi.c
@@ -86,7 +86,7 @@ void __init init_IRQ(void)
86 /* INT0 : LAN controller (RTL8019AS) */ 86 /* INT0 : LAN controller (RTL8019AS) */
87 irq_desc[M32R_IRQ_INT0].status = IRQ_DISABLED; 87 irq_desc[M32R_IRQ_INT0].status = IRQ_DISABLED;
88 irq_desc[M32R_IRQ_INT0].chip = &mappi_irq_type; 88 irq_desc[M32R_IRQ_INT0].chip = &mappi_irq_type;
89 irq_desc[M32R_IRQ_INT0].action = 0; 89 irq_desc[M32R_IRQ_INT0].action = NULL;
90 irq_desc[M32R_IRQ_INT0].depth = 1; 90 irq_desc[M32R_IRQ_INT0].depth = 1;
91 icu_data[M32R_IRQ_INT0].icucr = M32R_ICUCR_IEN|M32R_ICUCR_ISMOD10; 91 icu_data[M32R_IRQ_INT0].icucr = M32R_ICUCR_IEN|M32R_ICUCR_ISMOD10;
92 disable_mappi_irq(M32R_IRQ_INT0); 92 disable_mappi_irq(M32R_IRQ_INT0);
@@ -95,7 +95,7 @@ void __init init_IRQ(void)
95 /* MFT2 : system timer */ 95 /* MFT2 : system timer */
96 irq_desc[M32R_IRQ_MFT2].status = IRQ_DISABLED; 96 irq_desc[M32R_IRQ_MFT2].status = IRQ_DISABLED;
97 irq_desc[M32R_IRQ_MFT2].chip = &mappi_irq_type; 97 irq_desc[M32R_IRQ_MFT2].chip = &mappi_irq_type;
98 irq_desc[M32R_IRQ_MFT2].action = 0; 98 irq_desc[M32R_IRQ_MFT2].action = NULL;
99 irq_desc[M32R_IRQ_MFT2].depth = 1; 99 irq_desc[M32R_IRQ_MFT2].depth = 1;
100 icu_data[M32R_IRQ_MFT2].icucr = M32R_ICUCR_IEN; 100 icu_data[M32R_IRQ_MFT2].icucr = M32R_ICUCR_IEN;
101 disable_mappi_irq(M32R_IRQ_MFT2); 101 disable_mappi_irq(M32R_IRQ_MFT2);
@@ -104,7 +104,7 @@ void __init init_IRQ(void)
104 /* SIO0_R : uart receive data */ 104 /* SIO0_R : uart receive data */
105 irq_desc[M32R_IRQ_SIO0_R].status = IRQ_DISABLED; 105 irq_desc[M32R_IRQ_SIO0_R].status = IRQ_DISABLED;
106 irq_desc[M32R_IRQ_SIO0_R].chip = &mappi_irq_type; 106 irq_desc[M32R_IRQ_SIO0_R].chip = &mappi_irq_type;
107 irq_desc[M32R_IRQ_SIO0_R].action = 0; 107 irq_desc[M32R_IRQ_SIO0_R].action = NULL;
108 irq_desc[M32R_IRQ_SIO0_R].depth = 1; 108 irq_desc[M32R_IRQ_SIO0_R].depth = 1;
109 icu_data[M32R_IRQ_SIO0_R].icucr = 0; 109 icu_data[M32R_IRQ_SIO0_R].icucr = 0;
110 disable_mappi_irq(M32R_IRQ_SIO0_R); 110 disable_mappi_irq(M32R_IRQ_SIO0_R);
@@ -112,7 +112,7 @@ void __init init_IRQ(void)
112 /* SIO0_S : uart send data */ 112 /* SIO0_S : uart send data */
113 irq_desc[M32R_IRQ_SIO0_S].status = IRQ_DISABLED; 113 irq_desc[M32R_IRQ_SIO0_S].status = IRQ_DISABLED;
114 irq_desc[M32R_IRQ_SIO0_S].chip = &mappi_irq_type; 114 irq_desc[M32R_IRQ_SIO0_S].chip = &mappi_irq_type;
115 irq_desc[M32R_IRQ_SIO0_S].action = 0; 115 irq_desc[M32R_IRQ_SIO0_S].action = NULL;
116 irq_desc[M32R_IRQ_SIO0_S].depth = 1; 116 irq_desc[M32R_IRQ_SIO0_S].depth = 1;
117 icu_data[M32R_IRQ_SIO0_S].icucr = 0; 117 icu_data[M32R_IRQ_SIO0_S].icucr = 0;
118 disable_mappi_irq(M32R_IRQ_SIO0_S); 118 disable_mappi_irq(M32R_IRQ_SIO0_S);
@@ -120,7 +120,7 @@ void __init init_IRQ(void)
120 /* SIO1_R : uart receive data */ 120 /* SIO1_R : uart receive data */
121 irq_desc[M32R_IRQ_SIO1_R].status = IRQ_DISABLED; 121 irq_desc[M32R_IRQ_SIO1_R].status = IRQ_DISABLED;
122 irq_desc[M32R_IRQ_SIO1_R].chip = &mappi_irq_type; 122 irq_desc[M32R_IRQ_SIO1_R].chip = &mappi_irq_type;
123 irq_desc[M32R_IRQ_SIO1_R].action = 0; 123 irq_desc[M32R_IRQ_SIO1_R].action = NULL;
124 irq_desc[M32R_IRQ_SIO1_R].depth = 1; 124 irq_desc[M32R_IRQ_SIO1_R].depth = 1;
125 icu_data[M32R_IRQ_SIO1_R].icucr = 0; 125 icu_data[M32R_IRQ_SIO1_R].icucr = 0;
126 disable_mappi_irq(M32R_IRQ_SIO1_R); 126 disable_mappi_irq(M32R_IRQ_SIO1_R);
@@ -128,7 +128,7 @@ void __init init_IRQ(void)
128 /* SIO1_S : uart send data */ 128 /* SIO1_S : uart send data */
129 irq_desc[M32R_IRQ_SIO1_S].status = IRQ_DISABLED; 129 irq_desc[M32R_IRQ_SIO1_S].status = IRQ_DISABLED;
130 irq_desc[M32R_IRQ_SIO1_S].chip = &mappi_irq_type; 130 irq_desc[M32R_IRQ_SIO1_S].chip = &mappi_irq_type;
131 irq_desc[M32R_IRQ_SIO1_S].action = 0; 131 irq_desc[M32R_IRQ_SIO1_S].action = NULL;
132 irq_desc[M32R_IRQ_SIO1_S].depth = 1; 132 irq_desc[M32R_IRQ_SIO1_S].depth = 1;
133 icu_data[M32R_IRQ_SIO1_S].icucr = 0; 133 icu_data[M32R_IRQ_SIO1_S].icucr = 0;
134 disable_mappi_irq(M32R_IRQ_SIO1_S); 134 disable_mappi_irq(M32R_IRQ_SIO1_S);
@@ -138,7 +138,7 @@ void __init init_IRQ(void)
138 /* INT1 : pccard0 interrupt */ 138 /* INT1 : pccard0 interrupt */
139 irq_desc[M32R_IRQ_INT1].status = IRQ_DISABLED; 139 irq_desc[M32R_IRQ_INT1].status = IRQ_DISABLED;
140 irq_desc[M32R_IRQ_INT1].chip = &mappi_irq_type; 140 irq_desc[M32R_IRQ_INT1].chip = &mappi_irq_type;
141 irq_desc[M32R_IRQ_INT1].action = 0; 141 irq_desc[M32R_IRQ_INT1].action = NULL;
142 irq_desc[M32R_IRQ_INT1].depth = 1; 142 irq_desc[M32R_IRQ_INT1].depth = 1;
143 icu_data[M32R_IRQ_INT1].icucr = M32R_ICUCR_IEN | M32R_ICUCR_ISMOD00; 143 icu_data[M32R_IRQ_INT1].icucr = M32R_ICUCR_IEN | M32R_ICUCR_ISMOD00;
144 disable_mappi_irq(M32R_IRQ_INT1); 144 disable_mappi_irq(M32R_IRQ_INT1);
@@ -146,7 +146,7 @@ void __init init_IRQ(void)
146 /* INT2 : pccard1 interrupt */ 146 /* INT2 : pccard1 interrupt */
147 irq_desc[M32R_IRQ_INT2].status = IRQ_DISABLED; 147 irq_desc[M32R_IRQ_INT2].status = IRQ_DISABLED;
148 irq_desc[M32R_IRQ_INT2].chip = &mappi_irq_type; 148 irq_desc[M32R_IRQ_INT2].chip = &mappi_irq_type;
149 irq_desc[M32R_IRQ_INT2].action = 0; 149 irq_desc[M32R_IRQ_INT2].action = NULL;
150 irq_desc[M32R_IRQ_INT2].depth = 1; 150 irq_desc[M32R_IRQ_INT2].depth = 1;
151 icu_data[M32R_IRQ_INT2].icucr = M32R_ICUCR_IEN | M32R_ICUCR_ISMOD00; 151 icu_data[M32R_IRQ_INT2].icucr = M32R_ICUCR_IEN | M32R_ICUCR_ISMOD00;
152 disable_mappi_irq(M32R_IRQ_INT2); 152 disable_mappi_irq(M32R_IRQ_INT2);
diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c
index a9174efe80cb..b60cea4aebaa 100644
--- a/arch/m32r/kernel/signal.c
+++ b/arch/m32r/kernel/signal.c
@@ -33,7 +33,7 @@
33int do_signal(struct pt_regs *, sigset_t *); 33int do_signal(struct pt_regs *, sigset_t *);
34 34
35asmlinkage int 35asmlinkage int
36sys_rt_sigsuspend(sigset_t *unewset, size_t sigsetsize, 36sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize,
37 unsigned long r2, unsigned long r3, unsigned long r4, 37 unsigned long r2, unsigned long r3, unsigned long r4,
38 unsigned long r5, unsigned long r6, struct pt_regs *regs) 38 unsigned long r5, unsigned long r6, struct pt_regs *regs)
39{ 39{
@@ -78,8 +78,8 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
78struct rt_sigframe 78struct rt_sigframe
79{ 79{
80 int sig; 80 int sig;
81 struct siginfo *pinfo; 81 struct siginfo __user *pinfo;
82 void *puc; 82 void __user *puc;
83 struct siginfo info; 83 struct siginfo info;
84 struct ucontext uc; 84 struct ucontext uc;
85// struct _fpstate fpstate; 85// struct _fpstate fpstate;
diff --git a/arch/m32r/kernel/smp.c b/arch/m32r/kernel/smp.c
index 722e21f556dc..360129174b2b 100644
--- a/arch/m32r/kernel/smp.c
+++ b/arch/m32r/kernel/smp.c
@@ -231,7 +231,7 @@ void smp_flush_tlb_all(void)
231 local_irq_save(flags); 231 local_irq_save(flags);
232 __flush_tlb_all(); 232 __flush_tlb_all();
233 local_irq_restore(flags); 233 local_irq_restore(flags);
234 smp_call_function(flush_tlb_all_ipi, 0, 1, 1); 234 smp_call_function(flush_tlb_all_ipi, NULL, 1, 1);
235 preempt_enable(); 235 preempt_enable();
236} 236}
237 237
diff --git a/arch/m32r/kernel/sys_m32r.c b/arch/m32r/kernel/sys_m32r.c
index b567351f3c52..b4e7bcb43540 100644
--- a/arch/m32r/kernel/sys_m32r.c
+++ b/arch/m32r/kernel/sys_m32r.c
@@ -31,7 +31,7 @@
31/* 31/*
32 * sys_tas() - test-and-set 32 * sys_tas() - test-and-set
33 */ 33 */
34asmlinkage int sys_tas(int *addr) 34asmlinkage int sys_tas(int __user *addr)
35{ 35{
36 int oldval; 36 int oldval;
37 37
@@ -90,7 +90,7 @@ sys_pipe(unsigned long r0, unsigned long r1, unsigned long r2,
90 90
91 error = do_pipe(fd); 91 error = do_pipe(fd);
92 if (!error) { 92 if (!error) {
93 if (copy_to_user((void *)r0, (void *)fd, 2*sizeof(int))) 93 if (copy_to_user((void __user *)r0, fd, 2*sizeof(int)))
94 error = -EFAULT; 94 error = -EFAULT;
95 } 95 }
96 return error; 96 return error;
@@ -201,7 +201,7 @@ asmlinkage int sys_ipc(uint call, int first, int second,
201 } 201 }
202} 202}
203 203
204asmlinkage int sys_uname(struct old_utsname * name) 204asmlinkage int sys_uname(struct old_utsname __user * name)
205{ 205{
206 int err; 206 int err;
207 if (!name) 207 if (!name)
diff --git a/arch/m32r/kernel/traps.c b/arch/m32r/kernel/traps.c
index c1daf2c40c7c..97e0b1c0830e 100644
--- a/arch/m32r/kernel/traps.c
+++ b/arch/m32r/kernel/traps.c
@@ -268,7 +268,7 @@ static __inline__ void do_trap(int trapnr, int signr, const char * str,
268#define DO_ERROR(trapnr, signr, str, name) \ 268#define DO_ERROR(trapnr, signr, str, name) \
269asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ 269asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
270{ \ 270{ \
271 do_trap(trapnr, signr, 0, regs, error_code, NULL); \ 271 do_trap(trapnr, signr, NULL, regs, error_code, NULL); \
272} 272}
273 273
274#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ 274#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
diff --git a/arch/m68k/kernel/m68k_ksyms.c b/arch/m68k/kernel/m68k_ksyms.c
index f9636e84e6a4..6fc69c74fe2e 100644
--- a/arch/m68k/kernel/m68k_ksyms.c
+++ b/arch/m68k/kernel/m68k_ksyms.c
@@ -1,61 +1,10 @@
1#include <linux/module.h> 1#include <linux/module.h>
2#include <linux/linkage.h>
3#include <linux/sched.h>
4#include <linux/mm.h>
5#include <linux/user.h>
6#include <linux/elfcore.h>
7#include <linux/in6.h>
8#include <linux/interrupt.h>
9
10#include <asm/setup.h>
11#include <asm/machdep.h>
12#include <asm/pgalloc.h>
13#include <asm/irq.h>
14#include <asm/io.h>
15#include <asm/semaphore.h> 2#include <asm/semaphore.h>
16#include <asm/checksum.h>
17 3
18asmlinkage long long __ashldi3 (long long, int); 4asmlinkage long long __ashldi3 (long long, int);
19asmlinkage long long __ashrdi3 (long long, int); 5asmlinkage long long __ashrdi3 (long long, int);
20asmlinkage long long __lshrdi3 (long long, int); 6asmlinkage long long __lshrdi3 (long long, int);
21asmlinkage long long __muldi3 (long long, long long); 7asmlinkage long long __muldi3 (long long, long long);
22extern char m68k_debug_device[];
23
24/* platform dependent support */
25
26EXPORT_SYMBOL(m68k_machtype);
27EXPORT_SYMBOL(m68k_cputype);
28EXPORT_SYMBOL(m68k_is040or060);
29EXPORT_SYMBOL(m68k_realnum_memory);
30EXPORT_SYMBOL(m68k_memory);
31#ifndef CONFIG_SUN3
32EXPORT_SYMBOL(cache_push);
33EXPORT_SYMBOL(cache_clear);
34#ifndef CONFIG_SINGLE_MEMORY_CHUNK
35EXPORT_SYMBOL(mm_vtop);
36EXPORT_SYMBOL(mm_ptov);
37EXPORT_SYMBOL(mm_end_of_chunk);
38#else
39EXPORT_SYMBOL(m68k_memoffset);
40#endif /* !CONFIG_SINGLE_MEMORY_CHUNK */
41EXPORT_SYMBOL(__ioremap);
42EXPORT_SYMBOL(iounmap);
43EXPORT_SYMBOL(kernel_set_cachemode);
44#endif /* !CONFIG_SUN3 */
45EXPORT_SYMBOL(m68k_debug_device);
46EXPORT_SYMBOL(mach_hwclk);
47EXPORT_SYMBOL(mach_get_ss);
48EXPORT_SYMBOL(mach_get_rtc_pll);
49EXPORT_SYMBOL(mach_set_rtc_pll);
50#ifdef CONFIG_INPUT_M68K_BEEP_MODULE
51EXPORT_SYMBOL(mach_beep);
52#endif
53EXPORT_SYMBOL(dump_fpu);
54EXPORT_SYMBOL(dump_thread);
55EXPORT_SYMBOL(kernel_thread);
56#ifdef CONFIG_VME
57EXPORT_SYMBOL(vme_brdtype);
58#endif
59 8
60/* The following are special because they're not called 9/* The following are special because they're not called
61 explicitly (the C compiler generates them). Fortunately, 10 explicitly (the C compiler generates them). Fortunately,
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index 45a46646c1b3..99fc1226f7f8 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -187,6 +187,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
187 set_fs (fs); 187 set_fs (fs);
188 return pid; 188 return pid;
189} 189}
190EXPORT_SYMBOL(kernel_thread);
190 191
191void flush_thread(void) 192void flush_thread(void)
192{ 193{
@@ -221,13 +222,13 @@ asmlinkage int m68k_clone(struct pt_regs *regs)
221{ 222{
222 unsigned long clone_flags; 223 unsigned long clone_flags;
223 unsigned long newsp; 224 unsigned long newsp;
224 int *parent_tidptr, *child_tidptr; 225 int __user *parent_tidptr, *child_tidptr;
225 226
226 /* syscall2 puts clone_flags in d1 and usp in d2 */ 227 /* syscall2 puts clone_flags in d1 and usp in d2 */
227 clone_flags = regs->d1; 228 clone_flags = regs->d1;
228 newsp = regs->d2; 229 newsp = regs->d2;
229 parent_tidptr = (int *)regs->d3; 230 parent_tidptr = (int __user *)regs->d3;
230 child_tidptr = (int *)regs->d4; 231 child_tidptr = (int __user *)regs->d4;
231 if (!newsp) 232 if (!newsp)
232 newsp = rdusp(); 233 newsp = rdusp();
233 return do_fork(clone_flags, newsp, regs, 0, 234 return do_fork(clone_flags, newsp, regs, 0,
@@ -311,6 +312,7 @@ int dump_fpu (struct pt_regs *regs, struct user_m68kfp_struct *fpu)
311 : "memory"); 312 : "memory");
312 return 1; 313 return 1;
313} 314}
315EXPORT_SYMBOL(dump_fpu);
314 316
315/* 317/*
316 * fill in the user structure for a core dump.. 318 * fill in the user structure for a core dump..
@@ -357,11 +359,12 @@ void dump_thread(struct pt_regs * regs, struct user * dump)
357 /* dump floating point stuff */ 359 /* dump floating point stuff */
358 dump->u_fpvalid = dump_fpu (regs, &dump->m68kfp); 360 dump->u_fpvalid = dump_fpu (regs, &dump->m68kfp);
359} 361}
362EXPORT_SYMBOL(dump_thread);
360 363
361/* 364/*
362 * sys_execve() executes a new program. 365 * sys_execve() executes a new program.
363 */ 366 */
364asmlinkage int sys_execve(char *name, char **argv, char **envp) 367asmlinkage int sys_execve(char __user *name, char __user * __user *argv, char __user * __user *envp)
365{ 368{
366 int error; 369 int error;
367 char * filename; 370 char * filename;
diff --git a/arch/m68k/kernel/setup.c b/arch/m68k/kernel/setup.c
index 42d5b85f3350..9af3ee0e555d 100644
--- a/arch/m68k/kernel/setup.c
+++ b/arch/m68k/kernel/setup.c
@@ -42,27 +42,37 @@
42 42
43unsigned long m68k_machtype; 43unsigned long m68k_machtype;
44unsigned long m68k_cputype; 44unsigned long m68k_cputype;
45EXPORT_SYMBOL(m68k_machtype);
46EXPORT_SYMBOL(m68k_cputype);
45unsigned long m68k_fputype; 47unsigned long m68k_fputype;
46unsigned long m68k_mmutype; 48unsigned long m68k_mmutype;
47#ifdef CONFIG_VME 49#ifdef CONFIG_VME
48unsigned long vme_brdtype; 50unsigned long vme_brdtype;
51EXPORT_SYMBOL(vme_brdtype);
49#endif 52#endif
50 53
51int m68k_is040or060; 54int m68k_is040or060;
55EXPORT_SYMBOL(m68k_is040or060);
52 56
53extern int end; 57extern int end;
54extern unsigned long availmem; 58extern unsigned long availmem;
55 59
56int m68k_num_memory; 60int m68k_num_memory;
57int m68k_realnum_memory; 61int m68k_realnum_memory;
62EXPORT_SYMBOL(m68k_realnum_memory);
63#ifdef CONFIG_SINGLE_MEMORY_CHUNK
58unsigned long m68k_memoffset; 64unsigned long m68k_memoffset;
65EXPORT_SYMBOL(m68k_memoffset);
66#endif
59struct mem_info m68k_memory[NUM_MEMINFO]; 67struct mem_info m68k_memory[NUM_MEMINFO];
68EXPORT_SYMBOL(m68k_memory);
60 69
61static struct mem_info m68k_ramdisk; 70static struct mem_info m68k_ramdisk;
62 71
63static char m68k_command_line[CL_SIZE]; 72static char m68k_command_line[CL_SIZE];
64 73
65char m68k_debug_device[6] = ""; 74char m68k_debug_device[6] = "";
75EXPORT_SYMBOL(m68k_debug_device);
66 76
67void (*mach_sched_init) (irq_handler_t handler) __initdata = NULL; 77void (*mach_sched_init) (irq_handler_t handler) __initdata = NULL;
68/* machine dependent irq functions */ 78/* machine dependent irq functions */
@@ -72,10 +82,14 @@ int (*mach_get_hardware_list) (char *buffer);
72/* machine dependent timer functions */ 82/* machine dependent timer functions */
73unsigned long (*mach_gettimeoffset) (void); 83unsigned long (*mach_gettimeoffset) (void);
74int (*mach_hwclk) (int, struct rtc_time*); 84int (*mach_hwclk) (int, struct rtc_time*);
85EXPORT_SYMBOL(mach_hwclk);
75int (*mach_set_clock_mmss) (unsigned long); 86int (*mach_set_clock_mmss) (unsigned long);
76unsigned int (*mach_get_ss)(void); 87unsigned int (*mach_get_ss)(void);
77int (*mach_get_rtc_pll)(struct rtc_pll_info *); 88int (*mach_get_rtc_pll)(struct rtc_pll_info *);
78int (*mach_set_rtc_pll)(struct rtc_pll_info *); 89int (*mach_set_rtc_pll)(struct rtc_pll_info *);
90EXPORT_SYMBOL(mach_get_ss);
91EXPORT_SYMBOL(mach_get_rtc_pll);
92EXPORT_SYMBOL(mach_set_rtc_pll);
79void (*mach_reset)( void ); 93void (*mach_reset)( void );
80void (*mach_halt)( void ); 94void (*mach_halt)( void );
81void (*mach_power_off)( void ); 95void (*mach_power_off)( void );
@@ -89,6 +103,7 @@ void (*mach_l2_flush) (int);
89#endif 103#endif
90#if defined(CONFIG_INPUT_M68K_BEEP) || defined(CONFIG_INPUT_M68K_BEEP_MODULE) 104#if defined(CONFIG_INPUT_M68K_BEEP) || defined(CONFIG_INPUT_M68K_BEEP_MODULE)
91void (*mach_beep)(unsigned int, unsigned int); 105void (*mach_beep)(unsigned int, unsigned int);
106EXPORT_SYMBOL(mach_beep);
92#endif 107#endif
93#if defined(CONFIG_ISA) && defined(MULTI_ISA) 108#if defined(CONFIG_ISA) && defined(MULTI_ISA)
94int isa_type; 109int isa_type;
diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c
index 4569406a2e1f..759fa244e6cd 100644
--- a/arch/m68k/kernel/traps.c
+++ b/arch/m68k/kernel/traps.c
@@ -326,13 +326,13 @@ static inline int do_040writeback1(unsigned short wbs, unsigned long wba,
326 326
327 switch (wbs & WBSIZ_040) { 327 switch (wbs & WBSIZ_040) {
328 case BA_SIZE_BYTE: 328 case BA_SIZE_BYTE:
329 res = put_user(wbd & 0xff, (char *)wba); 329 res = put_user(wbd & 0xff, (char __user *)wba);
330 break; 330 break;
331 case BA_SIZE_WORD: 331 case BA_SIZE_WORD:
332 res = put_user(wbd & 0xffff, (short *)wba); 332 res = put_user(wbd & 0xffff, (short __user *)wba);
333 break; 333 break;
334 case BA_SIZE_LONG: 334 case BA_SIZE_LONG:
335 res = put_user(wbd, (int *)wba); 335 res = put_user(wbd, (int __user *)wba);
336 break; 336 break;
337 } 337 }
338 338
diff --git a/arch/m68k/mm/kmap.c b/arch/m68k/mm/kmap.c
index f46f049d29ff..b54ef1726c55 100644
--- a/arch/m68k/mm/kmap.c
+++ b/arch/m68k/mm/kmap.c
@@ -7,6 +7,7 @@
7 * used by other architectures /Roman Zippel 7 * used by other architectures /Roman Zippel
8 */ 8 */
9 9
10#include <linux/module.h>
10#include <linux/mm.h> 11#include <linux/mm.h>
11#include <linux/kernel.h> 12#include <linux/kernel.h>
12#include <linux/string.h> 13#include <linux/string.h>
@@ -219,6 +220,7 @@ void __iomem *__ioremap(unsigned long physaddr, unsigned long size, int cachefla
219 220
220 return (void __iomem *)retaddr; 221 return (void __iomem *)retaddr;
221} 222}
223EXPORT_SYMBOL(__ioremap);
222 224
223/* 225/*
224 * Unmap a ioremap()ed region again 226 * Unmap a ioremap()ed region again
@@ -234,6 +236,7 @@ void iounmap(void __iomem *addr)
234 free_io_area((__force void *)addr); 236 free_io_area((__force void *)addr);
235#endif 237#endif
236} 238}
239EXPORT_SYMBOL(iounmap);
237 240
238/* 241/*
239 * __iounmap unmaps nearly everything, so be careful 242 * __iounmap unmaps nearly everything, so be careful
@@ -360,3 +363,4 @@ void kernel_set_cachemode(void *addr, unsigned long size, int cmode)
360 363
361 flush_tlb_all(); 364 flush_tlb_all();
362} 365}
366EXPORT_SYMBOL(kernel_set_cachemode);
diff --git a/arch/m68k/mm/memory.c b/arch/m68k/mm/memory.c
index a0c095e17222..0f88812822b1 100644
--- a/arch/m68k/mm/memory.c
+++ b/arch/m68k/mm/memory.c
@@ -4,6 +4,7 @@
4 * Copyright (C) 1995 Hamish Macdonald 4 * Copyright (C) 1995 Hamish Macdonald
5 */ 5 */
6 6
7#include <linux/module.h>
7#include <linux/mm.h> 8#include <linux/mm.h>
8#include <linux/kernel.h> 9#include <linux/kernel.h>
9#include <linux/string.h> 10#include <linux/string.h>
@@ -157,9 +158,8 @@ unsigned long mm_vtop(unsigned long vaddr)
157 158
158 return -1; 159 return -1;
159} 160}
160#endif 161EXPORT_SYMBOL(mm_vtop);
161 162
162#ifndef CONFIG_SINGLE_MEMORY_CHUNK
163unsigned long mm_ptov (unsigned long paddr) 163unsigned long mm_ptov (unsigned long paddr)
164{ 164{
165 int i = 0; 165 int i = 0;
@@ -185,6 +185,7 @@ unsigned long mm_ptov (unsigned long paddr)
185#endif 185#endif
186 return -1; 186 return -1;
187} 187}
188EXPORT_SYMBOL(mm_ptov);
188#endif 189#endif
189 190
190/* invalidate page in both caches */ 191/* invalidate page in both caches */
@@ -298,6 +299,7 @@ void cache_clear (unsigned long paddr, int len)
298 mach_l2_flush(0); 299 mach_l2_flush(0);
299#endif 300#endif
300} 301}
302EXPORT_SYMBOL(cache_clear); /* probably can be unexported */
301 303
302 304
303/* 305/*
@@ -350,6 +352,7 @@ void cache_push (unsigned long paddr, int len)
350 mach_l2_flush(1); 352 mach_l2_flush(1);
351#endif 353#endif
352} 354}
355EXPORT_SYMBOL(cache_push); /* probably can be unexported */
353 356
354#ifndef CONFIG_SINGLE_MEMORY_CHUNK 357#ifndef CONFIG_SINGLE_MEMORY_CHUNK
355int mm_end_of_chunk (unsigned long addr, int len) 358int mm_end_of_chunk (unsigned long addr, int len)
@@ -361,4 +364,5 @@ int mm_end_of_chunk (unsigned long addr, int len)
361 return 1; 364 return 1;
362 return 0; 365 return 0;
363} 366}
367EXPORT_SYMBOL(mm_end_of_chunk);
364#endif 368#endif
diff --git a/arch/m68k/mm/sun3kmap.c b/arch/m68k/mm/sun3kmap.c
index 7f0d86f3fe73..1af24cb5bfe1 100644
--- a/arch/m68k/mm/sun3kmap.c
+++ b/arch/m68k/mm/sun3kmap.c
@@ -8,6 +8,7 @@
8 * for more details. 8 * for more details.
9 */ 9 */
10 10
11#include <linux/module.h>
11#include <linux/types.h> 12#include <linux/types.h>
12#include <linux/kernel.h> 13#include <linux/kernel.h>
13#include <linux/mm.h> 14#include <linux/mm.h>
@@ -59,7 +60,7 @@ static inline void do_pmeg_mapin(unsigned long phys, unsigned long virt,
59 } 60 }
60} 61}
61 62
62void *sun3_ioremap(unsigned long phys, unsigned long size, 63void __iomem *sun3_ioremap(unsigned long phys, unsigned long size,
63 unsigned long type) 64 unsigned long type)
64{ 65{
65 struct vm_struct *area; 66 struct vm_struct *area;
@@ -101,22 +102,24 @@ void *sun3_ioremap(unsigned long phys, unsigned long size,
101 virt += seg_pages * PAGE_SIZE; 102 virt += seg_pages * PAGE_SIZE;
102 } 103 }
103 104
104 return (void *)ret; 105 return (void __iomem *)ret;
105 106
106} 107}
107 108
108 109
109void *__ioremap(unsigned long phys, unsigned long size, int cache) 110void __iomem *__ioremap(unsigned long phys, unsigned long size, int cache)
110{ 111{
111 112
112 return sun3_ioremap(phys, size, SUN3_PAGE_TYPE_IO); 113 return sun3_ioremap(phys, size, SUN3_PAGE_TYPE_IO);
113 114
114} 115}
116EXPORT_SYMBOL(__ioremap);
115 117
116void iounmap(void *addr) 118void iounmap(void __iomem *addr)
117{ 119{
118 vfree((void *)(PAGE_MASK & (unsigned long)addr)); 120 vfree((void *)(PAGE_MASK & (unsigned long)addr));
119} 121}
122EXPORT_SYMBOL(iounmap);
120 123
121/* sun3_map_test(addr, val) -- Reads a byte from addr, storing to val, 124/* sun3_map_test(addr, val) -- Reads a byte from addr, storing to val,
122 * trapping the potential read fault. Returns 0 if the access faulted, 125 * trapping the potential read fault. Returns 0 if the access faulted,
diff --git a/arch/m68k/sun3/Makefile b/arch/m68k/sun3/Makefile
index 4d4f0695d985..be1a8470d636 100644
--- a/arch/m68k/sun3/Makefile
+++ b/arch/m68k/sun3/Makefile
@@ -2,6 +2,6 @@
2# Makefile for Linux arch/m68k/sun3 source directory 2# Makefile for Linux arch/m68k/sun3 source directory
3# 3#
4 4
5obj-y := sun3_ksyms.o sun3ints.o sun3dvma.o sbus.o idprom.o 5obj-y := sun3ints.o sun3dvma.o sbus.o idprom.o
6 6
7obj-$(CONFIG_SUN3) += config.o mmu_emu.o leds.o dvma.o intersil.o 7obj-$(CONFIG_SUN3) += config.o mmu_emu.o leds.o dvma.o intersil.o
diff --git a/arch/m68k/sun3/idprom.c b/arch/m68k/sun3/idprom.c
index 02c1fee6fe74..dca6ab6a4ede 100644
--- a/arch/m68k/sun3/idprom.c
+++ b/arch/m68k/sun3/idprom.c
@@ -6,6 +6,7 @@
6 * Sun3/3x models added by David Monro (davidm@psrg.cs.usyd.edu.au) 6 * Sun3/3x models added by David Monro (davidm@psrg.cs.usyd.edu.au)
7 */ 7 */
8 8
9#include <linux/module.h>
9#include <linux/kernel.h> 10#include <linux/kernel.h>
10#include <linux/types.h> 11#include <linux/types.h>
11#include <linux/init.h> 12#include <linux/init.h>
@@ -16,6 +17,8 @@
16#include <asm/machines.h> /* Fun with Sun released architectures. */ 17#include <asm/machines.h> /* Fun with Sun released architectures. */
17 18
18struct idprom *idprom; 19struct idprom *idprom;
20EXPORT_SYMBOL(idprom);
21
19static struct idprom idprom_buffer; 22static struct idprom idprom_buffer;
20 23
21/* Here is the master table of Sun machines which use some implementation 24/* Here is the master table of Sun machines which use some implementation
diff --git a/arch/m68k/sun3/sun3_ksyms.c b/arch/m68k/sun3/sun3_ksyms.c
deleted file mode 100644
index 43e5a9af8abd..000000000000
--- a/arch/m68k/sun3/sun3_ksyms.c
+++ /dev/null
@@ -1,13 +0,0 @@
1#include <linux/module.h>
2#include <linux/types.h>
3#include <asm/dvma.h>
4#include <asm/idprom.h>
5
6/*
7 * Add things here when you find the need for it.
8 */
9EXPORT_SYMBOL(dvma_map_align);
10EXPORT_SYMBOL(dvma_unmap);
11EXPORT_SYMBOL(dvma_malloc_align);
12EXPORT_SYMBOL(dvma_free);
13EXPORT_SYMBOL(idprom);
diff --git a/arch/m68k/sun3/sun3dvma.c b/arch/m68k/sun3/sun3dvma.c
index a2bc2da7f8f0..8709677fa025 100644
--- a/arch/m68k/sun3/sun3dvma.c
+++ b/arch/m68k/sun3/sun3dvma.c
@@ -6,6 +6,7 @@
6 * Contains common routines for sun3/sun3x DVMA management. 6 * Contains common routines for sun3/sun3x DVMA management.
7 */ 7 */
8 8
9#include <linux/module.h>
9#include <linux/kernel.h> 10#include <linux/kernel.h>
10#include <linux/mm.h> 11#include <linux/mm.h>
11#include <linux/list.h> 12#include <linux/list.h>
@@ -312,6 +313,7 @@ inline unsigned long dvma_map_align(unsigned long kaddr, int len, int align)
312 BUG(); 313 BUG();
313 return 0; 314 return 0;
314} 315}
316EXPORT_SYMBOL(dvma_map_align);
315 317
316void dvma_unmap(void *baddr) 318void dvma_unmap(void *baddr)
317{ 319{
@@ -327,7 +329,7 @@ void dvma_unmap(void *baddr)
327 return; 329 return;
328 330
329} 331}
330 332EXPORT_SYMBOL(dvma_unmap);
331 333
332void *dvma_malloc_align(unsigned long len, unsigned long align) 334void *dvma_malloc_align(unsigned long len, unsigned long align)
333{ 335{
@@ -367,6 +369,7 @@ void *dvma_malloc_align(unsigned long len, unsigned long align)
367 return (void *)vaddr; 369 return (void *)vaddr;
368 370
369} 371}
372EXPORT_SYMBOL(dvma_malloc_align);
370 373
371void dvma_free(void *vaddr) 374void dvma_free(void *vaddr)
372{ 375{
@@ -374,3 +377,4 @@ void dvma_free(void *vaddr)
374 return; 377 return;
375 378
376} 379}
380EXPORT_SYMBOL(dvma_free);
diff --git a/arch/parisc/kernel/parisc_ksyms.c b/arch/parisc/kernel/parisc_ksyms.c
index 6d57553d8ef8..8f6a0b312f7a 100644
--- a/arch/parisc/kernel/parisc_ksyms.c
+++ b/arch/parisc/kernel/parisc_ksyms.c
@@ -69,10 +69,6 @@ EXPORT_SYMBOL(memcpy_toio);
69EXPORT_SYMBOL(memcpy_fromio); 69EXPORT_SYMBOL(memcpy_fromio);
70EXPORT_SYMBOL(memset_io); 70EXPORT_SYMBOL(memset_io);
71 71
72#include <asm/unistd.h>
73EXPORT_SYMBOL(sys_lseek);
74EXPORT_SYMBOL(sys_write);
75
76#include <asm/semaphore.h> 72#include <asm/semaphore.h>
77EXPORT_SYMBOL(__up); 73EXPORT_SYMBOL(__up);
78EXPORT_SYMBOL(__down_interruptible); 74EXPORT_SYMBOL(__down_interruptible);
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 16fe027bbc12..d1c0758c5611 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -307,11 +307,12 @@ void __init paging_init(void)
307 top_of_ram, total_ram); 307 top_of_ram, total_ram);
308 printk(KERN_DEBUG "Memory hole size: %ldMB\n", 308 printk(KERN_DEBUG "Memory hole size: %ldMB\n",
309 (top_of_ram - total_ram) >> 20); 309 (top_of_ram - total_ram) >> 20);
310 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
310#ifdef CONFIG_HIGHMEM 311#ifdef CONFIG_HIGHMEM
311 max_zone_pfns[0] = total_lowmem >> PAGE_SHIFT; 312 max_zone_pfns[ZONE_DMA] = total_lowmem >> PAGE_SHIFT;
312 max_zone_pfns[1] = top_of_ram >> PAGE_SHIFT; 313 max_zone_pfns[ZONE_HIGHMEM] = top_of_ram >> PAGE_SHIFT;
313#else 314#else
314 max_zone_pfns[0] = top_of_ram >> PAGE_SHIFT; 315 max_zone_pfns[ZONE_DMA] = top_of_ram >> PAGE_SHIFT;
315#endif 316#endif
316 free_area_init_nodes(max_zone_pfns); 317 free_area_init_nodes(max_zone_pfns);
317} 318}
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 43c272075e1a..9da01dc8cfd9 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -617,9 +617,9 @@ void __init do_init_bootmem(void)
617 617
618void __init paging_init(void) 618void __init paging_init(void)
619{ 619{
620 unsigned long max_zone_pfns[MAX_NR_ZONES] = { 620 unsigned long max_zone_pfns[MAX_NR_ZONES];
621 lmb_end_of_DRAM() >> PAGE_SHIFT 621 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
622 }; 622 max_zone_pfns[ZONE_DMA] = lmb_end_of_DRAM() >> PAGE_SHIFT;
623 free_area_init_nodes(max_zone_pfns); 623 free_area_init_nodes(max_zone_pfns);
624} 624}
625 625
diff --git a/arch/ppc/mm/init.c b/arch/ppc/mm/init.c
index 410200046af1..c374e53ae03a 100644
--- a/arch/ppc/mm/init.c
+++ b/arch/ppc/mm/init.c
@@ -374,11 +374,12 @@ void __init paging_init(void)
374 end_pfn = start_pfn + (total_memory >> PAGE_SHIFT); 374 end_pfn = start_pfn + (total_memory >> PAGE_SHIFT);
375 add_active_range(0, start_pfn, end_pfn); 375 add_active_range(0, start_pfn, end_pfn);
376 376
377 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
377#ifdef CONFIG_HIGHMEM 378#ifdef CONFIG_HIGHMEM
378 max_zone_pfns[0] = total_lowmem >> PAGE_SHIFT; 379 max_zone_pfns[ZONE_DMA] = total_lowmem >> PAGE_SHIFT;
379 max_zone_pfns[1] = total_memory >> PAGE_SHIFT; 380 max_zone_pfns[ZONE_HIGHMEM] = total_memory >> PAGE_SHIFT;
380#else 381#else
381 max_zone_pfns[0] = total_memory >> PAGE_SHIFT; 382 max_zone_pfns[ZONE_DMA] = total_memory >> PAGE_SHIFT;
382#endif /* CONFIG_HIGHMEM */ 383#endif /* CONFIG_HIGHMEM */
383 free_area_init_nodes(max_zone_pfns); 384 free_area_init_nodes(max_zone_pfns);
384} 385}
diff --git a/arch/s390/kernel/s390_ksyms.c b/arch/s390/kernel/s390_ksyms.c
index 9f19e833a562..90b5ef529eb7 100644
--- a/arch/s390/kernel/s390_ksyms.c
+++ b/arch/s390/kernel/s390_ksyms.c
@@ -51,4 +51,3 @@ EXPORT_SYMBOL(csum_fold);
51EXPORT_SYMBOL(console_mode); 51EXPORT_SYMBOL(console_mode);
52EXPORT_SYMBOL(console_devno); 52EXPORT_SYMBOL(console_devno);
53EXPORT_SYMBOL(console_irq); 53EXPORT_SYMBOL(console_irq);
54EXPORT_SYMBOL(sys_wait4);
diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index d75307589d74..78fb619bdb73 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -25,6 +25,19 @@ config PCI
25config PCMCIA 25config PCMCIA
26 bool 26 bool
27 27
28# Yet to do!
29config TRACE_IRQFLAGS_SUPPORT
30 bool
31 default n
32
33config LOCKDEP_SUPPORT
34 bool
35 default y
36
37config STACKTRACE_SUPPORT
38 bool
39 default y
40
28config GENERIC_CALIBRATE_DELAY 41config GENERIC_CALIBRATE_DELAY
29 bool 42 bool
30 default y 43 default y
@@ -37,13 +50,15 @@ config IRQ_RELEASE_METHOD
37menu "UML-specific options" 50menu "UML-specific options"
38 51
39config MODE_TT 52config MODE_TT
40 bool "Tracing thread support" 53 bool "Tracing thread support (DEPRECATED)"
41 default n 54 default n
42 help 55 help
43 This option controls whether tracing thread support is compiled 56 This option controls whether tracing thread support is compiled
44 into UML. This option is largely obsolete, given that skas0 provides 57 into UML. This option is largely obsolete, given that skas0 provides
45 skas security and performance without needing to patch the host. 58 skas security and performance without needing to patch the host.
46 It is safe to say 'N' here. 59 It is safe to say 'N' here; saying 'Y' may cause additional problems
60 with the resulting binary even if you run UML in SKAS mode, and running
61 in TT mode is strongly *NOT RECOMMENDED*.
47 62
48config STATIC_LINK 63config STATIC_LINK
49 bool "Force a static link" 64 bool "Force a static link"
@@ -56,6 +71,9 @@ config STATIC_LINK
56 for use in a chroot jail. So, if you intend to run UML inside a 71 for use in a chroot jail. So, if you intend to run UML inside a
57 chroot, and you disable CONFIG_MODE_TT, you probably want to say Y 72 chroot, and you disable CONFIG_MODE_TT, you probably want to say Y
58 here. 73 here.
74 Additionally, this option enables using higher memory spaces (up to
75 2.75G) for UML - disabling CONFIG_MODE_TT and enabling this option leads
76 to best results for this.
59 77
60config KERNEL_HALF_GIGS 78config KERNEL_HALF_GIGS
61 int "Kernel address space size (in .5G units)" 79 int "Kernel address space size (in .5G units)"
@@ -72,10 +90,13 @@ config MODE_SKAS
72 default y 90 default y
73 help 91 help
74 This option controls whether skas (separate kernel address space) 92 This option controls whether skas (separate kernel address space)
75 support is compiled in. If you have applied the skas patch to the 93 support is compiled in.
76 host, then you certainly want to say Y here (and consider saying N 94 Unless you have specific needs to use TT mode (which applies almost only
77 to CONFIG_MODE_TT). Otherwise, it is safe to say Y. Disabling this 95 to developers), you should say Y here.
78 option will shrink the UML binary slightly. 96 SKAS mode will make use of the SKAS3 patch if it is applied on the host
97 (and your UML will run in SKAS3 mode), but if no SKAS patch is applied
98 on the host it will run in SKAS0 mode, which is anyway faster than TT
99 mode.
79 100
80source "arch/um/Kconfig.arch" 101source "arch/um/Kconfig.arch"
81source "mm/Kconfig" 102source "mm/Kconfig"
diff --git a/arch/um/Kconfig.i386 b/arch/um/Kconfig.i386
index f6eb72d117b9..f191a550a079 100644
--- a/arch/um/Kconfig.i386
+++ b/arch/um/Kconfig.i386
@@ -16,23 +16,42 @@ config SEMAPHORE_SLEEPERS
16 bool 16 bool
17 default y 17 default y
18 18
19config HOST_2G_2G 19choice
20 bool "2G/2G host address space split" 20 prompt "Host memory split"
21 default n 21 default HOST_VMSPLIT_3G
22 help 22 ---help---
23 This is needed when the host on which you run has a 2G/2G memory 23 This is needed when the host kernel on which you run has a non-default
24 split, instead of the customary 3G/1G. 24 (like 2G/2G) memory split, instead of the customary 3G/1G. If you did
25 25 not recompile your own kernel but use the default distro's one, you can
26 Note that to enable such a host 26 safely accept the "Default split" option.
27 configuration, which makes sense only in some cases, you need special 27
28 host patches. 28 It can be enabled on recent (>=2.6.16-rc2) vanilla kernels via
29 29 CONFIG_VM_SPLIT_*, or on previous kernels with special patches (-ck
30 So, if you do not know what to do here, say 'N'. 30 patchset by Con Kolivas, or other ones) - option names match closely the
31 host CONFIG_VM_SPLIT_* ones.
32
33 A lower setting (where 1G/3G is lowest and 3G/1G is higher) will
34 tolerate even more "normal" host kernels, but an higher setting will be
35 stricter.
36
37 So, if you do not know what to do here, say 'Default split'.
38
39 config HOST_VMSPLIT_3G
40 bool "Default split (3G/1G user/kernel host split)"
41 config HOST_VMSPLIT_3G_OPT
42 bool "3G/1G user/kernel host split (for full 1G low memory)"
43 config HOST_VMSPLIT_2G
44 bool "2G/2G user/kernel host split"
45 config HOST_VMSPLIT_1G
46 bool "1G/3G user/kernel host split"
47endchoice
31 48
32config TOP_ADDR 49config TOP_ADDR
33 hex 50 hex
34 default 0xc0000000 if !HOST_2G_2G 51 default 0xB0000000 if HOST_VMSPLIT_3G_OPT
35 default 0x80000000 if HOST_2G_2G 52 default 0x78000000 if HOST_VMSPLIT_2G
53 default 0x40000000 if HOST_VMSPLIT_1G
54 default 0xC0000000
36 55
37config 3_LEVEL_PGTABLES 56config 3_LEVEL_PGTABLES
38 bool "Three-level pagetables (EXPERIMENTAL)" 57 bool "Three-level pagetables (EXPERIMENTAL)"
diff --git a/arch/um/Makefile-x86_64 b/arch/um/Makefile-x86_64
index 11154b6773ec..d278682dd799 100644
--- a/arch/um/Makefile-x86_64
+++ b/arch/um/Makefile-x86_64
@@ -1,10 +1,10 @@
1# Copyright 2003 - 2004 Pathscale, Inc 1# Copyright 2003 - 2004 Pathscale, Inc
2# Released under the GPL 2# Released under the GPL
3 3
4core-y += arch/um/sys-x86_64/ 4core-y += arch/um/sys-x86_64/ arch/x86_64/crypto/
5START := 0x60000000 5START := 0x60000000
6 6
7_extra_flags_ = -fno-builtin -m64 -mcmodel=kernel 7_extra_flags_ = -fno-builtin -m64
8 8
9#We #undef __x86_64__ for kernelspace, not for userspace where 9#We #undef __x86_64__ for kernelspace, not for userspace where
10#it's needed for headers to work! 10#it's needed for headers to work!
diff --git a/arch/um/include/common-offsets.h b/arch/um/include/common-offsets.h
index 356390d1f8b9..461175f8b1d9 100644
--- a/arch/um/include/common-offsets.h
+++ b/arch/um/include/common-offsets.h
@@ -1,9 +1,16 @@
1/* for use by sys-$SUBARCH/kernel-offsets.c */ 1/* for use by sys-$SUBARCH/kernel-offsets.c */
2 2
3DEFINE(KERNEL_MADV_REMOVE, MADV_REMOVE);
4#ifdef CONFIG_MODE_TT
5OFFSET(HOST_TASK_EXTERN_PID, task_struct, thread.mode.tt.extern_pid);
6#endif
7
3OFFSET(HOST_TASK_REGS, task_struct, thread.regs); 8OFFSET(HOST_TASK_REGS, task_struct, thread.regs);
4OFFSET(HOST_TASK_PID, task_struct, pid); 9OFFSET(HOST_TASK_PID, task_struct, pid);
10
5DEFINE(UM_KERN_PAGE_SIZE, PAGE_SIZE); 11DEFINE(UM_KERN_PAGE_SIZE, PAGE_SIZE);
6DEFINE(UM_NSEC_PER_SEC, NSEC_PER_SEC); 12DEFINE(UM_NSEC_PER_SEC, NSEC_PER_SEC);
13
7DEFINE_STR(UM_KERN_EMERG, KERN_EMERG); 14DEFINE_STR(UM_KERN_EMERG, KERN_EMERG);
8DEFINE_STR(UM_KERN_ALERT, KERN_ALERT); 15DEFINE_STR(UM_KERN_ALERT, KERN_ALERT);
9DEFINE_STR(UM_KERN_CRIT, KERN_CRIT); 16DEFINE_STR(UM_KERN_CRIT, KERN_CRIT);
@@ -12,6 +19,10 @@ DEFINE_STR(UM_KERN_WARNING, KERN_WARNING);
12DEFINE_STR(UM_KERN_NOTICE, KERN_NOTICE); 19DEFINE_STR(UM_KERN_NOTICE, KERN_NOTICE);
13DEFINE_STR(UM_KERN_INFO, KERN_INFO); 20DEFINE_STR(UM_KERN_INFO, KERN_INFO);
14DEFINE_STR(UM_KERN_DEBUG, KERN_DEBUG); 21DEFINE_STR(UM_KERN_DEBUG, KERN_DEBUG);
22
15DEFINE(UM_ELF_CLASS, ELF_CLASS); 23DEFINE(UM_ELF_CLASS, ELF_CLASS);
16DEFINE(UM_ELFCLASS32, ELFCLASS32); 24DEFINE(UM_ELFCLASS32, ELFCLASS32);
17DEFINE(UM_ELFCLASS64, ELFCLASS64); 25DEFINE(UM_ELFCLASS64, ELFCLASS64);
26
27/* For crypto assembler code. */
28DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx));
diff --git a/arch/um/include/longjmp.h b/arch/um/include/longjmp.h
index e93c6d3e893b..e860bc5848e0 100644
--- a/arch/um/include/longjmp.h
+++ b/arch/um/include/longjmp.h
@@ -12,7 +12,8 @@ extern void longjmp(jmp_buf, int);
12} while(0) 12} while(0)
13 13
14#define UML_SETJMP(buf) ({ \ 14#define UML_SETJMP(buf) ({ \
15 int n, enable; \ 15 int n; \
16 volatile int enable; \
16 enable = get_signals(); \ 17 enable = get_signals(); \
17 n = setjmp(*buf); \ 18 n = setjmp(*buf); \
18 if(n != 0) \ 19 if(n != 0) \
diff --git a/arch/um/include/os.h b/arch/um/include/os.h
index 120ca21a513a..6516f6dca96d 100644
--- a/arch/um/include/os.h
+++ b/arch/um/include/os.h
@@ -201,6 +201,7 @@ extern int os_getpgrp(void);
201 201
202#ifdef UML_CONFIG_MODE_TT 202#ifdef UML_CONFIG_MODE_TT
203extern void init_new_thread_stack(void *sig_stack, void (*usr1_handler)(int)); 203extern void init_new_thread_stack(void *sig_stack, void (*usr1_handler)(int));
204extern void stop(void);
204#endif 205#endif
205extern void init_new_thread_signals(void); 206extern void init_new_thread_signals(void);
206extern int run_kernel_thread(int (*fn)(void *), void *arg, void **jmp_ptr); 207extern int run_kernel_thread(int (*fn)(void *), void *arg, void **jmp_ptr);
diff --git a/arch/um/include/sysdep-i386/kernel-offsets.h b/arch/um/include/sysdep-i386/kernel-offsets.h
index 2c13de321f2f..97ec9d894d75 100644
--- a/arch/um/include/sysdep-i386/kernel-offsets.h
+++ b/arch/um/include/sysdep-i386/kernel-offsets.h
@@ -1,6 +1,7 @@
1#include <linux/stddef.h> 1#include <linux/stddef.h>
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include <linux/elf.h> 3#include <linux/elf.h>
4#include <linux/crypto.h>
4#include <asm/mman.h> 5#include <asm/mman.h>
5 6
6#define DEFINE(sym, val) \ 7#define DEFINE(sym, val) \
@@ -17,9 +18,5 @@
17void foo(void) 18void foo(void)
18{ 19{
19 OFFSET(HOST_TASK_DEBUGREGS, task_struct, thread.arch.debugregs); 20 OFFSET(HOST_TASK_DEBUGREGS, task_struct, thread.arch.debugregs);
20 DEFINE(KERNEL_MADV_REMOVE, MADV_REMOVE);
21#ifdef CONFIG_MODE_TT
22 OFFSET(HOST_TASK_EXTERN_PID, task_struct, thread.mode.tt.extern_pid);
23#endif
24#include <common-offsets.h> 21#include <common-offsets.h>
25} 22}
diff --git a/arch/um/include/sysdep-x86_64/kernel-offsets.h b/arch/um/include/sysdep-x86_64/kernel-offsets.h
index 91d129fb3930..a307237b7964 100644
--- a/arch/um/include/sysdep-x86_64/kernel-offsets.h
+++ b/arch/um/include/sysdep-x86_64/kernel-offsets.h
@@ -2,6 +2,7 @@
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include <linux/time.h> 3#include <linux/time.h>
4#include <linux/elf.h> 4#include <linux/elf.h>
5#include <linux/crypto.h>
5#include <asm/page.h> 6#include <asm/page.h>
6#include <asm/mman.h> 7#include <asm/mman.h>
7 8
@@ -18,9 +19,5 @@
18 19
19void foo(void) 20void foo(void)
20{ 21{
21 DEFINE(KERNEL_MADV_REMOVE, MADV_REMOVE);
22#ifdef CONFIG_MODE_TT
23 OFFSET(HOST_TASK_EXTERN_PID, task_struct, thread.mode.tt.extern_pid);
24#endif
25#include <common-offsets.h> 22#include <common-offsets.h>
26} 23}
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index c17eddcf89b3..2c6d090a2e87 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -60,10 +60,7 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc,
60#endif 60#endif
61 61
62 *pte = mk_pte(virt_to_page(kernel), __pgprot(_PAGE_PRESENT)); 62 *pte = mk_pte(virt_to_page(kernel), __pgprot(_PAGE_PRESENT));
63 /* This is wrong for the code page, but it doesn't matter since the 63 *pte = pte_mkread(*pte);
64 * stub is mapped by hand with the correct permissions.
65 */
66 *pte = pte_mkwrite(*pte);
67 return(0); 64 return(0);
68 65
69 out_pmd: 66 out_pmd:
diff --git a/arch/um/kernel/tt/uaccess_user.c b/arch/um/kernel/tt/uaccess_user.c
index 6c92bbccb49c..ed1abcf4d057 100644
--- a/arch/um/kernel/tt/uaccess_user.c
+++ b/arch/um/kernel/tt/uaccess_user.c
@@ -4,13 +4,13 @@
4 * Licensed under the GPL 4 * Licensed under the GPL
5 */ 5 */
6 6
7#include <setjmp.h>
8#include <string.h> 7#include <string.h>
9#include "user_util.h" 8#include "user_util.h"
10#include "uml_uaccess.h" 9#include "uml_uaccess.h"
11#include "task.h" 10#include "task.h"
12#include "kern_util.h" 11#include "kern_util.h"
13#include "os.h" 12#include "os.h"
13#include "longjmp.h"
14 14
15int __do_copy_from_user(void *to, const void *from, int n, 15int __do_copy_from_user(void *to, const void *from, int n,
16 void **fault_addr, void **fault_catcher) 16 void **fault_addr, void **fault_catcher)
@@ -80,10 +80,10 @@ int __do_strnlen_user(const char *str, unsigned long n,
80 struct tt_regs save = TASK_REGS(get_current())->tt; 80 struct tt_regs save = TASK_REGS(get_current())->tt;
81 int ret; 81 int ret;
82 unsigned long *faddrp = (unsigned long *)fault_addr; 82 unsigned long *faddrp = (unsigned long *)fault_addr;
83 sigjmp_buf jbuf; 83 jmp_buf jbuf;
84 84
85 *fault_catcher = &jbuf; 85 *fault_catcher = &jbuf;
86 if(sigsetjmp(jbuf, 1) == 0) 86 if(UML_SETJMP(&jbuf) == 0)
87 ret = strlen(str) + 1; 87 ret = strlen(str) + 1;
88 else ret = *faddrp - (unsigned long) str; 88 else ret = *faddrp - (unsigned long) str;
89 89
diff --git a/arch/um/os-Linux/tt.c b/arch/um/os-Linux/tt.c
index 5461a065bbb9..3dc3a02d6263 100644
--- a/arch/um/os-Linux/tt.c
+++ b/arch/um/os-Linux/tt.c
@@ -10,7 +10,6 @@
10#include <errno.h> 10#include <errno.h>
11#include <stdarg.h> 11#include <stdarg.h>
12#include <stdlib.h> 12#include <stdlib.h>
13#include <setjmp.h>
14#include <sys/time.h> 13#include <sys/time.h>
15#include <sys/ptrace.h> 14#include <sys/ptrace.h>
16#include <linux/ptrace.h> 15#include <linux/ptrace.h>
diff --git a/arch/um/os-Linux/util.c b/arch/um/os-Linux/util.c
index 3f5b1514e8a7..56b8a50e8bc2 100644
--- a/arch/um/os-Linux/util.c
+++ b/arch/um/os-Linux/util.c
@@ -80,11 +80,18 @@ void setup_machinename(char *machine_out)
80 struct utsname host; 80 struct utsname host;
81 81
82 uname(&host); 82 uname(&host);
83#if defined(UML_CONFIG_UML_X86) && !defined(UML_CONFIG_64BIT) 83#ifdef UML_CONFIG_UML_X86
84# ifndef UML_CONFIG_64BIT
84 if (!strcmp(host.machine, "x86_64")) { 85 if (!strcmp(host.machine, "x86_64")) {
85 strcpy(machine_out, "i686"); 86 strcpy(machine_out, "i686");
86 return; 87 return;
87 } 88 }
89# else
90 if (!strcmp(host.machine, "i686")) {
91 strcpy(machine_out, "x86_64");
92 return;
93 }
94# endif
88#endif 95#endif
89 strcpy(machine_out, host.machine); 96 strcpy(machine_out, host.machine);
90} 97}
diff --git a/arch/um/sys-x86_64/ksyms.c b/arch/um/sys-x86_64/ksyms.c
index 859273808203..12c593607c59 100644
--- a/arch/um/sys-x86_64/ksyms.c
+++ b/arch/um/sys-x86_64/ksyms.c
@@ -14,6 +14,3 @@ EXPORT_SYMBOL(__up_wakeup);
14 14
15/*XXX: we need them because they would be exported by x86_64 */ 15/*XXX: we need them because they would be exported by x86_64 */
16EXPORT_SYMBOL(__memcpy); 16EXPORT_SYMBOL(__memcpy);
17
18/* Networking helper routines. */
19EXPORT_SYMBOL(ip_compute_csum);
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 771bcf77daf2..c3cdcab29688 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -1897,7 +1897,7 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
1897} 1897}
1898#endif 1898#endif
1899 1899
1900static struct hw_interrupt_type ht_irq_chip = { 1900static struct irq_chip ht_irq_chip = {
1901 .name = "PCI-HT", 1901 .name = "PCI-HT",
1902 .mask = mask_ht_irq, 1902 .mask = mask_ht_irq,
1903 .unmask = unmask_ht_irq, 1903 .unmask = unmask_ht_irq,
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 19c72520a868..971dc1181e69 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -406,9 +406,12 @@ void __cpuinit zap_low_mappings(int cpu)
406#ifndef CONFIG_NUMA 406#ifndef CONFIG_NUMA
407void __init paging_init(void) 407void __init paging_init(void)
408{ 408{
409 unsigned long max_zone_pfns[MAX_NR_ZONES] = {MAX_DMA_PFN, 409 unsigned long max_zone_pfns[MAX_NR_ZONES];
410 MAX_DMA32_PFN, 410 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
411 end_pfn}; 411 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
412 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
413 max_zone_pfns[ZONE_NORMAL] = end_pfn;
414
412 memory_present(0, 0, end_pfn); 415 memory_present(0, 0, end_pfn);
413 sparse_init(); 416 sparse_init();
414 free_area_init_nodes(max_zone_pfns); 417 free_area_init_nodes(max_zone_pfns);
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 829a008bd39b..2ee2e003606c 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -338,9 +338,11 @@ static void __init arch_sparse_init(void)
338void __init paging_init(void) 338void __init paging_init(void)
339{ 339{
340 int i; 340 int i;
341 unsigned long max_zone_pfns[MAX_NR_ZONES] = { MAX_DMA_PFN, 341 unsigned long max_zone_pfns[MAX_NR_ZONES];
342 MAX_DMA32_PFN, 342 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
343 end_pfn}; 343 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
344 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
345 max_zone_pfns[ZONE_NORMAL] = end_pfn;
344 346
345 arch_sparse_init(); 347 arch_sparse_init();
346 348
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 77138a39eb04..83728a9457ad 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -870,7 +870,11 @@ static unsigned int ata_id_xfermask(const u16 *id)
870 * the PIO timing number for the maximum. Turn it into 870 * the PIO timing number for the maximum. Turn it into
871 * a mask. 871 * a mask.
872 */ 872 */
873 pio_mask = (2 << (id[ATA_ID_OLD_PIO_MODES] & 0xFF)) - 1 ; 873 u8 mode = id[ATA_ID_OLD_PIO_MODES] & 0xFF;
874 if (mode < 5) /* Valid PIO range */
875 pio_mask = (2 << mode) - 1;
876 else
877 pio_mask = 1;
874 878
875 /* But wait.. there's more. Design your standards by 879 /* But wait.. there's more. Design your standards by
876 * committee and you too can get a free iordy field to 880 * committee and you too can get a free iordy field to
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index b0d0cc41f3e8..7af2a4ba4990 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -164,10 +164,10 @@ int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg)
164{ 164{
165 int rc = 0; 165 int rc = 0;
166 u8 scsi_cmd[MAX_COMMAND_SIZE]; 166 u8 scsi_cmd[MAX_COMMAND_SIZE];
167 u8 args[4], *argbuf = NULL; 167 u8 args[4], *argbuf = NULL, *sensebuf = NULL;
168 int argsize = 0; 168 int argsize = 0;
169 struct scsi_sense_hdr sshdr;
170 enum dma_data_direction data_dir; 169 enum dma_data_direction data_dir;
170 int cmd_result;
171 171
172 if (arg == NULL) 172 if (arg == NULL)
173 return -EINVAL; 173 return -EINVAL;
@@ -175,6 +175,10 @@ int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg)
175 if (copy_from_user(args, arg, sizeof(args))) 175 if (copy_from_user(args, arg, sizeof(args)))
176 return -EFAULT; 176 return -EFAULT;
177 177
178 sensebuf = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_NOIO);
179 if (!sensebuf)
180 return -ENOMEM;
181
178 memset(scsi_cmd, 0, sizeof(scsi_cmd)); 182 memset(scsi_cmd, 0, sizeof(scsi_cmd));
179 183
180 if (args[3]) { 184 if (args[3]) {
@@ -191,7 +195,7 @@ int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg)
191 data_dir = DMA_FROM_DEVICE; 195 data_dir = DMA_FROM_DEVICE;
192 } else { 196 } else {
193 scsi_cmd[1] = (3 << 1); /* Non-data */ 197 scsi_cmd[1] = (3 << 1); /* Non-data */
194 /* scsi_cmd[2] is already 0 -- no off.line, cc, or data xfer */ 198 scsi_cmd[2] = 0x20; /* cc but no off.line or data xfer */
195 data_dir = DMA_NONE; 199 data_dir = DMA_NONE;
196 } 200 }
197 201
@@ -210,18 +214,46 @@ int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg)
210 214
211 /* Good values for timeout and retries? Values below 215 /* Good values for timeout and retries? Values below
212 from scsi_ioctl_send_command() for default case... */ 216 from scsi_ioctl_send_command() for default case... */
213 if (scsi_execute_req(scsidev, scsi_cmd, data_dir, argbuf, argsize, 217 cmd_result = scsi_execute(scsidev, scsi_cmd, data_dir, argbuf, argsize,
214 &sshdr, (10*HZ), 5)) { 218 sensebuf, (10*HZ), 5, 0);
219
220 if (driver_byte(cmd_result) == DRIVER_SENSE) {/* sense data available */
221 u8 *desc = sensebuf + 8;
222 cmd_result &= ~(0xFF<<24); /* DRIVER_SENSE is not an error */
223
224 /* If we set cc then ATA pass-through will cause a
225 * check condition even if no error. Filter that. */
226 if (cmd_result & SAM_STAT_CHECK_CONDITION) {
227 struct scsi_sense_hdr sshdr;
228 scsi_normalize_sense(sensebuf, SCSI_SENSE_BUFFERSIZE,
229 &sshdr);
230 if (sshdr.sense_key==0 &&
231 sshdr.asc==0 && sshdr.ascq==0)
232 cmd_result &= ~SAM_STAT_CHECK_CONDITION;
233 }
234
235 /* Send userspace a few ATA registers (same as drivers/ide) */
236 if (sensebuf[0] == 0x72 && /* format is "descriptor" */
237 desc[0] == 0x09 ) { /* code is "ATA Descriptor" */
238 args[0] = desc[13]; /* status */
239 args[1] = desc[3]; /* error */
240 args[2] = desc[5]; /* sector count (0:7) */
241 if (copy_to_user(arg, args, sizeof(args)))
242 rc = -EFAULT;
243 }
244 }
245
246
247 if (cmd_result) {
215 rc = -EIO; 248 rc = -EIO;
216 goto error; 249 goto error;
217 } 250 }
218 251
219 /* Need code to retrieve data from check condition? */
220
221 if ((argbuf) 252 if ((argbuf)
222 && copy_to_user(arg + sizeof(args), argbuf, argsize)) 253 && copy_to_user(arg + sizeof(args), argbuf, argsize))
223 rc = -EFAULT; 254 rc = -EFAULT;
224error: 255error:
256 kfree(sensebuf);
225 kfree(argbuf); 257 kfree(argbuf);
226 return rc; 258 return rc;
227} 259}
diff --git a/drivers/ata/pata_qdi.c b/drivers/ata/pata_qdi.c
index 7977f471d5e9..2c3cc0ccc606 100644
--- a/drivers/ata/pata_qdi.c
+++ b/drivers/ata/pata_qdi.c
@@ -141,7 +141,7 @@ static void qdi_data_xfer(struct ata_device *adev, unsigned char *buf, unsigned
141 memcpy(&pad, buf + buflen - slop, slop); 141 memcpy(&pad, buf + buflen - slop, slop);
142 outl(le32_to_cpu(pad), ap->ioaddr.data_addr); 142 outl(le32_to_cpu(pad), ap->ioaddr.data_addr);
143 } else { 143 } else {
144 pad = cpu_to_le16(inl(ap->ioaddr.data_addr)); 144 pad = cpu_to_le32(inl(ap->ioaddr.data_addr));
145 memcpy(buf + buflen - slop, &pad, slop); 145 memcpy(buf + buflen - slop, &pad, slop);
146 } 146 }
147 } 147 }
diff --git a/drivers/ata/sata_promise.c b/drivers/ata/sata_promise.c
index 8bcdfa64667c..72eda5160fad 100644
--- a/drivers/ata/sata_promise.c
+++ b/drivers/ata/sata_promise.c
@@ -260,6 +260,7 @@ static const struct pci_device_id pdc_ata_pci_tbl[] = {
260#if 0 260#if 0
261 { PCI_VDEVICE(PROMISE, 0x3570), board_20771 }, 261 { PCI_VDEVICE(PROMISE, 0x3570), board_20771 },
262#endif 262#endif
263 { PCI_VDEVICE(PROMISE, 0x3577), board_20771 },
263 264
264 { } /* terminate list */ 265 { } /* terminate list */
265}; 266};
diff --git a/drivers/block/DAC960.h b/drivers/block/DAC960.h
index cec539e601fe..6148073532b2 100644
--- a/drivers/block/DAC960.h
+++ b/drivers/block/DAC960.h
@@ -4379,8 +4379,8 @@ static inline void DAC960_P_To_PD_TranslateEnquiry(void *Enquiry)
4379static inline void DAC960_P_To_PD_TranslateDeviceState(void *DeviceState) 4379static inline void DAC960_P_To_PD_TranslateDeviceState(void *DeviceState)
4380{ 4380{
4381 memcpy(DeviceState + 2, DeviceState + 3, 1); 4381 memcpy(DeviceState + 2, DeviceState + 3, 1);
4382 memcpy(DeviceState + 4, DeviceState + 5, 2); 4382 memmove(DeviceState + 4, DeviceState + 5, 2);
4383 memcpy(DeviceState + 6, DeviceState + 8, 4); 4383 memmove(DeviceState + 6, DeviceState + 8, 4);
4384} 4384}
4385 4385
4386static inline 4386static inline
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 5d254b714509..5d6562171533 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1709,10 +1709,13 @@ static struct kobject *floppy_find(dev_t dev, int *part, void *data)
1709 return get_disk(unit[drive].gendisk); 1709 return get_disk(unit[drive].gendisk);
1710} 1710}
1711 1711
1712int __init amiga_floppy_init(void) 1712static int __init amiga_floppy_init(void)
1713{ 1713{
1714 int i, ret; 1714 int i, ret;
1715 1715
1716 if (!MACH_IS_AMIGA)
1717 return -ENXIO;
1718
1716 if (!AMIGAHW_PRESENT(AMI_FLOPPY)) 1719 if (!AMIGAHW_PRESENT(AMI_FLOPPY))
1717 return -ENXIO; 1720 return -ENXIO;
1718 1721
@@ -1809,15 +1812,9 @@ out_blkdev:
1809 return ret; 1812 return ret;
1810} 1813}
1811 1814
1815module_init(amiga_floppy_init);
1812#ifdef MODULE 1816#ifdef MODULE
1813 1817
1814int init_module(void)
1815{
1816 if (!MACH_IS_AMIGA)
1817 return -ENXIO;
1818 return amiga_floppy_init();
1819}
1820
1821#if 0 /* not safe to unload */ 1818#if 0 /* not safe to unload */
1822void cleanup_module(void) 1819void cleanup_module(void)
1823{ 1820{
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index 10cc38783bdf..0d97b7eb818a 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -48,9 +48,9 @@
48#include <linux/blkdev.h> 48#include <linux/blkdev.h>
49#include <linux/blkpg.h> 49#include <linux/blkpg.h>
50#include <linux/delay.h> 50#include <linux/delay.h>
51#include <linux/io.h>
51 52
52#include <asm/system.h> 53#include <asm/system.h>
53#include <asm/io.h>
54#include <asm/uaccess.h> 54#include <asm/uaccess.h>
55#include <asm/dma.h> 55#include <asm/dma.h>
56 56
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index 82ddbdd7bd4b..7cc2685ca84a 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -329,7 +329,7 @@ static struct kobject *z2_find(dev_t dev, int *part, void *data)
329 329
330static struct request_queue *z2_queue; 330static struct request_queue *z2_queue;
331 331
332int __init 332static int __init
333z2_init(void) 333z2_init(void)
334{ 334{
335 int ret; 335 int ret;
@@ -370,26 +370,7 @@ err:
370 return ret; 370 return ret;
371} 371}
372 372
373#if defined(MODULE) 373static void __exit z2_exit(void)
374
375MODULE_LICENSE("GPL");
376
377int
378init_module( void )
379{
380 int error;
381
382 error = z2_init();
383 if ( error == 0 )
384 {
385 printk( KERN_INFO DEVICE_NAME ": loaded as module\n" );
386 }
387
388 return error;
389}
390
391void
392cleanup_module( void )
393{ 374{
394 int i, j; 375 int i, j;
395 blk_unregister_region(MKDEV(Z2RAM_MAJOR, 0), 256); 376 blk_unregister_region(MKDEV(Z2RAM_MAJOR, 0), 256);
@@ -425,4 +406,7 @@ cleanup_module( void )
425 406
426 return; 407 return;
427} 408}
428#endif 409
410module_init(z2_init);
411module_exit(z2_exit);
412MODULE_LICENSE("GPL");
diff --git a/drivers/char/ip2/i2lib.c b/drivers/char/ip2/i2lib.c
index fc944d375be7..54d93f0345e8 100644
--- a/drivers/char/ip2/i2lib.c
+++ b/drivers/char/ip2/i2lib.c
@@ -1007,7 +1007,7 @@ i2InputAvailable(i2ChanStrPtr pCh)
1007// applications that one cannot break out of. 1007// applications that one cannot break out of.
1008//****************************************************************************** 1008//******************************************************************************
1009static int 1009static int
1010i2Output(i2ChanStrPtr pCh, const char *pSource, int count, int user ) 1010i2Output(i2ChanStrPtr pCh, const char *pSource, int count)
1011{ 1011{
1012 i2eBordStrPtr pB; 1012 i2eBordStrPtr pB;
1013 unsigned char *pInsert; 1013 unsigned char *pInsert;
@@ -1020,7 +1020,7 @@ i2Output(i2ChanStrPtr pCh, const char *pSource, int count, int user )
1020 1020
1021 int bailout = 10; 1021 int bailout = 10;
1022 1022
1023 ip2trace (CHANN, ITRC_OUTPUT, ITRC_ENTER, 2, count, user ); 1023 ip2trace (CHANN, ITRC_OUTPUT, ITRC_ENTER, 2, count, 0 );
1024 1024
1025 // Ensure channel structure seems real 1025 // Ensure channel structure seems real
1026 if ( !i2Validate ( pCh ) ) 1026 if ( !i2Validate ( pCh ) )
@@ -1087,12 +1087,7 @@ i2Output(i2ChanStrPtr pCh, const char *pSource, int count, int user )
1087 DATA_COUNT_OF(pInsert) = amountToMove; 1087 DATA_COUNT_OF(pInsert) = amountToMove;
1088 1088
1089 // Move the data 1089 // Move the data
1090 if ( user ) { 1090 memcpy( (char*)(DATA_OF(pInsert)), pSource, amountToMove );
1091 rc = copy_from_user((char*)(DATA_OF(pInsert)), pSource,
1092 amountToMove );
1093 } else {
1094 memcpy( (char*)(DATA_OF(pInsert)), pSource, amountToMove );
1095 }
1096 // Adjust pointers and indices 1091 // Adjust pointers and indices
1097 pSource += amountToMove; 1092 pSource += amountToMove;
1098 pCh->Obuf_char_count += amountToMove; 1093 pCh->Obuf_char_count += amountToMove;
diff --git a/drivers/char/ip2/i2lib.h b/drivers/char/ip2/i2lib.h
index 952e113ccd8a..e559e9bac06d 100644
--- a/drivers/char/ip2/i2lib.h
+++ b/drivers/char/ip2/i2lib.h
@@ -332,7 +332,7 @@ static int i2QueueCommands(int, i2ChanStrPtr, int, int, cmdSyntaxPtr,...);
332static int i2GetStatus(i2ChanStrPtr, int); 332static int i2GetStatus(i2ChanStrPtr, int);
333static int i2Input(i2ChanStrPtr); 333static int i2Input(i2ChanStrPtr);
334static int i2InputFlush(i2ChanStrPtr); 334static int i2InputFlush(i2ChanStrPtr);
335static int i2Output(i2ChanStrPtr, const char *, int, int); 335static int i2Output(i2ChanStrPtr, const char *, int);
336static int i2OutputFree(i2ChanStrPtr); 336static int i2OutputFree(i2ChanStrPtr);
337static int i2ServiceBoard(i2eBordStrPtr); 337static int i2ServiceBoard(i2eBordStrPtr);
338static void i2DrainOutput(i2ChanStrPtr, int); 338static void i2DrainOutput(i2ChanStrPtr, int);
diff --git a/drivers/char/ip2/ip2main.c b/drivers/char/ip2/ip2main.c
index 858ba5432c99..a3f32d46d2f8 100644
--- a/drivers/char/ip2/ip2main.c
+++ b/drivers/char/ip2/ip2main.c
@@ -1704,7 +1704,7 @@ ip2_write( PTTY tty, const unsigned char *pData, int count)
1704 1704
1705 /* This is the actual move bit. Make sure it does what we need!!!!! */ 1705 /* This is the actual move bit. Make sure it does what we need!!!!! */
1706 WRITE_LOCK_IRQSAVE(&pCh->Pbuf_spinlock,flags); 1706 WRITE_LOCK_IRQSAVE(&pCh->Pbuf_spinlock,flags);
1707 bytesSent = i2Output( pCh, pData, count, 0 ); 1707 bytesSent = i2Output( pCh, pData, count);
1708 WRITE_UNLOCK_IRQRESTORE(&pCh->Pbuf_spinlock,flags); 1708 WRITE_UNLOCK_IRQRESTORE(&pCh->Pbuf_spinlock,flags);
1709 1709
1710 ip2trace (CHANN, ITRC_WRITE, ITRC_RETURN, 1, bytesSent ); 1710 ip2trace (CHANN, ITRC_WRITE, ITRC_RETURN, 1, bytesSent );
@@ -1764,7 +1764,7 @@ ip2_flush_chars( PTTY tty )
1764 // 1764 //
1765 // We may need to restart i2Output if it does not fullfill this request 1765 // We may need to restart i2Output if it does not fullfill this request
1766 // 1766 //
1767 strip = i2Output( pCh, pCh->Pbuf, pCh->Pbuf_stuff, 0 ); 1767 strip = i2Output( pCh, pCh->Pbuf, pCh->Pbuf_stuff);
1768 if ( strip != pCh->Pbuf_stuff ) { 1768 if ( strip != pCh->Pbuf_stuff ) {
1769 memmove( pCh->Pbuf, &pCh->Pbuf[strip], pCh->Pbuf_stuff - strip ); 1769 memmove( pCh->Pbuf, &pCh->Pbuf[strip], pCh->Pbuf_stuff - strip );
1770 } 1770 }
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index 2455e8d478ac..34a4fd13fa81 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -1928,13 +1928,8 @@ static ssize_t guid_show(struct device *dev, struct device_attribute *attr,
1928 (long long) bmc->guid[8]); 1928 (long long) bmc->guid[8]);
1929} 1929}
1930 1930
1931static void 1931static void remove_files(struct bmc_device *bmc)
1932cleanup_bmc_device(struct kref *ref)
1933{ 1932{
1934 struct bmc_device *bmc;
1935
1936 bmc = container_of(ref, struct bmc_device, refcount);
1937
1938 device_remove_file(&bmc->dev->dev, 1933 device_remove_file(&bmc->dev->dev,
1939 &bmc->device_id_attr); 1934 &bmc->device_id_attr);
1940 device_remove_file(&bmc->dev->dev, 1935 device_remove_file(&bmc->dev->dev,
@@ -1951,12 +1946,23 @@ cleanup_bmc_device(struct kref *ref)
1951 &bmc->manufacturer_id_attr); 1946 &bmc->manufacturer_id_attr);
1952 device_remove_file(&bmc->dev->dev, 1947 device_remove_file(&bmc->dev->dev,
1953 &bmc->product_id_attr); 1948 &bmc->product_id_attr);
1949
1954 if (bmc->id.aux_firmware_revision_set) 1950 if (bmc->id.aux_firmware_revision_set)
1955 device_remove_file(&bmc->dev->dev, 1951 device_remove_file(&bmc->dev->dev,
1956 &bmc->aux_firmware_rev_attr); 1952 &bmc->aux_firmware_rev_attr);
1957 if (bmc->guid_set) 1953 if (bmc->guid_set)
1958 device_remove_file(&bmc->dev->dev, 1954 device_remove_file(&bmc->dev->dev,
1959 &bmc->guid_attr); 1955 &bmc->guid_attr);
1956}
1957
1958static void
1959cleanup_bmc_device(struct kref *ref)
1960{
1961 struct bmc_device *bmc;
1962
1963 bmc = container_of(ref, struct bmc_device, refcount);
1964
1965 remove_files(bmc);
1960 platform_device_unregister(bmc->dev); 1966 platform_device_unregister(bmc->dev);
1961 kfree(bmc); 1967 kfree(bmc);
1962} 1968}
@@ -1977,6 +1983,79 @@ static void ipmi_bmc_unregister(ipmi_smi_t intf)
1977 mutex_unlock(&ipmidriver_mutex); 1983 mutex_unlock(&ipmidriver_mutex);
1978} 1984}
1979 1985
1986static int create_files(struct bmc_device *bmc)
1987{
1988 int err;
1989
1990 err = device_create_file(&bmc->dev->dev,
1991 &bmc->device_id_attr);
1992 if (err) goto out;
1993 err = device_create_file(&bmc->dev->dev,
1994 &bmc->provides_dev_sdrs_attr);
1995 if (err) goto out_devid;
1996 err = device_create_file(&bmc->dev->dev,
1997 &bmc->revision_attr);
1998 if (err) goto out_sdrs;
1999 err = device_create_file(&bmc->dev->dev,
2000 &bmc->firmware_rev_attr);
2001 if (err) goto out_rev;
2002 err = device_create_file(&bmc->dev->dev,
2003 &bmc->version_attr);
2004 if (err) goto out_firm;
2005 err = device_create_file(&bmc->dev->dev,
2006 &bmc->add_dev_support_attr);
2007 if (err) goto out_version;
2008 err = device_create_file(&bmc->dev->dev,
2009 &bmc->manufacturer_id_attr);
2010 if (err) goto out_add_dev;
2011 err = device_create_file(&bmc->dev->dev,
2012 &bmc->product_id_attr);
2013 if (err) goto out_manu;
2014 if (bmc->id.aux_firmware_revision_set) {
2015 err = device_create_file(&bmc->dev->dev,
2016 &bmc->aux_firmware_rev_attr);
2017 if (err) goto out_prod_id;
2018 }
2019 if (bmc->guid_set) {
2020 err = device_create_file(&bmc->dev->dev,
2021 &bmc->guid_attr);
2022 if (err) goto out_aux_firm;
2023 }
2024
2025 return 0;
2026
2027out_aux_firm:
2028 if (bmc->id.aux_firmware_revision_set)
2029 device_remove_file(&bmc->dev->dev,
2030 &bmc->aux_firmware_rev_attr);
2031out_prod_id:
2032 device_remove_file(&bmc->dev->dev,
2033 &bmc->product_id_attr);
2034out_manu:
2035 device_remove_file(&bmc->dev->dev,
2036 &bmc->manufacturer_id_attr);
2037out_add_dev:
2038 device_remove_file(&bmc->dev->dev,
2039 &bmc->add_dev_support_attr);
2040out_version:
2041 device_remove_file(&bmc->dev->dev,
2042 &bmc->version_attr);
2043out_firm:
2044 device_remove_file(&bmc->dev->dev,
2045 &bmc->firmware_rev_attr);
2046out_rev:
2047 device_remove_file(&bmc->dev->dev,
2048 &bmc->revision_attr);
2049out_sdrs:
2050 device_remove_file(&bmc->dev->dev,
2051 &bmc->provides_dev_sdrs_attr);
2052out_devid:
2053 device_remove_file(&bmc->dev->dev,
2054 &bmc->device_id_attr);
2055out:
2056 return err;
2057}
2058
1980static int ipmi_bmc_register(ipmi_smi_t intf) 2059static int ipmi_bmc_register(ipmi_smi_t intf)
1981{ 2060{
1982 int rv; 2061 int rv;
@@ -2051,7 +2130,6 @@ static int ipmi_bmc_register(ipmi_smi_t intf)
2051 bmc->provides_dev_sdrs_attr.attr.mode = S_IRUGO; 2130 bmc->provides_dev_sdrs_attr.attr.mode = S_IRUGO;
2052 bmc->provides_dev_sdrs_attr.show = provides_dev_sdrs_show; 2131 bmc->provides_dev_sdrs_attr.show = provides_dev_sdrs_show;
2053 2132
2054
2055 bmc->revision_attr.attr.name = "revision"; 2133 bmc->revision_attr.attr.name = "revision";
2056 bmc->revision_attr.attr.owner = THIS_MODULE; 2134 bmc->revision_attr.attr.owner = THIS_MODULE;
2057 bmc->revision_attr.attr.mode = S_IRUGO; 2135 bmc->revision_attr.attr.mode = S_IRUGO;
@@ -2093,28 +2171,14 @@ static int ipmi_bmc_register(ipmi_smi_t intf)
2093 bmc->aux_firmware_rev_attr.attr.mode = S_IRUGO; 2171 bmc->aux_firmware_rev_attr.attr.mode = S_IRUGO;
2094 bmc->aux_firmware_rev_attr.show = aux_firmware_rev_show; 2172 bmc->aux_firmware_rev_attr.show = aux_firmware_rev_show;
2095 2173
2096 device_create_file(&bmc->dev->dev, 2174 rv = create_files(bmc);
2097 &bmc->device_id_attr); 2175 if (rv) {
2098 device_create_file(&bmc->dev->dev, 2176 mutex_lock(&ipmidriver_mutex);
2099 &bmc->provides_dev_sdrs_attr); 2177 platform_device_unregister(bmc->dev);
2100 device_create_file(&bmc->dev->dev, 2178 mutex_unlock(&ipmidriver_mutex);
2101 &bmc->revision_attr); 2179
2102 device_create_file(&bmc->dev->dev, 2180 return rv;
2103 &bmc->firmware_rev_attr); 2181 }
2104 device_create_file(&bmc->dev->dev,
2105 &bmc->version_attr);
2106 device_create_file(&bmc->dev->dev,
2107 &bmc->add_dev_support_attr);
2108 device_create_file(&bmc->dev->dev,
2109 &bmc->manufacturer_id_attr);
2110 device_create_file(&bmc->dev->dev,
2111 &bmc->product_id_attr);
2112 if (bmc->id.aux_firmware_revision_set)
2113 device_create_file(&bmc->dev->dev,
2114 &bmc->aux_firmware_rev_attr);
2115 if (bmc->guid_set)
2116 device_create_file(&bmc->dev->dev,
2117 &bmc->guid_attr);
2118 2182
2119 printk(KERN_INFO 2183 printk(KERN_INFO
2120 "ipmi: Found new BMC (man_id: 0x%6.6x, " 2184 "ipmi: Found new BMC (man_id: 0x%6.6x, "
diff --git a/drivers/char/tpm/tpm.c b/drivers/char/tpm/tpm.c
index a082a2e34252..6ad2d3bb945c 100644
--- a/drivers/char/tpm/tpm.c
+++ b/drivers/char/tpm/tpm.c
@@ -1153,7 +1153,14 @@ struct tpm_chip *tpm_register_hardware(struct device *dev, const struct tpm_vend
1153 1153
1154 spin_unlock(&driver_lock); 1154 spin_unlock(&driver_lock);
1155 1155
1156 sysfs_create_group(&dev->kobj, chip->vendor.attr_group); 1156 if (sysfs_create_group(&dev->kobj, chip->vendor.attr_group)) {
1157 list_del(&chip->list);
1158 put_device(dev);
1159 clear_bit(chip->dev_num, dev_mask);
1160 kfree(chip);
1161 kfree(devname);
1162 return NULL;
1163 }
1157 1164
1158 chip->bios_dir = tpm_bios_log_setup(devname); 1165 chip->bios_dir = tpm_bios_log_setup(devname);
1159 1166
diff --git a/drivers/char/tpm/tpm_atmel.c b/drivers/char/tpm/tpm_atmel.c
index ad8ffe49256f..1ab0896070be 100644
--- a/drivers/char/tpm/tpm_atmel.c
+++ b/drivers/char/tpm/tpm_atmel.c
@@ -184,7 +184,9 @@ static int __init init_atmel(void)
184 unsigned long base; 184 unsigned long base;
185 struct tpm_chip *chip; 185 struct tpm_chip *chip;
186 186
187 driver_register(&atml_drv); 187 rc = driver_register(&atml_drv);
188 if (rc)
189 return rc;
188 190
189 if ((iobase = atmel_get_base_addr(&base, &region_size)) == NULL) { 191 if ((iobase = atmel_get_base_addr(&base, &region_size)) == NULL) {
190 rc = -ENODEV; 192 rc = -ENODEV;
@@ -195,10 +197,8 @@ static int __init init_atmel(void)
195 (atmel_request_region 197 (atmel_request_region
196 (tpm_atmel.base, region_size, "tpm_atmel0") == NULL) ? 0 : 1; 198 (tpm_atmel.base, region_size, "tpm_atmel0") == NULL) ? 0 : 1;
197 199
198 200 pdev = platform_device_register_simple("tpm_atmel", -1, NULL, 0);
199 if (IS_ERR 201 if (IS_ERR(pdev)) {
200 (pdev =
201 platform_device_register_simple("tpm_atmel", -1, NULL, 0))) {
202 rc = PTR_ERR(pdev); 202 rc = PTR_ERR(pdev);
203 goto err_rel_reg; 203 goto err_rel_reg;
204 } 204 }
diff --git a/drivers/char/tpm/tpm_nsc.c b/drivers/char/tpm/tpm_nsc.c
index 26287aace87d..608f73071bef 100644
--- a/drivers/char/tpm/tpm_nsc.c
+++ b/drivers/char/tpm/tpm_nsc.c
@@ -284,7 +284,7 @@ static struct device_driver nsc_drv = {
284static int __init init_nsc(void) 284static int __init init_nsc(void)
285{ 285{
286 int rc = 0; 286 int rc = 0;
287 int lo, hi; 287 int lo, hi, err;
288 int nscAddrBase = TPM_ADDR; 288 int nscAddrBase = TPM_ADDR;
289 struct tpm_chip *chip; 289 struct tpm_chip *chip;
290 unsigned long base; 290 unsigned long base;
@@ -297,7 +297,9 @@ static int __init init_nsc(void)
297 return -ENODEV; 297 return -ENODEV;
298 } 298 }
299 299
300 driver_register(&nsc_drv); 300 err = driver_register(&nsc_drv);
301 if (err)
302 return err;
301 303
302 hi = tpm_read_index(nscAddrBase, TPM_NSC_BASE0_HI); 304 hi = tpm_read_index(nscAddrBase, TPM_NSC_BASE0_HI);
303 lo = tpm_read_index(nscAddrBase, TPM_NSC_BASE0_LO); 305 lo = tpm_read_index(nscAddrBase, TPM_NSC_BASE0_LO);
diff --git a/drivers/eisa/eisa-bus.c b/drivers/eisa/eisa-bus.c
index 3a365e159d89..d944647c82c2 100644
--- a/drivers/eisa/eisa-bus.c
+++ b/drivers/eisa/eisa-bus.c
@@ -226,14 +226,26 @@ static int __init eisa_init_device (struct eisa_root_device *root,
226 226
227static int __init eisa_register_device (struct eisa_device *edev) 227static int __init eisa_register_device (struct eisa_device *edev)
228{ 228{
229 if (device_register (&edev->dev)) 229 int rc = device_register (&edev->dev);
230 return -1; 230 if (rc)
231 return rc;
231 232
232 device_create_file (&edev->dev, &dev_attr_signature); 233 rc = device_create_file (&edev->dev, &dev_attr_signature);
233 device_create_file (&edev->dev, &dev_attr_enabled); 234 if (rc) goto err_devreg;
234 device_create_file (&edev->dev, &dev_attr_modalias); 235 rc = device_create_file (&edev->dev, &dev_attr_enabled);
236 if (rc) goto err_sig;
237 rc = device_create_file (&edev->dev, &dev_attr_modalias);
238 if (rc) goto err_enab;
235 239
236 return 0; 240 return 0;
241
242err_enab:
243 device_remove_file (&edev->dev, &dev_attr_enabled);
244err_sig:
245 device_remove_file (&edev->dev, &dev_attr_signature);
246err_devreg:
247 device_unregister(&edev->dev);
248 return rc;
237} 249}
238 250
239static int __init eisa_request_resources (struct eisa_root_device *root, 251static int __init eisa_request_resources (struct eisa_root_device *root,
diff --git a/drivers/firmware/dell_rbu.c b/drivers/firmware/dell_rbu.c
index fc17599c905e..08b161798443 100644
--- a/drivers/firmware/dell_rbu.c
+++ b/drivers/firmware/dell_rbu.c
@@ -249,7 +249,7 @@ static int packetize_data(void *data, size_t length)
249 if ((rc = create_packet(temp, packet_length))) 249 if ((rc = create_packet(temp, packet_length)))
250 return rc; 250 return rc;
251 251
252 pr_debug("%p:%lu\n", temp, (end - temp)); 252 pr_debug("%p:%td\n", temp, (end - temp));
253 temp += packet_length; 253 temp += packet_length;
254 } 254 }
255 255
@@ -718,14 +718,27 @@ static int __init dcdrbu_init(void)
718 return -EIO; 718 return -EIO;
719 } 719 }
720 720
721 sysfs_create_bin_file(&rbu_device->dev.kobj, &rbu_data_attr); 721 rc = sysfs_create_bin_file(&rbu_device->dev.kobj, &rbu_data_attr);
722 sysfs_create_bin_file(&rbu_device->dev.kobj, &rbu_image_type_attr); 722 if (rc)
723 sysfs_create_bin_file(&rbu_device->dev.kobj, 723 goto out_devreg;
724 rc = sysfs_create_bin_file(&rbu_device->dev.kobj, &rbu_image_type_attr);
725 if (rc)
726 goto out_data;
727 rc = sysfs_create_bin_file(&rbu_device->dev.kobj,
724 &rbu_packet_size_attr); 728 &rbu_packet_size_attr);
729 if (rc)
730 goto out_imtype;
725 731
726 rbu_data.entry_created = 0; 732 rbu_data.entry_created = 0;
727 return rc; 733 return 0;
728 734
735out_imtype:
736 sysfs_remove_bin_file(&rbu_device->dev.kobj, &rbu_image_type_attr);
737out_data:
738 sysfs_remove_bin_file(&rbu_device->dev.kobj, &rbu_data_attr);
739out_devreg:
740 platform_device_unregister(rbu_device);
741 return rc;
729} 742}
730 743
731static __exit void dcdrbu_exit(void) 744static __exit void dcdrbu_exit(void)
diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
index 8ebce1c03ad7..5ab5e393b882 100644
--- a/drivers/firmware/efivars.c
+++ b/drivers/firmware/efivars.c
@@ -639,7 +639,12 @@ efivar_create_sysfs_entry(unsigned long variable_name_size,
639 639
640 kobject_set_name(&new_efivar->kobj, "%s", short_name); 640 kobject_set_name(&new_efivar->kobj, "%s", short_name);
641 kobj_set_kset_s(new_efivar, vars_subsys); 641 kobj_set_kset_s(new_efivar, vars_subsys);
642 kobject_register(&new_efivar->kobj); 642 i = kobject_register(&new_efivar->kobj);
643 if (i) {
644 kfree(short_name);
645 kfree(new_efivar);
646 return 1;
647 }
643 648
644 kfree(short_name); 649 kfree(short_name);
645 short_name = NULL; 650 short_name = NULL;
diff --git a/drivers/ide/pci/generic.c b/drivers/ide/pci/generic.c
index 965c43659e35..5b77a5bcbf0c 100644
--- a/drivers/ide/pci/generic.c
+++ b/drivers/ide/pci/generic.c
@@ -237,10 +237,12 @@ static int __devinit generic_init_one(struct pci_dev *dev, const struct pci_devi
237 if (dev->vendor == PCI_VENDOR_ID_JMICRON && PCI_FUNC(dev->devfn) != 1) 237 if (dev->vendor == PCI_VENDOR_ID_JMICRON && PCI_FUNC(dev->devfn) != 1)
238 goto out; 238 goto out;
239 239
240 pci_read_config_word(dev, PCI_COMMAND, &command); 240 if (dev->vendor != PCI_VENDOR_ID_JMICRON) {
241 if (!(command & PCI_COMMAND_IO)) { 241 pci_read_config_word(dev, PCI_COMMAND, &command);
242 printk(KERN_INFO "Skipping disabled %s IDE controller.\n", d->name); 242 if (!(command & PCI_COMMAND_IO)) {
243 goto out; 243 printk(KERN_INFO "Skipping disabled %s IDE controller.\n", d->name);
244 goto out;
245 }
244 } 246 }
245 ret = ide_setup_pci_device(dev, d); 247 ret = ide_setup_pci_device(dev, d);
246out: 248out:
diff --git a/drivers/input/misc/wistron_btns.c b/drivers/input/misc/wistron_btns.c
index 4639537336fc..7b9d1c1da41a 100644
--- a/drivers/input/misc/wistron_btns.c
+++ b/drivers/input/misc/wistron_btns.c
@@ -17,7 +17,7 @@
17 * with this program; if not, write to the Free Software Foundation, Inc., 17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 59 Temple Place Suite 330, Boston, MA 02111-1307, USA. 18 * 59 Temple Place Suite 330, Boston, MA 02111-1307, USA.
19 */ 19 */
20#include <asm/io.h> 20#include <linux/io.h>
21#include <linux/dmi.h> 21#include <linux/dmi.h>
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/input.h> 23#include <linux/input.h>
diff --git a/drivers/isdn/pcbit/layer2.c b/drivers/isdn/pcbit/layer2.c
index 13e7d219d1c7..937fd2120381 100644
--- a/drivers/isdn/pcbit/layer2.c
+++ b/drivers/isdn/pcbit/layer2.c
@@ -311,6 +311,7 @@ pcbit_deliver(void *data)
311 dev->read_queue = frame->next; 311 dev->read_queue = frame->next;
312 spin_unlock_irqrestore(&dev->lock, flags); 312 spin_unlock_irqrestore(&dev->lock, flags);
313 313
314 msg = 0;
314 SET_MSG_CPU(msg, 0); 315 SET_MSG_CPU(msg, 0);
315 SET_MSG_PROC(msg, 0); 316 SET_MSG_PROC(msg, 0);
316 SET_MSG_CMD(msg, frame->skb->data[2]); 317 SET_MSG_CMD(msg, frame->skb->data[2]);
diff --git a/drivers/isdn/sc/init.c b/drivers/isdn/sc/init.c
index 222ca7c08baa..06c9872e8c6a 100644
--- a/drivers/isdn/sc/init.c
+++ b/drivers/isdn/sc/init.c
@@ -98,13 +98,14 @@ static int __init sc_init(void)
98 * Confirm the I/O Address with a test 98 * Confirm the I/O Address with a test
99 */ 99 */
100 if(io[b] == 0) { 100 if(io[b] == 0) {
101 pr_debug("I/O Address 0x%x is in use.\n"); 101 pr_debug("I/O Address invalid.\n");
102 continue; 102 continue;
103 } 103 }
104 104
105 outb(0x18, io[b] + 0x400 * EXP_PAGE0); 105 outb(0x18, io[b] + 0x400 * EXP_PAGE0);
106 if(inb(io[b] + 0x400 * EXP_PAGE0) != 0x18) { 106 if(inb(io[b] + 0x400 * EXP_PAGE0) != 0x18) {
107 pr_debug("I/O Base 0x%x fails test\n"); 107 pr_debug("I/O Base 0x%x fails test\n",
108 io[b] + 0x400 * EXP_PAGE0);
108 continue; 109 continue;
109 } 110 }
110 } 111 }
@@ -158,8 +159,8 @@ static int __init sc_init(void)
158 outb(0xFF, io[b] + RESET_OFFSET); 159 outb(0xFF, io[b] + RESET_OFFSET);
159 msleep_interruptible(10000); 160 msleep_interruptible(10000);
160 } 161 }
161 pr_debug("RAM Base for board %d is 0x%x, %s probe\n", b, ram[b], 162 pr_debug("RAM Base for board %d is 0x%lx, %s probe\n", b,
162 ram[b] == 0 ? "will" : "won't"); 163 ram[b], ram[b] == 0 ? "will" : "won't");
163 164
164 if(ram[b]) { 165 if(ram[b]) {
165 /* 166 /*
@@ -168,7 +169,7 @@ static int __init sc_init(void)
168 * board model 169 * board model
169 */ 170 */
170 if(request_region(ram[b], SRAM_PAGESIZE, "sc test")) { 171 if(request_region(ram[b], SRAM_PAGESIZE, "sc test")) {
171 pr_debug("request_region for RAM base 0x%x succeeded\n", ram[b]); 172 pr_debug("request_region for RAM base 0x%lx succeeded\n", ram[b]);
172 model = identify_board(ram[b], io[b]); 173 model = identify_board(ram[b], io[b]);
173 release_region(ram[b], SRAM_PAGESIZE); 174 release_region(ram[b], SRAM_PAGESIZE);
174 } 175 }
@@ -204,7 +205,7 @@ static int __init sc_init(void)
204 * Nope, there was no place in RAM for the 205 * Nope, there was no place in RAM for the
205 * board, or it couldn't be identified 206 * board, or it couldn't be identified
206 */ 207 */
207 pr_debug("Failed to find an adapter at 0x%x\n", ram[b]); 208 pr_debug("Failed to find an adapter at 0x%lx\n", ram[b]);
208 continue; 209 continue;
209 } 210 }
210 211
@@ -451,7 +452,7 @@ static int identify_board(unsigned long rambase, unsigned int iobase)
451 HWConfig_pl hwci; 452 HWConfig_pl hwci;
452 int x; 453 int x;
453 454
454 pr_debug("Attempting to identify adapter @ 0x%x io 0x%x\n", 455 pr_debug("Attempting to identify adapter @ 0x%lx io 0x%x\n",
455 rambase, iobase); 456 rambase, iobase);
456 457
457 /* 458 /*
@@ -490,7 +491,7 @@ static int identify_board(unsigned long rambase, unsigned int iobase)
490 outb(PRI_BASEPG_VAL, pgport); 491 outb(PRI_BASEPG_VAL, pgport);
491 msleep_interruptible(1000); 492 msleep_interruptible(1000);
492 sig = readl(rambase + SIG_OFFSET); 493 sig = readl(rambase + SIG_OFFSET);
493 pr_debug("Looking for a signature, got 0x%x\n", sig); 494 pr_debug("Looking for a signature, got 0x%lx\n", sig);
494 if(sig == SIGNATURE) 495 if(sig == SIGNATURE)
495 return PRI_BOARD; 496 return PRI_BOARD;
496 497
@@ -500,7 +501,7 @@ static int identify_board(unsigned long rambase, unsigned int iobase)
500 outb(BRI_BASEPG_VAL, pgport); 501 outb(BRI_BASEPG_VAL, pgport);
501 msleep_interruptible(1000); 502 msleep_interruptible(1000);
502 sig = readl(rambase + SIG_OFFSET); 503 sig = readl(rambase + SIG_OFFSET);
503 pr_debug("Looking for a signature, got 0x%x\n", sig); 504 pr_debug("Looking for a signature, got 0x%lx\n", sig);
504 if(sig == SIGNATURE) 505 if(sig == SIGNATURE)
505 return BRI_BOARD; 506 return BRI_BOARD;
506 507
@@ -510,7 +511,7 @@ static int identify_board(unsigned long rambase, unsigned int iobase)
510 * Try to spot a card 511 * Try to spot a card
511 */ 512 */
512 sig = readl(rambase + SIG_OFFSET); 513 sig = readl(rambase + SIG_OFFSET);
513 pr_debug("Looking for a signature, got 0x%x\n", sig); 514 pr_debug("Looking for a signature, got 0x%lx\n", sig);
514 if(sig != SIGNATURE) 515 if(sig != SIGNATURE)
515 return -1; 516 return -1;
516 517
@@ -540,7 +541,7 @@ static int identify_board(unsigned long rambase, unsigned int iobase)
540 memcpy_fromio(&rcvmsg, &(dpm->rsp_queue[dpm->rsp_tail]), MSG_LEN); 541 memcpy_fromio(&rcvmsg, &(dpm->rsp_queue[dpm->rsp_tail]), MSG_LEN);
541 pr_debug("Got HWConfig response, status = 0x%x\n", rcvmsg.rsp_status); 542 pr_debug("Got HWConfig response, status = 0x%x\n", rcvmsg.rsp_status);
542 memcpy(&hwci, &(rcvmsg.msg_data.HWCresponse), sizeof(HWConfig_pl)); 543 memcpy(&hwci, &(rcvmsg.msg_data.HWCresponse), sizeof(HWConfig_pl));
543 pr_debug("Hardware Config: Interface: %s, RAM Size: %d, Serial: %s\n" 544 pr_debug("Hardware Config: Interface: %s, RAM Size: %ld, Serial: %s\n"
544 " Part: %s, Rev: %s\n", 545 " Part: %s, Rev: %s\n",
545 hwci.st_u_sense ? "S/T" : "U", hwci.ram_size, 546 hwci.st_u_sense ? "S/T" : "U", hwci.ram_size,
546 hwci.serial_no, hwci.part_no, hwci.rev_no); 547 hwci.serial_no, hwci.part_no, hwci.rev_no);
diff --git a/drivers/isdn/sc/packet.c b/drivers/isdn/sc/packet.c
index f50defc38ae5..1e04676b016b 100644
--- a/drivers/isdn/sc/packet.c
+++ b/drivers/isdn/sc/packet.c
@@ -44,7 +44,7 @@ int sndpkt(int devId, int channel, struct sk_buff *data)
44 return -ENODEV; 44 return -ENODEV;
45 } 45 }
46 46
47 pr_debug("%s: sndpkt: frst = 0x%x nxt = %d f = %d n = %d\n", 47 pr_debug("%s: sndpkt: frst = 0x%lx nxt = %d f = %d n = %d\n",
48 sc_adapter[card]->devicename, 48 sc_adapter[card]->devicename,
49 sc_adapter[card]->channel[channel].first_sendbuf, 49 sc_adapter[card]->channel[channel].first_sendbuf,
50 sc_adapter[card]->channel[channel].next_sendbuf, 50 sc_adapter[card]->channel[channel].next_sendbuf,
@@ -66,7 +66,7 @@ int sndpkt(int devId, int channel, struct sk_buff *data)
66 ReqLnkWrite.buff_offset = sc_adapter[card]->channel[channel].next_sendbuf * 66 ReqLnkWrite.buff_offset = sc_adapter[card]->channel[channel].next_sendbuf *
67 BUFFER_SIZE + sc_adapter[card]->channel[channel].first_sendbuf; 67 BUFFER_SIZE + sc_adapter[card]->channel[channel].first_sendbuf;
68 ReqLnkWrite.msg_len = data->len; /* sk_buff size */ 68 ReqLnkWrite.msg_len = data->len; /* sk_buff size */
69 pr_debug("%s: writing %d bytes to buffer offset 0x%x\n", 69 pr_debug("%s: writing %d bytes to buffer offset 0x%lx\n",
70 sc_adapter[card]->devicename, 70 sc_adapter[card]->devicename,
71 ReqLnkWrite.msg_len, ReqLnkWrite.buff_offset); 71 ReqLnkWrite.msg_len, ReqLnkWrite.buff_offset);
72 memcpy_toshmem(card, (char *)ReqLnkWrite.buff_offset, data->data, ReqLnkWrite.msg_len); 72 memcpy_toshmem(card, (char *)ReqLnkWrite.buff_offset, data->data, ReqLnkWrite.msg_len);
@@ -74,7 +74,7 @@ int sndpkt(int devId, int channel, struct sk_buff *data)
74 /* 74 /*
75 * sendmessage 75 * sendmessage
76 */ 76 */
77 pr_debug("%s: sndpkt size=%d, buf_offset=0x%x buf_indx=%d\n", 77 pr_debug("%s: sndpkt size=%d, buf_offset=0x%lx buf_indx=%d\n",
78 sc_adapter[card]->devicename, 78 sc_adapter[card]->devicename,
79 ReqLnkWrite.msg_len, ReqLnkWrite.buff_offset, 79 ReqLnkWrite.msg_len, ReqLnkWrite.buff_offset,
80 sc_adapter[card]->channel[channel].next_sendbuf); 80 sc_adapter[card]->channel[channel].next_sendbuf);
@@ -124,7 +124,7 @@ void rcvpkt(int card, RspMessage *rcvmsg)
124 return; 124 return;
125 } 125 }
126 skb_put(skb, rcvmsg->msg_data.response.msg_len); 126 skb_put(skb, rcvmsg->msg_data.response.msg_len);
127 pr_debug("%s: getting data from offset: 0x%x\n", 127 pr_debug("%s: getting data from offset: 0x%lx\n",
128 sc_adapter[card]->devicename, 128 sc_adapter[card]->devicename,
129 rcvmsg->msg_data.response.buff_offset); 129 rcvmsg->msg_data.response.buff_offset);
130 memcpy_fromshmem(card, 130 memcpy_fromshmem(card,
@@ -143,7 +143,7 @@ void rcvpkt(int card, RspMessage *rcvmsg)
143/* memset_shmem(card, rcvmsg->msg_data.response.buff_offset, 0, BUFFER_SIZE); */ 143/* memset_shmem(card, rcvmsg->msg_data.response.buff_offset, 0, BUFFER_SIZE); */
144 newll.buff_offset = rcvmsg->msg_data.response.buff_offset; 144 newll.buff_offset = rcvmsg->msg_data.response.buff_offset;
145 newll.msg_len = BUFFER_SIZE; 145 newll.msg_len = BUFFER_SIZE;
146 pr_debug("%s: recycled buffer at offset 0x%x size %d\n", 146 pr_debug("%s: recycled buffer at offset 0x%lx size %d\n",
147 sc_adapter[card]->devicename, 147 sc_adapter[card]->devicename,
148 newll.buff_offset, newll.msg_len); 148 newll.buff_offset, newll.msg_len);
149 sendmessage(card, CEPID, ceReqTypeLnk, ceReqClass1, ceReqLnkRead, 149 sendmessage(card, CEPID, ceReqTypeLnk, ceReqClass1, ceReqLnkRead,
@@ -186,7 +186,7 @@ int setup_buffers(int card, int c)
186 sc_adapter[card]->channel[c-1].num_sendbufs = nBuffers / 2; 186 sc_adapter[card]->channel[c-1].num_sendbufs = nBuffers / 2;
187 sc_adapter[card]->channel[c-1].free_sendbufs = nBuffers / 2; 187 sc_adapter[card]->channel[c-1].free_sendbufs = nBuffers / 2;
188 sc_adapter[card]->channel[c-1].next_sendbuf = 0; 188 sc_adapter[card]->channel[c-1].next_sendbuf = 0;
189 pr_debug("%s: send buffer setup complete: first=0x%x n=%d f=%d, nxt=%d\n", 189 pr_debug("%s: send buffer setup complete: first=0x%lx n=%d f=%d, nxt=%d\n",
190 sc_adapter[card]->devicename, 190 sc_adapter[card]->devicename,
191 sc_adapter[card]->channel[c-1].first_sendbuf, 191 sc_adapter[card]->channel[c-1].first_sendbuf,
192 sc_adapter[card]->channel[c-1].num_sendbufs, 192 sc_adapter[card]->channel[c-1].num_sendbufs,
@@ -203,7 +203,7 @@ int setup_buffers(int card, int c)
203 ((sc_adapter[card]->channel[c-1].first_sendbuf + 203 ((sc_adapter[card]->channel[c-1].first_sendbuf +
204 (nBuffers / 2) * buffer_size) + (buffer_size * i)); 204 (nBuffers / 2) * buffer_size) + (buffer_size * i));
205 RcvBuffOffset.msg_len = buffer_size; 205 RcvBuffOffset.msg_len = buffer_size;
206 pr_debug("%s: adding RcvBuffer #%d offset=0x%x sz=%d bufsz:%d\n", 206 pr_debug("%s: adding RcvBuffer #%d offset=0x%lx sz=%d bufsz:%d\n",
207 sc_adapter[card]->devicename, 207 sc_adapter[card]->devicename,
208 i + 1, RcvBuffOffset.buff_offset, 208 i + 1, RcvBuffOffset.buff_offset,
209 RcvBuffOffset.msg_len,buffer_size); 209 RcvBuffOffset.msg_len,buffer_size);
diff --git a/drivers/isdn/sc/shmem.c b/drivers/isdn/sc/shmem.c
index 24854826ca45..6f58862992db 100644
--- a/drivers/isdn/sc/shmem.c
+++ b/drivers/isdn/sc/shmem.c
@@ -61,7 +61,7 @@ void memcpy_toshmem(int card, void *dest, const void *src, size_t n)
61 spin_unlock_irqrestore(&sc_adapter[card]->lock, flags); 61 spin_unlock_irqrestore(&sc_adapter[card]->lock, flags);
62 pr_debug("%s: set page to %#x\n",sc_adapter[card]->devicename, 62 pr_debug("%s: set page to %#x\n",sc_adapter[card]->devicename,
63 ((sc_adapter[card]->shmem_magic + ch * SRAM_PAGESIZE)>>14)|0x80); 63 ((sc_adapter[card]->shmem_magic + ch * SRAM_PAGESIZE)>>14)|0x80);
64 pr_debug("%s: copying %d bytes from %#x to %#x\n", 64 pr_debug("%s: copying %d bytes from %#lx to %#lx\n",
65 sc_adapter[card]->devicename, n, 65 sc_adapter[card]->devicename, n,
66 (unsigned long) src, 66 (unsigned long) src,
67 sc_adapter[card]->rambase + ((unsigned long) dest %0x4000)); 67 sc_adapter[card]->rambase + ((unsigned long) dest %0x4000));
diff --git a/drivers/mca/mca-bus.c b/drivers/mca/mca-bus.c
index 09baa43b2599..da862e4632dd 100644
--- a/drivers/mca/mca-bus.c
+++ b/drivers/mca/mca-bus.c
@@ -100,6 +100,7 @@ static DEVICE_ATTR(pos, S_IRUGO, mca_show_pos, NULL);
100int __init mca_register_device(int bus, struct mca_device *mca_dev) 100int __init mca_register_device(int bus, struct mca_device *mca_dev)
101{ 101{
102 struct mca_bus *mca_bus = mca_root_busses[bus]; 102 struct mca_bus *mca_bus = mca_root_busses[bus];
103 int rc;
103 104
104 mca_dev->dev.parent = &mca_bus->dev; 105 mca_dev->dev.parent = &mca_bus->dev;
105 mca_dev->dev.bus = &mca_bus_type; 106 mca_dev->dev.bus = &mca_bus_type;
@@ -108,13 +109,23 @@ int __init mca_register_device(int bus, struct mca_device *mca_dev)
108 mca_dev->dev.dma_mask = &mca_dev->dma_mask; 109 mca_dev->dev.dma_mask = &mca_dev->dma_mask;
109 mca_dev->dev.coherent_dma_mask = mca_dev->dma_mask; 110 mca_dev->dev.coherent_dma_mask = mca_dev->dma_mask;
110 111
111 if (device_register(&mca_dev->dev)) 112 rc = device_register(&mca_dev->dev);
112 return 0; 113 if (rc)
114 goto err_out;
113 115
114 device_create_file(&mca_dev->dev, &dev_attr_id); 116 rc = device_create_file(&mca_dev->dev, &dev_attr_id);
115 device_create_file(&mca_dev->dev, &dev_attr_pos); 117 if (rc) goto err_out_devreg;
118 rc = device_create_file(&mca_dev->dev, &dev_attr_pos);
119 if (rc) goto err_out_id;
116 120
117 return 1; 121 return 1;
122
123err_out_id:
124 device_remove_file(&mca_dev->dev, &dev_attr_id);
125err_out_devreg:
126 device_unregister(&mca_dev->dev);
127err_out:
128 return 0;
118} 129}
119 130
120/* */ 131/* */
@@ -130,13 +141,16 @@ struct mca_bus * __devinit mca_attach_bus(int bus)
130 return NULL; 141 return NULL;
131 } 142 }
132 143
133 mca_bus = kmalloc(sizeof(struct mca_bus), GFP_KERNEL); 144 mca_bus = kzalloc(sizeof(struct mca_bus), GFP_KERNEL);
134 if (!mca_bus) 145 if (!mca_bus)
135 return NULL; 146 return NULL;
136 memset(mca_bus, 0, sizeof(struct mca_bus)); 147
137 sprintf(mca_bus->dev.bus_id,"mca%d",bus); 148 sprintf(mca_bus->dev.bus_id,"mca%d",bus);
138 sprintf(mca_bus->name,"Host %s MCA Bridge", bus ? "Secondary" : "Primary"); 149 sprintf(mca_bus->name,"Host %s MCA Bridge", bus ? "Secondary" : "Primary");
139 device_register(&mca_bus->dev); 150 if (device_register(&mca_bus->dev)) {
151 kfree(mca_bus);
152 return NULL;
153 }
140 154
141 mca_root_busses[bus] = mca_bus; 155 mca_root_busses[bus] = mca_bus;
142 156
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 8e67634e79a0..d47d38ac71b1 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1413,7 +1413,7 @@ int bitmap_create(mddev_t *mddev)
1413 int err; 1413 int err;
1414 sector_t start; 1414 sector_t start;
1415 1415
1416 BUG_ON(sizeof(bitmap_super_t) != 256); 1416 BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
1417 1417
1418 if (!file && !mddev->bitmap_offset) /* bitmap disabled, nothing to do */ 1418 if (!file && !mddev->bitmap_offset) /* bitmap disabled, nothing to do */
1419 return 0; 1419 return 0;
diff --git a/drivers/net/b44.c b/drivers/net/b44.c
index b124eee4eb10..1ec217433b4c 100644
--- a/drivers/net/b44.c
+++ b/drivers/net/b44.c
@@ -1706,14 +1706,15 @@ static void __b44_set_rx_mode(struct net_device *dev)
1706 1706
1707 __b44_set_mac_addr(bp); 1707 __b44_set_mac_addr(bp);
1708 1708
1709 if (dev->flags & IFF_ALLMULTI) 1709 if ((dev->flags & IFF_ALLMULTI) ||
1710 (dev->mc_count > B44_MCAST_TABLE_SIZE))
1710 val |= RXCONFIG_ALLMULTI; 1711 val |= RXCONFIG_ALLMULTI;
1711 else 1712 else
1712 i = __b44_load_mcast(bp, dev); 1713 i = __b44_load_mcast(bp, dev);
1713 1714
1714 for (; i < 64; i++) { 1715 for (; i < 64; i++)
1715 __b44_cam_write(bp, zero, i); 1716 __b44_cam_write(bp, zero, i);
1716 } 1717
1717 bw32(bp, B44_RXCONFIG, val); 1718 bw32(bp, B44_RXCONFIG, val);
1718 val = br32(bp, B44_CAM_CTRL); 1719 val = br32(bp, B44_CAM_CTRL);
1719 bw32(bp, B44_CAM_CTRL, val | CAM_CTRL_ENABLE); 1720 bw32(bp, B44_CAM_CTRL, val | CAM_CTRL_ENABLE);
@@ -2055,7 +2056,7 @@ static int b44_read_eeprom(struct b44 *bp, u8 *data)
2055 u16 *ptr = (u16 *) data; 2056 u16 *ptr = (u16 *) data;
2056 2057
2057 for (i = 0; i < 128; i += 2) 2058 for (i = 0; i < 128; i += 2)
2058 ptr[i / 2] = readw(bp->regs + 4096 + i); 2059 ptr[i / 2] = cpu_to_le16(readw(bp->regs + 4096 + i));
2059 2060
2060 return 0; 2061 return 0;
2061} 2062}
diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index e83bc825f6af..32923162179e 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1433,7 +1433,7 @@ void bond_alb_monitor(struct bonding *bond)
1433 * write lock to protect from other code that also 1433 * write lock to protect from other code that also
1434 * sets the promiscuity. 1434 * sets the promiscuity.
1435 */ 1435 */
1436 write_lock(&bond->curr_slave_lock); 1436 write_lock_bh(&bond->curr_slave_lock);
1437 1437
1438 if (bond_info->primary_is_promisc && 1438 if (bond_info->primary_is_promisc &&
1439 (++bond_info->rlb_promisc_timeout_counter >= RLB_PROMISC_TIMEOUT)) { 1439 (++bond_info->rlb_promisc_timeout_counter >= RLB_PROMISC_TIMEOUT)) {
@@ -1448,7 +1448,7 @@ void bond_alb_monitor(struct bonding *bond)
1448 bond_info->primary_is_promisc = 0; 1448 bond_info->primary_is_promisc = 0;
1449 } 1449 }
1450 1450
1451 write_unlock(&bond->curr_slave_lock); 1451 write_unlock_bh(&bond->curr_slave_lock);
1452 1452
1453 if (bond_info->rlb_rebalance) { 1453 if (bond_info->rlb_rebalance) {
1454 bond_info->rlb_rebalance = 0; 1454 bond_info->rlb_rebalance = 0;
diff --git a/drivers/net/ehea/ehea.h b/drivers/net/ehea/ehea.h
index 23b451a8ae12..b40724fc6b74 100644
--- a/drivers/net/ehea/ehea.h
+++ b/drivers/net/ehea/ehea.h
@@ -39,7 +39,7 @@
39#include <asm/io.h> 39#include <asm/io.h>
40 40
41#define DRV_NAME "ehea" 41#define DRV_NAME "ehea"
42#define DRV_VERSION "EHEA_0028" 42#define DRV_VERSION "EHEA_0034"
43 43
44#define EHEA_MSG_DEFAULT (NETIF_MSG_LINK | NETIF_MSG_TIMER \ 44#define EHEA_MSG_DEFAULT (NETIF_MSG_LINK | NETIF_MSG_TIMER \
45 | NETIF_MSG_RX_ERR | NETIF_MSG_TX_ERR) 45 | NETIF_MSG_RX_ERR | NETIF_MSG_TX_ERR)
@@ -50,6 +50,7 @@
50#define EHEA_MAX_ENTRIES_SQ 32767 50#define EHEA_MAX_ENTRIES_SQ 32767
51#define EHEA_MIN_ENTRIES_QP 127 51#define EHEA_MIN_ENTRIES_QP 127
52 52
53#define EHEA_SMALL_QUEUES
53#define EHEA_NUM_TX_QP 1 54#define EHEA_NUM_TX_QP 1
54 55
55#ifdef EHEA_SMALL_QUEUES 56#ifdef EHEA_SMALL_QUEUES
@@ -59,11 +60,11 @@
59#define EHEA_DEF_ENTRIES_RQ2 1023 60#define EHEA_DEF_ENTRIES_RQ2 1023
60#define EHEA_DEF_ENTRIES_RQ3 1023 61#define EHEA_DEF_ENTRIES_RQ3 1023
61#else 62#else
62#define EHEA_MAX_CQE_COUNT 32000 63#define EHEA_MAX_CQE_COUNT 4080
63#define EHEA_DEF_ENTRIES_SQ 16000 64#define EHEA_DEF_ENTRIES_SQ 4080
64#define EHEA_DEF_ENTRIES_RQ1 32080 65#define EHEA_DEF_ENTRIES_RQ1 8160
65#define EHEA_DEF_ENTRIES_RQ2 4020 66#define EHEA_DEF_ENTRIES_RQ2 2040
66#define EHEA_DEF_ENTRIES_RQ3 4020 67#define EHEA_DEF_ENTRIES_RQ3 2040
67#endif 68#endif
68 69
69#define EHEA_MAX_ENTRIES_EQ 20 70#define EHEA_MAX_ENTRIES_EQ 20
diff --git a/drivers/net/ehea/ehea_main.c b/drivers/net/ehea/ehea_main.c
index c6b31775e26b..eb7d44de59ff 100644
--- a/drivers/net/ehea/ehea_main.c
+++ b/drivers/net/ehea/ehea_main.c
@@ -766,7 +766,7 @@ static void ehea_parse_eqe(struct ehea_adapter *adapter, u64 eqe)
766 if (EHEA_BMASK_GET(NEQE_PORT_UP, eqe)) { 766 if (EHEA_BMASK_GET(NEQE_PORT_UP, eqe)) {
767 if (!netif_carrier_ok(port->netdev)) { 767 if (!netif_carrier_ok(port->netdev)) {
768 ret = ehea_sense_port_attr( 768 ret = ehea_sense_port_attr(
769 adapter->port[portnum]); 769 port);
770 if (ret) { 770 if (ret) {
771 ehea_error("failed resensing port " 771 ehea_error("failed resensing port "
772 "attributes"); 772 "attributes");
@@ -818,7 +818,7 @@ static void ehea_parse_eqe(struct ehea_adapter *adapter, u64 eqe)
818 netif_stop_queue(port->netdev); 818 netif_stop_queue(port->netdev);
819 break; 819 break;
820 default: 820 default:
821 ehea_error("unknown event code %x", ec); 821 ehea_error("unknown event code %x, eqe=0x%lX", ec, eqe);
822 break; 822 break;
823 } 823 }
824} 824}
@@ -1841,7 +1841,7 @@ static int ehea_start_xmit(struct sk_buff *skb, struct net_device *dev)
1841 1841
1842 if (netif_msg_tx_queued(port)) { 1842 if (netif_msg_tx_queued(port)) {
1843 ehea_info("post swqe on QP %d", pr->qp->init_attr.qp_nr); 1843 ehea_info("post swqe on QP %d", pr->qp->init_attr.qp_nr);
1844 ehea_dump(swqe, sizeof(*swqe), "swqe"); 1844 ehea_dump(swqe, 512, "swqe");
1845 } 1845 }
1846 1846
1847 ehea_post_swqe(pr->qp, swqe); 1847 ehea_post_swqe(pr->qp, swqe);
diff --git a/drivers/net/ehea/ehea_phyp.c b/drivers/net/ehea/ehea_phyp.c
index 4a85aca4c7e9..0b51a8cea077 100644
--- a/drivers/net/ehea/ehea_phyp.c
+++ b/drivers/net/ehea/ehea_phyp.c
@@ -44,71 +44,99 @@ static inline u16 get_order_of_qentries(u16 queue_entries)
44#define H_ALL_RES_TYPE_MR 5 44#define H_ALL_RES_TYPE_MR 5
45#define H_ALL_RES_TYPE_MW 6 45#define H_ALL_RES_TYPE_MW 6
46 46
47static long ehea_hcall_9arg_9ret(unsigned long opcode, 47static long ehea_plpar_hcall_norets(unsigned long opcode,
48 unsigned long arg1, unsigned long arg2, 48 unsigned long arg1,
49 unsigned long arg3, unsigned long arg4, 49 unsigned long arg2,
50 unsigned long arg5, unsigned long arg6, 50 unsigned long arg3,
51 unsigned long arg7, unsigned long arg8, 51 unsigned long arg4,
52 unsigned long arg9, unsigned long *out1, 52 unsigned long arg5,
53 unsigned long *out2,unsigned long *out3, 53 unsigned long arg6,
54 unsigned long *out4,unsigned long *out5, 54 unsigned long arg7)
55 unsigned long *out6,unsigned long *out7,
56 unsigned long *out8,unsigned long *out9)
57{ 55{
58 long hret; 56 long ret;
59 int i, sleep_msecs; 57 int i, sleep_msecs;
60 58
61 for (i = 0; i < 5; i++) { 59 for (i = 0; i < 5; i++) {
62 hret = plpar_hcall_9arg_9ret(opcode,arg1, arg2, arg3, arg4, 60 ret = plpar_hcall_norets(opcode, arg1, arg2, arg3, arg4,
63 arg5, arg6, arg7, arg8, arg9, out1, 61 arg5, arg6, arg7);
64 out2, out3, out4, out5, out6, out7, 62
65 out8, out9); 63 if (H_IS_LONG_BUSY(ret)) {
66 if (H_IS_LONG_BUSY(hret)) { 64 sleep_msecs = get_longbusy_msecs(ret);
67 sleep_msecs = get_longbusy_msecs(hret);
68 msleep_interruptible(sleep_msecs); 65 msleep_interruptible(sleep_msecs);
69 continue; 66 continue;
70 } 67 }
71 68
72 if (hret < H_SUCCESS) 69 if (ret < H_SUCCESS)
73 ehea_error("op=%lx hret=%lx " 70 ehea_error("opcode=%lx ret=%lx"
74 "i1=%lx i2=%lx i3=%lx i4=%lx i5=%lx i6=%lx " 71 " arg1=%lx arg2=%lx arg3=%lx arg4=%lx"
75 "i7=%lx i8=%lx i9=%lx " 72 " arg5=%lx arg6=%lx arg7=%lx ",
76 "o1=%lx o2=%lx o3=%lx o4=%lx o5=%lx o6=%lx " 73 opcode, ret,
77 "o7=%lx o8=%lx o9=%lx", 74 arg1, arg2, arg3, arg4, arg5,
78 opcode, hret, arg1, arg2, arg3, arg4, arg5, 75 arg6, arg7);
79 arg6, arg7, arg8, arg9, *out1, *out2, *out3, 76
80 *out4, *out5, *out6, *out7, *out8, *out9); 77 return ret;
81 return hret;
82 } 78 }
79
83 return H_BUSY; 80 return H_BUSY;
84} 81}
85 82
86u64 ehea_h_query_ehea_qp(const u64 adapter_handle, const u8 qp_category, 83static long ehea_plpar_hcall9(unsigned long opcode,
87 const u64 qp_handle, const u64 sel_mask, void *cb_addr) 84 unsigned long *outs, /* array of 9 outputs */
85 unsigned long arg1,
86 unsigned long arg2,
87 unsigned long arg3,
88 unsigned long arg4,
89 unsigned long arg5,
90 unsigned long arg6,
91 unsigned long arg7,
92 unsigned long arg8,
93 unsigned long arg9)
88{ 94{
89 u64 dummy; 95 long ret;
96 int i, sleep_msecs;
90 97
91 if ((((u64)cb_addr) & (PAGE_SIZE - 1)) != 0) { 98 for (i = 0; i < 5; i++) {
92 ehea_error("not on pageboundary"); 99 ret = plpar_hcall9(opcode, outs,
93 return H_PARAMETER; 100 arg1, arg2, arg3, arg4, arg5,
101 arg6, arg7, arg8, arg9);
102
103 if (H_IS_LONG_BUSY(ret)) {
104 sleep_msecs = get_longbusy_msecs(ret);
105 msleep_interruptible(sleep_msecs);
106 continue;
107 }
108
109 if (ret < H_SUCCESS)
110 ehea_error("opcode=%lx ret=%lx"
111 " arg1=%lx arg2=%lx arg3=%lx arg4=%lx"
112 " arg5=%lx arg6=%lx arg7=%lx arg8=%lx"
113 " arg9=%lx"
114 " out1=%lx out2=%lx out3=%lx out4=%lx"
115 " out5=%lx out6=%lx out7=%lx out8=%lx"
116 " out9=%lx",
117 opcode, ret,
118 arg1, arg2, arg3, arg4, arg5,
119 arg6, arg7, arg8, arg9,
120 outs[0], outs[1], outs[2], outs[3],
121 outs[4], outs[5], outs[6], outs[7],
122 outs[8]);
123
124 return ret;
94 } 125 }
95 126
96 return ehea_hcall_9arg_9ret(H_QUERY_HEA_QP, 127 return H_BUSY;
97 adapter_handle, /* R4 */ 128}
98 qp_category, /* R5 */ 129
99 qp_handle, /* R6 */ 130u64 ehea_h_query_ehea_qp(const u64 adapter_handle, const u8 qp_category,
100 sel_mask, /* R7 */ 131 const u64 qp_handle, const u64 sel_mask, void *cb_addr)
101 virt_to_abs(cb_addr), /* R8 */ 132{
102 0, 0, 0, 0, /* R9-R12 */ 133 return ehea_plpar_hcall_norets(H_QUERY_HEA_QP,
103 &dummy, /* R4 */ 134 adapter_handle, /* R4 */
104 &dummy, /* R5 */ 135 qp_category, /* R5 */
105 &dummy, /* R6 */ 136 qp_handle, /* R6 */
106 &dummy, /* R7 */ 137 sel_mask, /* R7 */
107 &dummy, /* R8 */ 138 virt_to_abs(cb_addr), /* R8 */
108 &dummy, /* R9 */ 139 0, 0);
109 &dummy, /* R10 */
110 &dummy, /* R11 */
111 &dummy); /* R12 */
112} 140}
113 141
114/* input param R5 */ 142/* input param R5 */
@@ -180,6 +208,7 @@ u64 ehea_h_alloc_resource_qp(const u64 adapter_handle,
180 u64 *qp_handle, struct h_epas *h_epas) 208 u64 *qp_handle, struct h_epas *h_epas)
181{ 209{
182 u64 hret; 210 u64 hret;
211 u64 outs[PLPAR_HCALL9_BUFSIZE];
183 212
184 u64 allocate_controls = 213 u64 allocate_controls =
185 EHEA_BMASK_SET(H_ALL_RES_QP_EQPO, init_attr->low_lat_rq1 ? 1 : 0) 214 EHEA_BMASK_SET(H_ALL_RES_QP_EQPO, init_attr->low_lat_rq1 ? 1 : 0)
@@ -219,45 +248,29 @@ u64 ehea_h_alloc_resource_qp(const u64 adapter_handle,
219 EHEA_BMASK_SET(H_ALL_RES_QP_TH_RQ2, init_attr->rq2_threshold) 248 EHEA_BMASK_SET(H_ALL_RES_QP_TH_RQ2, init_attr->rq2_threshold)
220 | EHEA_BMASK_SET(H_ALL_RES_QP_TH_RQ3, init_attr->rq3_threshold); 249 | EHEA_BMASK_SET(H_ALL_RES_QP_TH_RQ3, init_attr->rq3_threshold);
221 250
222 u64 r5_out = 0; 251 hret = ehea_plpar_hcall9(H_ALLOC_HEA_RESOURCE,
223 u64 r6_out = 0; 252 outs,
224 u64 r7_out = 0; 253 adapter_handle, /* R4 */
225 u64 r8_out = 0; 254 allocate_controls, /* R5 */
226 u64 r9_out = 0; 255 init_attr->send_cq_handle, /* R6 */
227 u64 g_la_user_out = 0; 256 init_attr->recv_cq_handle, /* R7 */
228 u64 r11_out = 0; 257 init_attr->aff_eq_handle, /* R8 */
229 u64 r12_out = 0; 258 r9_reg, /* R9 */
230 259 max_r10_reg, /* R10 */
231 hret = ehea_hcall_9arg_9ret(H_ALLOC_HEA_RESOURCE, 260 r11_in, /* R11 */
232 adapter_handle, /* R4 */ 261 threshold); /* R12 */
233 allocate_controls, /* R5 */ 262
234 init_attr->send_cq_handle, /* R6 */ 263 *qp_handle = outs[0];
235 init_attr->recv_cq_handle, /* R7 */ 264 init_attr->qp_nr = (u32)outs[1];
236 init_attr->aff_eq_handle, /* R8 */
237 r9_reg, /* R9 */
238 max_r10_reg, /* R10 */
239 r11_in, /* R11 */
240 threshold, /* R12 */
241 qp_handle, /* R4 */
242 &r5_out, /* R5 */
243 &r6_out, /* R6 */
244 &r7_out, /* R7 */
245 &r8_out, /* R8 */
246 &r9_out, /* R9 */
247 &g_la_user_out, /* R10 */
248 &r11_out, /* R11 */
249 &r12_out); /* R12 */
250
251 init_attr->qp_nr = (u32)r5_out;
252 265
253 init_attr->act_nr_send_wqes = 266 init_attr->act_nr_send_wqes =
254 (u16)EHEA_BMASK_GET(H_ALL_RES_QP_ACT_SWQE, r6_out); 267 (u16)EHEA_BMASK_GET(H_ALL_RES_QP_ACT_SWQE, outs[2]);
255 init_attr->act_nr_rwqes_rq1 = 268 init_attr->act_nr_rwqes_rq1 =
256 (u16)EHEA_BMASK_GET(H_ALL_RES_QP_ACT_R1WQE, r6_out); 269 (u16)EHEA_BMASK_GET(H_ALL_RES_QP_ACT_R1WQE, outs[2]);
257 init_attr->act_nr_rwqes_rq2 = 270 init_attr->act_nr_rwqes_rq2 =
258 (u16)EHEA_BMASK_GET(H_ALL_RES_QP_ACT_R2WQE, r6_out); 271 (u16)EHEA_BMASK_GET(H_ALL_RES_QP_ACT_R2WQE, outs[2]);
259 init_attr->act_nr_rwqes_rq3 = 272 init_attr->act_nr_rwqes_rq3 =
260 (u16)EHEA_BMASK_GET(H_ALL_RES_QP_ACT_R3WQE, r6_out); 273 (u16)EHEA_BMASK_GET(H_ALL_RES_QP_ACT_R3WQE, outs[2]);
261 274
262 init_attr->act_wqe_size_enc_sq = init_attr->wqe_size_enc_sq; 275 init_attr->act_wqe_size_enc_sq = init_attr->wqe_size_enc_sq;
263 init_attr->act_wqe_size_enc_rq1 = init_attr->wqe_size_enc_rq1; 276 init_attr->act_wqe_size_enc_rq1 = init_attr->wqe_size_enc_rq1;
@@ -265,25 +278,25 @@ u64 ehea_h_alloc_resource_qp(const u64 adapter_handle,
265 init_attr->act_wqe_size_enc_rq3 = init_attr->wqe_size_enc_rq3; 278 init_attr->act_wqe_size_enc_rq3 = init_attr->wqe_size_enc_rq3;
266 279
267 init_attr->nr_sq_pages = 280 init_attr->nr_sq_pages =
268 (u32)EHEA_BMASK_GET(H_ALL_RES_QP_SIZE_SQ, r8_out); 281 (u32)EHEA_BMASK_GET(H_ALL_RES_QP_SIZE_SQ, outs[4]);
269 init_attr->nr_rq1_pages = 282 init_attr->nr_rq1_pages =
270 (u32)EHEA_BMASK_GET(H_ALL_RES_QP_SIZE_RQ1, r8_out); 283 (u32)EHEA_BMASK_GET(H_ALL_RES_QP_SIZE_RQ1, outs[4]);
271 init_attr->nr_rq2_pages = 284 init_attr->nr_rq2_pages =
272 (u32)EHEA_BMASK_GET(H_ALL_RES_QP_SIZE_RQ2, r9_out); 285 (u32)EHEA_BMASK_GET(H_ALL_RES_QP_SIZE_RQ2, outs[5]);
273 init_attr->nr_rq3_pages = 286 init_attr->nr_rq3_pages =
274 (u32)EHEA_BMASK_GET(H_ALL_RES_QP_SIZE_RQ3, r9_out); 287 (u32)EHEA_BMASK_GET(H_ALL_RES_QP_SIZE_RQ3, outs[5]);
275 288
276 init_attr->liobn_sq = 289 init_attr->liobn_sq =
277 (u32)EHEA_BMASK_GET(H_ALL_RES_QP_LIOBN_SQ, r11_out); 290 (u32)EHEA_BMASK_GET(H_ALL_RES_QP_LIOBN_SQ, outs[7]);
278 init_attr->liobn_rq1 = 291 init_attr->liobn_rq1 =
279 (u32)EHEA_BMASK_GET(H_ALL_RES_QP_LIOBN_RQ1, r11_out); 292 (u32)EHEA_BMASK_GET(H_ALL_RES_QP_LIOBN_RQ1, outs[7]);
280 init_attr->liobn_rq2 = 293 init_attr->liobn_rq2 =
281 (u32)EHEA_BMASK_GET(H_ALL_RES_QP_LIOBN_RQ2, r12_out); 294 (u32)EHEA_BMASK_GET(H_ALL_RES_QP_LIOBN_RQ2, outs[8]);
282 init_attr->liobn_rq3 = 295 init_attr->liobn_rq3 =
283 (u32)EHEA_BMASK_GET(H_ALL_RES_QP_LIOBN_RQ3, r12_out); 296 (u32)EHEA_BMASK_GET(H_ALL_RES_QP_LIOBN_RQ3, outs[8]);
284 297
285 if (!hret) 298 if (!hret)
286 hcp_epas_ctor(h_epas, g_la_user_out, g_la_user_out); 299 hcp_epas_ctor(h_epas, outs[6], outs[6]);
287 300
288 return hret; 301 return hret;
289} 302}
@@ -292,31 +305,24 @@ u64 ehea_h_alloc_resource_cq(const u64 adapter_handle,
292 struct ehea_cq_attr *cq_attr, 305 struct ehea_cq_attr *cq_attr,
293 u64 *cq_handle, struct h_epas *epas) 306 u64 *cq_handle, struct h_epas *epas)
294{ 307{
295 u64 hret, dummy, act_nr_of_cqes_out, act_pages_out; 308 u64 hret;
296 u64 g_la_privileged_out, g_la_user_out; 309 u64 outs[PLPAR_HCALL9_BUFSIZE];
297 310
298 hret = ehea_hcall_9arg_9ret(H_ALLOC_HEA_RESOURCE, 311 hret = ehea_plpar_hcall9(H_ALLOC_HEA_RESOURCE,
299 adapter_handle, /* R4 */ 312 outs,
300 H_ALL_RES_TYPE_CQ, /* R5 */ 313 adapter_handle, /* R4 */
301 cq_attr->eq_handle, /* R6 */ 314 H_ALL_RES_TYPE_CQ, /* R5 */
302 cq_attr->cq_token, /* R7 */ 315 cq_attr->eq_handle, /* R6 */
303 cq_attr->max_nr_of_cqes, /* R8 */ 316 cq_attr->cq_token, /* R7 */
304 0, 0, 0, 0, /* R9-R12 */ 317 cq_attr->max_nr_of_cqes, /* R8 */
305 cq_handle, /* R4 */ 318 0, 0, 0, 0); /* R9-R12 */
306 &dummy, /* R5 */ 319
307 &dummy, /* R6 */ 320 *cq_handle = outs[0];
308 &act_nr_of_cqes_out, /* R7 */ 321 cq_attr->act_nr_of_cqes = outs[3];
309 &act_pages_out, /* R8 */ 322 cq_attr->nr_pages = outs[4];
310 &g_la_privileged_out, /* R9 */
311 &g_la_user_out, /* R10 */
312 &dummy, /* R11 */
313 &dummy); /* R12 */
314
315 cq_attr->act_nr_of_cqes = act_nr_of_cqes_out;
316 cq_attr->nr_pages = act_pages_out;
317 323
318 if (!hret) 324 if (!hret)
319 hcp_epas_ctor(epas, g_la_privileged_out, g_la_user_out); 325 hcp_epas_ctor(epas, outs[5], outs[6]);
320 326
321 return hret; 327 return hret;
322} 328}
@@ -361,9 +367,8 @@ u64 ehea_h_alloc_resource_cq(const u64 adapter_handle,
361u64 ehea_h_alloc_resource_eq(const u64 adapter_handle, 367u64 ehea_h_alloc_resource_eq(const u64 adapter_handle,
362 struct ehea_eq_attr *eq_attr, u64 *eq_handle) 368 struct ehea_eq_attr *eq_attr, u64 *eq_handle)
363{ 369{
364 u64 hret, dummy, eq_liobn, allocate_controls; 370 u64 hret, allocate_controls;
365 u64 ist1_out, ist2_out, ist3_out, ist4_out; 371 u64 outs[PLPAR_HCALL9_BUFSIZE];
366 u64 act_nr_of_eqes_out, act_pages_out;
367 372
368 /* resource type */ 373 /* resource type */
369 allocate_controls = 374 allocate_controls =
@@ -372,27 +377,20 @@ u64 ehea_h_alloc_resource_eq(const u64 adapter_handle,
372 | EHEA_BMASK_SET(H_ALL_RES_EQ_INH_EQE_GEN, !eq_attr->eqe_gen) 377 | EHEA_BMASK_SET(H_ALL_RES_EQ_INH_EQE_GEN, !eq_attr->eqe_gen)
373 | EHEA_BMASK_SET(H_ALL_RES_EQ_NON_NEQ_ISN, 1); 378 | EHEA_BMASK_SET(H_ALL_RES_EQ_NON_NEQ_ISN, 1);
374 379
375 hret = ehea_hcall_9arg_9ret(H_ALLOC_HEA_RESOURCE, 380 hret = ehea_plpar_hcall9(H_ALLOC_HEA_RESOURCE,
376 adapter_handle, /* R4 */ 381 outs,
377 allocate_controls, /* R5 */ 382 adapter_handle, /* R4 */
378 eq_attr->max_nr_of_eqes, /* R6 */ 383 allocate_controls, /* R5 */
379 0, 0, 0, 0, 0, 0, /* R7-R10 */ 384 eq_attr->max_nr_of_eqes, /* R6 */
380 eq_handle, /* R4 */ 385 0, 0, 0, 0, 0, 0); /* R7-R10 */
381 &dummy, /* R5 */ 386
382 &eq_liobn, /* R6 */ 387 *eq_handle = outs[0];
383 &act_nr_of_eqes_out, /* R7 */ 388 eq_attr->act_nr_of_eqes = outs[3];
384 &act_pages_out, /* R8 */ 389 eq_attr->nr_pages = outs[4];
385 &ist1_out, /* R9 */ 390 eq_attr->ist1 = outs[5];
386 &ist2_out, /* R10 */ 391 eq_attr->ist2 = outs[6];
387 &ist3_out, /* R11 */ 392 eq_attr->ist3 = outs[7];
388 &ist4_out); /* R12 */ 393 eq_attr->ist4 = outs[8];
389
390 eq_attr->act_nr_of_eqes = act_nr_of_eqes_out;
391 eq_attr->nr_pages = act_pages_out;
392 eq_attr->ist1 = ist1_out;
393 eq_attr->ist2 = ist2_out;
394 eq_attr->ist3 = ist3_out;
395 eq_attr->ist4 = ist4_out;
396 394
397 return hret; 395 return hret;
398} 396}
@@ -402,31 +400,22 @@ u64 ehea_h_modify_ehea_qp(const u64 adapter_handle, const u8 cat,
402 void *cb_addr, u64 *inv_attr_id, u64 *proc_mask, 400 void *cb_addr, u64 *inv_attr_id, u64 *proc_mask,
403 u16 *out_swr, u16 *out_rwr) 401 u16 *out_swr, u16 *out_rwr)
404{ 402{
405 u64 hret, dummy, act_out_swr, act_out_rwr; 403 u64 hret;
406 404 u64 outs[PLPAR_HCALL9_BUFSIZE];
407 if ((((u64)cb_addr) & (PAGE_SIZE - 1)) != 0) { 405
408 ehea_error("not on page boundary"); 406 hret = ehea_plpar_hcall9(H_MODIFY_HEA_QP,
409 return H_PARAMETER; 407 outs,
410 } 408 adapter_handle, /* R4 */
411 409 (u64) cat, /* R5 */
412 hret = ehea_hcall_9arg_9ret(H_MODIFY_HEA_QP, 410 qp_handle, /* R6 */
413 adapter_handle, /* R4 */ 411 sel_mask, /* R7 */
414 (u64) cat, /* R5 */ 412 virt_to_abs(cb_addr), /* R8 */
415 qp_handle, /* R6 */ 413 0, 0, 0, 0); /* R9-R12 */
416 sel_mask, /* R7 */ 414
417 virt_to_abs(cb_addr), /* R8 */ 415 *inv_attr_id = outs[0];
418 0, 0, 0, 0, /* R9-R12 */ 416 *out_swr = outs[3];
419 inv_attr_id, /* R4 */ 417 *out_rwr = outs[4];
420 &dummy, /* R5 */ 418 *proc_mask = outs[5];
421 &dummy, /* R6 */
422 &act_out_swr, /* R7 */
423 &act_out_rwr, /* R8 */
424 proc_mask, /* R9 */
425 &dummy, /* R10 */
426 &dummy, /* R11 */
427 &dummy); /* R12 */
428 *out_swr = act_out_swr;
429 *out_rwr = act_out_rwr;
430 419
431 return hret; 420 return hret;
432} 421}
@@ -435,122 +424,81 @@ u64 ehea_h_register_rpage(const u64 adapter_handle, const u8 pagesize,
435 const u8 queue_type, const u64 resource_handle, 424 const u8 queue_type, const u64 resource_handle,
436 const u64 log_pageaddr, u64 count) 425 const u64 log_pageaddr, u64 count)
437{ 426{
438 u64 dummy, reg_control; 427 u64 reg_control;
439 428
440 reg_control = EHEA_BMASK_SET(H_REG_RPAGE_PAGE_SIZE, pagesize) 429 reg_control = EHEA_BMASK_SET(H_REG_RPAGE_PAGE_SIZE, pagesize)
441 | EHEA_BMASK_SET(H_REG_RPAGE_QT, queue_type); 430 | EHEA_BMASK_SET(H_REG_RPAGE_QT, queue_type);
442 431
443 return ehea_hcall_9arg_9ret(H_REGISTER_HEA_RPAGES, 432 return ehea_plpar_hcall_norets(H_REGISTER_HEA_RPAGES,
444 adapter_handle, /* R4 */ 433 adapter_handle, /* R4 */
445 reg_control, /* R5 */ 434 reg_control, /* R5 */
446 resource_handle, /* R6 */ 435 resource_handle, /* R6 */
447 log_pageaddr, /* R7 */ 436 log_pageaddr, /* R7 */
448 count, /* R8 */ 437 count, /* R8 */
449 0, 0, 0, 0, /* R9-R12 */ 438 0, 0); /* R9-R10 */
450 &dummy, /* R4 */
451 &dummy, /* R5 */
452 &dummy, /* R6 */
453 &dummy, /* R7 */
454 &dummy, /* R8 */
455 &dummy, /* R9 */
456 &dummy, /* R10 */
457 &dummy, /* R11 */
458 &dummy); /* R12 */
459} 439}
460 440
461u64 ehea_h_register_smr(const u64 adapter_handle, const u64 orig_mr_handle, 441u64 ehea_h_register_smr(const u64 adapter_handle, const u64 orig_mr_handle,
462 const u64 vaddr_in, const u32 access_ctrl, const u32 pd, 442 const u64 vaddr_in, const u32 access_ctrl, const u32 pd,
463 struct ehea_mr *mr) 443 struct ehea_mr *mr)
464{ 444{
465 u64 hret, dummy, lkey_out; 445 u64 hret;
466 446 u64 outs[PLPAR_HCALL9_BUFSIZE];
467 hret = ehea_hcall_9arg_9ret(H_REGISTER_SMR, 447
468 adapter_handle , /* R4 */ 448 hret = ehea_plpar_hcall9(H_REGISTER_SMR,
469 orig_mr_handle, /* R5 */ 449 outs,
470 vaddr_in, /* R6 */ 450 adapter_handle , /* R4 */
471 (((u64)access_ctrl) << 32ULL), /* R7 */ 451 orig_mr_handle, /* R5 */
472 pd, /* R8 */ 452 vaddr_in, /* R6 */
473 0, 0, 0, 0, /* R9-R12 */ 453 (((u64)access_ctrl) << 32ULL), /* R7 */
474 &mr->handle, /* R4 */ 454 pd, /* R8 */
475 &dummy, /* R5 */ 455 0, 0, 0, 0); /* R9-R12 */
476 &lkey_out, /* R6 */ 456
477 &dummy, /* R7 */ 457 mr->handle = outs[0];
478 &dummy, /* R8 */ 458 mr->lkey = (u32)outs[2];
479 &dummy, /* R9 */
480 &dummy, /* R10 */
481 &dummy, /* R11 */
482 &dummy); /* R12 */
483 mr->lkey = (u32)lkey_out;
484 459
485 return hret; 460 return hret;
486} 461}
487 462
488u64 ehea_h_disable_and_get_hea(const u64 adapter_handle, const u64 qp_handle) 463u64 ehea_h_disable_and_get_hea(const u64 adapter_handle, const u64 qp_handle)
489{ 464{
490 u64 hret, dummy, ladr_next_sq_wqe_out; 465 u64 outs[PLPAR_HCALL9_BUFSIZE];
491 u64 ladr_next_rq1_wqe_out, ladr_next_rq2_wqe_out, ladr_next_rq3_wqe_out; 466
492 467 return ehea_plpar_hcall9(H_DISABLE_AND_GET_HEA,
493 hret = ehea_hcall_9arg_9ret(H_DISABLE_AND_GET_HEA, 468 outs,
494 adapter_handle, /* R4 */ 469 adapter_handle, /* R4 */
495 H_DISABLE_GET_EHEA_WQE_P, /* R5 */ 470 H_DISABLE_GET_EHEA_WQE_P, /* R5 */
496 qp_handle, /* R6 */ 471 qp_handle, /* R6 */
497 0, 0, 0, 0, 0, 0, /* R7-R12 */ 472 0, 0, 0, 0, 0, 0); /* R7-R12 */
498 &ladr_next_sq_wqe_out, /* R4 */
499 &ladr_next_rq1_wqe_out, /* R5 */
500 &ladr_next_rq2_wqe_out, /* R6 */
501 &ladr_next_rq3_wqe_out, /* R7 */
502 &dummy, /* R8 */
503 &dummy, /* R9 */
504 &dummy, /* R10 */
505 &dummy, /* R11 */
506 &dummy); /* R12 */
507 return hret;
508} 473}
509 474
510u64 ehea_h_free_resource(const u64 adapter_handle, const u64 res_handle) 475u64 ehea_h_free_resource(const u64 adapter_handle, const u64 res_handle)
511{ 476{
512 u64 dummy; 477 return ehea_plpar_hcall_norets(H_FREE_RESOURCE,
513 478 adapter_handle, /* R4 */
514 return ehea_hcall_9arg_9ret(H_FREE_RESOURCE, 479 res_handle, /* R5 */
515 adapter_handle, /* R4 */ 480 0, 0, 0, 0, 0); /* R6-R10 */
516 res_handle, /* R5 */
517 0, 0, 0, 0, 0, 0, 0, /* R6-R12 */
518 &dummy, /* R4 */
519 &dummy, /* R5 */
520 &dummy, /* R6 */
521 &dummy, /* R7 */
522 &dummy, /* R8 */
523 &dummy, /* R9 */
524 &dummy, /* R10 */
525 &dummy, /* R11 */
526 &dummy); /* R12 */
527} 481}
528 482
529u64 ehea_h_alloc_resource_mr(const u64 adapter_handle, const u64 vaddr, 483u64 ehea_h_alloc_resource_mr(const u64 adapter_handle, const u64 vaddr,
530 const u64 length, const u32 access_ctrl, 484 const u64 length, const u32 access_ctrl,
531 const u32 pd, u64 *mr_handle, u32 *lkey) 485 const u32 pd, u64 *mr_handle, u32 *lkey)
532{ 486{
533 u64 hret, dummy, lkey_out; 487 u64 hret;
534 488 u64 outs[PLPAR_HCALL9_BUFSIZE];
535 hret = ehea_hcall_9arg_9ret(H_ALLOC_HEA_RESOURCE, 489
536 adapter_handle, /* R4 */ 490 hret = ehea_plpar_hcall9(H_ALLOC_HEA_RESOURCE,
537 5, /* R5 */ 491 outs,
538 vaddr, /* R6 */ 492 adapter_handle, /* R4 */
539 length, /* R7 */ 493 5, /* R5 */
540 (((u64) access_ctrl) << 32ULL),/* R8 */ 494 vaddr, /* R6 */
541 pd, /* R9 */ 495 length, /* R7 */
542 0, 0, 0, /* R10-R12 */ 496 (((u64) access_ctrl) << 32ULL), /* R8 */
543 mr_handle, /* R4 */ 497 pd, /* R9 */
544 &dummy, /* R5 */ 498 0, 0, 0); /* R10-R12 */
545 &lkey_out, /* R6 */ 499
546 &dummy, /* R7 */ 500 *mr_handle = outs[0];
547 &dummy, /* R8 */ 501 *lkey = (u32)outs[2];
548 &dummy, /* R9 */
549 &dummy, /* R10 */
550 &dummy, /* R11 */
551 &dummy); /* R12 */
552 *lkey = (u32) lkey_out;
553
554 return hret; 502 return hret;
555} 503}
556 504
@@ -570,23 +518,14 @@ u64 ehea_h_register_rpage_mr(const u64 adapter_handle, const u64 mr_handle,
570 518
571u64 ehea_h_query_ehea(const u64 adapter_handle, void *cb_addr) 519u64 ehea_h_query_ehea(const u64 adapter_handle, void *cb_addr)
572{ 520{
573 u64 hret, dummy, cb_logaddr; 521 u64 hret, cb_logaddr;
574 522
575 cb_logaddr = virt_to_abs(cb_addr); 523 cb_logaddr = virt_to_abs(cb_addr);
576 524
577 hret = ehea_hcall_9arg_9ret(H_QUERY_HEA, 525 hret = ehea_plpar_hcall_norets(H_QUERY_HEA,
578 adapter_handle, /* R4 */ 526 adapter_handle, /* R4 */
579 cb_logaddr, /* R5 */ 527 cb_logaddr, /* R5 */
580 0, 0, 0, 0, 0, 0, 0, /* R6-R12 */ 528 0, 0, 0, 0, 0); /* R6-R10 */
581 &dummy, /* R4 */
582 &dummy, /* R5 */
583 &dummy, /* R6 */
584 &dummy, /* R7 */
585 &dummy, /* R8 */
586 &dummy, /* R9 */
587 &dummy, /* R10 */
588 &dummy, /* R11 */
589 &dummy); /* R12 */
590#ifdef DEBUG 529#ifdef DEBUG
591 ehea_dmp(cb_addr, sizeof(struct hcp_query_ehea), "hcp_query_ehea"); 530 ehea_dmp(cb_addr, sizeof(struct hcp_query_ehea), "hcp_query_ehea");
592#endif 531#endif
@@ -597,36 +536,28 @@ u64 ehea_h_query_ehea_port(const u64 adapter_handle, const u16 port_num,
597 const u8 cb_cat, const u64 select_mask, 536 const u8 cb_cat, const u64 select_mask,
598 void *cb_addr) 537 void *cb_addr)
599{ 538{
600 u64 port_info, dummy; 539 u64 port_info;
601 u64 cb_logaddr = virt_to_abs(cb_addr); 540 u64 cb_logaddr = virt_to_abs(cb_addr);
602 u64 arr_index = 0; 541 u64 arr_index = 0;
603 542
604 port_info = EHEA_BMASK_SET(H_MEHEAPORT_CAT, cb_cat) 543 port_info = EHEA_BMASK_SET(H_MEHEAPORT_CAT, cb_cat)
605 | EHEA_BMASK_SET(H_MEHEAPORT_PN, port_num); 544 | EHEA_BMASK_SET(H_MEHEAPORT_PN, port_num);
606 545
607 return ehea_hcall_9arg_9ret(H_QUERY_HEA_PORT, 546 return ehea_plpar_hcall_norets(H_QUERY_HEA_PORT,
608 adapter_handle, /* R4 */ 547 adapter_handle, /* R4 */
609 port_info, /* R5 */ 548 port_info, /* R5 */
610 select_mask, /* R6 */ 549 select_mask, /* R6 */
611 arr_index, /* R7 */ 550 arr_index, /* R7 */
612 cb_logaddr, /* R8 */ 551 cb_logaddr, /* R8 */
613 0, 0, 0, 0, /* R9-R12 */ 552 0, 0); /* R9-R10 */
614 &dummy, /* R4 */
615 &dummy, /* R5 */
616 &dummy, /* R6 */
617 &dummy, /* R7 */
618 &dummy, /* R8 */
619 &dummy, /* R9 */
620 &dummy, /* R10 */
621 &dummy, /* R11 */
622 &dummy); /* R12 */
623} 553}
624 554
625u64 ehea_h_modify_ehea_port(const u64 adapter_handle, const u16 port_num, 555u64 ehea_h_modify_ehea_port(const u64 adapter_handle, const u16 port_num,
626 const u8 cb_cat, const u64 select_mask, 556 const u8 cb_cat, const u64 select_mask,
627 void *cb_addr) 557 void *cb_addr)
628{ 558{
629 u64 port_info, dummy, inv_attr_ident, proc_mask; 559 u64 outs[PLPAR_HCALL9_BUFSIZE];
560 u64 port_info;
630 u64 arr_index = 0; 561 u64 arr_index = 0;
631 u64 cb_logaddr = virt_to_abs(cb_addr); 562 u64 cb_logaddr = virt_to_abs(cb_addr);
632 563
@@ -635,29 +566,21 @@ u64 ehea_h_modify_ehea_port(const u64 adapter_handle, const u16 port_num,
635#ifdef DEBUG 566#ifdef DEBUG
636 ehea_dump(cb_addr, sizeof(struct hcp_ehea_port_cb0), "Before HCALL"); 567 ehea_dump(cb_addr, sizeof(struct hcp_ehea_port_cb0), "Before HCALL");
637#endif 568#endif
638 return ehea_hcall_9arg_9ret(H_MODIFY_HEA_PORT, 569 return ehea_plpar_hcall9(H_MODIFY_HEA_PORT,
639 adapter_handle, /* R4 */ 570 outs,
640 port_info, /* R5 */ 571 adapter_handle, /* R4 */
641 select_mask, /* R6 */ 572 port_info, /* R5 */
642 arr_index, /* R7 */ 573 select_mask, /* R6 */
643 cb_logaddr, /* R8 */ 574 arr_index, /* R7 */
644 0, 0, 0, 0, /* R9-R12 */ 575 cb_logaddr, /* R8 */
645 &inv_attr_ident, /* R4 */ 576 0, 0, 0, 0); /* R9-R12 */
646 &proc_mask, /* R5 */
647 &dummy, /* R6 */
648 &dummy, /* R7 */
649 &dummy, /* R8 */
650 &dummy, /* R9 */
651 &dummy, /* R10 */
652 &dummy, /* R11 */
653 &dummy); /* R12 */
654} 577}
655 578
656u64 ehea_h_reg_dereg_bcmc(const u64 adapter_handle, const u16 port_num, 579u64 ehea_h_reg_dereg_bcmc(const u64 adapter_handle, const u16 port_num,
657 const u8 reg_type, const u64 mc_mac_addr, 580 const u8 reg_type, const u64 mc_mac_addr,
658 const u16 vlan_id, const u32 hcall_id) 581 const u16 vlan_id, const u32 hcall_id)
659{ 582{
660 u64 r5_port_num, r6_reg_type, r7_mc_mac_addr, r8_vlan_id, dummy; 583 u64 r5_port_num, r6_reg_type, r7_mc_mac_addr, r8_vlan_id;
661 u64 mac_addr = mc_mac_addr >> 16; 584 u64 mac_addr = mc_mac_addr >> 16;
662 585
663 r5_port_num = EHEA_BMASK_SET(H_REGBCMC_PN, port_num); 586 r5_port_num = EHEA_BMASK_SET(H_REGBCMC_PN, port_num);
@@ -665,41 +588,21 @@ u64 ehea_h_reg_dereg_bcmc(const u64 adapter_handle, const u16 port_num,
665 r7_mc_mac_addr = EHEA_BMASK_SET(H_REGBCMC_MACADDR, mac_addr); 588 r7_mc_mac_addr = EHEA_BMASK_SET(H_REGBCMC_MACADDR, mac_addr);
666 r8_vlan_id = EHEA_BMASK_SET(H_REGBCMC_VLANID, vlan_id); 589 r8_vlan_id = EHEA_BMASK_SET(H_REGBCMC_VLANID, vlan_id);
667 590
668 return ehea_hcall_9arg_9ret(hcall_id, 591 return ehea_plpar_hcall_norets(hcall_id,
669 adapter_handle, /* R4 */ 592 adapter_handle, /* R4 */
670 r5_port_num, /* R5 */ 593 r5_port_num, /* R5 */
671 r6_reg_type, /* R6 */ 594 r6_reg_type, /* R6 */
672 r7_mc_mac_addr, /* R7 */ 595 r7_mc_mac_addr, /* R7 */
673 r8_vlan_id, /* R8 */ 596 r8_vlan_id, /* R8 */
674 0, 0, 0, 0, /* R9-R12 */ 597 0, 0); /* R9-R12 */
675 &dummy, /* R4 */
676 &dummy, /* R5 */
677 &dummy, /* R6 */
678 &dummy, /* R7 */
679 &dummy, /* R8 */
680 &dummy, /* R9 */
681 &dummy, /* R10 */
682 &dummy, /* R11 */
683 &dummy); /* R12 */
684} 598}
685 599
686u64 ehea_h_reset_events(const u64 adapter_handle, const u64 neq_handle, 600u64 ehea_h_reset_events(const u64 adapter_handle, const u64 neq_handle,
687 const u64 event_mask) 601 const u64 event_mask)
688{ 602{
689 u64 dummy; 603 return ehea_plpar_hcall_norets(H_RESET_EVENTS,
690 604 adapter_handle, /* R4 */
691 return ehea_hcall_9arg_9ret(H_RESET_EVENTS, 605 neq_handle, /* R5 */
692 adapter_handle, /* R4 */ 606 event_mask, /* R6 */
693 neq_handle, /* R5 */ 607 0, 0, 0, 0); /* R7-R12 */
694 event_mask, /* R6 */
695 0, 0, 0, 0, 0, 0, /* R7-R12 */
696 &dummy, /* R4 */
697 &dummy, /* R5 */
698 &dummy, /* R6 */
699 &dummy, /* R7 */
700 &dummy, /* R8 */
701 &dummy, /* R9 */
702 &dummy, /* R10 */
703 &dummy, /* R11 */
704 &dummy); /* R12 */
705} 608}
diff --git a/drivers/net/eth16i.c b/drivers/net/eth16i.c
index 8cc3c331aca8..b7b8bc2a6307 100644
--- a/drivers/net/eth16i.c
+++ b/drivers/net/eth16i.c
@@ -162,9 +162,9 @@ static char *version =
162#include <linux/skbuff.h> 162#include <linux/skbuff.h>
163#include <linux/bitops.h> 163#include <linux/bitops.h>
164#include <linux/jiffies.h> 164#include <linux/jiffies.h>
165#include <linux/io.h>
165 166
166#include <asm/system.h> 167#include <asm/system.h>
167#include <asm/io.h>
168#include <asm/dma.h> 168#include <asm/dma.h>
169 169
170 170
diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c
index 99b7a411db28..c5ed635bce36 100644
--- a/drivers/net/forcedeth.c
+++ b/drivers/net/forcedeth.c
@@ -2497,6 +2497,7 @@ static irqreturn_t nv_nic_irq_tx(int foo, void *data)
2497 u8 __iomem *base = get_hwbase(dev); 2497 u8 __iomem *base = get_hwbase(dev);
2498 u32 events; 2498 u32 events;
2499 int i; 2499 int i;
2500 unsigned long flags;
2500 2501
2501 dprintk(KERN_DEBUG "%s: nv_nic_irq_tx\n", dev->name); 2502 dprintk(KERN_DEBUG "%s: nv_nic_irq_tx\n", dev->name);
2502 2503
@@ -2508,16 +2509,16 @@ static irqreturn_t nv_nic_irq_tx(int foo, void *data)
2508 if (!(events & np->irqmask)) 2509 if (!(events & np->irqmask))
2509 break; 2510 break;
2510 2511
2511 spin_lock_irq(&np->lock); 2512 spin_lock_irqsave(&np->lock, flags);
2512 nv_tx_done(dev); 2513 nv_tx_done(dev);
2513 spin_unlock_irq(&np->lock); 2514 spin_unlock_irqrestore(&np->lock, flags);
2514 2515
2515 if (events & (NVREG_IRQ_TX_ERR)) { 2516 if (events & (NVREG_IRQ_TX_ERR)) {
2516 dprintk(KERN_DEBUG "%s: received irq with events 0x%x. Probably TX fail.\n", 2517 dprintk(KERN_DEBUG "%s: received irq with events 0x%x. Probably TX fail.\n",
2517 dev->name, events); 2518 dev->name, events);
2518 } 2519 }
2519 if (i > max_interrupt_work) { 2520 if (i > max_interrupt_work) {
2520 spin_lock_irq(&np->lock); 2521 spin_lock_irqsave(&np->lock, flags);
2521 /* disable interrupts on the nic */ 2522 /* disable interrupts on the nic */
2522 writel(NVREG_IRQ_TX_ALL, base + NvRegIrqMask); 2523 writel(NVREG_IRQ_TX_ALL, base + NvRegIrqMask);
2523 pci_push(base); 2524 pci_push(base);
@@ -2527,7 +2528,7 @@ static irqreturn_t nv_nic_irq_tx(int foo, void *data)
2527 mod_timer(&np->nic_poll, jiffies + POLL_WAIT); 2528 mod_timer(&np->nic_poll, jiffies + POLL_WAIT);
2528 } 2529 }
2529 printk(KERN_DEBUG "%s: too many iterations (%d) in nv_nic_irq_tx.\n", dev->name, i); 2530 printk(KERN_DEBUG "%s: too many iterations (%d) in nv_nic_irq_tx.\n", dev->name, i);
2530 spin_unlock_irq(&np->lock); 2531 spin_unlock_irqrestore(&np->lock, flags);
2531 break; 2532 break;
2532 } 2533 }
2533 2534
@@ -2601,6 +2602,7 @@ static irqreturn_t nv_nic_irq_rx(int foo, void *data)
2601 u8 __iomem *base = get_hwbase(dev); 2602 u8 __iomem *base = get_hwbase(dev);
2602 u32 events; 2603 u32 events;
2603 int i; 2604 int i;
2605 unsigned long flags;
2604 2606
2605 dprintk(KERN_DEBUG "%s: nv_nic_irq_rx\n", dev->name); 2607 dprintk(KERN_DEBUG "%s: nv_nic_irq_rx\n", dev->name);
2606 2608
@@ -2614,14 +2616,14 @@ static irqreturn_t nv_nic_irq_rx(int foo, void *data)
2614 2616
2615 nv_rx_process(dev, dev->weight); 2617 nv_rx_process(dev, dev->weight);
2616 if (nv_alloc_rx(dev)) { 2618 if (nv_alloc_rx(dev)) {
2617 spin_lock_irq(&np->lock); 2619 spin_lock_irqsave(&np->lock, flags);
2618 if (!np->in_shutdown) 2620 if (!np->in_shutdown)
2619 mod_timer(&np->oom_kick, jiffies + OOM_REFILL); 2621 mod_timer(&np->oom_kick, jiffies + OOM_REFILL);
2620 spin_unlock_irq(&np->lock); 2622 spin_unlock_irqrestore(&np->lock, flags);
2621 } 2623 }
2622 2624
2623 if (i > max_interrupt_work) { 2625 if (i > max_interrupt_work) {
2624 spin_lock_irq(&np->lock); 2626 spin_lock_irqsave(&np->lock, flags);
2625 /* disable interrupts on the nic */ 2627 /* disable interrupts on the nic */
2626 writel(NVREG_IRQ_RX_ALL, base + NvRegIrqMask); 2628 writel(NVREG_IRQ_RX_ALL, base + NvRegIrqMask);
2627 pci_push(base); 2629 pci_push(base);
@@ -2631,7 +2633,7 @@ static irqreturn_t nv_nic_irq_rx(int foo, void *data)
2631 mod_timer(&np->nic_poll, jiffies + POLL_WAIT); 2633 mod_timer(&np->nic_poll, jiffies + POLL_WAIT);
2632 } 2634 }
2633 printk(KERN_DEBUG "%s: too many iterations (%d) in nv_nic_irq_rx.\n", dev->name, i); 2635 printk(KERN_DEBUG "%s: too many iterations (%d) in nv_nic_irq_rx.\n", dev->name, i);
2634 spin_unlock_irq(&np->lock); 2636 spin_unlock_irqrestore(&np->lock, flags);
2635 break; 2637 break;
2636 } 2638 }
2637 } 2639 }
@@ -2648,6 +2650,7 @@ static irqreturn_t nv_nic_irq_other(int foo, void *data)
2648 u8 __iomem *base = get_hwbase(dev); 2650 u8 __iomem *base = get_hwbase(dev);
2649 u32 events; 2651 u32 events;
2650 int i; 2652 int i;
2653 unsigned long flags;
2651 2654
2652 dprintk(KERN_DEBUG "%s: nv_nic_irq_other\n", dev->name); 2655 dprintk(KERN_DEBUG "%s: nv_nic_irq_other\n", dev->name);
2653 2656
@@ -2660,14 +2663,14 @@ static irqreturn_t nv_nic_irq_other(int foo, void *data)
2660 break; 2663 break;
2661 2664
2662 if (events & NVREG_IRQ_LINK) { 2665 if (events & NVREG_IRQ_LINK) {
2663 spin_lock_irq(&np->lock); 2666 spin_lock_irqsave(&np->lock, flags);
2664 nv_link_irq(dev); 2667 nv_link_irq(dev);
2665 spin_unlock_irq(&np->lock); 2668 spin_unlock_irqrestore(&np->lock, flags);
2666 } 2669 }
2667 if (np->need_linktimer && time_after(jiffies, np->link_timeout)) { 2670 if (np->need_linktimer && time_after(jiffies, np->link_timeout)) {
2668 spin_lock_irq(&np->lock); 2671 spin_lock_irqsave(&np->lock, flags);
2669 nv_linkchange(dev); 2672 nv_linkchange(dev);
2670 spin_unlock_irq(&np->lock); 2673 spin_unlock_irqrestore(&np->lock, flags);
2671 np->link_timeout = jiffies + LINK_TIMEOUT; 2674 np->link_timeout = jiffies + LINK_TIMEOUT;
2672 } 2675 }
2673 if (events & (NVREG_IRQ_UNKNOWN)) { 2676 if (events & (NVREG_IRQ_UNKNOWN)) {
@@ -2675,7 +2678,7 @@ static irqreturn_t nv_nic_irq_other(int foo, void *data)
2675 dev->name, events); 2678 dev->name, events);
2676 } 2679 }
2677 if (i > max_interrupt_work) { 2680 if (i > max_interrupt_work) {
2678 spin_lock_irq(&np->lock); 2681 spin_lock_irqsave(&np->lock, flags);
2679 /* disable interrupts on the nic */ 2682 /* disable interrupts on the nic */
2680 writel(NVREG_IRQ_OTHER, base + NvRegIrqMask); 2683 writel(NVREG_IRQ_OTHER, base + NvRegIrqMask);
2681 pci_push(base); 2684 pci_push(base);
@@ -2685,7 +2688,7 @@ static irqreturn_t nv_nic_irq_other(int foo, void *data)
2685 mod_timer(&np->nic_poll, jiffies + POLL_WAIT); 2688 mod_timer(&np->nic_poll, jiffies + POLL_WAIT);
2686 } 2689 }
2687 printk(KERN_DEBUG "%s: too many iterations (%d) in nv_nic_irq_other.\n", dev->name, i); 2690 printk(KERN_DEBUG "%s: too many iterations (%d) in nv_nic_irq_other.\n", dev->name, i);
2688 spin_unlock_irq(&np->lock); 2691 spin_unlock_irqrestore(&np->lock, flags);
2689 break; 2692 break;
2690 } 2693 }
2691 2694
diff --git a/drivers/net/ibmveth.c b/drivers/net/ibmveth.c
index 4bac3cd8f235..2802db23d3cb 100644
--- a/drivers/net/ibmveth.c
+++ b/drivers/net/ibmveth.c
@@ -213,6 +213,7 @@ static void ibmveth_replenish_buffer_pool(struct ibmveth_adapter *adapter, struc
213 } 213 }
214 214
215 free_index = pool->consumer_index++ % pool->size; 215 free_index = pool->consumer_index++ % pool->size;
216 pool->consumer_index = free_index;
216 index = pool->free_map[free_index]; 217 index = pool->free_map[free_index];
217 218
218 ibmveth_assert(index != IBM_VETH_INVALID_MAP); 219 ibmveth_assert(index != IBM_VETH_INVALID_MAP);
@@ -238,7 +239,10 @@ static void ibmveth_replenish_buffer_pool(struct ibmveth_adapter *adapter, struc
238 if(lpar_rc != H_SUCCESS) { 239 if(lpar_rc != H_SUCCESS) {
239 pool->free_map[free_index] = index; 240 pool->free_map[free_index] = index;
240 pool->skbuff[index] = NULL; 241 pool->skbuff[index] = NULL;
241 pool->consumer_index--; 242 if (pool->consumer_index == 0)
243 pool->consumer_index = pool->size - 1;
244 else
245 pool->consumer_index--;
242 dma_unmap_single(&adapter->vdev->dev, 246 dma_unmap_single(&adapter->vdev->dev,
243 pool->dma_addr[index], pool->buff_size, 247 pool->dma_addr[index], pool->buff_size,
244 DMA_FROM_DEVICE); 248 DMA_FROM_DEVICE);
@@ -326,6 +330,7 @@ static void ibmveth_remove_buffer_from_pool(struct ibmveth_adapter *adapter, u64
326 DMA_FROM_DEVICE); 330 DMA_FROM_DEVICE);
327 331
328 free_index = adapter->rx_buff_pool[pool].producer_index++ % adapter->rx_buff_pool[pool].size; 332 free_index = adapter->rx_buff_pool[pool].producer_index++ % adapter->rx_buff_pool[pool].size;
333 adapter->rx_buff_pool[pool].producer_index = free_index;
329 adapter->rx_buff_pool[pool].free_map[free_index] = index; 334 adapter->rx_buff_pool[pool].free_map[free_index] = index;
330 335
331 mb(); 336 mb();
@@ -437,6 +442,31 @@ static void ibmveth_cleanup(struct ibmveth_adapter *adapter)
437 &adapter->rx_buff_pool[i]); 442 &adapter->rx_buff_pool[i]);
438} 443}
439 444
445static int ibmveth_register_logical_lan(struct ibmveth_adapter *adapter,
446 union ibmveth_buf_desc rxq_desc, u64 mac_address)
447{
448 int rc, try_again = 1;
449
450 /* After a kexec the adapter will still be open, so our attempt to
451 * open it will fail. So if we get a failure we free the adapter and
452 * try again, but only once. */
453retry:
454 rc = h_register_logical_lan(adapter->vdev->unit_address,
455 adapter->buffer_list_dma, rxq_desc.desc,
456 adapter->filter_list_dma, mac_address);
457
458 if (rc != H_SUCCESS && try_again) {
459 do {
460 rc = h_free_logical_lan(adapter->vdev->unit_address);
461 } while (H_IS_LONG_BUSY(rc) || (rc == H_BUSY));
462
463 try_again = 0;
464 goto retry;
465 }
466
467 return rc;
468}
469
440static int ibmveth_open(struct net_device *netdev) 470static int ibmveth_open(struct net_device *netdev)
441{ 471{
442 struct ibmveth_adapter *adapter = netdev->priv; 472 struct ibmveth_adapter *adapter = netdev->priv;
@@ -502,12 +532,9 @@ static int ibmveth_open(struct net_device *netdev)
502 ibmveth_debug_printk("filter list @ 0x%p\n", adapter->filter_list_addr); 532 ibmveth_debug_printk("filter list @ 0x%p\n", adapter->filter_list_addr);
503 ibmveth_debug_printk("receive q @ 0x%p\n", adapter->rx_queue.queue_addr); 533 ibmveth_debug_printk("receive q @ 0x%p\n", adapter->rx_queue.queue_addr);
504 534
535 h_vio_signal(adapter->vdev->unit_address, VIO_IRQ_DISABLE);
505 536
506 lpar_rc = h_register_logical_lan(adapter->vdev->unit_address, 537 lpar_rc = ibmveth_register_logical_lan(adapter, rxq_desc, mac_address);
507 adapter->buffer_list_dma,
508 rxq_desc.desc,
509 adapter->filter_list_dma,
510 mac_address);
511 538
512 if(lpar_rc != H_SUCCESS) { 539 if(lpar_rc != H_SUCCESS) {
513 ibmveth_error_printk("h_register_logical_lan failed with %ld\n", lpar_rc); 540 ibmveth_error_printk("h_register_logical_lan failed with %ld\n", lpar_rc);
@@ -905,6 +932,14 @@ static int ibmveth_change_mtu(struct net_device *dev, int new_mtu)
905 return -EINVAL; 932 return -EINVAL;
906} 933}
907 934
935#ifdef CONFIG_NET_POLL_CONTROLLER
936static void ibmveth_poll_controller(struct net_device *dev)
937{
938 ibmveth_replenish_task(dev->priv);
939 ibmveth_interrupt(dev->irq, dev);
940}
941#endif
942
908static int __devinit ibmveth_probe(struct vio_dev *dev, const struct vio_device_id *id) 943static int __devinit ibmveth_probe(struct vio_dev *dev, const struct vio_device_id *id)
909{ 944{
910 int rc, i; 945 int rc, i;
@@ -977,6 +1012,9 @@ static int __devinit ibmveth_probe(struct vio_dev *dev, const struct vio_device_
977 netdev->ethtool_ops = &netdev_ethtool_ops; 1012 netdev->ethtool_ops = &netdev_ethtool_ops;
978 netdev->change_mtu = ibmveth_change_mtu; 1013 netdev->change_mtu = ibmveth_change_mtu;
979 SET_NETDEV_DEV(netdev, &dev->dev); 1014 SET_NETDEV_DEV(netdev, &dev->dev);
1015#ifdef CONFIG_NET_POLL_CONTROLLER
1016 netdev->poll_controller = ibmveth_poll_controller;
1017#endif
980 netdev->features |= NETIF_F_LLTX; 1018 netdev->features |= NETIF_F_LLTX;
981 spin_lock_init(&adapter->stats_lock); 1019 spin_lock_init(&adapter->stats_lock);
982 1020
@@ -1132,7 +1170,9 @@ static void ibmveth_proc_register_adapter(struct ibmveth_adapter *adapter)
1132{ 1170{
1133 struct proc_dir_entry *entry; 1171 struct proc_dir_entry *entry;
1134 if (ibmveth_proc_dir) { 1172 if (ibmveth_proc_dir) {
1135 entry = create_proc_entry(adapter->netdev->name, S_IFREG, ibmveth_proc_dir); 1173 char u_addr[10];
1174 sprintf(u_addr, "%x", adapter->vdev->unit_address);
1175 entry = create_proc_entry(u_addr, S_IFREG, ibmveth_proc_dir);
1136 if (!entry) { 1176 if (!entry) {
1137 ibmveth_error_printk("Cannot create adapter proc entry"); 1177 ibmveth_error_printk("Cannot create adapter proc entry");
1138 } else { 1178 } else {
@@ -1147,7 +1187,9 @@ static void ibmveth_proc_register_adapter(struct ibmveth_adapter *adapter)
1147static void ibmveth_proc_unregister_adapter(struct ibmveth_adapter *adapter) 1187static void ibmveth_proc_unregister_adapter(struct ibmveth_adapter *adapter)
1148{ 1188{
1149 if (ibmveth_proc_dir) { 1189 if (ibmveth_proc_dir) {
1150 remove_proc_entry(adapter->netdev->name, ibmveth_proc_dir); 1190 char u_addr[10];
1191 sprintf(u_addr, "%x", adapter->vdev->unit_address);
1192 remove_proc_entry(u_addr, ibmveth_proc_dir);
1151 } 1193 }
1152} 1194}
1153 1195
diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c
index 2ffa3a59e704..9997081c6dae 100644
--- a/drivers/net/mv643xx_eth.c
+++ b/drivers/net/mv643xx_eth.c
@@ -2155,7 +2155,7 @@ static void eth_update_mib_counters(struct mv643xx_private *mp)
2155 for (offset = ETH_MIB_BAD_OCTETS_RECEIVED; 2155 for (offset = ETH_MIB_BAD_OCTETS_RECEIVED;
2156 offset <= ETH_MIB_FRAMES_1024_TO_MAX_OCTETS; 2156 offset <= ETH_MIB_FRAMES_1024_TO_MAX_OCTETS;
2157 offset += 4) 2157 offset += 4)
2158 *(u32 *)((char *)p + offset) = read_mib(mp, offset); 2158 *(u32 *)((char *)p + offset) += read_mib(mp, offset);
2159 2159
2160 p->good_octets_sent += read_mib(mp, ETH_MIB_GOOD_OCTETS_SENT_LOW); 2160 p->good_octets_sent += read_mib(mp, ETH_MIB_GOOD_OCTETS_SENT_LOW);
2161 p->good_octets_sent += 2161 p->good_octets_sent +=
@@ -2164,7 +2164,7 @@ static void eth_update_mib_counters(struct mv643xx_private *mp)
2164 for (offset = ETH_MIB_GOOD_FRAMES_SENT; 2164 for (offset = ETH_MIB_GOOD_FRAMES_SENT;
2165 offset <= ETH_MIB_LATE_COLLISION; 2165 offset <= ETH_MIB_LATE_COLLISION;
2166 offset += 4) 2166 offset += 4)
2167 *(u32 *)((char *)p + offset) = read_mib(mp, offset); 2167 *(u32 *)((char *)p + offset) += read_mib(mp, offset);
2168} 2168}
2169 2169
2170/* 2170/*
diff --git a/drivers/net/skge.c b/drivers/net/skge.c
index a4a58e4e93a1..e7e414928f89 100644
--- a/drivers/net/skge.c
+++ b/drivers/net/skge.c
@@ -43,7 +43,7 @@
43#include "skge.h" 43#include "skge.h"
44 44
45#define DRV_NAME "skge" 45#define DRV_NAME "skge"
46#define DRV_VERSION "1.8" 46#define DRV_VERSION "1.9"
47#define PFX DRV_NAME " " 47#define PFX DRV_NAME " "
48 48
49#define DEFAULT_TX_RING_SIZE 128 49#define DEFAULT_TX_RING_SIZE 128
@@ -197,8 +197,8 @@ static u32 skge_supported_modes(const struct skge_hw *hw)
197 else if (hw->chip_id == CHIP_ID_YUKON) 197 else if (hw->chip_id == CHIP_ID_YUKON)
198 supported &= ~SUPPORTED_1000baseT_Half; 198 supported &= ~SUPPORTED_1000baseT_Half;
199 } else 199 } else
200 supported = SUPPORTED_1000baseT_Full | SUPPORTED_FIBRE 200 supported = SUPPORTED_1000baseT_Full | SUPPORTED_1000baseT_Half
201 | SUPPORTED_Autoneg; 201 | SUPPORTED_FIBRE | SUPPORTED_Autoneg;
202 202
203 return supported; 203 return supported;
204} 204}
@@ -487,31 +487,37 @@ static void skge_get_pauseparam(struct net_device *dev,
487{ 487{
488 struct skge_port *skge = netdev_priv(dev); 488 struct skge_port *skge = netdev_priv(dev);
489 489
490 ecmd->tx_pause = (skge->flow_control == FLOW_MODE_LOC_SEND) 490 ecmd->rx_pause = (skge->flow_control == FLOW_MODE_SYMMETRIC)
491 || (skge->flow_control == FLOW_MODE_SYMMETRIC); 491 || (skge->flow_control == FLOW_MODE_SYM_OR_REM);
492 ecmd->rx_pause = (skge->flow_control == FLOW_MODE_REM_SEND) 492 ecmd->tx_pause = ecmd->rx_pause || (skge->flow_control == FLOW_MODE_LOC_SEND);
493 || (skge->flow_control == FLOW_MODE_SYMMETRIC);
494 493
495 ecmd->autoneg = skge->autoneg; 494 ecmd->autoneg = ecmd->rx_pause || ecmd->tx_pause;
496} 495}
497 496
498static int skge_set_pauseparam(struct net_device *dev, 497static int skge_set_pauseparam(struct net_device *dev,
499 struct ethtool_pauseparam *ecmd) 498 struct ethtool_pauseparam *ecmd)
500{ 499{
501 struct skge_port *skge = netdev_priv(dev); 500 struct skge_port *skge = netdev_priv(dev);
501 struct ethtool_pauseparam old;
502 502
503 skge->autoneg = ecmd->autoneg; 503 skge_get_pauseparam(dev, &old);
504 if (ecmd->rx_pause && ecmd->tx_pause) 504
505 skge->flow_control = FLOW_MODE_SYMMETRIC; 505 if (ecmd->autoneg != old.autoneg)
506 else if (ecmd->rx_pause && !ecmd->tx_pause) 506 skge->flow_control = ecmd->autoneg ? FLOW_MODE_NONE : FLOW_MODE_SYMMETRIC;
507 skge->flow_control = FLOW_MODE_REM_SEND; 507 else {
508 else if (!ecmd->rx_pause && ecmd->tx_pause) 508 if (ecmd->rx_pause && ecmd->tx_pause)
509 skge->flow_control = FLOW_MODE_LOC_SEND; 509 skge->flow_control = FLOW_MODE_SYMMETRIC;
510 else 510 else if (ecmd->rx_pause && !ecmd->tx_pause)
511 skge->flow_control = FLOW_MODE_NONE; 511 skge->flow_control = FLOW_MODE_SYM_OR_REM;
512 else if (!ecmd->rx_pause && ecmd->tx_pause)
513 skge->flow_control = FLOW_MODE_LOC_SEND;
514 else
515 skge->flow_control = FLOW_MODE_NONE;
516 }
512 517
513 if (netif_running(dev)) 518 if (netif_running(dev))
514 skge_phy_reset(skge); 519 skge_phy_reset(skge);
520
515 return 0; 521 return 0;
516} 522}
517 523
@@ -854,6 +860,23 @@ static int skge_rx_fill(struct net_device *dev)
854 return 0; 860 return 0;
855} 861}
856 862
863static const char *skge_pause(enum pause_status status)
864{
865 switch(status) {
866 case FLOW_STAT_NONE:
867 return "none";
868 case FLOW_STAT_REM_SEND:
869 return "rx only";
870 case FLOW_STAT_LOC_SEND:
871 return "tx_only";
872 case FLOW_STAT_SYMMETRIC: /* Both station may send PAUSE */
873 return "both";
874 default:
875 return "indeterminated";
876 }
877}
878
879
857static void skge_link_up(struct skge_port *skge) 880static void skge_link_up(struct skge_port *skge)
858{ 881{
859 skge_write8(skge->hw, SK_REG(skge->port, LNK_LED_REG), 882 skge_write8(skge->hw, SK_REG(skge->port, LNK_LED_REG),
@@ -862,16 +885,13 @@ static void skge_link_up(struct skge_port *skge)
862 netif_carrier_on(skge->netdev); 885 netif_carrier_on(skge->netdev);
863 netif_wake_queue(skge->netdev); 886 netif_wake_queue(skge->netdev);
864 887
865 if (netif_msg_link(skge)) 888 if (netif_msg_link(skge)) {
866 printk(KERN_INFO PFX 889 printk(KERN_INFO PFX
867 "%s: Link is up at %d Mbps, %s duplex, flow control %s\n", 890 "%s: Link is up at %d Mbps, %s duplex, flow control %s\n",
868 skge->netdev->name, skge->speed, 891 skge->netdev->name, skge->speed,
869 skge->duplex == DUPLEX_FULL ? "full" : "half", 892 skge->duplex == DUPLEX_FULL ? "full" : "half",
870 (skge->flow_control == FLOW_MODE_NONE) ? "none" : 893 skge_pause(skge->flow_status));
871 (skge->flow_control == FLOW_MODE_LOC_SEND) ? "tx only" : 894 }
872 (skge->flow_control == FLOW_MODE_REM_SEND) ? "rx only" :
873 (skge->flow_control == FLOW_MODE_SYMMETRIC) ? "tx and rx" :
874 "unknown");
875} 895}
876 896
877static void skge_link_down(struct skge_port *skge) 897static void skge_link_down(struct skge_port *skge)
@@ -884,6 +904,29 @@ static void skge_link_down(struct skge_port *skge)
884 printk(KERN_INFO PFX "%s: Link is down.\n", skge->netdev->name); 904 printk(KERN_INFO PFX "%s: Link is down.\n", skge->netdev->name);
885} 905}
886 906
907
908static void xm_link_down(struct skge_hw *hw, int port)
909{
910 struct net_device *dev = hw->dev[port];
911 struct skge_port *skge = netdev_priv(dev);
912 u16 cmd, msk;
913
914 if (hw->phy_type == SK_PHY_XMAC) {
915 msk = xm_read16(hw, port, XM_IMSK);
916 msk |= XM_IS_INP_ASS | XM_IS_LIPA_RC | XM_IS_RX_PAGE | XM_IS_AND;
917 xm_write16(hw, port, XM_IMSK, msk);
918 }
919
920 cmd = xm_read16(hw, port, XM_MMU_CMD);
921 cmd &= ~(XM_MMU_ENA_RX | XM_MMU_ENA_TX);
922 xm_write16(hw, port, XM_MMU_CMD, cmd);
923 /* dummy read to ensure writing */
924 (void) xm_read16(hw, port, XM_MMU_CMD);
925
926 if (netif_carrier_ok(dev))
927 skge_link_down(skge);
928}
929
887static int __xm_phy_read(struct skge_hw *hw, int port, u16 reg, u16 *val) 930static int __xm_phy_read(struct skge_hw *hw, int port, u16 reg, u16 *val)
888{ 931{
889 int i; 932 int i;
@@ -992,7 +1035,15 @@ static const u16 phy_pause_map[] = {
992 [FLOW_MODE_NONE] = 0, 1035 [FLOW_MODE_NONE] = 0,
993 [FLOW_MODE_LOC_SEND] = PHY_AN_PAUSE_ASYM, 1036 [FLOW_MODE_LOC_SEND] = PHY_AN_PAUSE_ASYM,
994 [FLOW_MODE_SYMMETRIC] = PHY_AN_PAUSE_CAP, 1037 [FLOW_MODE_SYMMETRIC] = PHY_AN_PAUSE_CAP,
995 [FLOW_MODE_REM_SEND] = PHY_AN_PAUSE_CAP | PHY_AN_PAUSE_ASYM, 1038 [FLOW_MODE_SYM_OR_REM] = PHY_AN_PAUSE_CAP | PHY_AN_PAUSE_ASYM,
1039};
1040
1041/* special defines for FIBER (88E1011S only) */
1042static const u16 fiber_pause_map[] = {
1043 [FLOW_MODE_NONE] = PHY_X_P_NO_PAUSE,
1044 [FLOW_MODE_LOC_SEND] = PHY_X_P_ASYM_MD,
1045 [FLOW_MODE_SYMMETRIC] = PHY_X_P_SYM_MD,
1046 [FLOW_MODE_SYM_OR_REM] = PHY_X_P_BOTH_MD,
996}; 1047};
997 1048
998 1049
@@ -1008,14 +1059,7 @@ static void bcom_check_link(struct skge_hw *hw, int port)
1008 status = xm_phy_read(hw, port, PHY_BCOM_STAT); 1059 status = xm_phy_read(hw, port, PHY_BCOM_STAT);
1009 1060
1010 if ((status & PHY_ST_LSYNC) == 0) { 1061 if ((status & PHY_ST_LSYNC) == 0) {
1011 u16 cmd = xm_read16(hw, port, XM_MMU_CMD); 1062 xm_link_down(hw, port);
1012 cmd &= ~(XM_MMU_ENA_RX | XM_MMU_ENA_TX);
1013 xm_write16(hw, port, XM_MMU_CMD, cmd);
1014 /* dummy read to ensure writing */
1015 (void) xm_read16(hw, port, XM_MMU_CMD);
1016
1017 if (netif_carrier_ok(dev))
1018 skge_link_down(skge);
1019 return; 1063 return;
1020 } 1064 }
1021 1065
@@ -1048,20 +1092,19 @@ static void bcom_check_link(struct skge_hw *hw, int port)
1048 return; 1092 return;
1049 } 1093 }
1050 1094
1051
1052 /* We are using IEEE 802.3z/D5.0 Table 37-4 */ 1095 /* We are using IEEE 802.3z/D5.0 Table 37-4 */
1053 switch (aux & PHY_B_AS_PAUSE_MSK) { 1096 switch (aux & PHY_B_AS_PAUSE_MSK) {
1054 case PHY_B_AS_PAUSE_MSK: 1097 case PHY_B_AS_PAUSE_MSK:
1055 skge->flow_control = FLOW_MODE_SYMMETRIC; 1098 skge->flow_status = FLOW_STAT_SYMMETRIC;
1056 break; 1099 break;
1057 case PHY_B_AS_PRR: 1100 case PHY_B_AS_PRR:
1058 skge->flow_control = FLOW_MODE_REM_SEND; 1101 skge->flow_status = FLOW_STAT_REM_SEND;
1059 break; 1102 break;
1060 case PHY_B_AS_PRT: 1103 case PHY_B_AS_PRT:
1061 skge->flow_control = FLOW_MODE_LOC_SEND; 1104 skge->flow_status = FLOW_STAT_LOC_SEND;
1062 break; 1105 break;
1063 default: 1106 default:
1064 skge->flow_control = FLOW_MODE_NONE; 1107 skge->flow_status = FLOW_STAT_NONE;
1065 } 1108 }
1066 skge->speed = SPEED_1000; 1109 skge->speed = SPEED_1000;
1067 } 1110 }
@@ -1191,17 +1234,7 @@ static void xm_phy_init(struct skge_port *skge)
1191 if (skge->advertising & ADVERTISED_1000baseT_Full) 1234 if (skge->advertising & ADVERTISED_1000baseT_Full)
1192 ctrl |= PHY_X_AN_FD; 1235 ctrl |= PHY_X_AN_FD;
1193 1236
1194 switch(skge->flow_control) { 1237 ctrl |= fiber_pause_map[skge->flow_control];
1195 case FLOW_MODE_NONE:
1196 ctrl |= PHY_X_P_NO_PAUSE;
1197 break;
1198 case FLOW_MODE_LOC_SEND:
1199 ctrl |= PHY_X_P_ASYM_MD;
1200 break;
1201 case FLOW_MODE_SYMMETRIC:
1202 ctrl |= PHY_X_P_BOTH_MD;
1203 break;
1204 }
1205 1238
1206 xm_phy_write(hw, port, PHY_XMAC_AUNE_ADV, ctrl); 1239 xm_phy_write(hw, port, PHY_XMAC_AUNE_ADV, ctrl);
1207 1240
@@ -1235,14 +1268,7 @@ static void xm_check_link(struct net_device *dev)
1235 status = xm_phy_read(hw, port, PHY_XMAC_STAT); 1268 status = xm_phy_read(hw, port, PHY_XMAC_STAT);
1236 1269
1237 if ((status & PHY_ST_LSYNC) == 0) { 1270 if ((status & PHY_ST_LSYNC) == 0) {
1238 u16 cmd = xm_read16(hw, port, XM_MMU_CMD); 1271 xm_link_down(hw, port);
1239 cmd &= ~(XM_MMU_ENA_RX | XM_MMU_ENA_TX);
1240 xm_write16(hw, port, XM_MMU_CMD, cmd);
1241 /* dummy read to ensure writing */
1242 (void) xm_read16(hw, port, XM_MMU_CMD);
1243
1244 if (netif_carrier_ok(dev))
1245 skge_link_down(skge);
1246 return; 1272 return;
1247 } 1273 }
1248 1274
@@ -1276,15 +1302,20 @@ static void xm_check_link(struct net_device *dev)
1276 } 1302 }
1277 1303
1278 /* We are using IEEE 802.3z/D5.0 Table 37-4 */ 1304 /* We are using IEEE 802.3z/D5.0 Table 37-4 */
1279 if (lpa & PHY_X_P_SYM_MD) 1305 if ((skge->flow_control == FLOW_MODE_SYMMETRIC ||
1280 skge->flow_control = FLOW_MODE_SYMMETRIC; 1306 skge->flow_control == FLOW_MODE_SYM_OR_REM) &&
1281 else if ((lpa & PHY_X_RS_PAUSE) == PHY_X_P_ASYM_MD) 1307 (lpa & PHY_X_P_SYM_MD))
1282 skge->flow_control = FLOW_MODE_REM_SEND; 1308 skge->flow_status = FLOW_STAT_SYMMETRIC;
1283 else if ((lpa & PHY_X_RS_PAUSE) == PHY_X_P_BOTH_MD) 1309 else if (skge->flow_control == FLOW_MODE_SYM_OR_REM &&
1284 skge->flow_control = FLOW_MODE_LOC_SEND; 1310 (lpa & PHY_X_RS_PAUSE) == PHY_X_P_ASYM_MD)
1311 /* Enable PAUSE receive, disable PAUSE transmit */
1312 skge->flow_status = FLOW_STAT_REM_SEND;
1313 else if (skge->flow_control == FLOW_MODE_LOC_SEND &&
1314 (lpa & PHY_X_RS_PAUSE) == PHY_X_P_BOTH_MD)
1315 /* Disable PAUSE receive, enable PAUSE transmit */
1316 skge->flow_status = FLOW_STAT_LOC_SEND;
1285 else 1317 else
1286 skge->flow_control = FLOW_MODE_NONE; 1318 skge->flow_status = FLOW_STAT_NONE;
1287
1288 1319
1289 skge->speed = SPEED_1000; 1320 skge->speed = SPEED_1000;
1290 } 1321 }
@@ -1568,6 +1599,10 @@ static void genesis_mac_intr(struct skge_hw *hw, int port)
1568 printk(KERN_DEBUG PFX "%s: mac interrupt status 0x%x\n", 1599 printk(KERN_DEBUG PFX "%s: mac interrupt status 0x%x\n",
1569 skge->netdev->name, status); 1600 skge->netdev->name, status);
1570 1601
1602 if (hw->phy_type == SK_PHY_XMAC &&
1603 (status & (XM_IS_INP_ASS | XM_IS_LIPA_RC)))
1604 xm_link_down(hw, port);
1605
1571 if (status & XM_IS_TXF_UR) { 1606 if (status & XM_IS_TXF_UR) {
1572 xm_write32(hw, port, XM_MODE, XM_MD_FTF); 1607 xm_write32(hw, port, XM_MODE, XM_MD_FTF);
1573 ++skge->net_stats.tx_fifo_errors; 1608 ++skge->net_stats.tx_fifo_errors;
@@ -1582,7 +1617,7 @@ static void genesis_link_up(struct skge_port *skge)
1582{ 1617{
1583 struct skge_hw *hw = skge->hw; 1618 struct skge_hw *hw = skge->hw;
1584 int port = skge->port; 1619 int port = skge->port;
1585 u16 cmd; 1620 u16 cmd, msk;
1586 u32 mode; 1621 u32 mode;
1587 1622
1588 cmd = xm_read16(hw, port, XM_MMU_CMD); 1623 cmd = xm_read16(hw, port, XM_MMU_CMD);
@@ -1591,8 +1626,8 @@ static void genesis_link_up(struct skge_port *skge)
1591 * enabling pause frame reception is required for 1000BT 1626 * enabling pause frame reception is required for 1000BT
1592 * because the XMAC is not reset if the link is going down 1627 * because the XMAC is not reset if the link is going down
1593 */ 1628 */
1594 if (skge->flow_control == FLOW_MODE_NONE || 1629 if (skge->flow_status == FLOW_STAT_NONE ||
1595 skge->flow_control == FLOW_MODE_LOC_SEND) 1630 skge->flow_status == FLOW_STAT_LOC_SEND)
1596 /* Disable Pause Frame Reception */ 1631 /* Disable Pause Frame Reception */
1597 cmd |= XM_MMU_IGN_PF; 1632 cmd |= XM_MMU_IGN_PF;
1598 else 1633 else
@@ -1602,8 +1637,8 @@ static void genesis_link_up(struct skge_port *skge)
1602 xm_write16(hw, port, XM_MMU_CMD, cmd); 1637 xm_write16(hw, port, XM_MMU_CMD, cmd);
1603 1638
1604 mode = xm_read32(hw, port, XM_MODE); 1639 mode = xm_read32(hw, port, XM_MODE);
1605 if (skge->flow_control == FLOW_MODE_SYMMETRIC || 1640 if (skge->flow_status== FLOW_STAT_SYMMETRIC ||
1606 skge->flow_control == FLOW_MODE_LOC_SEND) { 1641 skge->flow_status == FLOW_STAT_LOC_SEND) {
1607 /* 1642 /*
1608 * Configure Pause Frame Generation 1643 * Configure Pause Frame Generation
1609 * Use internal and external Pause Frame Generation. 1644 * Use internal and external Pause Frame Generation.
@@ -1631,7 +1666,11 @@ static void genesis_link_up(struct skge_port *skge)
1631 } 1666 }
1632 1667
1633 xm_write32(hw, port, XM_MODE, mode); 1668 xm_write32(hw, port, XM_MODE, mode);
1634 xm_write16(hw, port, XM_IMSK, XM_DEF_MSK); 1669 msk = XM_DEF_MSK;
1670 if (hw->phy_type != SK_PHY_XMAC)
1671 msk |= XM_IS_INP_ASS; /* disable GP0 interrupt bit */
1672
1673 xm_write16(hw, port, XM_IMSK, msk);
1635 xm_read16(hw, port, XM_ISRC); 1674 xm_read16(hw, port, XM_ISRC);
1636 1675
1637 /* get MMU Command Reg. */ 1676 /* get MMU Command Reg. */
@@ -1779,11 +1818,17 @@ static void yukon_init(struct skge_hw *hw, int port)
1779 adv |= PHY_M_AN_10_FD; 1818 adv |= PHY_M_AN_10_FD;
1780 if (skge->advertising & ADVERTISED_10baseT_Half) 1819 if (skge->advertising & ADVERTISED_10baseT_Half)
1781 adv |= PHY_M_AN_10_HD; 1820 adv |= PHY_M_AN_10_HD;
1782 } else /* special defines for FIBER (88E1011S only) */
1783 adv |= PHY_M_AN_1000X_AHD | PHY_M_AN_1000X_AFD;
1784 1821
1785 /* Set Flow-control capabilities */ 1822 /* Set Flow-control capabilities */
1786 adv |= phy_pause_map[skge->flow_control]; 1823 adv |= phy_pause_map[skge->flow_control];
1824 } else {
1825 if (skge->advertising & ADVERTISED_1000baseT_Full)
1826 adv |= PHY_M_AN_1000X_AFD;
1827 if (skge->advertising & ADVERTISED_1000baseT_Half)
1828 adv |= PHY_M_AN_1000X_AHD;
1829
1830 adv |= fiber_pause_map[skge->flow_control];
1831 }
1787 1832
1788 /* Restart Auto-negotiation */ 1833 /* Restart Auto-negotiation */
1789 ctrl |= PHY_CT_ANE | PHY_CT_RE_CFG; 1834 ctrl |= PHY_CT_ANE | PHY_CT_RE_CFG;
@@ -1917,6 +1962,11 @@ static void yukon_mac_init(struct skge_hw *hw, int port)
1917 case FLOW_MODE_LOC_SEND: 1962 case FLOW_MODE_LOC_SEND:
1918 /* disable Rx flow-control */ 1963 /* disable Rx flow-control */
1919 reg |= GM_GPCR_FC_RX_DIS | GM_GPCR_AU_FCT_DIS; 1964 reg |= GM_GPCR_FC_RX_DIS | GM_GPCR_AU_FCT_DIS;
1965 break;
1966 case FLOW_MODE_SYMMETRIC:
1967 case FLOW_MODE_SYM_OR_REM:
1968 /* enable Tx & Rx flow-control */
1969 break;
1920 } 1970 }
1921 1971
1922 gma_write16(hw, port, GM_GP_CTRL, reg); 1972 gma_write16(hw, port, GM_GP_CTRL, reg);
@@ -2111,13 +2161,11 @@ static void yukon_link_down(struct skge_port *skge)
2111 ctrl &= ~(GM_GPCR_RX_ENA | GM_GPCR_TX_ENA); 2161 ctrl &= ~(GM_GPCR_RX_ENA | GM_GPCR_TX_ENA);
2112 gma_write16(hw, port, GM_GP_CTRL, ctrl); 2162 gma_write16(hw, port, GM_GP_CTRL, ctrl);
2113 2163
2114 if (skge->flow_control == FLOW_MODE_REM_SEND) { 2164 if (skge->flow_status == FLOW_STAT_REM_SEND) {
2165 ctrl = gm_phy_read(hw, port, PHY_MARV_AUNE_ADV);
2166 ctrl |= PHY_M_AN_ASP;
2115 /* restore Asymmetric Pause bit */ 2167 /* restore Asymmetric Pause bit */
2116 gm_phy_write(hw, port, PHY_MARV_AUNE_ADV, 2168 gm_phy_write(hw, port, PHY_MARV_AUNE_ADV, ctrl);
2117 gm_phy_read(hw, port,
2118 PHY_MARV_AUNE_ADV)
2119 | PHY_M_AN_ASP);
2120
2121 } 2169 }
2122 2170
2123 yukon_reset(hw, port); 2171 yukon_reset(hw, port);
@@ -2164,19 +2212,19 @@ static void yukon_phy_intr(struct skge_port *skge)
2164 /* We are using IEEE 802.3z/D5.0 Table 37-4 */ 2212 /* We are using IEEE 802.3z/D5.0 Table 37-4 */
2165 switch (phystat & PHY_M_PS_PAUSE_MSK) { 2213 switch (phystat & PHY_M_PS_PAUSE_MSK) {
2166 case PHY_M_PS_PAUSE_MSK: 2214 case PHY_M_PS_PAUSE_MSK:
2167 skge->flow_control = FLOW_MODE_SYMMETRIC; 2215 skge->flow_status = FLOW_STAT_SYMMETRIC;
2168 break; 2216 break;
2169 case PHY_M_PS_RX_P_EN: 2217 case PHY_M_PS_RX_P_EN:
2170 skge->flow_control = FLOW_MODE_REM_SEND; 2218 skge->flow_status = FLOW_STAT_REM_SEND;
2171 break; 2219 break;
2172 case PHY_M_PS_TX_P_EN: 2220 case PHY_M_PS_TX_P_EN:
2173 skge->flow_control = FLOW_MODE_LOC_SEND; 2221 skge->flow_status = FLOW_STAT_LOC_SEND;
2174 break; 2222 break;
2175 default: 2223 default:
2176 skge->flow_control = FLOW_MODE_NONE; 2224 skge->flow_status = FLOW_STAT_NONE;
2177 } 2225 }
2178 2226
2179 if (skge->flow_control == FLOW_MODE_NONE || 2227 if (skge->flow_status == FLOW_STAT_NONE ||
2180 (skge->speed < SPEED_1000 && skge->duplex == DUPLEX_HALF)) 2228 (skge->speed < SPEED_1000 && skge->duplex == DUPLEX_HALF))
2181 skge_write8(hw, SK_REG(port, GMAC_CTRL), GMC_PAUSE_OFF); 2229 skge_write8(hw, SK_REG(port, GMAC_CTRL), GMC_PAUSE_OFF);
2182 else 2230 else
@@ -3399,7 +3447,7 @@ static struct net_device *skge_devinit(struct skge_hw *hw, int port,
3399 3447
3400 /* Auto speed and flow control */ 3448 /* Auto speed and flow control */
3401 skge->autoneg = AUTONEG_ENABLE; 3449 skge->autoneg = AUTONEG_ENABLE;
3402 skge->flow_control = FLOW_MODE_SYMMETRIC; 3450 skge->flow_control = FLOW_MODE_SYM_OR_REM;
3403 skge->duplex = -1; 3451 skge->duplex = -1;
3404 skge->speed = -1; 3452 skge->speed = -1;
3405 skge->advertising = skge_supported_modes(hw); 3453 skge->advertising = skge_supported_modes(hw);
diff --git a/drivers/net/skge.h b/drivers/net/skge.h
index d0b47d46cf9d..537c0aaa1db8 100644
--- a/drivers/net/skge.h
+++ b/drivers/net/skge.h
@@ -2195,7 +2195,8 @@ enum {
2195 XM_IS_RX_COMP = 1<<0, /* Bit 0: Frame Rx Complete */ 2195 XM_IS_RX_COMP = 1<<0, /* Bit 0: Frame Rx Complete */
2196}; 2196};
2197 2197
2198#define XM_DEF_MSK (~(XM_IS_RXC_OV | XM_IS_TXC_OV | XM_IS_RXF_OV | XM_IS_TXF_UR)) 2198#define XM_DEF_MSK (~(XM_IS_INP_ASS | XM_IS_LIPA_RC | \
2199 XM_IS_RXF_OV | XM_IS_TXF_UR))
2199 2200
2200 2201
2201/* XM_HW_CFG 16 bit r/w Hardware Config Register */ 2202/* XM_HW_CFG 16 bit r/w Hardware Config Register */
@@ -2426,13 +2427,24 @@ struct skge_hw {
2426 struct mutex phy_mutex; 2427 struct mutex phy_mutex;
2427}; 2428};
2428 2429
2429enum { 2430enum pause_control {
2430 FLOW_MODE_NONE = 0, /* No Flow-Control */ 2431 FLOW_MODE_NONE = 1, /* No Flow-Control */
2431 FLOW_MODE_LOC_SEND = 1, /* Local station sends PAUSE */ 2432 FLOW_MODE_LOC_SEND = 2, /* Local station sends PAUSE */
2432 FLOW_MODE_REM_SEND = 2, /* Symmetric or just remote */
2433 FLOW_MODE_SYMMETRIC = 3, /* Both stations may send PAUSE */ 2433 FLOW_MODE_SYMMETRIC = 3, /* Both stations may send PAUSE */
2434 FLOW_MODE_SYM_OR_REM = 4, /* Both stations may send PAUSE or
2435 * just the remote station may send PAUSE
2436 */
2437};
2438
2439enum pause_status {
2440 FLOW_STAT_INDETERMINATED=0, /* indeterminated */
2441 FLOW_STAT_NONE, /* No Flow Control */
2442 FLOW_STAT_REM_SEND, /* Remote Station sends PAUSE */
2443 FLOW_STAT_LOC_SEND, /* Local station sends PAUSE */
2444 FLOW_STAT_SYMMETRIC, /* Both station may send PAUSE */
2434}; 2445};
2435 2446
2447
2436struct skge_port { 2448struct skge_port {
2437 u32 msg_enable; 2449 u32 msg_enable;
2438 struct skge_hw *hw; 2450 struct skge_hw *hw;
@@ -2445,9 +2457,10 @@ struct skge_port {
2445 struct net_device_stats net_stats; 2457 struct net_device_stats net_stats;
2446 2458
2447 struct work_struct link_thread; 2459 struct work_struct link_thread;
2460 enum pause_control flow_control;
2461 enum pause_status flow_status;
2448 u8 rx_csum; 2462 u8 rx_csum;
2449 u8 blink_on; 2463 u8 blink_on;
2450 u8 flow_control;
2451 u8 wol; 2464 u8 wol;
2452 u8 autoneg; /* AUTONEG_ENABLE, AUTONEG_DISABLE */ 2465 u8 autoneg; /* AUTONEG_ENABLE, AUTONEG_DISABLE */
2453 u8 duplex; /* DUPLEX_HALF, DUPLEX_FULL */ 2466 u8 duplex; /* DUPLEX_HALF, DUPLEX_FULL */
diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c
index 459c845d6648..c10e7f5faa5f 100644
--- a/drivers/net/sky2.c
+++ b/drivers/net/sky2.c
@@ -683,7 +683,7 @@ static void sky2_mac_init(struct sky2_hw *hw, unsigned port)
683 sky2_write16(hw, SK_REG(port, TX_GMF_CTRL_T), GMF_OPER_ON); 683 sky2_write16(hw, SK_REG(port, TX_GMF_CTRL_T), GMF_OPER_ON);
684 684
685 if (hw->chip_id == CHIP_ID_YUKON_EC_U) { 685 if (hw->chip_id == CHIP_ID_YUKON_EC_U) {
686 sky2_write8(hw, SK_REG(port, RX_GMF_LP_THR), 768/8); 686 sky2_write8(hw, SK_REG(port, RX_GMF_LP_THR), 512/8);
687 sky2_write8(hw, SK_REG(port, RX_GMF_UP_THR), 1024/8); 687 sky2_write8(hw, SK_REG(port, RX_GMF_UP_THR), 1024/8);
688 if (hw->dev[port]->mtu > ETH_DATA_LEN) { 688 if (hw->dev[port]->mtu > ETH_DATA_LEN) {
689 /* set Tx GMAC FIFO Almost Empty Threshold */ 689 /* set Tx GMAC FIFO Almost Empty Threshold */
@@ -1907,7 +1907,7 @@ static struct sk_buff *receive_copy(struct sky2_port *sky2,
1907 pci_dma_sync_single_for_device(sky2->hw->pdev, re->data_addr, 1907 pci_dma_sync_single_for_device(sky2->hw->pdev, re->data_addr,
1908 length, PCI_DMA_FROMDEVICE); 1908 length, PCI_DMA_FROMDEVICE);
1909 re->skb->ip_summed = CHECKSUM_NONE; 1909 re->skb->ip_summed = CHECKSUM_NONE;
1910 __skb_put(skb, length); 1910 skb_put(skb, length);
1911 } 1911 }
1912 return skb; 1912 return skb;
1913} 1913}
@@ -1970,7 +1970,7 @@ static struct sk_buff *receive_new(struct sky2_port *sky2,
1970 if (skb_shinfo(skb)->nr_frags) 1970 if (skb_shinfo(skb)->nr_frags)
1971 skb_put_frags(skb, hdr_space, length); 1971 skb_put_frags(skb, hdr_space, length);
1972 else 1972 else
1973 skb_put(skb, hdr_space); 1973 skb_put(skb, length);
1974 return skb; 1974 return skb;
1975} 1975}
1976 1976
@@ -2220,8 +2220,7 @@ static void sky2_hw_intr(struct sky2_hw *hw)
2220 /* PCI-Express uncorrectable Error occurred */ 2220 /* PCI-Express uncorrectable Error occurred */
2221 u32 pex_err; 2221 u32 pex_err;
2222 2222
2223 pex_err = sky2_pci_read32(hw, 2223 pex_err = sky2_pci_read32(hw, PEX_UNC_ERR_STAT);
2224 hw->err_cap + PCI_ERR_UNCOR_STATUS);
2225 2224
2226 if (net_ratelimit()) 2225 if (net_ratelimit())
2227 printk(KERN_ERR PFX "%s: pci express error (0x%x)\n", 2226 printk(KERN_ERR PFX "%s: pci express error (0x%x)\n",
@@ -2229,20 +2228,15 @@ static void sky2_hw_intr(struct sky2_hw *hw)
2229 2228
2230 /* clear the interrupt */ 2229 /* clear the interrupt */
2231 sky2_write32(hw, B2_TST_CTRL1, TST_CFG_WRITE_ON); 2230 sky2_write32(hw, B2_TST_CTRL1, TST_CFG_WRITE_ON);
2232 sky2_pci_write32(hw, 2231 sky2_pci_write32(hw, PEX_UNC_ERR_STAT,
2233 hw->err_cap + PCI_ERR_UNCOR_STATUS, 2232 0xffffffffUL);
2234 0xffffffffUL);
2235 sky2_write32(hw, B2_TST_CTRL1, TST_CFG_WRITE_OFF); 2233 sky2_write32(hw, B2_TST_CTRL1, TST_CFG_WRITE_OFF);
2236 2234
2237 2235 if (pex_err & PEX_FATAL_ERRORS) {
2238 /* In case of fatal error mask off to keep from getting stuck */
2239 if (pex_err & (PCI_ERR_UNC_POISON_TLP | PCI_ERR_UNC_FCP
2240 | PCI_ERR_UNC_DLP)) {
2241 u32 hwmsk = sky2_read32(hw, B0_HWE_IMSK); 2236 u32 hwmsk = sky2_read32(hw, B0_HWE_IMSK);
2242 hwmsk &= ~Y2_IS_PCI_EXP; 2237 hwmsk &= ~Y2_IS_PCI_EXP;
2243 sky2_write32(hw, B0_HWE_IMSK, hwmsk); 2238 sky2_write32(hw, B0_HWE_IMSK, hwmsk);
2244 } 2239 }
2245
2246 } 2240 }
2247 2241
2248 if (status & Y2_HWE_L1_MASK) 2242 if (status & Y2_HWE_L1_MASK)
@@ -2423,7 +2417,6 @@ static int sky2_reset(struct sky2_hw *hw)
2423 u16 status; 2417 u16 status;
2424 u8 t8; 2418 u8 t8;
2425 int i; 2419 int i;
2426 u32 msk;
2427 2420
2428 sky2_write8(hw, B0_CTST, CS_RST_CLR); 2421 sky2_write8(hw, B0_CTST, CS_RST_CLR);
2429 2422
@@ -2464,13 +2457,9 @@ static int sky2_reset(struct sky2_hw *hw)
2464 sky2_write8(hw, B0_CTST, CS_MRST_CLR); 2457 sky2_write8(hw, B0_CTST, CS_MRST_CLR);
2465 2458
2466 /* clear any PEX errors */ 2459 /* clear any PEX errors */
2467 if (pci_find_capability(hw->pdev, PCI_CAP_ID_EXP)) { 2460 if (pci_find_capability(hw->pdev, PCI_CAP_ID_EXP))
2468 hw->err_cap = pci_find_ext_capability(hw->pdev, PCI_EXT_CAP_ID_ERR); 2461 sky2_pci_write32(hw, PEX_UNC_ERR_STAT, 0xffffffffUL);
2469 if (hw->err_cap) 2462
2470 sky2_pci_write32(hw,
2471 hw->err_cap + PCI_ERR_UNCOR_STATUS,
2472 0xffffffffUL);
2473 }
2474 2463
2475 hw->pmd_type = sky2_read8(hw, B2_PMD_TYP); 2464 hw->pmd_type = sky2_read8(hw, B2_PMD_TYP);
2476 hw->ports = 1; 2465 hw->ports = 1;
@@ -2527,10 +2516,7 @@ static int sky2_reset(struct sky2_hw *hw)
2527 sky2_write8(hw, RAM_BUFFER(i, B3_RI_RTO_XS2), SK_RI_TO_53); 2516 sky2_write8(hw, RAM_BUFFER(i, B3_RI_RTO_XS2), SK_RI_TO_53);
2528 } 2517 }
2529 2518
2530 msk = Y2_HWE_ALL_MASK; 2519 sky2_write32(hw, B0_HWE_IMSK, Y2_HWE_ALL_MASK);
2531 if (!hw->err_cap)
2532 msk &= ~Y2_IS_PCI_EXP;
2533 sky2_write32(hw, B0_HWE_IMSK, msk);
2534 2520
2535 for (i = 0; i < hw->ports; i++) 2521 for (i = 0; i < hw->ports; i++)
2536 sky2_gmac_reset(hw, i); 2522 sky2_gmac_reset(hw, i);
diff --git a/drivers/net/sky2.h b/drivers/net/sky2.h
index f66109a96d95..43d2accf60e1 100644
--- a/drivers/net/sky2.h
+++ b/drivers/net/sky2.h
@@ -6,15 +6,24 @@
6 6
7#define ETH_JUMBO_MTU 9000 /* Maximum MTU supported */ 7#define ETH_JUMBO_MTU 9000 /* Maximum MTU supported */
8 8
9/* PCI device specific config registers */ 9/* PCI config registers */
10enum { 10enum {
11 PCI_DEV_REG1 = 0x40, 11 PCI_DEV_REG1 = 0x40,
12 PCI_DEV_REG2 = 0x44, 12 PCI_DEV_REG2 = 0x44,
13 PCI_DEV_STATUS = 0x7c,
13 PCI_DEV_REG3 = 0x80, 14 PCI_DEV_REG3 = 0x80,
14 PCI_DEV_REG4 = 0x84, 15 PCI_DEV_REG4 = 0x84,
15 PCI_DEV_REG5 = 0x88, 16 PCI_DEV_REG5 = 0x88,
16}; 17};
17 18
19enum {
20 PEX_DEV_CAP = 0xe4,
21 PEX_DEV_CTRL = 0xe8,
22 PEX_DEV_STA = 0xea,
23 PEX_LNK_STAT = 0xf2,
24 PEX_UNC_ERR_STAT= 0x104,
25};
26
18/* Yukon-2 */ 27/* Yukon-2 */
19enum pci_dev_reg_1 { 28enum pci_dev_reg_1 {
20 PCI_Y2_PIG_ENA = 1<<31, /* Enable Plug-in-Go (YUKON-2) */ 29 PCI_Y2_PIG_ENA = 1<<31, /* Enable Plug-in-Go (YUKON-2) */
@@ -63,6 +72,39 @@ enum pci_dev_reg_4 {
63 PCI_STATUS_REC_MASTER_ABORT | \ 72 PCI_STATUS_REC_MASTER_ABORT | \
64 PCI_STATUS_REC_TARGET_ABORT | \ 73 PCI_STATUS_REC_TARGET_ABORT | \
65 PCI_STATUS_PARITY) 74 PCI_STATUS_PARITY)
75
76enum pex_dev_ctrl {
77 PEX_DC_MAX_RRS_MSK = 7<<12, /* Bit 14..12: Max. Read Request Size */
78 PEX_DC_EN_NO_SNOOP = 1<<11,/* Enable No Snoop */
79 PEX_DC_EN_AUX_POW = 1<<10,/* Enable AUX Power */
80 PEX_DC_EN_PHANTOM = 1<<9, /* Enable Phantom Functions */
81 PEX_DC_EN_EXT_TAG = 1<<8, /* Enable Extended Tag Field */
82 PEX_DC_MAX_PLS_MSK = 7<<5, /* Bit 7.. 5: Max. Payload Size Mask */
83 PEX_DC_EN_REL_ORD = 1<<4, /* Enable Relaxed Ordering */
84 PEX_DC_EN_UNS_RQ_RP = 1<<3, /* Enable Unsupported Request Reporting */
85 PEX_DC_EN_FAT_ER_RP = 1<<2, /* Enable Fatal Error Reporting */
86 PEX_DC_EN_NFA_ER_RP = 1<<1, /* Enable Non-Fatal Error Reporting */
87 PEX_DC_EN_COR_ER_RP = 1<<0, /* Enable Correctable Error Reporting */
88};
89#define PEX_DC_MAX_RD_RQ_SIZE(x) (((x)<<12) & PEX_DC_MAX_RRS_MSK)
90
91/* PEX_UNC_ERR_STAT PEX Uncorrectable Errors Status Register (Yukon-2) */
92enum pex_err {
93 PEX_UNSUP_REQ = 1<<20, /* Unsupported Request Error */
94
95 PEX_MALFOR_TLP = 1<<18, /* Malformed TLP */
96
97 PEX_UNEXP_COMP = 1<<16, /* Unexpected Completion */
98
99 PEX_COMP_TO = 1<<14, /* Completion Timeout */
100 PEX_FLOW_CTRL_P = 1<<13, /* Flow Control Protocol Error */
101 PEX_POIS_TLP = 1<<12, /* Poisoned TLP */
102
103 PEX_DATA_LINK_P = 1<<4, /* Data Link Protocol Error */
104 PEX_FATAL_ERRORS= (PEX_MALFOR_TLP | PEX_FLOW_CTRL_P | PEX_DATA_LINK_P),
105};
106
107
66enum csr_regs { 108enum csr_regs {
67 B0_RAP = 0x0000, 109 B0_RAP = 0x0000,
68 B0_CTST = 0x0004, 110 B0_CTST = 0x0004,
@@ -1836,7 +1878,6 @@ struct sky2_hw {
1836 struct net_device *dev[2]; 1878 struct net_device *dev[2];
1837 1879
1838 int pm_cap; 1880 int pm_cap;
1839 int err_cap;
1840 u8 chip_id; 1881 u8 chip_id;
1841 u8 chip_rev; 1882 u8 chip_rev;
1842 u8 pmd_type; 1883 u8 pmd_type;
diff --git a/drivers/net/smc91x.h b/drivers/net/smc91x.h
index 636dbfcdf8cb..0c9f1e7dab2e 100644
--- a/drivers/net/smc91x.h
+++ b/drivers/net/smc91x.h
@@ -398,6 +398,24 @@ static inline void LPD7_SMC_outsw (unsigned char* a, int r,
398 398
399#define SMC_IRQ_FLAGS (0) 399#define SMC_IRQ_FLAGS (0)
400 400
401#elif defined(CONFIG_ARCH_VERSATILE)
402
403#define SMC_CAN_USE_8BIT 1
404#define SMC_CAN_USE_16BIT 1
405#define SMC_CAN_USE_32BIT 1
406#define SMC_NOWAIT 1
407
408#define SMC_inb(a, r) readb((a) + (r))
409#define SMC_inw(a, r) readw((a) + (r))
410#define SMC_inl(a, r) readl((a) + (r))
411#define SMC_outb(v, a, r) writeb(v, (a) + (r))
412#define SMC_outw(v, a, r) writew(v, (a) + (r))
413#define SMC_outl(v, a, r) writel(v, (a) + (r))
414#define SMC_insl(a, r, p, l) readsl((a) + (r), p, l)
415#define SMC_outsl(a, r, p, l) writesl((a) + (r), p, l)
416
417#define SMC_IRQ_FLAGS (0)
418
401#else 419#else
402 420
403#define SMC_CAN_USE_8BIT 1 421#define SMC_CAN_USE_8BIT 1
diff --git a/drivers/net/spider_net.c b/drivers/net/spider_net.c
index 46a009085f7c..418138dd6c68 100644
--- a/drivers/net/spider_net.c
+++ b/drivers/net/spider_net.c
@@ -55,12 +55,13 @@ MODULE_AUTHOR("Utz Bacher <utz.bacher@de.ibm.com> and Jens Osterkamp " \
55 "<Jens.Osterkamp@de.ibm.com>"); 55 "<Jens.Osterkamp@de.ibm.com>");
56MODULE_DESCRIPTION("Spider Southbridge Gigabit Ethernet driver"); 56MODULE_DESCRIPTION("Spider Southbridge Gigabit Ethernet driver");
57MODULE_LICENSE("GPL"); 57MODULE_LICENSE("GPL");
58MODULE_VERSION(VERSION);
58 59
59static int rx_descriptors = SPIDER_NET_RX_DESCRIPTORS_DEFAULT; 60static int rx_descriptors = SPIDER_NET_RX_DESCRIPTORS_DEFAULT;
60static int tx_descriptors = SPIDER_NET_TX_DESCRIPTORS_DEFAULT; 61static int tx_descriptors = SPIDER_NET_TX_DESCRIPTORS_DEFAULT;
61 62
62module_param(rx_descriptors, int, 0644); 63module_param(rx_descriptors, int, 0444);
63module_param(tx_descriptors, int, 0644); 64module_param(tx_descriptors, int, 0444);
64 65
65MODULE_PARM_DESC(rx_descriptors, "number of descriptors used " \ 66MODULE_PARM_DESC(rx_descriptors, "number of descriptors used " \
66 "in rx chains"); 67 "in rx chains");
@@ -300,7 +301,7 @@ static int
300spider_net_init_chain(struct spider_net_card *card, 301spider_net_init_chain(struct spider_net_card *card,
301 struct spider_net_descr_chain *chain, 302 struct spider_net_descr_chain *chain,
302 struct spider_net_descr *start_descr, 303 struct spider_net_descr *start_descr,
303 int direction, int no) 304 int no)
304{ 305{
305 int i; 306 int i;
306 struct spider_net_descr *descr; 307 struct spider_net_descr *descr;
@@ -315,7 +316,7 @@ spider_net_init_chain(struct spider_net_card *card,
315 316
316 buf = pci_map_single(card->pdev, descr, 317 buf = pci_map_single(card->pdev, descr,
317 SPIDER_NET_DESCR_SIZE, 318 SPIDER_NET_DESCR_SIZE,
318 direction); 319 PCI_DMA_BIDIRECTIONAL);
319 320
320 if (pci_dma_mapping_error(buf)) 321 if (pci_dma_mapping_error(buf))
321 goto iommu_error; 322 goto iommu_error;
@@ -329,11 +330,6 @@ spider_net_init_chain(struct spider_net_card *card,
329 (descr-1)->next = start_descr; 330 (descr-1)->next = start_descr;
330 start_descr->prev = descr-1; 331 start_descr->prev = descr-1;
331 332
332 descr = start_descr;
333 if (direction == PCI_DMA_FROMDEVICE)
334 for (i=0; i < no; i++, descr++)
335 descr->next_descr_addr = descr->next->bus_addr;
336
337 spin_lock_init(&chain->lock); 333 spin_lock_init(&chain->lock);
338 chain->head = start_descr; 334 chain->head = start_descr;
339 chain->tail = start_descr; 335 chain->tail = start_descr;
@@ -346,7 +342,7 @@ iommu_error:
346 if (descr->bus_addr) 342 if (descr->bus_addr)
347 pci_unmap_single(card->pdev, descr->bus_addr, 343 pci_unmap_single(card->pdev, descr->bus_addr,
348 SPIDER_NET_DESCR_SIZE, 344 SPIDER_NET_DESCR_SIZE,
349 direction); 345 PCI_DMA_BIDIRECTIONAL);
350 return -ENOMEM; 346 return -ENOMEM;
351} 347}
352 348
@@ -362,15 +358,15 @@ spider_net_free_rx_chain_contents(struct spider_net_card *card)
362 struct spider_net_descr *descr; 358 struct spider_net_descr *descr;
363 359
364 descr = card->rx_chain.head; 360 descr = card->rx_chain.head;
365 while (descr->next != card->rx_chain.head) { 361 do {
366 if (descr->skb) { 362 if (descr->skb) {
367 dev_kfree_skb(descr->skb); 363 dev_kfree_skb(descr->skb);
368 pci_unmap_single(card->pdev, descr->buf_addr, 364 pci_unmap_single(card->pdev, descr->buf_addr,
369 SPIDER_NET_MAX_FRAME, 365 SPIDER_NET_MAX_FRAME,
370 PCI_DMA_FROMDEVICE); 366 PCI_DMA_BIDIRECTIONAL);
371 } 367 }
372 descr = descr->next; 368 descr = descr->next;
373 } 369 } while (descr != card->rx_chain.head);
374} 370}
375 371
376/** 372/**
@@ -645,26 +641,41 @@ static int
645spider_net_prepare_tx_descr(struct spider_net_card *card, 641spider_net_prepare_tx_descr(struct spider_net_card *card,
646 struct sk_buff *skb) 642 struct sk_buff *skb)
647{ 643{
648 struct spider_net_descr *descr = card->tx_chain.head; 644 struct spider_net_descr *descr;
649 dma_addr_t buf; 645 dma_addr_t buf;
646 unsigned long flags;
647 int length;
650 648
651 buf = pci_map_single(card->pdev, skb->data, skb->len, PCI_DMA_TODEVICE); 649 length = skb->len;
650 if (length < ETH_ZLEN) {
651 if (skb_pad(skb, ETH_ZLEN-length))
652 return 0;
653 length = ETH_ZLEN;
654 }
655
656 buf = pci_map_single(card->pdev, skb->data, length, PCI_DMA_TODEVICE);
652 if (pci_dma_mapping_error(buf)) { 657 if (pci_dma_mapping_error(buf)) {
653 if (netif_msg_tx_err(card) && net_ratelimit()) 658 if (netif_msg_tx_err(card) && net_ratelimit())
654 pr_err("could not iommu-map packet (%p, %i). " 659 pr_err("could not iommu-map packet (%p, %i). "
655 "Dropping packet\n", skb->data, skb->len); 660 "Dropping packet\n", skb->data, length);
656 card->spider_stats.tx_iommu_map_error++; 661 card->spider_stats.tx_iommu_map_error++;
657 return -ENOMEM; 662 return -ENOMEM;
658 } 663 }
659 664
665 spin_lock_irqsave(&card->tx_chain.lock, flags);
666 descr = card->tx_chain.head;
667 card->tx_chain.head = descr->next;
668
660 descr->buf_addr = buf; 669 descr->buf_addr = buf;
661 descr->buf_size = skb->len; 670 descr->buf_size = length;
662 descr->next_descr_addr = 0; 671 descr->next_descr_addr = 0;
663 descr->skb = skb; 672 descr->skb = skb;
664 descr->data_status = 0; 673 descr->data_status = 0;
665 674
666 descr->dmac_cmd_status = 675 descr->dmac_cmd_status =
667 SPIDER_NET_DESCR_CARDOWNED | SPIDER_NET_DMAC_NOCS; 676 SPIDER_NET_DESCR_CARDOWNED | SPIDER_NET_DMAC_NOCS;
677 spin_unlock_irqrestore(&card->tx_chain.lock, flags);
678
668 if (skb->protocol == htons(ETH_P_IP)) 679 if (skb->protocol == htons(ETH_P_IP))
669 switch (skb->nh.iph->protocol) { 680 switch (skb->nh.iph->protocol) {
670 case IPPROTO_TCP: 681 case IPPROTO_TCP:
@@ -675,32 +686,51 @@ spider_net_prepare_tx_descr(struct spider_net_card *card,
675 break; 686 break;
676 } 687 }
677 688
689 /* Chain the bus address, so that the DMA engine finds this descr. */
678 descr->prev->next_descr_addr = descr->bus_addr; 690 descr->prev->next_descr_addr = descr->bus_addr;
679 691
692 card->netdev->trans_start = jiffies; /* set netdev watchdog timer */
680 return 0; 693 return 0;
681} 694}
682 695
683/** 696static int
684 * spider_net_release_tx_descr - processes a used tx descriptor 697spider_net_set_low_watermark(struct spider_net_card *card)
685 * @card: card structure
686 * @descr: descriptor to release
687 *
688 * releases a used tx descriptor (unmapping, freeing of skb)
689 */
690static inline void
691spider_net_release_tx_descr(struct spider_net_card *card)
692{ 698{
699 unsigned long flags;
700 int status;
701 int cnt=0;
702 int i;
693 struct spider_net_descr *descr = card->tx_chain.tail; 703 struct spider_net_descr *descr = card->tx_chain.tail;
694 struct sk_buff *skb;
695 704
696 card->tx_chain.tail = card->tx_chain.tail->next; 705 /* Measure the length of the queue. Measurement does not
697 descr->dmac_cmd_status |= SPIDER_NET_DESCR_NOT_IN_USE; 706 * need to be precise -- does not need a lock. */
707 while (descr != card->tx_chain.head) {
708 status = descr->dmac_cmd_status & SPIDER_NET_DESCR_NOT_IN_USE;
709 if (status == SPIDER_NET_DESCR_NOT_IN_USE)
710 break;
711 descr = descr->next;
712 cnt++;
713 }
698 714
699 /* unmap the skb */ 715 /* If TX queue is short, don't even bother with interrupts */
700 skb = descr->skb; 716 if (cnt < card->num_tx_desc/4)
701 pci_unmap_single(card->pdev, descr->buf_addr, skb->len, 717 return cnt;
702 PCI_DMA_TODEVICE); 718
703 dev_kfree_skb_any(skb); 719 /* Set low-watermark 3/4th's of the way into the queue. */
720 descr = card->tx_chain.tail;
721 cnt = (cnt*3)/4;
722 for (i=0;i<cnt; i++)
723 descr = descr->next;
724
725 /* Set the new watermark, clear the old watermark */
726 spin_lock_irqsave(&card->tx_chain.lock, flags);
727 descr->dmac_cmd_status |= SPIDER_NET_DESCR_TXDESFLG;
728 if (card->low_watermark && card->low_watermark != descr)
729 card->low_watermark->dmac_cmd_status =
730 card->low_watermark->dmac_cmd_status & ~SPIDER_NET_DESCR_TXDESFLG;
731 card->low_watermark = descr;
732 spin_unlock_irqrestore(&card->tx_chain.lock, flags);
733 return cnt;
704} 734}
705 735
706/** 736/**
@@ -719,21 +749,29 @@ static int
719spider_net_release_tx_chain(struct spider_net_card *card, int brutal) 749spider_net_release_tx_chain(struct spider_net_card *card, int brutal)
720{ 750{
721 struct spider_net_descr_chain *chain = &card->tx_chain; 751 struct spider_net_descr_chain *chain = &card->tx_chain;
752 struct spider_net_descr *descr;
753 struct sk_buff *skb;
754 u32 buf_addr;
755 unsigned long flags;
722 int status; 756 int status;
723 757
724 spider_net_read_reg(card, SPIDER_NET_GDTDMACCNTR);
725
726 while (chain->tail != chain->head) { 758 while (chain->tail != chain->head) {
727 status = spider_net_get_descr_status(chain->tail); 759 spin_lock_irqsave(&chain->lock, flags);
760 descr = chain->tail;
761
762 status = spider_net_get_descr_status(descr);
728 switch (status) { 763 switch (status) {
729 case SPIDER_NET_DESCR_COMPLETE: 764 case SPIDER_NET_DESCR_COMPLETE:
730 card->netdev_stats.tx_packets++; 765 card->netdev_stats.tx_packets++;
731 card->netdev_stats.tx_bytes += chain->tail->skb->len; 766 card->netdev_stats.tx_bytes += descr->skb->len;
732 break; 767 break;
733 768
734 case SPIDER_NET_DESCR_CARDOWNED: 769 case SPIDER_NET_DESCR_CARDOWNED:
735 if (!brutal) 770 if (!brutal) {
771 spin_unlock_irqrestore(&chain->lock, flags);
736 return 1; 772 return 1;
773 }
774
737 /* fallthrough, if we release the descriptors 775 /* fallthrough, if we release the descriptors
738 * brutally (then we don't care about 776 * brutally (then we don't care about
739 * SPIDER_NET_DESCR_CARDOWNED) */ 777 * SPIDER_NET_DESCR_CARDOWNED) */
@@ -750,11 +788,25 @@ spider_net_release_tx_chain(struct spider_net_card *card, int brutal)
750 788
751 default: 789 default:
752 card->netdev_stats.tx_dropped++; 790 card->netdev_stats.tx_dropped++;
753 return 1; 791 if (!brutal) {
792 spin_unlock_irqrestore(&chain->lock, flags);
793 return 1;
794 }
754 } 795 }
755 spider_net_release_tx_descr(card);
756 }
757 796
797 chain->tail = descr->next;
798 descr->dmac_cmd_status |= SPIDER_NET_DESCR_NOT_IN_USE;
799 skb = descr->skb;
800 buf_addr = descr->buf_addr;
801 spin_unlock_irqrestore(&chain->lock, flags);
802
803 /* unmap the skb */
804 if (skb) {
805 int len = skb->len < ETH_ZLEN ? ETH_ZLEN : skb->len;
806 pci_unmap_single(card->pdev, buf_addr, len, PCI_DMA_TODEVICE);
807 dev_kfree_skb(skb);
808 }
809 }
758 return 0; 810 return 0;
759} 811}
760 812
@@ -763,8 +815,12 @@ spider_net_release_tx_chain(struct spider_net_card *card, int brutal)
763 * @card: card structure 815 * @card: card structure
764 * @descr: descriptor address to enable TX processing at 816 * @descr: descriptor address to enable TX processing at
765 * 817 *
766 * spider_net_kick_tx_dma writes the current tx chain head as start address 818 * This routine will start the transmit DMA running if
767 * of the tx descriptor chain and enables the transmission DMA engine 819 * it is not already running. This routine ned only be
820 * called when queueing a new packet to an empty tx queue.
821 * Writes the current tx chain head as start address
822 * of the tx descriptor chain and enables the transmission
823 * DMA engine.
768 */ 824 */
769static inline void 825static inline void
770spider_net_kick_tx_dma(struct spider_net_card *card) 826spider_net_kick_tx_dma(struct spider_net_card *card)
@@ -804,65 +860,43 @@ out:
804static int 860static int
805spider_net_xmit(struct sk_buff *skb, struct net_device *netdev) 861spider_net_xmit(struct sk_buff *skb, struct net_device *netdev)
806{ 862{
863 int cnt;
807 struct spider_net_card *card = netdev_priv(netdev); 864 struct spider_net_card *card = netdev_priv(netdev);
808 struct spider_net_descr_chain *chain = &card->tx_chain; 865 struct spider_net_descr_chain *chain = &card->tx_chain;
809 struct spider_net_descr *descr = chain->head;
810 unsigned long flags;
811 int result;
812
813 spin_lock_irqsave(&chain->lock, flags);
814 866
815 spider_net_release_tx_chain(card, 0); 867 spider_net_release_tx_chain(card, 0);
816 868
817 if (chain->head->next == chain->tail->prev) { 869 if ((chain->head->next == chain->tail->prev) ||
818 card->netdev_stats.tx_dropped++; 870 (spider_net_prepare_tx_descr(card, skb) != 0)) {
819 result = NETDEV_TX_LOCKED;
820 goto out;
821 }
822 871
823 if (spider_net_get_descr_status(descr) != SPIDER_NET_DESCR_NOT_IN_USE) {
824 card->netdev_stats.tx_dropped++; 872 card->netdev_stats.tx_dropped++;
825 result = NETDEV_TX_LOCKED; 873 netif_stop_queue(netdev);
826 goto out; 874 return NETDEV_TX_BUSY;
827 } 875 }
828 876
829 if (spider_net_prepare_tx_descr(card, skb) != 0) { 877 cnt = spider_net_set_low_watermark(card);
830 card->netdev_stats.tx_dropped++; 878 if (cnt < 5)
831 result = NETDEV_TX_BUSY; 879 spider_net_kick_tx_dma(card);
832 goto out; 880 return NETDEV_TX_OK;
833 }
834
835 result = NETDEV_TX_OK;
836
837 spider_net_kick_tx_dma(card);
838 card->tx_chain.head = card->tx_chain.head->next;
839
840out:
841 spin_unlock_irqrestore(&chain->lock, flags);
842 netif_wake_queue(netdev);
843 return result;
844} 881}
845 882
846/** 883/**
847 * spider_net_cleanup_tx_ring - cleans up the TX ring 884 * spider_net_cleanup_tx_ring - cleans up the TX ring
848 * @card: card structure 885 * @card: card structure
849 * 886 *
850 * spider_net_cleanup_tx_ring is called by the tx_timer (as we don't use 887 * spider_net_cleanup_tx_ring is called by either the tx_timer
851 * interrupts to cleanup our TX ring) and returns sent packets to the stack 888 * or from the NAPI polling routine.
852 * by freeing them 889 * This routine releases resources associted with transmitted
890 * packets, including updating the queue tail pointer.
853 */ 891 */
854static void 892static void
855spider_net_cleanup_tx_ring(struct spider_net_card *card) 893spider_net_cleanup_tx_ring(struct spider_net_card *card)
856{ 894{
857 unsigned long flags;
858
859 spin_lock_irqsave(&card->tx_chain.lock, flags);
860
861 if ((spider_net_release_tx_chain(card, 0) != 0) && 895 if ((spider_net_release_tx_chain(card, 0) != 0) &&
862 (card->netdev->flags & IFF_UP)) 896 (card->netdev->flags & IFF_UP)) {
863 spider_net_kick_tx_dma(card); 897 spider_net_kick_tx_dma(card);
864 898 netif_wake_queue(card->netdev);
865 spin_unlock_irqrestore(&card->tx_chain.lock, flags); 899 }
866} 900}
867 901
868/** 902/**
@@ -1053,6 +1087,7 @@ spider_net_poll(struct net_device *netdev, int *budget)
1053 int packets_to_do, packets_done = 0; 1087 int packets_to_do, packets_done = 0;
1054 int no_more_packets = 0; 1088 int no_more_packets = 0;
1055 1089
1090 spider_net_cleanup_tx_ring(card);
1056 packets_to_do = min(*budget, netdev->quota); 1091 packets_to_do = min(*budget, netdev->quota);
1057 1092
1058 while (packets_to_do) { 1093 while (packets_to_do) {
@@ -1243,12 +1278,15 @@ spider_net_handle_error_irq(struct spider_net_card *card, u32 status_reg)
1243 case SPIDER_NET_PHYINT: 1278 case SPIDER_NET_PHYINT:
1244 case SPIDER_NET_GMAC2INT: 1279 case SPIDER_NET_GMAC2INT:
1245 case SPIDER_NET_GMAC1INT: 1280 case SPIDER_NET_GMAC1INT:
1246 case SPIDER_NET_GIPSINT:
1247 case SPIDER_NET_GFIFOINT: 1281 case SPIDER_NET_GFIFOINT:
1248 case SPIDER_NET_DMACINT: 1282 case SPIDER_NET_DMACINT:
1249 case SPIDER_NET_GSYSINT: 1283 case SPIDER_NET_GSYSINT:
1250 break; */ 1284 break; */
1251 1285
1286 case SPIDER_NET_GIPSINT:
1287 show_error = 0;
1288 break;
1289
1252 case SPIDER_NET_GPWOPCMPINT: 1290 case SPIDER_NET_GPWOPCMPINT:
1253 /* PHY write operation completed */ 1291 /* PHY write operation completed */
1254 show_error = 0; 1292 show_error = 0;
@@ -1307,9 +1345,10 @@ spider_net_handle_error_irq(struct spider_net_card *card, u32 status_reg)
1307 case SPIDER_NET_GDTDCEINT: 1345 case SPIDER_NET_GDTDCEINT:
1308 /* chain end. If a descriptor should be sent, kick off 1346 /* chain end. If a descriptor should be sent, kick off
1309 * tx dma 1347 * tx dma
1310 if (card->tx_chain.tail == card->tx_chain.head) 1348 if (card->tx_chain.tail != card->tx_chain.head)
1311 spider_net_kick_tx_dma(card); 1349 spider_net_kick_tx_dma(card);
1312 show_error = 0; */ 1350 */
1351 show_error = 0;
1313 break; 1352 break;
1314 1353
1315 /* case SPIDER_NET_G1TMCNTINT: not used. print a message */ 1354 /* case SPIDER_NET_G1TMCNTINT: not used. print a message */
@@ -1354,7 +1393,7 @@ spider_net_handle_error_irq(struct spider_net_card *card, u32 status_reg)
1354 if (netif_msg_intr(card)) 1393 if (netif_msg_intr(card))
1355 pr_err("got descriptor chain end interrupt, " 1394 pr_err("got descriptor chain end interrupt, "
1356 "restarting DMAC %c.\n", 1395 "restarting DMAC %c.\n",
1357 'D'+i-SPIDER_NET_GDDDCEINT); 1396 'D'-(i-SPIDER_NET_GDDDCEINT)/3);
1358 spider_net_refill_rx_chain(card); 1397 spider_net_refill_rx_chain(card);
1359 spider_net_enable_rxdmac(card); 1398 spider_net_enable_rxdmac(card);
1360 show_error = 0; 1399 show_error = 0;
@@ -1423,8 +1462,9 @@ spider_net_handle_error_irq(struct spider_net_card *card, u32 status_reg)
1423 } 1462 }
1424 1463
1425 if ((show_error) && (netif_msg_intr(card))) 1464 if ((show_error) && (netif_msg_intr(card)))
1426 pr_err("Got error interrupt, GHIINT0STS = 0x%08x, " 1465 pr_err("Got error interrupt on %s, GHIINT0STS = 0x%08x, "
1427 "GHIINT1STS = 0x%08x, GHIINT2STS = 0x%08x\n", 1466 "GHIINT1STS = 0x%08x, GHIINT2STS = 0x%08x\n",
1467 card->netdev->name,
1428 status_reg, error_reg1, error_reg2); 1468 status_reg, error_reg1, error_reg2);
1429 1469
1430 /* clear interrupt sources */ 1470 /* clear interrupt sources */
@@ -1460,6 +1500,8 @@ spider_net_interrupt(int irq, void *ptr)
1460 spider_net_rx_irq_off(card); 1500 spider_net_rx_irq_off(card);
1461 netif_rx_schedule(netdev); 1501 netif_rx_schedule(netdev);
1462 } 1502 }
1503 if (status_reg & SPIDER_NET_TXINT)
1504 netif_rx_schedule(netdev);
1463 1505
1464 if (status_reg & SPIDER_NET_ERRINT ) 1506 if (status_reg & SPIDER_NET_ERRINT )
1465 spider_net_handle_error_irq(card, status_reg); 1507 spider_net_handle_error_irq(card, status_reg);
@@ -1599,7 +1641,7 @@ spider_net_enable_card(struct spider_net_card *card)
1599 SPIDER_NET_INT2_MASK_VALUE); 1641 SPIDER_NET_INT2_MASK_VALUE);
1600 1642
1601 spider_net_write_reg(card, SPIDER_NET_GDTDMACCNTR, 1643 spider_net_write_reg(card, SPIDER_NET_GDTDMACCNTR,
1602 SPIDER_NET_GDTDCEIDIS); 1644 SPIDER_NET_GDTBSTA | SPIDER_NET_GDTDCEIDIS);
1603} 1645}
1604 1646
1605/** 1647/**
@@ -1615,17 +1657,26 @@ int
1615spider_net_open(struct net_device *netdev) 1657spider_net_open(struct net_device *netdev)
1616{ 1658{
1617 struct spider_net_card *card = netdev_priv(netdev); 1659 struct spider_net_card *card = netdev_priv(netdev);
1618 int result; 1660 struct spider_net_descr *descr;
1661 int i, result;
1619 1662
1620 result = -ENOMEM; 1663 result = -ENOMEM;
1621 if (spider_net_init_chain(card, &card->tx_chain, card->descr, 1664 if (spider_net_init_chain(card, &card->tx_chain, card->descr,
1622 PCI_DMA_TODEVICE, card->tx_desc)) 1665 card->num_tx_desc))
1623 goto alloc_tx_failed; 1666 goto alloc_tx_failed;
1667
1668 card->low_watermark = NULL;
1669
1670 /* rx_chain is after tx_chain, so offset is descr + tx_count */
1624 if (spider_net_init_chain(card, &card->rx_chain, 1671 if (spider_net_init_chain(card, &card->rx_chain,
1625 card->descr + card->rx_desc, 1672 card->descr + card->num_tx_desc,
1626 PCI_DMA_FROMDEVICE, card->rx_desc)) 1673 card->num_rx_desc))
1627 goto alloc_rx_failed; 1674 goto alloc_rx_failed;
1628 1675
1676 descr = card->rx_chain.head;
1677 for (i=0; i < card->num_rx_desc; i++, descr++)
1678 descr->next_descr_addr = descr->next->bus_addr;
1679
1629 /* allocate rx skbs */ 1680 /* allocate rx skbs */
1630 if (spider_net_alloc_rx_skbs(card)) 1681 if (spider_net_alloc_rx_skbs(card))
1631 goto alloc_skbs_failed; 1682 goto alloc_skbs_failed;
@@ -1878,10 +1929,7 @@ spider_net_stop(struct net_device *netdev)
1878 spider_net_disable_rxdmac(card); 1929 spider_net_disable_rxdmac(card);
1879 1930
1880 /* release chains */ 1931 /* release chains */
1881 if (spin_trylock(&card->tx_chain.lock)) { 1932 spider_net_release_tx_chain(card, 1);
1882 spider_net_release_tx_chain(card, 1);
1883 spin_unlock(&card->tx_chain.lock);
1884 }
1885 1933
1886 spider_net_free_chain(card, &card->tx_chain); 1934 spider_net_free_chain(card, &card->tx_chain);
1887 spider_net_free_chain(card, &card->rx_chain); 1935 spider_net_free_chain(card, &card->rx_chain);
@@ -2012,8 +2060,8 @@ spider_net_setup_netdev(struct spider_net_card *card)
2012 2060
2013 card->options.rx_csum = SPIDER_NET_RX_CSUM_DEFAULT; 2061 card->options.rx_csum = SPIDER_NET_RX_CSUM_DEFAULT;
2014 2062
2015 card->tx_desc = tx_descriptors; 2063 card->num_tx_desc = tx_descriptors;
2016 card->rx_desc = rx_descriptors; 2064 card->num_rx_desc = rx_descriptors;
2017 2065
2018 spider_net_setup_netdev_ops(netdev); 2066 spider_net_setup_netdev_ops(netdev);
2019 2067
@@ -2252,6 +2300,8 @@ static struct pci_driver spider_net_driver = {
2252 */ 2300 */
2253static int __init spider_net_init(void) 2301static int __init spider_net_init(void)
2254{ 2302{
2303 printk(KERN_INFO "Spidernet version %s.\n", VERSION);
2304
2255 if (rx_descriptors < SPIDER_NET_RX_DESCRIPTORS_MIN) { 2305 if (rx_descriptors < SPIDER_NET_RX_DESCRIPTORS_MIN) {
2256 rx_descriptors = SPIDER_NET_RX_DESCRIPTORS_MIN; 2306 rx_descriptors = SPIDER_NET_RX_DESCRIPTORS_MIN;
2257 pr_info("adjusting rx descriptors to %i.\n", rx_descriptors); 2307 pr_info("adjusting rx descriptors to %i.\n", rx_descriptors);
diff --git a/drivers/net/spider_net.h b/drivers/net/spider_net.h
index a59deda2f95e..b3b46119b424 100644
--- a/drivers/net/spider_net.h
+++ b/drivers/net/spider_net.h
@@ -24,6 +24,8 @@
24#ifndef _SPIDER_NET_H 24#ifndef _SPIDER_NET_H
25#define _SPIDER_NET_H 25#define _SPIDER_NET_H
26 26
27#define VERSION "1.1 A"
28
27#include "sungem_phy.h" 29#include "sungem_phy.h"
28 30
29extern int spider_net_stop(struct net_device *netdev); 31extern int spider_net_stop(struct net_device *netdev);
@@ -47,7 +49,7 @@ extern char spider_net_driver_name[];
47#define SPIDER_NET_TX_DESCRIPTORS_MIN 16 49#define SPIDER_NET_TX_DESCRIPTORS_MIN 16
48#define SPIDER_NET_TX_DESCRIPTORS_MAX 512 50#define SPIDER_NET_TX_DESCRIPTORS_MAX 512
49 51
50#define SPIDER_NET_TX_TIMER 20 52#define SPIDER_NET_TX_TIMER (HZ/5)
51 53
52#define SPIDER_NET_RX_CSUM_DEFAULT 1 54#define SPIDER_NET_RX_CSUM_DEFAULT 1
53 55
@@ -189,7 +191,9 @@ extern char spider_net_driver_name[];
189#define SPIDER_NET_MACMODE_VALUE 0x00000001 191#define SPIDER_NET_MACMODE_VALUE 0x00000001
190#define SPIDER_NET_BURSTLMT_VALUE 0x00000200 /* about 16 us */ 192#define SPIDER_NET_BURSTLMT_VALUE 0x00000200 /* about 16 us */
191 193
192/* 1(0) enable r/tx dma 194/* DMAC control register GDMACCNTR
195 *
196 * 1(0) enable r/tx dma
193 * 0000000 fixed to 0 197 * 0000000 fixed to 0
194 * 198 *
195 * 000000 fixed to 0 199 * 000000 fixed to 0
@@ -198,6 +202,7 @@ extern char spider_net_driver_name[];
198 * 202 *
199 * 000000 fixed to 0 203 * 000000 fixed to 0
200 * 00 burst alignment: 128 bytes 204 * 00 burst alignment: 128 bytes
205 * 11 burst alignment: 1024 bytes
201 * 206 *
202 * 00000 fixed to 0 207 * 00000 fixed to 0
203 * 0 descr writeback size 32 bytes 208 * 0 descr writeback size 32 bytes
@@ -208,10 +213,13 @@ extern char spider_net_driver_name[];
208#define SPIDER_NET_DMA_RX_VALUE 0x80000000 213#define SPIDER_NET_DMA_RX_VALUE 0x80000000
209#define SPIDER_NET_DMA_RX_FEND_VALUE 0x00030003 214#define SPIDER_NET_DMA_RX_FEND_VALUE 0x00030003
210/* to set TX_DMA_EN */ 215/* to set TX_DMA_EN */
211#define SPIDER_NET_TX_DMA_EN 0x80000000 216#define SPIDER_NET_TX_DMA_EN 0x80000000
212#define SPIDER_NET_GDTDCEIDIS 0x00000002 217#define SPIDER_NET_GDTBSTA 0x00000300
213#define SPIDER_NET_DMA_TX_VALUE SPIDER_NET_TX_DMA_EN | \ 218#define SPIDER_NET_GDTDCEIDIS 0x00000002
214 SPIDER_NET_GDTDCEIDIS 219#define SPIDER_NET_DMA_TX_VALUE SPIDER_NET_TX_DMA_EN | \
220 SPIDER_NET_GDTBSTA | \
221 SPIDER_NET_GDTDCEIDIS
222
215#define SPIDER_NET_DMA_TX_FEND_VALUE 0x00030003 223#define SPIDER_NET_DMA_TX_FEND_VALUE 0x00030003
216 224
217/* SPIDER_NET_UA_DESCR_VALUE is OR'ed with the unicast address */ 225/* SPIDER_NET_UA_DESCR_VALUE is OR'ed with the unicast address */
@@ -320,13 +328,10 @@ enum spider_net_int2_status {
320 SPIDER_NET_GRISPDNGINT 328 SPIDER_NET_GRISPDNGINT
321}; 329};
322 330
323#define SPIDER_NET_TXINT ( (1 << SPIDER_NET_GTTEDINT) | \ 331#define SPIDER_NET_TXINT ( (1 << SPIDER_NET_GDTFDCINT) )
324 (1 << SPIDER_NET_GDTDCEINT) | \
325 (1 << SPIDER_NET_GDTFDCINT) )
326 332
327/* we rely on flagged descriptor interrupts*/ 333/* We rely on flagged descriptor interrupts */
328#define SPIDER_NET_RXINT ( (1 << SPIDER_NET_GDAFDCINT) | \ 334#define SPIDER_NET_RXINT ( (1 << SPIDER_NET_GDAFDCINT) )
329 (1 << SPIDER_NET_GRMFLLINT) )
330 335
331#define SPIDER_NET_ERRINT ( 0xffffffff & \ 336#define SPIDER_NET_ERRINT ( 0xffffffff & \
332 (~SPIDER_NET_TXINT) & \ 337 (~SPIDER_NET_TXINT) & \
@@ -349,6 +354,7 @@ enum spider_net_int2_status {
349#define SPIDER_NET_DESCR_FORCE_END 0x50000000 /* used in rx and tx */ 354#define SPIDER_NET_DESCR_FORCE_END 0x50000000 /* used in rx and tx */
350#define SPIDER_NET_DESCR_CARDOWNED 0xA0000000 /* used in rx and tx */ 355#define SPIDER_NET_DESCR_CARDOWNED 0xA0000000 /* used in rx and tx */
351#define SPIDER_NET_DESCR_NOT_IN_USE 0xF0000000 356#define SPIDER_NET_DESCR_NOT_IN_USE 0xF0000000
357#define SPIDER_NET_DESCR_TXDESFLG 0x00800000
352 358
353struct spider_net_descr { 359struct spider_net_descr {
354 /* as defined by the hardware */ 360 /* as defined by the hardware */
@@ -433,6 +439,7 @@ struct spider_net_card {
433 439
434 struct spider_net_descr_chain tx_chain; 440 struct spider_net_descr_chain tx_chain;
435 struct spider_net_descr_chain rx_chain; 441 struct spider_net_descr_chain rx_chain;
442 struct spider_net_descr *low_watermark;
436 443
437 struct net_device_stats netdev_stats; 444 struct net_device_stats netdev_stats;
438 445
@@ -448,8 +455,8 @@ struct spider_net_card {
448 455
449 /* for ethtool */ 456 /* for ethtool */
450 int msg_enable; 457 int msg_enable;
451 int rx_desc; 458 int num_rx_desc;
452 int tx_desc; 459 int num_tx_desc;
453 struct spider_net_extra_stats spider_stats; 460 struct spider_net_extra_stats spider_stats;
454 461
455 struct spider_net_descr descr[0]; 462 struct spider_net_descr descr[0];
diff --git a/drivers/net/spider_net_ethtool.c b/drivers/net/spider_net_ethtool.c
index 589e43658dee..91b995102915 100644
--- a/drivers/net/spider_net_ethtool.c
+++ b/drivers/net/spider_net_ethtool.c
@@ -76,7 +76,7 @@ spider_net_ethtool_get_drvinfo(struct net_device *netdev,
76 /* clear and fill out info */ 76 /* clear and fill out info */
77 memset(drvinfo, 0, sizeof(struct ethtool_drvinfo)); 77 memset(drvinfo, 0, sizeof(struct ethtool_drvinfo));
78 strncpy(drvinfo->driver, spider_net_driver_name, 32); 78 strncpy(drvinfo->driver, spider_net_driver_name, 32);
79 strncpy(drvinfo->version, "0.1", 32); 79 strncpy(drvinfo->version, VERSION, 32);
80 strcpy(drvinfo->fw_version, "no information"); 80 strcpy(drvinfo->fw_version, "no information");
81 strncpy(drvinfo->bus_info, pci_name(card->pdev), 32); 81 strncpy(drvinfo->bus_info, pci_name(card->pdev), 32);
82} 82}
@@ -158,9 +158,9 @@ spider_net_ethtool_get_ringparam(struct net_device *netdev,
158 struct spider_net_card *card = netdev->priv; 158 struct spider_net_card *card = netdev->priv;
159 159
160 ering->tx_max_pending = SPIDER_NET_TX_DESCRIPTORS_MAX; 160 ering->tx_max_pending = SPIDER_NET_TX_DESCRIPTORS_MAX;
161 ering->tx_pending = card->tx_desc; 161 ering->tx_pending = card->num_tx_desc;
162 ering->rx_max_pending = SPIDER_NET_RX_DESCRIPTORS_MAX; 162 ering->rx_max_pending = SPIDER_NET_RX_DESCRIPTORS_MAX;
163 ering->rx_pending = card->rx_desc; 163 ering->rx_pending = card->num_rx_desc;
164} 164}
165 165
166static int spider_net_get_stats_count(struct net_device *netdev) 166static int spider_net_get_stats_count(struct net_device *netdev)
diff --git a/drivers/net/sun3_82586.c b/drivers/net/sun3_82586.c
index d1d1885b0295..a3220a96524f 100644
--- a/drivers/net/sun3_82586.c
+++ b/drivers/net/sun3_82586.c
@@ -330,7 +330,7 @@ out2:
330out1: 330out1:
331 free_netdev(dev); 331 free_netdev(dev);
332out: 332out:
333 iounmap((void *)ioaddr); 333 iounmap((void __iomem *)ioaddr);
334 return ERR_PTR(err); 334 return ERR_PTR(err);
335} 335}
336 336
diff --git a/drivers/net/sun3lance.c b/drivers/net/sun3lance.c
index 91c76544e4dd..b865db363ba0 100644
--- a/drivers/net/sun3lance.c
+++ b/drivers/net/sun3lance.c
@@ -286,7 +286,7 @@ struct net_device * __init sun3lance_probe(int unit)
286 286
287out1: 287out1:
288#ifdef CONFIG_SUN3 288#ifdef CONFIG_SUN3
289 iounmap((void *)dev->base_addr); 289 iounmap((void __iomem *)dev->base_addr);
290#endif 290#endif
291out: 291out:
292 free_netdev(dev); 292 free_netdev(dev);
@@ -326,7 +326,7 @@ static int __init lance_probe( struct net_device *dev)
326 ioaddr_probe[1] = tmp2; 326 ioaddr_probe[1] = tmp2;
327 327
328#ifdef CONFIG_SUN3 328#ifdef CONFIG_SUN3
329 iounmap((void *)ioaddr); 329 iounmap((void __iomem *)ioaddr);
330#endif 330#endif
331 return 0; 331 return 0;
332 } 332 }
@@ -956,7 +956,7 @@ void cleanup_module(void)
956{ 956{
957 unregister_netdev(sun3lance_dev); 957 unregister_netdev(sun3lance_dev);
958#ifdef CONFIG_SUN3 958#ifdef CONFIG_SUN3
959 iounmap((void *)sun3lance_dev->base_addr); 959 iounmap((void __iomem *)sun3lance_dev->base_addr);
960#endif 960#endif
961 free_netdev(sun3lance_dev); 961 free_netdev(sun3lance_dev);
962} 962}
diff --git a/drivers/net/tulip/de2104x.c b/drivers/net/tulip/de2104x.c
index 2cfd9634895a..f6b3a94e97bf 100644
--- a/drivers/net/tulip/de2104x.c
+++ b/drivers/net/tulip/de2104x.c
@@ -1730,7 +1730,7 @@ static void __init de21040_get_media_info(struct de_private *de)
1730} 1730}
1731 1731
1732/* Note: this routine returns extra data bits for size detection. */ 1732/* Note: this routine returns extra data bits for size detection. */
1733static unsigned __init tulip_read_eeprom(void __iomem *regs, int location, int addr_len) 1733static unsigned __devinit tulip_read_eeprom(void __iomem *regs, int location, int addr_len)
1734{ 1734{
1735 int i; 1735 int i;
1736 unsigned retval = 0; 1736 unsigned retval = 0;
@@ -1926,7 +1926,7 @@ bad_srom:
1926 goto fill_defaults; 1926 goto fill_defaults;
1927} 1927}
1928 1928
1929static int __init de_init_one (struct pci_dev *pdev, 1929static int __devinit de_init_one (struct pci_dev *pdev,
1930 const struct pci_device_id *ent) 1930 const struct pci_device_id *ent)
1931{ 1931{
1932 struct net_device *dev; 1932 struct net_device *dev;
@@ -2082,7 +2082,7 @@ err_out_free:
2082 return rc; 2082 return rc;
2083} 2083}
2084 2084
2085static void __exit de_remove_one (struct pci_dev *pdev) 2085static void __devexit de_remove_one (struct pci_dev *pdev)
2086{ 2086{
2087 struct net_device *dev = pci_get_drvdata(pdev); 2087 struct net_device *dev = pci_get_drvdata(pdev);
2088 struct de_private *de = dev->priv; 2088 struct de_private *de = dev->priv;
@@ -2164,7 +2164,7 @@ static struct pci_driver de_driver = {
2164 .name = DRV_NAME, 2164 .name = DRV_NAME,
2165 .id_table = de_pci_tbl, 2165 .id_table = de_pci_tbl,
2166 .probe = de_init_one, 2166 .probe = de_init_one,
2167 .remove = __exit_p(de_remove_one), 2167 .remove = __devexit_p(de_remove_one),
2168#ifdef CONFIG_PM 2168#ifdef CONFIG_PM
2169 .suspend = de_suspend, 2169 .suspend = de_suspend,
2170 .resume = de_resume, 2170 .resume = de_resume,
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 30294127a0aa..ecc50db8585a 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -55,7 +55,7 @@ config PCI_DEBUG
55config HT_IRQ 55config HT_IRQ
56 bool "Interrupts on hypertransport devices" 56 bool "Interrupts on hypertransport devices"
57 default y 57 default y
58 depends on X86_LOCAL_APIC && X86_IO_APIC 58 depends on PCI && X86_LOCAL_APIC && X86_IO_APIC
59 help 59 help
60 This allows native hypertransport devices to use interrupts. 60 This allows native hypertransport devices to use interrupts.
61 61
diff --git a/drivers/scsi/aha152x.c b/drivers/scsi/aha152x.c
index a0d1cee0be77..306f46b85a55 100644
--- a/drivers/scsi/aha152x.c
+++ b/drivers/scsi/aha152x.c
@@ -238,7 +238,7 @@
238#include <linux/module.h> 238#include <linux/module.h>
239#include <linux/sched.h> 239#include <linux/sched.h>
240#include <asm/irq.h> 240#include <asm/irq.h>
241#include <asm/io.h> 241#include <linux/io.h>
242#include <linux/blkdev.h> 242#include <linux/blkdev.h>
243#include <asm/system.h> 243#include <asm/system.h>
244#include <linux/errno.h> 244#include <linux/errno.h>
diff --git a/drivers/scsi/dtc.c b/drivers/scsi/dtc.c
index 0d5713dfa204..54756722dd5f 100644
--- a/drivers/scsi/dtc.c
+++ b/drivers/scsi/dtc.c
@@ -82,7 +82,7 @@
82#include <linux/string.h> 82#include <linux/string.h>
83#include <linux/init.h> 83#include <linux/init.h>
84#include <linux/interrupt.h> 84#include <linux/interrupt.h>
85#include <asm/io.h> 85#include <linux/io.h>
86#include "scsi.h" 86#include "scsi.h"
87#include <scsi/scsi_host.h> 87#include <scsi/scsi_host.h>
88#include "dtc.h" 88#include "dtc.h"
diff --git a/drivers/scsi/fdomain.c b/drivers/scsi/fdomain.c
index 41b05fc45380..72794a7b6dcc 100644
--- a/drivers/scsi/fdomain.c
+++ b/drivers/scsi/fdomain.c
@@ -278,9 +278,9 @@
278#include <linux/pci.h> 278#include <linux/pci.h>
279#include <linux/stat.h> 279#include <linux/stat.h>
280#include <linux/delay.h> 280#include <linux/delay.h>
281#include <linux/io.h>
281#include <scsi/scsicam.h> 282#include <scsi/scsicam.h>
282 283
283#include <asm/io.h>
284#include <asm/system.h> 284#include <asm/system.h>
285 285
286#include <scsi/scsi.h> 286#include <scsi/scsi.h>
diff --git a/drivers/scsi/seagate.c b/drivers/scsi/seagate.c
index 8ff1f2866f7b..5ffec2721b28 100644
--- a/drivers/scsi/seagate.c
+++ b/drivers/scsi/seagate.c
@@ -97,8 +97,8 @@
97#include <linux/blkdev.h> 97#include <linux/blkdev.h>
98#include <linux/stat.h> 98#include <linux/stat.h>
99#include <linux/delay.h> 99#include <linux/delay.h>
100#include <linux/io.h>
100 101
101#include <asm/io.h>
102#include <asm/system.h> 102#include <asm/system.h>
103#include <asm/uaccess.h> 103#include <asm/uaccess.h>
104 104
diff --git a/drivers/scsi/t128.c b/drivers/scsi/t128.c
index 2df6747cb76f..0b7a70f61e0d 100644
--- a/drivers/scsi/t128.c
+++ b/drivers/scsi/t128.c
@@ -109,7 +109,7 @@
109#include <asm/system.h> 109#include <asm/system.h>
110#include <linux/signal.h> 110#include <linux/signal.h>
111#include <linux/sched.h> 111#include <linux/sched.h>
112#include <asm/io.h> 112#include <linux/io.h>
113#include <linux/blkdev.h> 113#include <linux/blkdev.h>
114#include <linux/interrupt.h> 114#include <linux/interrupt.h>
115#include <linux/stat.h> 115#include <linux/stat.h>
diff --git a/drivers/scsi/wd7000.c b/drivers/scsi/wd7000.c
index 331e1cf159b0..30be76514c43 100644
--- a/drivers/scsi/wd7000.c
+++ b/drivers/scsi/wd7000.c
@@ -178,10 +178,10 @@
178#include <linux/blkdev.h> 178#include <linux/blkdev.h>
179#include <linux/init.h> 179#include <linux/init.h>
180#include <linux/stat.h> 180#include <linux/stat.h>
181#include <linux/io.h>
181 182
182#include <asm/system.h> 183#include <asm/system.h>
183#include <asm/dma.h> 184#include <asm/dma.h>
184#include <asm/io.h>
185 185
186#include <scsi/scsi.h> 186#include <scsi/scsi.h>
187#include <scsi/scsi_cmnd.h> 187#include <scsi/scsi_cmnd.h>
diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig
index daaa486159cf..7a43020fa583 100644
--- a/drivers/video/Kconfig
+++ b/drivers/video/Kconfig
@@ -701,7 +701,6 @@ config FB_NVIDIA
701 depends on FB && PCI 701 depends on FB && PCI
702 select I2C_ALGOBIT if FB_NVIDIA_I2C 702 select I2C_ALGOBIT if FB_NVIDIA_I2C
703 select I2C if FB_NVIDIA_I2C 703 select I2C if FB_NVIDIA_I2C
704 select FB_DDC if FB_NVIDIA_I2C
705 select FB_MODE_HELPERS 704 select FB_MODE_HELPERS
706 select FB_CFB_FILLRECT 705 select FB_CFB_FILLRECT
707 select FB_CFB_COPYAREA 706 select FB_CFB_COPYAREA
diff --git a/drivers/video/nvidia/nv_i2c.c b/drivers/video/nvidia/nv_i2c.c
index e48de3c9fd13..19eef3a09023 100644
--- a/drivers/video/nvidia/nv_i2c.c
+++ b/drivers/video/nvidia/nv_i2c.c
@@ -160,12 +160,51 @@ void nvidia_delete_i2c_busses(struct nvidia_par *par)
160 160
161} 161}
162 162
163static u8 *nvidia_do_probe_i2c_edid(struct nvidia_i2c_chan *chan)
164{
165 u8 start = 0x0;
166 struct i2c_msg msgs[] = {
167 {
168 .addr = 0x50,
169 .len = 1,
170 .buf = &start,
171 }, {
172 .addr = 0x50,
173 .flags = I2C_M_RD,
174 .len = EDID_LENGTH,
175 },
176 };
177 u8 *buf;
178
179 if (!chan->par)
180 return NULL;
181
182 buf = kmalloc(EDID_LENGTH, GFP_KERNEL);
183 if (!buf) {
184 dev_warn(&chan->par->pci_dev->dev, "Out of memory!\n");
185 return NULL;
186 }
187 msgs[1].buf = buf;
188
189 if (i2c_transfer(&chan->adapter, msgs, 2) == 2)
190 return buf;
191 dev_dbg(&chan->par->pci_dev->dev, "Unable to read EDID block.\n");
192 kfree(buf);
193 return NULL;
194}
195
163int nvidia_probe_i2c_connector(struct fb_info *info, int conn, u8 **out_edid) 196int nvidia_probe_i2c_connector(struct fb_info *info, int conn, u8 **out_edid)
164{ 197{
165 struct nvidia_par *par = info->par; 198 struct nvidia_par *par = info->par;
166 u8 *edid; 199 u8 *edid = NULL;
167 200 int i;
168 edid = fb_ddc_read(&par->chan[conn - 1].adapter); 201
202 for (i = 0; i < 3; i++) {
203 /* Do the real work */
204 edid = nvidia_do_probe_i2c_edid(&par->chan[conn - 1]);
205 if (edid)
206 break;
207 }
169 208
170 if (!edid && conn == 1) { 209 if (!edid && conn == 1) {
171 /* try to get from firmware */ 210 /* try to get from firmware */
diff --git a/fs/Kconfig b/fs/Kconfig
index 599de54451af..db4d13324c36 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -140,6 +140,73 @@ config EXT3_FS_SECURITY
140 If you are not using a security module that requires using 140 If you are not using a security module that requires using
141 extended attributes for file security labels, say N. 141 extended attributes for file security labels, say N.
142 142
143config EXT4DEV_FS
144 tristate "Ext4dev/ext4 extended fs support development (EXPERIMENTAL)"
145 depends on EXPERIMENTAL
146 select JBD2
147 help
148 Ext4dev is a predecessor filesystem of the next generation
149 extended fs ext4, based on ext3 filesystem code. It will be
150 renamed ext4 fs later, once ext4dev is mature and stabilized.
151
152 Unlike the change from ext2 filesystem to ext3 filesystem,
153 the on-disk format of ext4dev is not the same as ext3 any more:
154 it is based on extent maps and it supports 48-bit physical block
155 numbers. These combined on-disk format changes will allow
156 ext4dev/ext4 to handle more than 16 TB filesystem volumes --
157 a hard limit that ext3 cannot overcome without changing the
158 on-disk format.
159
160 Other than extent maps and 48-bit block numbers, ext4dev also is
161 likely to have other new features such as persistent preallocation,
162 high resolution time stamps, and larger file support etc. These
163 features will be added to ext4dev gradually.
164
165 To compile this file system support as a module, choose M here. The
166 module will be called ext4dev. Be aware, however, that the filesystem
167 of your root partition (the one containing the directory /) cannot
168 be compiled as a module, and so this could be dangerous.
169
170 If unsure, say N.
171
172config EXT4DEV_FS_XATTR
173 bool "Ext4dev extended attributes"
174 depends on EXT4DEV_FS
175 default y
176 help
177 Extended attributes are name:value pairs associated with inodes by
178 the kernel or by users (see the attr(5) manual page, or visit
179 <http://acl.bestbits.at/> for details).
180
181 If unsure, say N.
182
183 You need this for POSIX ACL support on ext4dev/ext4.
184
185config EXT4DEV_FS_POSIX_ACL
186 bool "Ext4dev POSIX Access Control Lists"
187 depends on EXT4DEV_FS_XATTR
188 select FS_POSIX_ACL
189 help
190 POSIX Access Control Lists (ACLs) support permissions for users and
191 groups beyond the owner/group/world scheme.
192
193 To learn more about Access Control Lists, visit the POSIX ACLs for
194 Linux website <http://acl.bestbits.at/>.
195
196 If you don't know what Access Control Lists are, say N
197
198config EXT4DEV_FS_SECURITY
199 bool "Ext4dev Security Labels"
200 depends on EXT4DEV_FS_XATTR
201 help
202 Security labels support alternative access control models
203 implemented by security modules like SELinux. This option
204 enables an extended attribute handler for file security
205 labels in the ext4dev/ext4 filesystem.
206
207 If you are not using a security module that requires using
208 extended attributes for file security labels, say N.
209
143config JBD 210config JBD
144 tristate 211 tristate
145 help 212 help
@@ -172,12 +239,44 @@ config JBD_DEBUG
172 generated. To turn debugging off again, do 239 generated. To turn debugging off again, do
173 "echo 0 > /proc/sys/fs/jbd-debug". 240 "echo 0 > /proc/sys/fs/jbd-debug".
174 241
242config JBD2
243 tristate
244 help
245 This is a generic journaling layer for block devices that support
246 both 32-bit and 64-bit block numbers. It is currently used by
247 the ext4dev/ext4 filesystem, but it could also be used to add
248 journal support to other file systems or block devices such
249 as RAID or LVM.
250
251 If you are using ext4dev/ext4, you need to say Y here. If you are not
252 using ext4dev/ext4 then you will probably want to say N.
253
254 To compile this device as a module, choose M here. The module will be
255 called jbd2. If you are compiling ext4dev/ext4 into the kernel,
256 you cannot compile this code as a module.
257
258config JBD2_DEBUG
259 bool "JBD2 (ext4dev/ext4) debugging support"
260 depends on JBD2
261 help
262 If you are using the ext4dev/ext4 journaled file system (or
263 potentially any other filesystem/device using JBD2), this option
264 allows you to enable debugging output while the system is running,
265 in order to help track down any problems you are having.
266 By default, the debugging output will be turned off.
267
268 If you select Y here, then you will be able to turn on debugging
269 with "echo N > /proc/sys/fs/jbd2-debug", where N is a number between
270 1 and 5. The higher the number, the more debugging output is
271 generated. To turn debugging off again, do
272 "echo 0 > /proc/sys/fs/jbd2-debug".
273
175config FS_MBCACHE 274config FS_MBCACHE
176# Meta block cache for Extended Attributes (ext2/ext3) 275# Meta block cache for Extended Attributes (ext2/ext3/ext4)
177 tristate 276 tristate
178 depends on EXT2_FS_XATTR || EXT3_FS_XATTR 277 depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4DEV_FS_XATTR
179 default y if EXT2_FS=y || EXT3_FS=y 278 default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y
180 default m if EXT2_FS=m || EXT3_FS=m 279 default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m
181 280
182config REISERFS_FS 281config REISERFS_FS
183 tristate "Reiserfs support" 282 tristate "Reiserfs support"
diff --git a/fs/Makefile b/fs/Makefile
index df614eacee86..9a5ce9323bfd 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -62,7 +62,9 @@ obj-$(CONFIG_DLM) += dlm/
62# Do not add any filesystems before this line 62# Do not add any filesystems before this line
63obj-$(CONFIG_REISERFS_FS) += reiserfs/ 63obj-$(CONFIG_REISERFS_FS) += reiserfs/
64obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 64obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
65obj-$(CONFIG_EXT4DEV_FS) += ext4/ # Before ext2 so root fs can be ext4dev
65obj-$(CONFIG_JBD) += jbd/ 66obj-$(CONFIG_JBD) += jbd/
67obj-$(CONFIG_JBD2) += jbd2/
66obj-$(CONFIG_EXT2_FS) += ext2/ 68obj-$(CONFIG_EXT2_FS) += ext2/
67obj-$(CONFIG_CRAMFS) += cramfs/ 69obj-$(CONFIG_CRAMFS) += cramfs/
68obj-$(CONFIG_RAMFS) += ramfs/ 70obj-$(CONFIG_RAMFS) += ramfs/
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index cf8a2cb28505..a6ec75c56fcf 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -211,8 +211,8 @@ static int afs_dir_open(struct inode *inode, struct file *file)
211{ 211{
212 _enter("{%lu}", inode->i_ino); 212 _enter("{%lu}", inode->i_ino);
213 213
214 BUG_ON(sizeof(union afs_dir_block) != 2048); 214 BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
215 BUG_ON(sizeof(union afs_dirent) != 32); 215 BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
216 216
217 if (AFS_FS_I(inode)->flags & AFS_VNODE_DELETED) 217 if (AFS_FS_I(inode)->flags & AFS_VNODE_DELETED)
218 return -ENOENT; 218 return -ENOENT;
@@ -446,8 +446,8 @@ static struct dentry *afs_dir_lookup(struct inode *dir, struct dentry *dentry,
446 _enter("{%lu},%p{%s}", dir->i_ino, dentry, dentry->d_name.name); 446 _enter("{%lu},%p{%s}", dir->i_ino, dentry, dentry->d_name.name);
447 447
448 /* insanity checks first */ 448 /* insanity checks first */
449 BUG_ON(sizeof(union afs_dir_block) != 2048); 449 BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
450 BUG_ON(sizeof(union afs_dirent) != 32); 450 BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
451 451
452 if (dentry->d_name.len > 255) { 452 if (dentry->d_name.len > 255) {
453 _leave(" = -ENAMETOOLONG"); 453 _leave(" = -ENAMETOOLONG");
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 480ab178cba5..b13f32c8aeee 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -94,7 +94,6 @@ struct autofs_wait_queue {
94 94
95struct autofs_sb_info { 95struct autofs_sb_info {
96 u32 magic; 96 u32 magic;
97 struct dentry *root;
98 int pipefd; 97 int pipefd;
99 struct file *pipe; 98 struct file *pipe;
100 pid_t oz_pgrp; 99 pid_t oz_pgrp;
@@ -229,4 +228,4 @@ out:
229} 228}
230 229
231void autofs4_dentry_release(struct dentry *); 230void autofs4_dentry_release(struct dentry *);
232 231extern void autofs4_kill_sb(struct super_block *);
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index 5d9193332bef..723a1c5e361b 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -24,7 +24,7 @@ static struct file_system_type autofs_fs_type = {
24 .owner = THIS_MODULE, 24 .owner = THIS_MODULE,
25 .name = "autofs", 25 .name = "autofs",
26 .get_sb = autofs_get_sb, 26 .get_sb = autofs_get_sb,
27 .kill_sb = kill_anon_super, 27 .kill_sb = autofs4_kill_sb,
28}; 28};
29 29
30static int __init init_autofs4_fs(void) 30static int __init init_autofs4_fs(void)
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 800ce876caec..51fd8595bf85 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -96,7 +96,7 @@ void autofs4_free_ino(struct autofs_info *ino)
96 */ 96 */
97static void autofs4_force_release(struct autofs_sb_info *sbi) 97static void autofs4_force_release(struct autofs_sb_info *sbi)
98{ 98{
99 struct dentry *this_parent = sbi->root; 99 struct dentry *this_parent = sbi->sb->s_root;
100 struct list_head *next; 100 struct list_head *next;
101 101
102 spin_lock(&dcache_lock); 102 spin_lock(&dcache_lock);
@@ -127,7 +127,7 @@ resume:
127 spin_lock(&dcache_lock); 127 spin_lock(&dcache_lock);
128 } 128 }
129 129
130 if (this_parent != sbi->root) { 130 if (this_parent != sbi->sb->s_root) {
131 struct dentry *dentry = this_parent; 131 struct dentry *dentry = this_parent;
132 132
133 next = this_parent->d_u.d_child.next; 133 next = this_parent->d_u.d_child.next;
@@ -140,15 +140,9 @@ resume:
140 goto resume; 140 goto resume;
141 } 141 }
142 spin_unlock(&dcache_lock); 142 spin_unlock(&dcache_lock);
143
144 dput(sbi->root);
145 sbi->root = NULL;
146 shrink_dcache_sb(sbi->sb);
147
148 return;
149} 143}
150 144
151static void autofs4_put_super(struct super_block *sb) 145void autofs4_kill_sb(struct super_block *sb)
152{ 146{
153 struct autofs_sb_info *sbi = autofs4_sbi(sb); 147 struct autofs_sb_info *sbi = autofs4_sbi(sb);
154 148
@@ -163,6 +157,7 @@ static void autofs4_put_super(struct super_block *sb)
163 kfree(sbi); 157 kfree(sbi);
164 158
165 DPRINTK("shutting down"); 159 DPRINTK("shutting down");
160 kill_anon_super(sb);
166} 161}
167 162
168static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt) 163static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
@@ -189,7 +184,6 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
189} 184}
190 185
191static struct super_operations autofs4_sops = { 186static struct super_operations autofs4_sops = {
192 .put_super = autofs4_put_super,
193 .statfs = simple_statfs, 187 .statfs = simple_statfs,
194 .show_options = autofs4_show_options, 188 .show_options = autofs4_show_options,
195}; 189};
@@ -315,7 +309,6 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
315 309
316 s->s_fs_info = sbi; 310 s->s_fs_info = sbi;
317 sbi->magic = AUTOFS_SBI_MAGIC; 311 sbi->magic = AUTOFS_SBI_MAGIC;
318 sbi->root = NULL;
319 sbi->pipefd = -1; 312 sbi->pipefd = -1;
320 sbi->catatonic = 0; 313 sbi->catatonic = 0;
321 sbi->exp_timeout = 0; 314 sbi->exp_timeout = 0;
@@ -397,13 +390,6 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
397 sbi->pipefd = pipefd; 390 sbi->pipefd = pipefd;
398 391
399 /* 392 /*
400 * Take a reference to the root dentry so we get a chance to
401 * clean up the dentry tree on umount.
402 * See autofs4_force_release.
403 */
404 sbi->root = dget(root);
405
406 /*
407 * Success! Install the root dentry now to indicate completion. 393 * Success! Install the root dentry now to indicate completion.
408 */ 394 */
409 s->s_root = root; 395 s->s_root = root;
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index ce103e7b0bc3..c0a6c8d445c7 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -45,7 +45,6 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
45 fput(sbi->pipe); /* Close the pipe */ 45 fput(sbi->pipe); /* Close the pipe */
46 sbi->pipe = NULL; 46 sbi->pipe = NULL;
47 } 47 }
48 shrink_dcache_sb(sbi->sb);
49} 48}
50 49
51static int autofs4_write(struct file *file, const void *addr, int bytes) 50static int autofs4_write(struct file *file, const void *addr, int bytes)
diff --git a/fs/bio.c b/fs/bio.c
index 8f93e939f213..f95c8749499f 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -79,7 +79,6 @@ static struct bio_set *fs_bio_set;
79static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs) 79static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
80{ 80{
81 struct bio_vec *bvl; 81 struct bio_vec *bvl;
82 struct biovec_slab *bp;
83 82
84 /* 83 /*
85 * see comment near bvec_array define! 84 * see comment near bvec_array define!
@@ -98,10 +97,12 @@ static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned lon
98 * idx now points to the pool we want to allocate from 97 * idx now points to the pool we want to allocate from
99 */ 98 */
100 99
101 bp = bvec_slabs + *idx;
102 bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask); 100 bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask);
103 if (bvl) 101 if (bvl) {
102 struct biovec_slab *bp = bvec_slabs + *idx;
103
104 memset(bvl, 0, bp->nr_vecs * sizeof(struct bio_vec)); 104 memset(bvl, 0, bp->nr_vecs * sizeof(struct bio_vec));
105 }
105 106
106 return bvl; 107 return bvl;
107} 108}
@@ -166,7 +167,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
166 167
167 bio_init(bio); 168 bio_init(bio);
168 if (likely(nr_iovecs)) { 169 if (likely(nr_iovecs)) {
169 unsigned long idx; 170 unsigned long idx = 0; /* shut up gcc */
170 171
171 bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); 172 bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
172 if (unlikely(!bvl)) { 173 if (unlikely(!bvl)) {
diff --git a/fs/buffer.c b/fs/buffer.c
index eeb8ac1aa856..f65ef8821c73 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1042,8 +1042,21 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
1042 } while ((size << sizebits) < PAGE_SIZE); 1042 } while ((size << sizebits) < PAGE_SIZE);
1043 1043
1044 index = block >> sizebits; 1044 index = block >> sizebits;
1045 block = index << sizebits;
1046 1045
1046 /*
1047 * Check for a block which wants to lie outside our maximum possible
1048 * pagecache index. (this comparison is done using sector_t types).
1049 */
1050 if (unlikely(index != block >> sizebits)) {
1051 char b[BDEVNAME_SIZE];
1052
1053 printk(KERN_ERR "%s: requested out-of-range block %llu for "
1054 "device %s\n",
1055 __FUNCTION__, (unsigned long long)block,
1056 bdevname(bdev, b));
1057 return -EIO;
1058 }
1059 block = index << sizebits;
1047 /* Create a page with the proper size buffers.. */ 1060 /* Create a page with the proper size buffers.. */
1048 page = grow_dev_page(bdev, block, index, size); 1061 page = grow_dev_page(bdev, block, index, size);
1049 if (!page) 1062 if (!page)
@@ -1070,12 +1083,16 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
1070 1083
1071 for (;;) { 1084 for (;;) {
1072 struct buffer_head * bh; 1085 struct buffer_head * bh;
1086 int ret;
1073 1087
1074 bh = __find_get_block(bdev, block, size); 1088 bh = __find_get_block(bdev, block, size);
1075 if (bh) 1089 if (bh)
1076 return bh; 1090 return bh;
1077 1091
1078 if (!grow_buffers(bdev, block, size)) 1092 ret = grow_buffers(bdev, block, size);
1093 if (ret < 0)
1094 return NULL;
1095 if (ret == 0)
1079 free_more_memory(); 1096 free_more_memory();
1080 } 1097 }
1081} 1098}
@@ -1837,6 +1854,7 @@ static int __block_prepare_write(struct inode *inode, struct page *page,
1837 clear_buffer_new(bh); 1854 clear_buffer_new(bh);
1838 kaddr = kmap_atomic(page, KM_USER0); 1855 kaddr = kmap_atomic(page, KM_USER0);
1839 memset(kaddr+block_start, 0, bh->b_size); 1856 memset(kaddr+block_start, 0, bh->b_size);
1857 flush_dcache_page(page);
1840 kunmap_atomic(kaddr, KM_USER0); 1858 kunmap_atomic(kaddr, KM_USER0);
1841 set_buffer_uptodate(bh); 1859 set_buffer_uptodate(bh);
1842 mark_buffer_dirty(bh); 1860 mark_buffer_dirty(bh);
@@ -2343,6 +2361,7 @@ failed:
2343 */ 2361 */
2344 kaddr = kmap_atomic(page, KM_USER0); 2362 kaddr = kmap_atomic(page, KM_USER0);
2345 memset(kaddr, 0, PAGE_CACHE_SIZE); 2363 memset(kaddr, 0, PAGE_CACHE_SIZE);
2364 flush_dcache_page(page);
2346 kunmap_atomic(kaddr, KM_USER0); 2365 kunmap_atomic(kaddr, KM_USER0);
2347 SetPageUptodate(page); 2366 SetPageUptodate(page);
2348 set_page_dirty(page); 2367 set_page_dirty(page);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 27ca1aa30562..a91f2628c981 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -2438,13 +2438,17 @@ HANDLE_IOCTL(0x1260, broken_blkgetsize)
2438HANDLE_IOCTL(BLKFRAGET, w_long) 2438HANDLE_IOCTL(BLKFRAGET, w_long)
2439HANDLE_IOCTL(BLKSECTGET, w_long) 2439HANDLE_IOCTL(BLKSECTGET, w_long)
2440HANDLE_IOCTL(BLKPG, blkpg_ioctl_trans) 2440HANDLE_IOCTL(BLKPG, blkpg_ioctl_trans)
2441HANDLE_IOCTL(HDIO_GET_KEEPSETTINGS, hdio_ioctl_trans)
2442HANDLE_IOCTL(HDIO_GET_UNMASKINTR, hdio_ioctl_trans) 2441HANDLE_IOCTL(HDIO_GET_UNMASKINTR, hdio_ioctl_trans)
2443HANDLE_IOCTL(HDIO_GET_DMA, hdio_ioctl_trans)
2444HANDLE_IOCTL(HDIO_GET_32BIT, hdio_ioctl_trans)
2445HANDLE_IOCTL(HDIO_GET_MULTCOUNT, hdio_ioctl_trans) 2442HANDLE_IOCTL(HDIO_GET_MULTCOUNT, hdio_ioctl_trans)
2443HANDLE_IOCTL(HDIO_GET_KEEPSETTINGS, hdio_ioctl_trans)
2444HANDLE_IOCTL(HDIO_GET_32BIT, hdio_ioctl_trans)
2446HANDLE_IOCTL(HDIO_GET_NOWERR, hdio_ioctl_trans) 2445HANDLE_IOCTL(HDIO_GET_NOWERR, hdio_ioctl_trans)
2446HANDLE_IOCTL(HDIO_GET_DMA, hdio_ioctl_trans)
2447HANDLE_IOCTL(HDIO_GET_NICE, hdio_ioctl_trans) 2447HANDLE_IOCTL(HDIO_GET_NICE, hdio_ioctl_trans)
2448HANDLE_IOCTL(HDIO_GET_WCACHE, hdio_ioctl_trans)
2449HANDLE_IOCTL(HDIO_GET_ACOUSTIC, hdio_ioctl_trans)
2450HANDLE_IOCTL(HDIO_GET_ADDRESS, hdio_ioctl_trans)
2451HANDLE_IOCTL(HDIO_GET_BUSSTATE, hdio_ioctl_trans)
2448HANDLE_IOCTL(FDSETPRM32, fd_ioctl_trans) 2452HANDLE_IOCTL(FDSETPRM32, fd_ioctl_trans)
2449HANDLE_IOCTL(FDDEFPRM32, fd_ioctl_trans) 2453HANDLE_IOCTL(FDDEFPRM32, fd_ioctl_trans)
2450HANDLE_IOCTL(FDGETPRM32, fd_ioctl_trans) 2454HANDLE_IOCTL(FDGETPRM32, fd_ioctl_trans)
diff --git a/fs/dcache.c b/fs/dcache.c
index 2355bddad8de..2bac4ba1d1d3 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -549,6 +549,136 @@ repeat:
549} 549}
550 550
551/* 551/*
552 * destroy a single subtree of dentries for unmount
553 * - see the comments on shrink_dcache_for_umount() for a description of the
554 * locking
555 */
556static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
557{
558 struct dentry *parent;
559
560 BUG_ON(!IS_ROOT(dentry));
561
562 /* detach this root from the system */
563 spin_lock(&dcache_lock);
564 if (!list_empty(&dentry->d_lru)) {
565 dentry_stat.nr_unused--;
566 list_del_init(&dentry->d_lru);
567 }
568 __d_drop(dentry);
569 spin_unlock(&dcache_lock);
570
571 for (;;) {
572 /* descend to the first leaf in the current subtree */
573 while (!list_empty(&dentry->d_subdirs)) {
574 struct dentry *loop;
575
576 /* this is a branch with children - detach all of them
577 * from the system in one go */
578 spin_lock(&dcache_lock);
579 list_for_each_entry(loop, &dentry->d_subdirs,
580 d_u.d_child) {
581 if (!list_empty(&loop->d_lru)) {
582 dentry_stat.nr_unused--;
583 list_del_init(&loop->d_lru);
584 }
585
586 __d_drop(loop);
587 cond_resched_lock(&dcache_lock);
588 }
589 spin_unlock(&dcache_lock);
590
591 /* move to the first child */
592 dentry = list_entry(dentry->d_subdirs.next,
593 struct dentry, d_u.d_child);
594 }
595
596 /* consume the dentries from this leaf up through its parents
597 * until we find one with children or run out altogether */
598 do {
599 struct inode *inode;
600
601 if (atomic_read(&dentry->d_count) != 0) {
602 printk(KERN_ERR
603 "BUG: Dentry %p{i=%lx,n=%s}"
604 " still in use (%d)"
605 " [unmount of %s %s]\n",
606 dentry,
607 dentry->d_inode ?
608 dentry->d_inode->i_ino : 0UL,
609 dentry->d_name.name,
610 atomic_read(&dentry->d_count),
611 dentry->d_sb->s_type->name,
612 dentry->d_sb->s_id);
613 BUG();
614 }
615
616 parent = dentry->d_parent;
617 if (parent == dentry)
618 parent = NULL;
619 else
620 atomic_dec(&parent->d_count);
621
622 list_del(&dentry->d_u.d_child);
623 dentry_stat.nr_dentry--; /* For d_free, below */
624
625 inode = dentry->d_inode;
626 if (inode) {
627 dentry->d_inode = NULL;
628 list_del_init(&dentry->d_alias);
629 if (dentry->d_op && dentry->d_op->d_iput)
630 dentry->d_op->d_iput(dentry, inode);
631 else
632 iput(inode);
633 }
634
635 d_free(dentry);
636
637 /* finished when we fall off the top of the tree,
638 * otherwise we ascend to the parent and move to the
639 * next sibling if there is one */
640 if (!parent)
641 return;
642
643 dentry = parent;
644
645 } while (list_empty(&dentry->d_subdirs));
646
647 dentry = list_entry(dentry->d_subdirs.next,
648 struct dentry, d_u.d_child);
649 }
650}
651
652/*
653 * destroy the dentries attached to a superblock on unmounting
654 * - we don't need to use dentry->d_lock, and only need dcache_lock when
655 * removing the dentry from the system lists and hashes because:
656 * - the superblock is detached from all mountings and open files, so the
657 * dentry trees will not be rearranged by the VFS
658 * - s_umount is write-locked, so the memory pressure shrinker will ignore
659 * any dentries belonging to this superblock that it comes across
660 * - the filesystem itself is no longer permitted to rearrange the dentries
661 * in this superblock
662 */
663void shrink_dcache_for_umount(struct super_block *sb)
664{
665 struct dentry *dentry;
666
667 if (down_read_trylock(&sb->s_umount))
668 BUG();
669
670 dentry = sb->s_root;
671 sb->s_root = NULL;
672 atomic_dec(&dentry->d_count);
673 shrink_dcache_for_umount_subtree(dentry);
674
675 while (!hlist_empty(&sb->s_anon)) {
676 dentry = hlist_entry(sb->s_anon.first, struct dentry, d_hash);
677 shrink_dcache_for_umount_subtree(dentry);
678 }
679}
680
681/*
552 * Search for at least 1 mount point in the dentry's subdirs. 682 * Search for at least 1 mount point in the dentry's subdirs.
553 * We descend to the next level whenever the d_subdirs 683 * We descend to the next level whenever the d_subdirs
554 * list is non-empty and continue searching. 684 * list is non-empty and continue searching.
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 557d5b614fae..ae228ec54e94 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -105,6 +105,8 @@
105/* Maximum msec timeout value storeable in a long int */ 105/* Maximum msec timeout value storeable in a long int */
106#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ) 106#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
107 107
108#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
109
108 110
109struct epoll_filefd { 111struct epoll_filefd {
110 struct file *file; 112 struct file *file;
@@ -497,7 +499,7 @@ void eventpoll_release_file(struct file *file)
497 */ 499 */
498asmlinkage long sys_epoll_create(int size) 500asmlinkage long sys_epoll_create(int size)
499{ 501{
500 int error, fd; 502 int error, fd = -1;
501 struct eventpoll *ep; 503 struct eventpoll *ep;
502 struct inode *inode; 504 struct inode *inode;
503 struct file *file; 505 struct file *file;
@@ -640,7 +642,6 @@ eexit_1:
640 return error; 642 return error;
641} 643}
642 644
643#define MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
644 645
645/* 646/*
646 * Implement the event wait interface for the eventpoll file. It is the kernel 647 * Implement the event wait interface for the eventpoll file. It is the kernel
@@ -657,7 +658,7 @@ asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
657 current, epfd, events, maxevents, timeout)); 658 current, epfd, events, maxevents, timeout));
658 659
659 /* The maximum number of event must be greater than zero */ 660 /* The maximum number of event must be greater than zero */
660 if (maxevents <= 0 || maxevents > MAX_EVENTS) 661 if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
661 return -EINVAL; 662 return -EINVAL;
662 663
663 /* Verify that the area passed by the user is writeable */ 664 /* Verify that the area passed by the user is writeable */
@@ -699,6 +700,55 @@ eexit_1:
699} 700}
700 701
701 702
703#ifdef TIF_RESTORE_SIGMASK
704
705/*
706 * Implement the event wait interface for the eventpoll file. It is the kernel
707 * part of the user space epoll_pwait(2).
708 */
709asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
710 int maxevents, int timeout, const sigset_t __user *sigmask,
711 size_t sigsetsize)
712{
713 int error;
714 sigset_t ksigmask, sigsaved;
715
716 /*
717 * If the caller wants a certain signal mask to be set during the wait,
718 * we apply it here.
719 */
720 if (sigmask) {
721 if (sigsetsize != sizeof(sigset_t))
722 return -EINVAL;
723 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
724 return -EFAULT;
725 sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
726 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
727 }
728
729 error = sys_epoll_wait(epfd, events, maxevents, timeout);
730
731 /*
732 * If we changed the signal mask, we need to restore the original one.
733 * In case we've got a signal while waiting, we do not restore the
734 * signal mask yet, and we allow do_signal() to deliver the signal on
735 * the way back to userspace, before the signal mask is restored.
736 */
737 if (sigmask) {
738 if (error == -EINTR) {
739 memcpy(&current->saved_sigmask, &sigsaved,
740 sizeof(sigsaved));
741 set_thread_flag(TIF_RESTORE_SIGMASK);
742 } else
743 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
744 }
745
746 return error;
747}
748
749#endif /* #ifdef TIF_RESTORE_SIGMASK */
750
751
702/* 752/*
703 * Creates the file descriptor to be used by the epoll interface. 753 * Creates the file descriptor to be used by the epoll interface.
704 */ 754 */
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 513cd421ac0b..d8b9abd95d07 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -364,7 +364,6 @@ static int parse_options (char * options,
364{ 364{
365 char * p; 365 char * p;
366 substring_t args[MAX_OPT_ARGS]; 366 substring_t args[MAX_OPT_ARGS];
367 unsigned long kind = EXT2_MOUNT_ERRORS_CONT;
368 int option; 367 int option;
369 368
370 if (!options) 369 if (!options)
@@ -404,13 +403,19 @@ static int parse_options (char * options,
404 /* *sb_block = match_int(&args[0]); */ 403 /* *sb_block = match_int(&args[0]); */
405 break; 404 break;
406 case Opt_err_panic: 405 case Opt_err_panic:
407 kind = EXT2_MOUNT_ERRORS_PANIC; 406 clear_opt (sbi->s_mount_opt, ERRORS_CONT);
407 clear_opt (sbi->s_mount_opt, ERRORS_RO);
408 set_opt (sbi->s_mount_opt, ERRORS_PANIC);
408 break; 409 break;
409 case Opt_err_ro: 410 case Opt_err_ro:
410 kind = EXT2_MOUNT_ERRORS_RO; 411 clear_opt (sbi->s_mount_opt, ERRORS_CONT);
412 clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
413 set_opt (sbi->s_mount_opt, ERRORS_RO);
411 break; 414 break;
412 case Opt_err_cont: 415 case Opt_err_cont:
413 kind = EXT2_MOUNT_ERRORS_CONT; 416 clear_opt (sbi->s_mount_opt, ERRORS_RO);
417 clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
418 set_opt (sbi->s_mount_opt, ERRORS_CONT);
414 break; 419 break;
415 case Opt_nouid32: 420 case Opt_nouid32:
416 set_opt (sbi->s_mount_opt, NO_UID32); 421 set_opt (sbi->s_mount_opt, NO_UID32);
@@ -489,7 +494,6 @@ static int parse_options (char * options,
489 return 0; 494 return 0;
490 } 495 }
491 } 496 }
492 sbi->s_mount_opt |= kind;
493 return 1; 497 return 1;
494} 498}
495 499
@@ -715,6 +719,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
715 set_opt(sbi->s_mount_opt, ERRORS_PANIC); 719 set_opt(sbi->s_mount_opt, ERRORS_PANIC);
716 else if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_RO) 720 else if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_RO)
717 set_opt(sbi->s_mount_opt, ERRORS_RO); 721 set_opt(sbi->s_mount_opt, ERRORS_RO);
722 else
723 set_opt(sbi->s_mount_opt, ERRORS_CONT);
718 724
719 sbi->s_resuid = le16_to_cpu(es->s_def_resuid); 725 sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
720 sbi->s_resgid = le16_to_cpu(es->s_def_resgid); 726 sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 8bfd56ef18ca..afc2d4f42d77 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1470,6 +1470,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1470 set_opt(sbi->s_mount_opt, ERRORS_PANIC); 1470 set_opt(sbi->s_mount_opt, ERRORS_PANIC);
1471 else if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_RO) 1471 else if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_RO)
1472 set_opt(sbi->s_mount_opt, ERRORS_RO); 1472 set_opt(sbi->s_mount_opt, ERRORS_RO);
1473 else
1474 set_opt(sbi->s_mount_opt, ERRORS_CONT);
1473 1475
1474 sbi->s_resuid = le16_to_cpu(es->s_def_resuid); 1476 sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
1475 sbi->s_resgid = le16_to_cpu(es->s_def_resgid); 1477 sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
new file mode 100644
index 000000000000..a6acb96ebeb9
--- /dev/null
+++ b/fs/ext4/Makefile
@@ -0,0 +1,12 @@
1#
2# Makefile for the linux ext4-filesystem routines.
3#
4
5obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
6
7ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o
9
10ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
11ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o
12ext4dev-$(CONFIG_EXT4DEV_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
new file mode 100644
index 000000000000..9e882546d91a
--- /dev/null
+++ b/fs/ext4/acl.c
@@ -0,0 +1,551 @@
1/*
2 * linux/fs/ext4/acl.c
3 *
4 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
5 */
6
7#include <linux/init.h>
8#include <linux/sched.h>
9#include <linux/slab.h>
10#include <linux/capability.h>
11#include <linux/fs.h>
12#include <linux/ext4_jbd2.h>
13#include <linux/ext4_fs.h>
14#include "xattr.h"
15#include "acl.h"
16
17/*
18 * Convert from filesystem to in-memory representation.
19 */
20static struct posix_acl *
21ext4_acl_from_disk(const void *value, size_t size)
22{
23 const char *end = (char *)value + size;
24 int n, count;
25 struct posix_acl *acl;
26
27 if (!value)
28 return NULL;
29 if (size < sizeof(ext4_acl_header))
30 return ERR_PTR(-EINVAL);
31 if (((ext4_acl_header *)value)->a_version !=
32 cpu_to_le32(EXT4_ACL_VERSION))
33 return ERR_PTR(-EINVAL);
34 value = (char *)value + sizeof(ext4_acl_header);
35 count = ext4_acl_count(size);
36 if (count < 0)
37 return ERR_PTR(-EINVAL);
38 if (count == 0)
39 return NULL;
40 acl = posix_acl_alloc(count, GFP_KERNEL);
41 if (!acl)
42 return ERR_PTR(-ENOMEM);
43 for (n=0; n < count; n++) {
44 ext4_acl_entry *entry =
45 (ext4_acl_entry *)value;
46 if ((char *)value + sizeof(ext4_acl_entry_short) > end)
47 goto fail;
48 acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
49 acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
50 switch(acl->a_entries[n].e_tag) {
51 case ACL_USER_OBJ:
52 case ACL_GROUP_OBJ:
53 case ACL_MASK:
54 case ACL_OTHER:
55 value = (char *)value +
56 sizeof(ext4_acl_entry_short);
57 acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
58 break;
59
60 case ACL_USER:
61 case ACL_GROUP:
62 value = (char *)value + sizeof(ext4_acl_entry);
63 if ((char *)value > end)
64 goto fail;
65 acl->a_entries[n].e_id =
66 le32_to_cpu(entry->e_id);
67 break;
68
69 default:
70 goto fail;
71 }
72 }
73 if (value != end)
74 goto fail;
75 return acl;
76
77fail:
78 posix_acl_release(acl);
79 return ERR_PTR(-EINVAL);
80}
81
82/*
83 * Convert from in-memory to filesystem representation.
84 */
85static void *
86ext4_acl_to_disk(const struct posix_acl *acl, size_t *size)
87{
88 ext4_acl_header *ext_acl;
89 char *e;
90 size_t n;
91
92 *size = ext4_acl_size(acl->a_count);
93 ext_acl = kmalloc(sizeof(ext4_acl_header) + acl->a_count *
94 sizeof(ext4_acl_entry), GFP_KERNEL);
95 if (!ext_acl)
96 return ERR_PTR(-ENOMEM);
97 ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION);
98 e = (char *)ext_acl + sizeof(ext4_acl_header);
99 for (n=0; n < acl->a_count; n++) {
100 ext4_acl_entry *entry = (ext4_acl_entry *)e;
101 entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
102 entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
103 switch(acl->a_entries[n].e_tag) {
104 case ACL_USER:
105 case ACL_GROUP:
106 entry->e_id =
107 cpu_to_le32(acl->a_entries[n].e_id);
108 e += sizeof(ext4_acl_entry);
109 break;
110
111 case ACL_USER_OBJ:
112 case ACL_GROUP_OBJ:
113 case ACL_MASK:
114 case ACL_OTHER:
115 e += sizeof(ext4_acl_entry_short);
116 break;
117
118 default:
119 goto fail;
120 }
121 }
122 return (char *)ext_acl;
123
124fail:
125 kfree(ext_acl);
126 return ERR_PTR(-EINVAL);
127}
128
129static inline struct posix_acl *
130ext4_iget_acl(struct inode *inode, struct posix_acl **i_acl)
131{
132 struct posix_acl *acl = EXT4_ACL_NOT_CACHED;
133
134 spin_lock(&inode->i_lock);
135 if (*i_acl != EXT4_ACL_NOT_CACHED)
136 acl = posix_acl_dup(*i_acl);
137 spin_unlock(&inode->i_lock);
138
139 return acl;
140}
141
142static inline void
143ext4_iset_acl(struct inode *inode, struct posix_acl **i_acl,
144 struct posix_acl *acl)
145{
146 spin_lock(&inode->i_lock);
147 if (*i_acl != EXT4_ACL_NOT_CACHED)
148 posix_acl_release(*i_acl);
149 *i_acl = posix_acl_dup(acl);
150 spin_unlock(&inode->i_lock);
151}
152
153/*
154 * Inode operation get_posix_acl().
155 *
156 * inode->i_mutex: don't care
157 */
158static struct posix_acl *
159ext4_get_acl(struct inode *inode, int type)
160{
161 struct ext4_inode_info *ei = EXT4_I(inode);
162 int name_index;
163 char *value = NULL;
164 struct posix_acl *acl;
165 int retval;
166
167 if (!test_opt(inode->i_sb, POSIX_ACL))
168 return NULL;
169
170 switch(type) {
171 case ACL_TYPE_ACCESS:
172 acl = ext4_iget_acl(inode, &ei->i_acl);
173 if (acl != EXT4_ACL_NOT_CACHED)
174 return acl;
175 name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
176 break;
177
178 case ACL_TYPE_DEFAULT:
179 acl = ext4_iget_acl(inode, &ei->i_default_acl);
180 if (acl != EXT4_ACL_NOT_CACHED)
181 return acl;
182 name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
183 break;
184
185 default:
186 return ERR_PTR(-EINVAL);
187 }
188 retval = ext4_xattr_get(inode, name_index, "", NULL, 0);
189 if (retval > 0) {
190 value = kmalloc(retval, GFP_KERNEL);
191 if (!value)
192 return ERR_PTR(-ENOMEM);
193 retval = ext4_xattr_get(inode, name_index, "", value, retval);
194 }
195 if (retval > 0)
196 acl = ext4_acl_from_disk(value, retval);
197 else if (retval == -ENODATA || retval == -ENOSYS)
198 acl = NULL;
199 else
200 acl = ERR_PTR(retval);
201 kfree(value);
202
203 if (!IS_ERR(acl)) {
204 switch(type) {
205 case ACL_TYPE_ACCESS:
206 ext4_iset_acl(inode, &ei->i_acl, acl);
207 break;
208
209 case ACL_TYPE_DEFAULT:
210 ext4_iset_acl(inode, &ei->i_default_acl, acl);
211 break;
212 }
213 }
214 return acl;
215}
216
217/*
218 * Set the access or default ACL of an inode.
219 *
220 * inode->i_mutex: down unless called from ext4_new_inode
221 */
222static int
223ext4_set_acl(handle_t *handle, struct inode *inode, int type,
224 struct posix_acl *acl)
225{
226 struct ext4_inode_info *ei = EXT4_I(inode);
227 int name_index;
228 void *value = NULL;
229 size_t size = 0;
230 int error;
231
232 if (S_ISLNK(inode->i_mode))
233 return -EOPNOTSUPP;
234
235 switch(type) {
236 case ACL_TYPE_ACCESS:
237 name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
238 if (acl) {
239 mode_t mode = inode->i_mode;
240 error = posix_acl_equiv_mode(acl, &mode);
241 if (error < 0)
242 return error;
243 else {
244 inode->i_mode = mode;
245 ext4_mark_inode_dirty(handle, inode);
246 if (error == 0)
247 acl = NULL;
248 }
249 }
250 break;
251
252 case ACL_TYPE_DEFAULT:
253 name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
254 if (!S_ISDIR(inode->i_mode))
255 return acl ? -EACCES : 0;
256 break;
257
258 default:
259 return -EINVAL;
260 }
261 if (acl) {
262 value = ext4_acl_to_disk(acl, &size);
263 if (IS_ERR(value))
264 return (int)PTR_ERR(value);
265 }
266
267 error = ext4_xattr_set_handle(handle, inode, name_index, "",
268 value, size, 0);
269
270 kfree(value);
271 if (!error) {
272 switch(type) {
273 case ACL_TYPE_ACCESS:
274 ext4_iset_acl(inode, &ei->i_acl, acl);
275 break;
276
277 case ACL_TYPE_DEFAULT:
278 ext4_iset_acl(inode, &ei->i_default_acl, acl);
279 break;
280 }
281 }
282 return error;
283}
284
285static int
286ext4_check_acl(struct inode *inode, int mask)
287{
288 struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
289
290 if (IS_ERR(acl))
291 return PTR_ERR(acl);
292 if (acl) {
293 int error = posix_acl_permission(inode, acl, mask);
294 posix_acl_release(acl);
295 return error;
296 }
297
298 return -EAGAIN;
299}
300
301int
302ext4_permission(struct inode *inode, int mask, struct nameidata *nd)
303{
304 return generic_permission(inode, mask, ext4_check_acl);
305}
306
307/*
308 * Initialize the ACLs of a new inode. Called from ext4_new_inode.
309 *
310 * dir->i_mutex: down
311 * inode->i_mutex: up (access to inode is still exclusive)
312 */
313int
314ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
315{
316 struct posix_acl *acl = NULL;
317 int error = 0;
318
319 if (!S_ISLNK(inode->i_mode)) {
320 if (test_opt(dir->i_sb, POSIX_ACL)) {
321 acl = ext4_get_acl(dir, ACL_TYPE_DEFAULT);
322 if (IS_ERR(acl))
323 return PTR_ERR(acl);
324 }
325 if (!acl)
326 inode->i_mode &= ~current->fs->umask;
327 }
328 if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
329 struct posix_acl *clone;
330 mode_t mode;
331
332 if (S_ISDIR(inode->i_mode)) {
333 error = ext4_set_acl(handle, inode,
334 ACL_TYPE_DEFAULT, acl);
335 if (error)
336 goto cleanup;
337 }
338 clone = posix_acl_clone(acl, GFP_KERNEL);
339 error = -ENOMEM;
340 if (!clone)
341 goto cleanup;
342
343 mode = inode->i_mode;
344 error = posix_acl_create_masq(clone, &mode);
345 if (error >= 0) {
346 inode->i_mode = mode;
347 if (error > 0) {
348 /* This is an extended ACL */
349 error = ext4_set_acl(handle, inode,
350 ACL_TYPE_ACCESS, clone);
351 }
352 }
353 posix_acl_release(clone);
354 }
355cleanup:
356 posix_acl_release(acl);
357 return error;
358}
359
360/*
361 * Does chmod for an inode that may have an Access Control List. The
362 * inode->i_mode field must be updated to the desired value by the caller
363 * before calling this function.
364 * Returns 0 on success, or a negative error number.
365 *
366 * We change the ACL rather than storing some ACL entries in the file
367 * mode permission bits (which would be more efficient), because that
368 * would break once additional permissions (like ACL_APPEND, ACL_DELETE
369 * for directories) are added. There are no more bits available in the
370 * file mode.
371 *
372 * inode->i_mutex: down
373 */
374int
375ext4_acl_chmod(struct inode *inode)
376{
377 struct posix_acl *acl, *clone;
378 int error;
379
380 if (S_ISLNK(inode->i_mode))
381 return -EOPNOTSUPP;
382 if (!test_opt(inode->i_sb, POSIX_ACL))
383 return 0;
384 acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
385 if (IS_ERR(acl) || !acl)
386 return PTR_ERR(acl);
387 clone = posix_acl_clone(acl, GFP_KERNEL);
388 posix_acl_release(acl);
389 if (!clone)
390 return -ENOMEM;
391 error = posix_acl_chmod_masq(clone, inode->i_mode);
392 if (!error) {
393 handle_t *handle;
394 int retries = 0;
395
396 retry:
397 handle = ext4_journal_start(inode,
398 EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
399 if (IS_ERR(handle)) {
400 error = PTR_ERR(handle);
401 ext4_std_error(inode->i_sb, error);
402 goto out;
403 }
404 error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, clone);
405 ext4_journal_stop(handle);
406 if (error == -ENOSPC &&
407 ext4_should_retry_alloc(inode->i_sb, &retries))
408 goto retry;
409 }
410out:
411 posix_acl_release(clone);
412 return error;
413}
414
415/*
416 * Extended attribute handlers
417 */
418static size_t
419ext4_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len,
420 const char *name, size_t name_len)
421{
422 const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
423
424 if (!test_opt(inode->i_sb, POSIX_ACL))
425 return 0;
426 if (list && size <= list_len)
427 memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
428 return size;
429}
430
431static size_t
432ext4_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len,
433 const char *name, size_t name_len)
434{
435 const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
436
437 if (!test_opt(inode->i_sb, POSIX_ACL))
438 return 0;
439 if (list && size <= list_len)
440 memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
441 return size;
442}
443
444static int
445ext4_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
446{
447 struct posix_acl *acl;
448 int error;
449
450 if (!test_opt(inode->i_sb, POSIX_ACL))
451 return -EOPNOTSUPP;
452
453 acl = ext4_get_acl(inode, type);
454 if (IS_ERR(acl))
455 return PTR_ERR(acl);
456 if (acl == NULL)
457 return -ENODATA;
458 error = posix_acl_to_xattr(acl, buffer, size);
459 posix_acl_release(acl);
460
461 return error;
462}
463
464static int
465ext4_xattr_get_acl_access(struct inode *inode, const char *name,
466 void *buffer, size_t size)
467{
468 if (strcmp(name, "") != 0)
469 return -EINVAL;
470 return ext4_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
471}
472
473static int
474ext4_xattr_get_acl_default(struct inode *inode, const char *name,
475 void *buffer, size_t size)
476{
477 if (strcmp(name, "") != 0)
478 return -EINVAL;
479 return ext4_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
480}
481
482static int
483ext4_xattr_set_acl(struct inode *inode, int type, const void *value,
484 size_t size)
485{
486 handle_t *handle;
487 struct posix_acl *acl;
488 int error, retries = 0;
489
490 if (!test_opt(inode->i_sb, POSIX_ACL))
491 return -EOPNOTSUPP;
492 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
493 return -EPERM;
494
495 if (value) {
496 acl = posix_acl_from_xattr(value, size);
497 if (IS_ERR(acl))
498 return PTR_ERR(acl);
499 else if (acl) {
500 error = posix_acl_valid(acl);
501 if (error)
502 goto release_and_out;
503 }
504 } else
505 acl = NULL;
506
507retry:
508 handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
509 if (IS_ERR(handle))
510 return PTR_ERR(handle);
511 error = ext4_set_acl(handle, inode, type, acl);
512 ext4_journal_stop(handle);
513 if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
514 goto retry;
515
516release_and_out:
517 posix_acl_release(acl);
518 return error;
519}
520
521static int
522ext4_xattr_set_acl_access(struct inode *inode, const char *name,
523 const void *value, size_t size, int flags)
524{
525 if (strcmp(name, "") != 0)
526 return -EINVAL;
527 return ext4_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
528}
529
530static int
531ext4_xattr_set_acl_default(struct inode *inode, const char *name,
532 const void *value, size_t size, int flags)
533{
534 if (strcmp(name, "") != 0)
535 return -EINVAL;
536 return ext4_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
537}
538
539struct xattr_handler ext4_xattr_acl_access_handler = {
540 .prefix = POSIX_ACL_XATTR_ACCESS,
541 .list = ext4_xattr_list_acl_access,
542 .get = ext4_xattr_get_acl_access,
543 .set = ext4_xattr_set_acl_access,
544};
545
546struct xattr_handler ext4_xattr_acl_default_handler = {
547 .prefix = POSIX_ACL_XATTR_DEFAULT,
548 .list = ext4_xattr_list_acl_default,
549 .get = ext4_xattr_get_acl_default,
550 .set = ext4_xattr_set_acl_default,
551};
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
new file mode 100644
index 000000000000..26a5c1abf147
--- /dev/null
+++ b/fs/ext4/acl.h
@@ -0,0 +1,81 @@
1/*
2 File: fs/ext4/acl.h
3
4 (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
5*/
6
7#include <linux/posix_acl_xattr.h>
8
9#define EXT4_ACL_VERSION 0x0001
10
11typedef struct {
12 __le16 e_tag;
13 __le16 e_perm;
14 __le32 e_id;
15} ext4_acl_entry;
16
17typedef struct {
18 __le16 e_tag;
19 __le16 e_perm;
20} ext4_acl_entry_short;
21
22typedef struct {
23 __le32 a_version;
24} ext4_acl_header;
25
26static inline size_t ext4_acl_size(int count)
27{
28 if (count <= 4) {
29 return sizeof(ext4_acl_header) +
30 count * sizeof(ext4_acl_entry_short);
31 } else {
32 return sizeof(ext4_acl_header) +
33 4 * sizeof(ext4_acl_entry_short) +
34 (count - 4) * sizeof(ext4_acl_entry);
35 }
36}
37
38static inline int ext4_acl_count(size_t size)
39{
40 ssize_t s;
41 size -= sizeof(ext4_acl_header);
42 s = size - 4 * sizeof(ext4_acl_entry_short);
43 if (s < 0) {
44 if (size % sizeof(ext4_acl_entry_short))
45 return -1;
46 return size / sizeof(ext4_acl_entry_short);
47 } else {
48 if (s % sizeof(ext4_acl_entry))
49 return -1;
50 return s / sizeof(ext4_acl_entry) + 4;
51 }
52}
53
54#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
55
56/* Value for inode->u.ext4_i.i_acl and inode->u.ext4_i.i_default_acl
57 if the ACL has not been cached */
58#define EXT4_ACL_NOT_CACHED ((void *)-1)
59
60/* acl.c */
61extern int ext4_permission (struct inode *, int, struct nameidata *);
62extern int ext4_acl_chmod (struct inode *);
63extern int ext4_init_acl (handle_t *, struct inode *, struct inode *);
64
65#else /* CONFIG_EXT4DEV_FS_POSIX_ACL */
66#include <linux/sched.h>
67#define ext4_permission NULL
68
69static inline int
70ext4_acl_chmod(struct inode *inode)
71{
72 return 0;
73}
74
75static inline int
76ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
77{
78 return 0;
79}
80#endif /* CONFIG_EXT4DEV_FS_POSIX_ACL */
81
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
new file mode 100644
index 000000000000..5d45582f9517
--- /dev/null
+++ b/fs/ext4/balloc.c
@@ -0,0 +1,1833 @@
1/*
2 * linux/fs/ext4/balloc.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
10 * Big-endian to little-endian byte-swapping/bitmaps by
11 * David S. Miller (davem@caip.rutgers.edu), 1995
12 */
13
14#include <linux/time.h>
15#include <linux/capability.h>
16#include <linux/fs.h>
17#include <linux/jbd2.h>
18#include <linux/ext4_fs.h>
19#include <linux/ext4_jbd2.h>
20#include <linux/quotaops.h>
21#include <linux/buffer_head.h>
22
23/*
24 * balloc.c contains the blocks allocation and deallocation routines
25 */
26
27/*
28 * Calculate the block group number and offset, given a block number
29 */
30void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
31 unsigned long *blockgrpp, ext4_grpblk_t *offsetp)
32{
33 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
34 ext4_grpblk_t offset;
35
36 blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
37 offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb));
38 if (offsetp)
39 *offsetp = offset;
40 if (blockgrpp)
41 *blockgrpp = blocknr;
42
43}
44
45/*
46 * The free blocks are managed by bitmaps. A file system contains several
47 * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
48 * block for inodes, N blocks for the inode table and data blocks.
49 *
50 * The file system contains group descriptors which are located after the
51 * super block. Each descriptor contains the number of the bitmap block and
52 * the free blocks count in the block. The descriptors are loaded in memory
53 * when a file system is mounted (see ext4_read_super).
54 */
55
56
57#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
58
59/**
60 * ext4_get_group_desc() -- load group descriptor from disk
61 * @sb: super block
62 * @block_group: given block group
63 * @bh: pointer to the buffer head to store the block
64 * group descriptor
65 */
66struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
67 unsigned int block_group,
68 struct buffer_head ** bh)
69{
70 unsigned long group_desc;
71 unsigned long offset;
72 struct ext4_group_desc * desc;
73 struct ext4_sb_info *sbi = EXT4_SB(sb);
74
75 if (block_group >= sbi->s_groups_count) {
76 ext4_error (sb, "ext4_get_group_desc",
77 "block_group >= groups_count - "
78 "block_group = %d, groups_count = %lu",
79 block_group, sbi->s_groups_count);
80
81 return NULL;
82 }
83 smp_rmb();
84
85 group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
86 offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
87 if (!sbi->s_group_desc[group_desc]) {
88 ext4_error (sb, "ext4_get_group_desc",
89 "Group descriptor not loaded - "
90 "block_group = %d, group_desc = %lu, desc = %lu",
91 block_group, group_desc, offset);
92 return NULL;
93 }
94
95 desc = (struct ext4_group_desc *)(
96 (__u8 *)sbi->s_group_desc[group_desc]->b_data +
97 offset * EXT4_DESC_SIZE(sb));
98 if (bh)
99 *bh = sbi->s_group_desc[group_desc];
100 return desc;
101}
102
103/**
104 * read_block_bitmap()
105 * @sb: super block
106 * @block_group: given block group
107 *
108 * Read the bitmap for a given block_group, reading into the specified
109 * slot in the superblock's bitmap cache.
110 *
111 * Return buffer_head on success or NULL in case of failure.
112 */
113static struct buffer_head *
114read_block_bitmap(struct super_block *sb, unsigned int block_group)
115{
116 struct ext4_group_desc * desc;
117 struct buffer_head * bh = NULL;
118
119 desc = ext4_get_group_desc (sb, block_group, NULL);
120 if (!desc)
121 goto error_out;
122 bh = sb_bread(sb, ext4_block_bitmap(sb, desc));
123 if (!bh)
124 ext4_error (sb, "read_block_bitmap",
125 "Cannot read block bitmap - "
126 "block_group = %d, block_bitmap = %llu",
127 block_group,
128 ext4_block_bitmap(sb, desc));
129error_out:
130 return bh;
131}
132/*
133 * The reservation window structure operations
134 * --------------------------------------------
135 * Operations include:
136 * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
137 *
138 * We use a red-black tree to represent per-filesystem reservation
139 * windows.
140 *
141 */
142
143/**
144 * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
145 * @rb_root: root of per-filesystem reservation rb tree
146 * @verbose: verbose mode
147 * @fn: function which wishes to dump the reservation map
148 *
149 * If verbose is turned on, it will print the whole block reservation
150 * windows(start, end). Otherwise, it will only print out the "bad" windows,
151 * those windows that overlap with their immediate neighbors.
152 */
153#if 1
154static void __rsv_window_dump(struct rb_root *root, int verbose,
155 const char *fn)
156{
157 struct rb_node *n;
158 struct ext4_reserve_window_node *rsv, *prev;
159 int bad;
160
161restart:
162 n = rb_first(root);
163 bad = 0;
164 prev = NULL;
165
166 printk("Block Allocation Reservation Windows Map (%s):\n", fn);
167 while (n) {
168 rsv = list_entry(n, struct ext4_reserve_window_node, rsv_node);
169 if (verbose)
170 printk("reservation window 0x%p "
171 "start: %llu, end: %llu\n",
172 rsv, rsv->rsv_start, rsv->rsv_end);
173 if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
174 printk("Bad reservation %p (start >= end)\n",
175 rsv);
176 bad = 1;
177 }
178 if (prev && prev->rsv_end >= rsv->rsv_start) {
179 printk("Bad reservation %p (prev->end >= start)\n",
180 rsv);
181 bad = 1;
182 }
183 if (bad) {
184 if (!verbose) {
185 printk("Restarting reservation walk in verbose mode\n");
186 verbose = 1;
187 goto restart;
188 }
189 }
190 n = rb_next(n);
191 prev = rsv;
192 }
193 printk("Window map complete.\n");
194 if (bad)
195 BUG();
196}
197#define rsv_window_dump(root, verbose) \
198 __rsv_window_dump((root), (verbose), __FUNCTION__)
199#else
200#define rsv_window_dump(root, verbose) do {} while (0)
201#endif
202
203/**
204 * goal_in_my_reservation()
205 * @rsv: inode's reservation window
206 * @grp_goal: given goal block relative to the allocation block group
207 * @group: the current allocation block group
208 * @sb: filesystem super block
209 *
210 * Test if the given goal block (group relative) is within the file's
211 * own block reservation window range.
212 *
213 * If the reservation window is outside the goal allocation group, return 0;
214 * grp_goal (given goal block) could be -1, which means no specific
215 * goal block. In this case, always return 1.
216 * If the goal block is within the reservation window, return 1;
217 * otherwise, return 0;
218 */
219static int
220goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal,
221 unsigned int group, struct super_block * sb)
222{
223 ext4_fsblk_t group_first_block, group_last_block;
224
225 group_first_block = ext4_group_first_block_no(sb, group);
226 group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
227
228 if ((rsv->_rsv_start > group_last_block) ||
229 (rsv->_rsv_end < group_first_block))
230 return 0;
231 if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
232 || (grp_goal + group_first_block > rsv->_rsv_end)))
233 return 0;
234 return 1;
235}
236
237/**
238 * search_reserve_window()
239 * @rb_root: root of reservation tree
240 * @goal: target allocation block
241 *
242 * Find the reserved window which includes the goal, or the previous one
243 * if the goal is not in any window.
244 * Returns NULL if there are no windows or if all windows start after the goal.
245 */
246static struct ext4_reserve_window_node *
247search_reserve_window(struct rb_root *root, ext4_fsblk_t goal)
248{
249 struct rb_node *n = root->rb_node;
250 struct ext4_reserve_window_node *rsv;
251
252 if (!n)
253 return NULL;
254
255 do {
256 rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
257
258 if (goal < rsv->rsv_start)
259 n = n->rb_left;
260 else if (goal > rsv->rsv_end)
261 n = n->rb_right;
262 else
263 return rsv;
264 } while (n);
265 /*
266 * We've fallen off the end of the tree: the goal wasn't inside
267 * any particular node. OK, the previous node must be to one
268 * side of the interval containing the goal. If it's the RHS,
269 * we need to back up one.
270 */
271 if (rsv->rsv_start > goal) {
272 n = rb_prev(&rsv->rsv_node);
273 rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
274 }
275 return rsv;
276}
277
278/**
279 * ext4_rsv_window_add() -- Insert a window to the block reservation rb tree.
280 * @sb: super block
281 * @rsv: reservation window to add
282 *
283 * Must be called with rsv_lock hold.
284 */
285void ext4_rsv_window_add(struct super_block *sb,
286 struct ext4_reserve_window_node *rsv)
287{
288 struct rb_root *root = &EXT4_SB(sb)->s_rsv_window_root;
289 struct rb_node *node = &rsv->rsv_node;
290 ext4_fsblk_t start = rsv->rsv_start;
291
292 struct rb_node ** p = &root->rb_node;
293 struct rb_node * parent = NULL;
294 struct ext4_reserve_window_node *this;
295
296 while (*p)
297 {
298 parent = *p;
299 this = rb_entry(parent, struct ext4_reserve_window_node, rsv_node);
300
301 if (start < this->rsv_start)
302 p = &(*p)->rb_left;
303 else if (start > this->rsv_end)
304 p = &(*p)->rb_right;
305 else {
306 rsv_window_dump(root, 1);
307 BUG();
308 }
309 }
310
311 rb_link_node(node, parent, p);
312 rb_insert_color(node, root);
313}
314
315/**
316 * ext4_rsv_window_remove() -- unlink a window from the reservation rb tree
317 * @sb: super block
318 * @rsv: reservation window to remove
319 *
320 * Mark the block reservation window as not allocated, and unlink it
321 * from the filesystem reservation window rb tree. Must be called with
322 * rsv_lock hold.
323 */
324static void rsv_window_remove(struct super_block *sb,
325 struct ext4_reserve_window_node *rsv)
326{
327 rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
328 rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
329 rsv->rsv_alloc_hit = 0;
330 rb_erase(&rsv->rsv_node, &EXT4_SB(sb)->s_rsv_window_root);
331}
332
333/*
334 * rsv_is_empty() -- Check if the reservation window is allocated.
335 * @rsv: given reservation window to check
336 *
337 * returns 1 if the end block is EXT4_RESERVE_WINDOW_NOT_ALLOCATED.
338 */
339static inline int rsv_is_empty(struct ext4_reserve_window *rsv)
340{
341 /* a valid reservation end block could not be 0 */
342 return rsv->_rsv_end == EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
343}
344
345/**
346 * ext4_init_block_alloc_info()
347 * @inode: file inode structure
348 *
349 * Allocate and initialize the reservation window structure, and
350 * link the window to the ext4 inode structure at last
351 *
352 * The reservation window structure is only dynamically allocated
353 * and linked to ext4 inode the first time the open file
354 * needs a new block. So, before every ext4_new_block(s) call, for
355 * regular files, we should check whether the reservation window
356 * structure exists or not. In the latter case, this function is called.
357 * Fail to do so will result in block reservation being turned off for that
358 * open file.
359 *
360 * This function is called from ext4_get_blocks_handle(), also called
361 * when setting the reservation window size through ioctl before the file
362 * is open for write (needs block allocation).
363 *
364 * Needs truncate_mutex protection prior to call this function.
365 */
366void ext4_init_block_alloc_info(struct inode *inode)
367{
368 struct ext4_inode_info *ei = EXT4_I(inode);
369 struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
370 struct super_block *sb = inode->i_sb;
371
372 block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
373 if (block_i) {
374 struct ext4_reserve_window_node *rsv = &block_i->rsv_window_node;
375
376 rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
377 rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
378
379 /*
380 * if filesystem is mounted with NORESERVATION, the goal
381 * reservation window size is set to zero to indicate
382 * block reservation is off
383 */
384 if (!test_opt(sb, RESERVATION))
385 rsv->rsv_goal_size = 0;
386 else
387 rsv->rsv_goal_size = EXT4_DEFAULT_RESERVE_BLOCKS;
388 rsv->rsv_alloc_hit = 0;
389 block_i->last_alloc_logical_block = 0;
390 block_i->last_alloc_physical_block = 0;
391 }
392 ei->i_block_alloc_info = block_i;
393}
394
395/**
396 * ext4_discard_reservation()
397 * @inode: inode
398 *
399 * Discard(free) block reservation window on last file close, or truncate
400 * or at last iput().
401 *
402 * It is being called in three cases:
403 * ext4_release_file(): last writer close the file
404 * ext4_clear_inode(): last iput(), when nobody link to this file.
405 * ext4_truncate(): when the block indirect map is about to change.
406 *
407 */
408void ext4_discard_reservation(struct inode *inode)
409{
410 struct ext4_inode_info *ei = EXT4_I(inode);
411 struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
412 struct ext4_reserve_window_node *rsv;
413 spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock;
414
415 if (!block_i)
416 return;
417
418 rsv = &block_i->rsv_window_node;
419 if (!rsv_is_empty(&rsv->rsv_window)) {
420 spin_lock(rsv_lock);
421 if (!rsv_is_empty(&rsv->rsv_window))
422 rsv_window_remove(inode->i_sb, rsv);
423 spin_unlock(rsv_lock);
424 }
425}
426
427/**
428 * ext4_free_blocks_sb() -- Free given blocks and update quota
429 * @handle: handle to this transaction
430 * @sb: super block
431 * @block: start physcial block to free
432 * @count: number of blocks to free
433 * @pdquot_freed_blocks: pointer to quota
434 */
435void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
436 ext4_fsblk_t block, unsigned long count,
437 unsigned long *pdquot_freed_blocks)
438{
439 struct buffer_head *bitmap_bh = NULL;
440 struct buffer_head *gd_bh;
441 unsigned long block_group;
442 ext4_grpblk_t bit;
443 unsigned long i;
444 unsigned long overflow;
445 struct ext4_group_desc * desc;
446 struct ext4_super_block * es;
447 struct ext4_sb_info *sbi;
448 int err = 0, ret;
449 ext4_grpblk_t group_freed;
450
451 *pdquot_freed_blocks = 0;
452 sbi = EXT4_SB(sb);
453 es = sbi->s_es;
454 if (block < le32_to_cpu(es->s_first_data_block) ||
455 block + count < block ||
456 block + count > ext4_blocks_count(es)) {
457 ext4_error (sb, "ext4_free_blocks",
458 "Freeing blocks not in datazone - "
459 "block = %llu, count = %lu", block, count);
460 goto error_return;
461 }
462
463 ext4_debug ("freeing block(s) %llu-%llu\n", block, block + count - 1);
464
465do_more:
466 overflow = 0;
467 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
468 /*
469 * Check to see if we are freeing blocks across a group
470 * boundary.
471 */
472 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
473 overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
474 count -= overflow;
475 }
476 brelse(bitmap_bh);
477 bitmap_bh = read_block_bitmap(sb, block_group);
478 if (!bitmap_bh)
479 goto error_return;
480 desc = ext4_get_group_desc (sb, block_group, &gd_bh);
481 if (!desc)
482 goto error_return;
483
484 if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
485 in_range(ext4_inode_bitmap(sb, desc), block, count) ||
486 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
487 in_range(block + count - 1, ext4_inode_table(sb, desc),
488 sbi->s_itb_per_group))
489 ext4_error (sb, "ext4_free_blocks",
490 "Freeing blocks in system zones - "
491 "Block = %llu, count = %lu",
492 block, count);
493
494 /*
495 * We are about to start releasing blocks in the bitmap,
496 * so we need undo access.
497 */
498 /* @@@ check errors */
499 BUFFER_TRACE(bitmap_bh, "getting undo access");
500 err = ext4_journal_get_undo_access(handle, bitmap_bh);
501 if (err)
502 goto error_return;
503
504 /*
505 * We are about to modify some metadata. Call the journal APIs
506 * to unshare ->b_data if a currently-committing transaction is
507 * using it
508 */
509 BUFFER_TRACE(gd_bh, "get_write_access");
510 err = ext4_journal_get_write_access(handle, gd_bh);
511 if (err)
512 goto error_return;
513
514 jbd_lock_bh_state(bitmap_bh);
515
516 for (i = 0, group_freed = 0; i < count; i++) {
517 /*
518 * An HJ special. This is expensive...
519 */
520#ifdef CONFIG_JBD_DEBUG
521 jbd_unlock_bh_state(bitmap_bh);
522 {
523 struct buffer_head *debug_bh;
524 debug_bh = sb_find_get_block(sb, block + i);
525 if (debug_bh) {
526 BUFFER_TRACE(debug_bh, "Deleted!");
527 if (!bh2jh(bitmap_bh)->b_committed_data)
528 BUFFER_TRACE(debug_bh,
529 "No commited data in bitmap");
530 BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
531 __brelse(debug_bh);
532 }
533 }
534 jbd_lock_bh_state(bitmap_bh);
535#endif
536 if (need_resched()) {
537 jbd_unlock_bh_state(bitmap_bh);
538 cond_resched();
539 jbd_lock_bh_state(bitmap_bh);
540 }
541 /* @@@ This prevents newly-allocated data from being
542 * freed and then reallocated within the same
543 * transaction.
544 *
545 * Ideally we would want to allow that to happen, but to
546 * do so requires making jbd2_journal_forget() capable of
547 * revoking the queued write of a data block, which
548 * implies blocking on the journal lock. *forget()
549 * cannot block due to truncate races.
550 *
551 * Eventually we can fix this by making jbd2_journal_forget()
552 * return a status indicating whether or not it was able
553 * to revoke the buffer. On successful revoke, it is
554 * safe not to set the allocation bit in the committed
555 * bitmap, because we know that there is no outstanding
556 * activity on the buffer any more and so it is safe to
557 * reallocate it.
558 */
559 BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
560 J_ASSERT_BH(bitmap_bh,
561 bh2jh(bitmap_bh)->b_committed_data != NULL);
562 ext4_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
563 bh2jh(bitmap_bh)->b_committed_data);
564
565 /*
566 * We clear the bit in the bitmap after setting the committed
567 * data bit, because this is the reverse order to that which
568 * the allocator uses.
569 */
570 BUFFER_TRACE(bitmap_bh, "clear bit");
571 if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
572 bit + i, bitmap_bh->b_data)) {
573 jbd_unlock_bh_state(bitmap_bh);
574 ext4_error(sb, __FUNCTION__,
575 "bit already cleared for block %llu",
576 (ext4_fsblk_t)(block + i));
577 jbd_lock_bh_state(bitmap_bh);
578 BUFFER_TRACE(bitmap_bh, "bit already cleared");
579 } else {
580 group_freed++;
581 }
582 }
583 jbd_unlock_bh_state(bitmap_bh);
584
585 spin_lock(sb_bgl_lock(sbi, block_group));
586 desc->bg_free_blocks_count =
587 cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) +
588 group_freed);
589 spin_unlock(sb_bgl_lock(sbi, block_group));
590 percpu_counter_mod(&sbi->s_freeblocks_counter, count);
591
592 /* We dirtied the bitmap block */
593 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
594 err = ext4_journal_dirty_metadata(handle, bitmap_bh);
595
596 /* And the group descriptor block */
597 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
598 ret = ext4_journal_dirty_metadata(handle, gd_bh);
599 if (!err) err = ret;
600 *pdquot_freed_blocks += group_freed;
601
602 if (overflow && !err) {
603 block += count;
604 count = overflow;
605 goto do_more;
606 }
607 sb->s_dirt = 1;
608error_return:
609 brelse(bitmap_bh);
610 ext4_std_error(sb, err);
611 return;
612}
613
614/**
615 * ext4_free_blocks() -- Free given blocks and update quota
616 * @handle: handle for this transaction
617 * @inode: inode
618 * @block: start physical block to free
619 * @count: number of blocks to count
620 */
621void ext4_free_blocks(handle_t *handle, struct inode *inode,
622 ext4_fsblk_t block, unsigned long count)
623{
624 struct super_block * sb;
625 unsigned long dquot_freed_blocks;
626
627 sb = inode->i_sb;
628 if (!sb) {
629 printk ("ext4_free_blocks: nonexistent device");
630 return;
631 }
632 ext4_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
633 if (dquot_freed_blocks)
634 DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
635 return;
636}
637
638/**
639 * ext4_test_allocatable()
640 * @nr: given allocation block group
641 * @bh: bufferhead contains the bitmap of the given block group
642 *
643 * For ext4 allocations, we must not reuse any blocks which are
644 * allocated in the bitmap buffer's "last committed data" copy. This
645 * prevents deletes from freeing up the page for reuse until we have
646 * committed the delete transaction.
647 *
648 * If we didn't do this, then deleting something and reallocating it as
649 * data would allow the old block to be overwritten before the
650 * transaction committed (because we force data to disk before commit).
651 * This would lead to corruption if we crashed between overwriting the
652 * data and committing the delete.
653 *
654 * @@@ We may want to make this allocation behaviour conditional on
655 * data-writes at some point, and disable it for metadata allocations or
656 * sync-data inodes.
657 */
658static int ext4_test_allocatable(ext4_grpblk_t nr, struct buffer_head *bh)
659{
660 int ret;
661 struct journal_head *jh = bh2jh(bh);
662
663 if (ext4_test_bit(nr, bh->b_data))
664 return 0;
665
666 jbd_lock_bh_state(bh);
667 if (!jh->b_committed_data)
668 ret = 1;
669 else
670 ret = !ext4_test_bit(nr, jh->b_committed_data);
671 jbd_unlock_bh_state(bh);
672 return ret;
673}
674
675/**
676 * bitmap_search_next_usable_block()
677 * @start: the starting block (group relative) of the search
678 * @bh: bufferhead contains the block group bitmap
679 * @maxblocks: the ending block (group relative) of the reservation
680 *
681 * The bitmap search --- search forward alternately through the actual
682 * bitmap on disk and the last-committed copy in journal, until we find a
683 * bit free in both bitmaps.
684 */
685static ext4_grpblk_t
686bitmap_search_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
687 ext4_grpblk_t maxblocks)
688{
689 ext4_grpblk_t next;
690 struct journal_head *jh = bh2jh(bh);
691
692 while (start < maxblocks) {
693 next = ext4_find_next_zero_bit(bh->b_data, maxblocks, start);
694 if (next >= maxblocks)
695 return -1;
696 if (ext4_test_allocatable(next, bh))
697 return next;
698 jbd_lock_bh_state(bh);
699 if (jh->b_committed_data)
700 start = ext4_find_next_zero_bit(jh->b_committed_data,
701 maxblocks, next);
702 jbd_unlock_bh_state(bh);
703 }
704 return -1;
705}
706
707/**
708 * find_next_usable_block()
709 * @start: the starting block (group relative) to find next
710 * allocatable block in bitmap.
711 * @bh: bufferhead contains the block group bitmap
712 * @maxblocks: the ending block (group relative) for the search
713 *
714 * Find an allocatable block in a bitmap. We honor both the bitmap and
715 * its last-committed copy (if that exists), and perform the "most
716 * appropriate allocation" algorithm of looking for a free block near
717 * the initial goal; then for a free byte somewhere in the bitmap; then
718 * for any free bit in the bitmap.
719 */
720static ext4_grpblk_t
721find_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
722 ext4_grpblk_t maxblocks)
723{
724 ext4_grpblk_t here, next;
725 char *p, *r;
726
727 if (start > 0) {
728 /*
729 * The goal was occupied; search forward for a free
730 * block within the next XX blocks.
731 *
732 * end_goal is more or less random, but it has to be
733 * less than EXT4_BLOCKS_PER_GROUP. Aligning up to the
734 * next 64-bit boundary is simple..
735 */
736 ext4_grpblk_t end_goal = (start + 63) & ~63;
737 if (end_goal > maxblocks)
738 end_goal = maxblocks;
739 here = ext4_find_next_zero_bit(bh->b_data, end_goal, start);
740 if (here < end_goal && ext4_test_allocatable(here, bh))
741 return here;
742 ext4_debug("Bit not found near goal\n");
743 }
744
745 here = start;
746 if (here < 0)
747 here = 0;
748
749 p = ((char *)bh->b_data) + (here >> 3);
750 r = memscan(p, 0, (maxblocks - here + 7) >> 3);
751 next = (r - ((char *)bh->b_data)) << 3;
752
753 if (next < maxblocks && next >= start && ext4_test_allocatable(next, bh))
754 return next;
755
756 /*
757 * The bitmap search --- search forward alternately through the actual
758 * bitmap and the last-committed copy until we find a bit free in
759 * both
760 */
761 here = bitmap_search_next_usable_block(here, bh, maxblocks);
762 return here;
763}
764
765/**
766 * claim_block()
767 * @block: the free block (group relative) to allocate
768 * @bh: the bufferhead containts the block group bitmap
769 *
770 * We think we can allocate this block in this bitmap. Try to set the bit.
771 * If that succeeds then check that nobody has allocated and then freed the
772 * block since we saw that is was not marked in b_committed_data. If it _was_
773 * allocated and freed then clear the bit in the bitmap again and return
774 * zero (failure).
775 */
776static inline int
777claim_block(spinlock_t *lock, ext4_grpblk_t block, struct buffer_head *bh)
778{
779 struct journal_head *jh = bh2jh(bh);
780 int ret;
781
782 if (ext4_set_bit_atomic(lock, block, bh->b_data))
783 return 0;
784 jbd_lock_bh_state(bh);
785 if (jh->b_committed_data && ext4_test_bit(block,jh->b_committed_data)) {
786 ext4_clear_bit_atomic(lock, block, bh->b_data);
787 ret = 0;
788 } else {
789 ret = 1;
790 }
791 jbd_unlock_bh_state(bh);
792 return ret;
793}
794
795/**
796 * ext4_try_to_allocate()
797 * @sb: superblock
798 * @handle: handle to this transaction
799 * @group: given allocation block group
800 * @bitmap_bh: bufferhead holds the block bitmap
801 * @grp_goal: given target block within the group
802 * @count: target number of blocks to allocate
803 * @my_rsv: reservation window
804 *
805 * Attempt to allocate blocks within a give range. Set the range of allocation
806 * first, then find the first free bit(s) from the bitmap (within the range),
807 * and at last, allocate the blocks by claiming the found free bit as allocated.
808 *
809 * To set the range of this allocation:
810 * if there is a reservation window, only try to allocate block(s) from the
811 * file's own reservation window;
812 * Otherwise, the allocation range starts from the give goal block, ends at
813 * the block group's last block.
814 *
815 * If we failed to allocate the desired block then we may end up crossing to a
816 * new bitmap. In that case we must release write access to the old one via
817 * ext4_journal_release_buffer(), else we'll run out of credits.
818 */
819static ext4_grpblk_t
820ext4_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
821 struct buffer_head *bitmap_bh, ext4_grpblk_t grp_goal,
822 unsigned long *count, struct ext4_reserve_window *my_rsv)
823{
824 ext4_fsblk_t group_first_block;
825 ext4_grpblk_t start, end;
826 unsigned long num = 0;
827
828 /* we do allocation within the reservation window if we have a window */
829 if (my_rsv) {
830 group_first_block = ext4_group_first_block_no(sb, group);
831 if (my_rsv->_rsv_start >= group_first_block)
832 start = my_rsv->_rsv_start - group_first_block;
833 else
834 /* reservation window cross group boundary */
835 start = 0;
836 end = my_rsv->_rsv_end - group_first_block + 1;
837 if (end > EXT4_BLOCKS_PER_GROUP(sb))
838 /* reservation window crosses group boundary */
839 end = EXT4_BLOCKS_PER_GROUP(sb);
840 if ((start <= grp_goal) && (grp_goal < end))
841 start = grp_goal;
842 else
843 grp_goal = -1;
844 } else {
845 if (grp_goal > 0)
846 start = grp_goal;
847 else
848 start = 0;
849 end = EXT4_BLOCKS_PER_GROUP(sb);
850 }
851
852 BUG_ON(start > EXT4_BLOCKS_PER_GROUP(sb));
853
854repeat:
855 if (grp_goal < 0 || !ext4_test_allocatable(grp_goal, bitmap_bh)) {
856 grp_goal = find_next_usable_block(start, bitmap_bh, end);
857 if (grp_goal < 0)
858 goto fail_access;
859 if (!my_rsv) {
860 int i;
861
862 for (i = 0; i < 7 && grp_goal > start &&
863 ext4_test_allocatable(grp_goal - 1,
864 bitmap_bh);
865 i++, grp_goal--)
866 ;
867 }
868 }
869 start = grp_goal;
870
871 if (!claim_block(sb_bgl_lock(EXT4_SB(sb), group),
872 grp_goal, bitmap_bh)) {
873 /*
874 * The block was allocated by another thread, or it was
875 * allocated and then freed by another thread
876 */
877 start++;
878 grp_goal++;
879 if (start >= end)
880 goto fail_access;
881 goto repeat;
882 }
883 num++;
884 grp_goal++;
885 while (num < *count && grp_goal < end
886 && ext4_test_allocatable(grp_goal, bitmap_bh)
887 && claim_block(sb_bgl_lock(EXT4_SB(sb), group),
888 grp_goal, bitmap_bh)) {
889 num++;
890 grp_goal++;
891 }
892 *count = num;
893 return grp_goal - num;
894fail_access:
895 *count = num;
896 return -1;
897}
898
899/**
900 * find_next_reservable_window():
901 * find a reservable space within the given range.
902 * It does not allocate the reservation window for now:
903 * alloc_new_reservation() will do the work later.
904 *
905 * @search_head: the head of the searching list;
906 * This is not necessarily the list head of the whole filesystem
907 *
908 * We have both head and start_block to assist the search
909 * for the reservable space. The list starts from head,
910 * but we will shift to the place where start_block is,
911 * then start from there, when looking for a reservable space.
912 *
913 * @size: the target new reservation window size
914 *
915 * @group_first_block: the first block we consider to start
916 * the real search from
917 *
918 * @last_block:
919 * the maximum block number that our goal reservable space
920 * could start from. This is normally the last block in this
921 * group. The search will end when we found the start of next
922 * possible reservable space is out of this boundary.
923 * This could handle the cross boundary reservation window
924 * request.
925 *
926 * basically we search from the given range, rather than the whole
927 * reservation double linked list, (start_block, last_block)
928 * to find a free region that is of my size and has not
929 * been reserved.
930 *
931 */
932static int find_next_reservable_window(
933 struct ext4_reserve_window_node *search_head,
934 struct ext4_reserve_window_node *my_rsv,
935 struct super_block * sb,
936 ext4_fsblk_t start_block,
937 ext4_fsblk_t last_block)
938{
939 struct rb_node *next;
940 struct ext4_reserve_window_node *rsv, *prev;
941 ext4_fsblk_t cur;
942 int size = my_rsv->rsv_goal_size;
943
944 /* TODO: make the start of the reservation window byte-aligned */
945 /* cur = *start_block & ~7;*/
946 cur = start_block;
947 rsv = search_head;
948 if (!rsv)
949 return -1;
950
951 while (1) {
952 if (cur <= rsv->rsv_end)
953 cur = rsv->rsv_end + 1;
954
955 /* TODO?
956 * in the case we could not find a reservable space
957 * that is what is expected, during the re-search, we could
958 * remember what's the largest reservable space we could have
959 * and return that one.
960 *
961 * For now it will fail if we could not find the reservable
962 * space with expected-size (or more)...
963 */
964 if (cur > last_block)
965 return -1; /* fail */
966
967 prev = rsv;
968 next = rb_next(&rsv->rsv_node);
969 rsv = list_entry(next,struct ext4_reserve_window_node,rsv_node);
970
971 /*
972 * Reached the last reservation, we can just append to the
973 * previous one.
974 */
975 if (!next)
976 break;
977
978 if (cur + size <= rsv->rsv_start) {
979 /*
980 * Found a reserveable space big enough. We could
981 * have a reservation across the group boundary here
982 */
983 break;
984 }
985 }
986 /*
987 * we come here either :
988 * when we reach the end of the whole list,
989 * and there is empty reservable space after last entry in the list.
990 * append it to the end of the list.
991 *
992 * or we found one reservable space in the middle of the list,
993 * return the reservation window that we could append to.
994 * succeed.
995 */
996
997 if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window)))
998 rsv_window_remove(sb, my_rsv);
999
1000 /*
1001 * Let's book the whole avaliable window for now. We will check the
1002 * disk bitmap later and then, if there are free blocks then we adjust
1003 * the window size if it's larger than requested.
1004 * Otherwise, we will remove this node from the tree next time
1005 * call find_next_reservable_window.
1006 */
1007 my_rsv->rsv_start = cur;
1008 my_rsv->rsv_end = cur + size - 1;
1009 my_rsv->rsv_alloc_hit = 0;
1010
1011 if (prev != my_rsv)
1012 ext4_rsv_window_add(sb, my_rsv);
1013
1014 return 0;
1015}
1016
1017/**
1018 * alloc_new_reservation()--allocate a new reservation window
1019 *
1020 * To make a new reservation, we search part of the filesystem
1021 * reservation list (the list that inside the group). We try to
1022 * allocate a new reservation window near the allocation goal,
1023 * or the beginning of the group, if there is no goal.
1024 *
1025 * We first find a reservable space after the goal, then from
1026 * there, we check the bitmap for the first free block after
1027 * it. If there is no free block until the end of group, then the
1028 * whole group is full, we failed. Otherwise, check if the free
1029 * block is inside the expected reservable space, if so, we
1030 * succeed.
1031 * If the first free block is outside the reservable space, then
1032 * start from the first free block, we search for next available
1033 * space, and go on.
1034 *
1035 * on succeed, a new reservation will be found and inserted into the list
1036 * It contains at least one free block, and it does not overlap with other
1037 * reservation windows.
1038 *
1039 * failed: we failed to find a reservation window in this group
1040 *
1041 * @rsv: the reservation
1042 *
1043 * @grp_goal: The goal (group-relative). It is where the search for a
1044 * free reservable space should start from.
1045 * if we have a grp_goal(grp_goal >0 ), then start from there,
1046 * no grp_goal(grp_goal = -1), we start from the first block
1047 * of the group.
1048 *
1049 * @sb: the super block
1050 * @group: the group we are trying to allocate in
1051 * @bitmap_bh: the block group block bitmap
1052 *
1053 */
1054static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv,
1055 ext4_grpblk_t grp_goal, struct super_block *sb,
1056 unsigned int group, struct buffer_head *bitmap_bh)
1057{
1058 struct ext4_reserve_window_node *search_head;
1059 ext4_fsblk_t group_first_block, group_end_block, start_block;
1060 ext4_grpblk_t first_free_block;
1061 struct rb_root *fs_rsv_root = &EXT4_SB(sb)->s_rsv_window_root;
1062 unsigned long size;
1063 int ret;
1064 spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
1065
1066 group_first_block = ext4_group_first_block_no(sb, group);
1067 group_end_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
1068
1069 if (grp_goal < 0)
1070 start_block = group_first_block;
1071 else
1072 start_block = grp_goal + group_first_block;
1073
1074 size = my_rsv->rsv_goal_size;
1075
1076 if (!rsv_is_empty(&my_rsv->rsv_window)) {
1077 /*
1078 * if the old reservation is cross group boundary
1079 * and if the goal is inside the old reservation window,
1080 * we will come here when we just failed to allocate from
1081 * the first part of the window. We still have another part
1082 * that belongs to the next group. In this case, there is no
1083 * point to discard our window and try to allocate a new one
1084 * in this group(which will fail). we should
1085 * keep the reservation window, just simply move on.
1086 *
1087 * Maybe we could shift the start block of the reservation
1088 * window to the first block of next group.
1089 */
1090
1091 if ((my_rsv->rsv_start <= group_end_block) &&
1092 (my_rsv->rsv_end > group_end_block) &&
1093 (start_block >= my_rsv->rsv_start))
1094 return -1;
1095
1096 if ((my_rsv->rsv_alloc_hit >
1097 (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
1098 /*
1099 * if the previously allocation hit ratio is
1100 * greater than 1/2, then we double the size of
1101 * the reservation window the next time,
1102 * otherwise we keep the same size window
1103 */
1104 size = size * 2;
1105 if (size > EXT4_MAX_RESERVE_BLOCKS)
1106 size = EXT4_MAX_RESERVE_BLOCKS;
1107 my_rsv->rsv_goal_size= size;
1108 }
1109 }
1110
1111 spin_lock(rsv_lock);
1112 /*
1113 * shift the search start to the window near the goal block
1114 */
1115 search_head = search_reserve_window(fs_rsv_root, start_block);
1116
1117 /*
1118 * find_next_reservable_window() simply finds a reservable window
1119 * inside the given range(start_block, group_end_block).
1120 *
1121 * To make sure the reservation window has a free bit inside it, we
1122 * need to check the bitmap after we found a reservable window.
1123 */
1124retry:
1125 ret = find_next_reservable_window(search_head, my_rsv, sb,
1126 start_block, group_end_block);
1127
1128 if (ret == -1) {
1129 if (!rsv_is_empty(&my_rsv->rsv_window))
1130 rsv_window_remove(sb, my_rsv);
1131 spin_unlock(rsv_lock);
1132 return -1;
1133 }
1134
1135 /*
1136 * On success, find_next_reservable_window() returns the
1137 * reservation window where there is a reservable space after it.
1138 * Before we reserve this reservable space, we need
1139 * to make sure there is at least a free block inside this region.
1140 *
1141 * searching the first free bit on the block bitmap and copy of
1142 * last committed bitmap alternatively, until we found a allocatable
1143 * block. Search start from the start block of the reservable space
1144 * we just found.
1145 */
1146 spin_unlock(rsv_lock);
1147 first_free_block = bitmap_search_next_usable_block(
1148 my_rsv->rsv_start - group_first_block,
1149 bitmap_bh, group_end_block - group_first_block + 1);
1150
1151 if (first_free_block < 0) {
1152 /*
1153 * no free block left on the bitmap, no point
1154 * to reserve the space. return failed.
1155 */
1156 spin_lock(rsv_lock);
1157 if (!rsv_is_empty(&my_rsv->rsv_window))
1158 rsv_window_remove(sb, my_rsv);
1159 spin_unlock(rsv_lock);
1160 return -1; /* failed */
1161 }
1162
1163 start_block = first_free_block + group_first_block;
1164 /*
1165 * check if the first free block is within the
1166 * free space we just reserved
1167 */
1168 if (start_block >= my_rsv->rsv_start && start_block < my_rsv->rsv_end)
1169 return 0; /* success */
1170 /*
1171 * if the first free bit we found is out of the reservable space
1172 * continue search for next reservable space,
1173 * start from where the free block is,
1174 * we also shift the list head to where we stopped last time
1175 */
1176 search_head = my_rsv;
1177 spin_lock(rsv_lock);
1178 goto retry;
1179}
1180
1181/**
1182 * try_to_extend_reservation()
1183 * @my_rsv: given reservation window
1184 * @sb: super block
1185 * @size: the delta to extend
1186 *
1187 * Attempt to expand the reservation window large enough to have
1188 * required number of free blocks
1189 *
1190 * Since ext4_try_to_allocate() will always allocate blocks within
1191 * the reservation window range, if the window size is too small,
1192 * multiple blocks allocation has to stop at the end of the reservation
1193 * window. To make this more efficient, given the total number of
1194 * blocks needed and the current size of the window, we try to
1195 * expand the reservation window size if necessary on a best-effort
1196 * basis before ext4_new_blocks() tries to allocate blocks,
1197 */
1198static void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv,
1199 struct super_block *sb, int size)
1200{
1201 struct ext4_reserve_window_node *next_rsv;
1202 struct rb_node *next;
1203 spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
1204
1205 if (!spin_trylock(rsv_lock))
1206 return;
1207
1208 next = rb_next(&my_rsv->rsv_node);
1209
1210 if (!next)
1211 my_rsv->rsv_end += size;
1212 else {
1213 next_rsv = list_entry(next, struct ext4_reserve_window_node, rsv_node);
1214
1215 if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
1216 my_rsv->rsv_end += size;
1217 else
1218 my_rsv->rsv_end = next_rsv->rsv_start - 1;
1219 }
1220 spin_unlock(rsv_lock);
1221}
1222
1223/**
1224 * ext4_try_to_allocate_with_rsv()
1225 * @sb: superblock
1226 * @handle: handle to this transaction
1227 * @group: given allocation block group
1228 * @bitmap_bh: bufferhead holds the block bitmap
1229 * @grp_goal: given target block within the group
1230 * @count: target number of blocks to allocate
1231 * @my_rsv: reservation window
1232 * @errp: pointer to store the error code
1233 *
1234 * This is the main function used to allocate a new block and its reservation
1235 * window.
1236 *
1237 * Each time when a new block allocation is need, first try to allocate from
1238 * its own reservation. If it does not have a reservation window, instead of
1239 * looking for a free bit on bitmap first, then look up the reservation list to
1240 * see if it is inside somebody else's reservation window, we try to allocate a
1241 * reservation window for it starting from the goal first. Then do the block
1242 * allocation within the reservation window.
1243 *
1244 * This will avoid keeping on searching the reservation list again and
1245 * again when somebody is looking for a free block (without
1246 * reservation), and there are lots of free blocks, but they are all
1247 * being reserved.
1248 *
1249 * We use a red-black tree for the per-filesystem reservation list.
1250 *
1251 */
1252static ext4_grpblk_t
1253ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
1254 unsigned int group, struct buffer_head *bitmap_bh,
1255 ext4_grpblk_t grp_goal,
1256 struct ext4_reserve_window_node * my_rsv,
1257 unsigned long *count, int *errp)
1258{
1259 ext4_fsblk_t group_first_block, group_last_block;
1260 ext4_grpblk_t ret = 0;
1261 int fatal;
1262 unsigned long num = *count;
1263
1264 *errp = 0;
1265
1266 /*
1267 * Make sure we use undo access for the bitmap, because it is critical
1268 * that we do the frozen_data COW on bitmap buffers in all cases even
1269 * if the buffer is in BJ_Forget state in the committing transaction.
1270 */
1271 BUFFER_TRACE(bitmap_bh, "get undo access for new block");
1272 fatal = ext4_journal_get_undo_access(handle, bitmap_bh);
1273 if (fatal) {
1274 *errp = fatal;
1275 return -1;
1276 }
1277
1278 /*
1279 * we don't deal with reservation when
1280 * filesystem is mounted without reservation
1281 * or the file is not a regular file
1282 * or last attempt to allocate a block with reservation turned on failed
1283 */
1284 if (my_rsv == NULL ) {
1285 ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
1286 grp_goal, count, NULL);
1287 goto out;
1288 }
1289 /*
1290 * grp_goal is a group relative block number (if there is a goal)
1291 * 0 < grp_goal < EXT4_BLOCKS_PER_GROUP(sb)
1292 * first block is a filesystem wide block number
1293 * first block is the block number of the first block in this group
1294 */
1295 group_first_block = ext4_group_first_block_no(sb, group);
1296 group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
1297
1298 /*
1299 * Basically we will allocate a new block from inode's reservation
1300 * window.
1301 *
1302 * We need to allocate a new reservation window, if:
1303 * a) inode does not have a reservation window; or
1304 * b) last attempt to allocate a block from existing reservation
1305 * failed; or
1306 * c) we come here with a goal and with a reservation window
1307 *
1308 * We do not need to allocate a new reservation window if we come here
1309 * at the beginning with a goal and the goal is inside the window, or
1310 * we don't have a goal but already have a reservation window.
1311 * then we could go to allocate from the reservation window directly.
1312 */
1313 while (1) {
1314 if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
1315 !goal_in_my_reservation(&my_rsv->rsv_window,
1316 grp_goal, group, sb)) {
1317 if (my_rsv->rsv_goal_size < *count)
1318 my_rsv->rsv_goal_size = *count;
1319 ret = alloc_new_reservation(my_rsv, grp_goal, sb,
1320 group, bitmap_bh);
1321 if (ret < 0)
1322 break; /* failed */
1323
1324 if (!goal_in_my_reservation(&my_rsv->rsv_window,
1325 grp_goal, group, sb))
1326 grp_goal = -1;
1327 } else if (grp_goal > 0 &&
1328 (my_rsv->rsv_end-grp_goal+1) < *count)
1329 try_to_extend_reservation(my_rsv, sb,
1330 *count-my_rsv->rsv_end + grp_goal - 1);
1331
1332 if ((my_rsv->rsv_start > group_last_block) ||
1333 (my_rsv->rsv_end < group_first_block)) {
1334 rsv_window_dump(&EXT4_SB(sb)->s_rsv_window_root, 1);
1335 BUG();
1336 }
1337 ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
1338 grp_goal, &num, &my_rsv->rsv_window);
1339 if (ret >= 0) {
1340 my_rsv->rsv_alloc_hit += num;
1341 *count = num;
1342 break; /* succeed */
1343 }
1344 num = *count;
1345 }
1346out:
1347 if (ret >= 0) {
1348 BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
1349 "bitmap block");
1350 fatal = ext4_journal_dirty_metadata(handle, bitmap_bh);
1351 if (fatal) {
1352 *errp = fatal;
1353 return -1;
1354 }
1355 return ret;
1356 }
1357
1358 BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
1359 ext4_journal_release_buffer(handle, bitmap_bh);
1360 return ret;
1361}
1362
1363/**
1364 * ext4_has_free_blocks()
1365 * @sbi: in-core super block structure.
1366 *
1367 * Check if filesystem has at least 1 free block available for allocation.
1368 */
1369static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
1370{
1371 ext4_fsblk_t free_blocks, root_blocks;
1372
1373 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
1374 root_blocks = ext4_r_blocks_count(sbi->s_es);
1375 if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
1376 sbi->s_resuid != current->fsuid &&
1377 (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
1378 return 0;
1379 }
1380 return 1;
1381}
1382
1383/**
1384 * ext4_should_retry_alloc()
1385 * @sb: super block
1386 * @retries number of attemps has been made
1387 *
1388 * ext4_should_retry_alloc() is called when ENOSPC is returned, and if
1389 * it is profitable to retry the operation, this function will wait
1390 * for the current or commiting transaction to complete, and then
1391 * return TRUE.
1392 *
1393 * if the total number of retries exceed three times, return FALSE.
1394 */
1395int ext4_should_retry_alloc(struct super_block *sb, int *retries)
1396{
1397 if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3)
1398 return 0;
1399
1400 jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
1401
1402 return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
1403}
1404
1405/**
1406 * ext4_new_blocks() -- core block(s) allocation function
1407 * @handle: handle to this transaction
1408 * @inode: file inode
1409 * @goal: given target block(filesystem wide)
1410 * @count: target number of blocks to allocate
1411 * @errp: error code
1412 *
1413 * ext4_new_blocks uses a goal block to assist allocation. It tries to
1414 * allocate block(s) from the block group contains the goal block first. If that
1415 * fails, it will try to allocate block(s) from other block groups without
1416 * any specific goal block.
1417 *
1418 */
1419ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
1420 ext4_fsblk_t goal, unsigned long *count, int *errp)
1421{
1422 struct buffer_head *bitmap_bh = NULL;
1423 struct buffer_head *gdp_bh;
1424 unsigned long group_no;
1425 int goal_group;
1426 ext4_grpblk_t grp_target_blk; /* blockgroup relative goal block */
1427 ext4_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/
1428 ext4_fsblk_t ret_block; /* filesyetem-wide allocated block */
1429 int bgi; /* blockgroup iteration index */
1430 int fatal = 0, err;
1431 int performed_allocation = 0;
1432 ext4_grpblk_t free_blocks; /* number of free blocks in a group */
1433 struct super_block *sb;
1434 struct ext4_group_desc *gdp;
1435 struct ext4_super_block *es;
1436 struct ext4_sb_info *sbi;
1437 struct ext4_reserve_window_node *my_rsv = NULL;
1438 struct ext4_block_alloc_info *block_i;
1439 unsigned short windowsz = 0;
1440#ifdef EXT4FS_DEBUG
1441 static int goal_hits, goal_attempts;
1442#endif
1443 unsigned long ngroups;
1444 unsigned long num = *count;
1445
1446 *errp = -ENOSPC;
1447 sb = inode->i_sb;
1448 if (!sb) {
1449 printk("ext4_new_block: nonexistent device");
1450 return 0;
1451 }
1452
1453 /*
1454 * Check quota for allocation of this block.
1455 */
1456 if (DQUOT_ALLOC_BLOCK(inode, num)) {
1457 *errp = -EDQUOT;
1458 return 0;
1459 }
1460
1461 sbi = EXT4_SB(sb);
1462 es = EXT4_SB(sb)->s_es;
1463 ext4_debug("goal=%lu.\n", goal);
1464 /*
1465 * Allocate a block from reservation only when
1466 * filesystem is mounted with reservation(default,-o reservation), and
1467 * it's a regular file, and
1468 * the desired window size is greater than 0 (One could use ioctl
1469 * command EXT4_IOC_SETRSVSZ to set the window size to 0 to turn off
1470 * reservation on that particular file)
1471 */
1472 block_i = EXT4_I(inode)->i_block_alloc_info;
1473 if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
1474 my_rsv = &block_i->rsv_window_node;
1475
1476 if (!ext4_has_free_blocks(sbi)) {
1477 *errp = -ENOSPC;
1478 goto out;
1479 }
1480
1481 /*
1482 * First, test whether the goal block is free.
1483 */
1484 if (goal < le32_to_cpu(es->s_first_data_block) ||
1485 goal >= ext4_blocks_count(es))
1486 goal = le32_to_cpu(es->s_first_data_block);
1487 ext4_get_group_no_and_offset(sb, goal, &group_no, &grp_target_blk);
1488 goal_group = group_no;
1489retry_alloc:
1490 gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
1491 if (!gdp)
1492 goto io_error;
1493
1494 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1495 /*
1496 * if there is not enough free blocks to make a new resevation
1497 * turn off reservation for this allocation
1498 */
1499 if (my_rsv && (free_blocks < windowsz)
1500 && (rsv_is_empty(&my_rsv->rsv_window)))
1501 my_rsv = NULL;
1502
1503 if (free_blocks > 0) {
1504 bitmap_bh = read_block_bitmap(sb, group_no);
1505 if (!bitmap_bh)
1506 goto io_error;
1507 grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
1508 group_no, bitmap_bh, grp_target_blk,
1509 my_rsv, &num, &fatal);
1510 if (fatal)
1511 goto out;
1512 if (grp_alloc_blk >= 0)
1513 goto allocated;
1514 }
1515
1516 ngroups = EXT4_SB(sb)->s_groups_count;
1517 smp_rmb();
1518
1519 /*
1520 * Now search the rest of the groups. We assume that
1521 * i and gdp correctly point to the last group visited.
1522 */
1523 for (bgi = 0; bgi < ngroups; bgi++) {
1524 group_no++;
1525 if (group_no >= ngroups)
1526 group_no = 0;
1527 gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
1528 if (!gdp) {
1529 *errp = -EIO;
1530 goto out;
1531 }
1532 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1533 /*
1534 * skip this group if the number of
1535 * free blocks is less than half of the reservation
1536 * window size.
1537 */
1538 if (free_blocks <= (windowsz/2))
1539 continue;
1540
1541 brelse(bitmap_bh);
1542 bitmap_bh = read_block_bitmap(sb, group_no);
1543 if (!bitmap_bh)
1544 goto io_error;
1545 /*
1546 * try to allocate block(s) from this group, without a goal(-1).
1547 */
1548 grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
1549 group_no, bitmap_bh, -1, my_rsv,
1550 &num, &fatal);
1551 if (fatal)
1552 goto out;
1553 if (grp_alloc_blk >= 0)
1554 goto allocated;
1555 }
1556 /*
1557 * We may end up a bogus ealier ENOSPC error due to
1558 * filesystem is "full" of reservations, but
1559 * there maybe indeed free blocks avaliable on disk
1560 * In this case, we just forget about the reservations
1561 * just do block allocation as without reservations.
1562 */
1563 if (my_rsv) {
1564 my_rsv = NULL;
1565 group_no = goal_group;
1566 goto retry_alloc;
1567 }
1568 /* No space left on the device */
1569 *errp = -ENOSPC;
1570 goto out;
1571
1572allocated:
1573
1574 ext4_debug("using block group %d(%d)\n",
1575 group_no, gdp->bg_free_blocks_count);
1576
1577 BUFFER_TRACE(gdp_bh, "get_write_access");
1578 fatal = ext4_journal_get_write_access(handle, gdp_bh);
1579 if (fatal)
1580 goto out;
1581
1582 ret_block = grp_alloc_blk + ext4_group_first_block_no(sb, group_no);
1583
1584 if (in_range(ext4_block_bitmap(sb, gdp), ret_block, num) ||
1585 in_range(ext4_block_bitmap(sb, gdp), ret_block, num) ||
1586 in_range(ret_block, ext4_inode_table(sb, gdp),
1587 EXT4_SB(sb)->s_itb_per_group) ||
1588 in_range(ret_block + num - 1, ext4_inode_table(sb, gdp),
1589 EXT4_SB(sb)->s_itb_per_group))
1590 ext4_error(sb, "ext4_new_block",
1591 "Allocating block in system zone - "
1592 "blocks from %llu, length %lu",
1593 ret_block, num);
1594
1595 performed_allocation = 1;
1596
1597#ifdef CONFIG_JBD_DEBUG
1598 {
1599 struct buffer_head *debug_bh;
1600
1601 /* Record bitmap buffer state in the newly allocated block */
1602 debug_bh = sb_find_get_block(sb, ret_block);
1603 if (debug_bh) {
1604 BUFFER_TRACE(debug_bh, "state when allocated");
1605 BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
1606 brelse(debug_bh);
1607 }
1608 }
1609 jbd_lock_bh_state(bitmap_bh);
1610 spin_lock(sb_bgl_lock(sbi, group_no));
1611 if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
1612 int i;
1613
1614 for (i = 0; i < num; i++) {
1615 if (ext4_test_bit(grp_alloc_blk+i,
1616 bh2jh(bitmap_bh)->b_committed_data)) {
1617 printk("%s: block was unexpectedly set in "
1618 "b_committed_data\n", __FUNCTION__);
1619 }
1620 }
1621 }
1622 ext4_debug("found bit %d\n", grp_alloc_blk);
1623 spin_unlock(sb_bgl_lock(sbi, group_no));
1624 jbd_unlock_bh_state(bitmap_bh);
1625#endif
1626
1627 if (ret_block + num - 1 >= ext4_blocks_count(es)) {
1628 ext4_error(sb, "ext4_new_block",
1629 "block(%llu) >= blocks count(%llu) - "
1630 "block_group = %lu, es == %p ", ret_block,
1631 ext4_blocks_count(es), group_no, es);
1632 goto out;
1633 }
1634
1635 /*
1636 * It is up to the caller to add the new buffer to a journal
1637 * list of some description. We don't know in advance whether
1638 * the caller wants to use it as metadata or data.
1639 */
1640 ext4_debug("allocating block %lu. Goal hits %d of %d.\n",
1641 ret_block, goal_hits, goal_attempts);
1642
1643 spin_lock(sb_bgl_lock(sbi, group_no));
1644 gdp->bg_free_blocks_count =
1645 cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)-num);
1646 spin_unlock(sb_bgl_lock(sbi, group_no));
1647 percpu_counter_mod(&sbi->s_freeblocks_counter, -num);
1648
1649 BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
1650 err = ext4_journal_dirty_metadata(handle, gdp_bh);
1651 if (!fatal)
1652 fatal = err;
1653
1654 sb->s_dirt = 1;
1655 if (fatal)
1656 goto out;
1657
1658 *errp = 0;
1659 brelse(bitmap_bh);
1660 DQUOT_FREE_BLOCK(inode, *count-num);
1661 *count = num;
1662 return ret_block;
1663
1664io_error:
1665 *errp = -EIO;
1666out:
1667 if (fatal) {
1668 *errp = fatal;
1669 ext4_std_error(sb, fatal);
1670 }
1671 /*
1672 * Undo the block allocation
1673 */
1674 if (!performed_allocation)
1675 DQUOT_FREE_BLOCK(inode, *count);
1676 brelse(bitmap_bh);
1677 return 0;
1678}
1679
1680ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
1681 ext4_fsblk_t goal, int *errp)
1682{
1683 unsigned long count = 1;
1684
1685 return ext4_new_blocks(handle, inode, goal, &count, errp);
1686}
1687
1688/**
1689 * ext4_count_free_blocks() -- count filesystem free blocks
1690 * @sb: superblock
1691 *
1692 * Adds up the number of free blocks from each block group.
1693 */
1694ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
1695{
1696 ext4_fsblk_t desc_count;
1697 struct ext4_group_desc *gdp;
1698 int i;
1699 unsigned long ngroups = EXT4_SB(sb)->s_groups_count;
1700#ifdef EXT4FS_DEBUG
1701 struct ext4_super_block *es;
1702 ext4_fsblk_t bitmap_count;
1703 unsigned long x;
1704 struct buffer_head *bitmap_bh = NULL;
1705
1706 es = EXT4_SB(sb)->s_es;
1707 desc_count = 0;
1708 bitmap_count = 0;
1709 gdp = NULL;
1710
1711 smp_rmb();
1712 for (i = 0; i < ngroups; i++) {
1713 gdp = ext4_get_group_desc(sb, i, NULL);
1714 if (!gdp)
1715 continue;
1716 desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
1717 brelse(bitmap_bh);
1718 bitmap_bh = read_block_bitmap(sb, i);
1719 if (bitmap_bh == NULL)
1720 continue;
1721
1722 x = ext4_count_free(bitmap_bh, sb->s_blocksize);
1723 printk("group %d: stored = %d, counted = %lu\n",
1724 i, le16_to_cpu(gdp->bg_free_blocks_count), x);
1725 bitmap_count += x;
1726 }
1727 brelse(bitmap_bh);
1728 printk("ext4_count_free_blocks: stored = %llu"
1729 ", computed = %llu, %llu\n",
1730 EXT4_FREE_BLOCKS_COUNT(es),
1731 desc_count, bitmap_count);
1732 return bitmap_count;
1733#else
1734 desc_count = 0;
1735 smp_rmb();
1736 for (i = 0; i < ngroups; i++) {
1737 gdp = ext4_get_group_desc(sb, i, NULL);
1738 if (!gdp)
1739 continue;
1740 desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
1741 }
1742
1743 return desc_count;
1744#endif
1745}
1746
1747static inline int
1748block_in_use(ext4_fsblk_t block, struct super_block *sb, unsigned char *map)
1749{
1750 ext4_grpblk_t offset;
1751
1752 ext4_get_group_no_and_offset(sb, block, NULL, &offset);
1753 return ext4_test_bit (offset, map);
1754}
1755
1756static inline int test_root(int a, int b)
1757{
1758 int num = b;
1759
1760 while (a > num)
1761 num *= b;
1762 return num == a;
1763}
1764
1765static int ext4_group_sparse(int group)
1766{
1767 if (group <= 1)
1768 return 1;
1769 if (!(group & 1))
1770 return 0;
1771 return (test_root(group, 7) || test_root(group, 5) ||
1772 test_root(group, 3));
1773}
1774
1775/**
1776 * ext4_bg_has_super - number of blocks used by the superblock in group
1777 * @sb: superblock for filesystem
1778 * @group: group number to check
1779 *
1780 * Return the number of blocks used by the superblock (primary or backup)
1781 * in this group. Currently this will be only 0 or 1.
1782 */
1783int ext4_bg_has_super(struct super_block *sb, int group)
1784{
1785 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
1786 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
1787 !ext4_group_sparse(group))
1788 return 0;
1789 return 1;
1790}
1791
1792static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, int group)
1793{
1794 unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
1795 unsigned long first = metagroup * EXT4_DESC_PER_BLOCK(sb);
1796 unsigned long last = first + EXT4_DESC_PER_BLOCK(sb) - 1;
1797
1798 if (group == first || group == first + 1 || group == last)
1799 return 1;
1800 return 0;
1801}
1802
1803static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, int group)
1804{
1805 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
1806 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
1807 !ext4_group_sparse(group))
1808 return 0;
1809 return EXT4_SB(sb)->s_gdb_count;
1810}
1811
1812/**
1813 * ext4_bg_num_gdb - number of blocks used by the group table in group
1814 * @sb: superblock for filesystem
1815 * @group: group number to check
1816 *
1817 * Return the number of blocks used by the group descriptor table
1818 * (primary or backup) in this group. In the future there may be a
1819 * different number of descriptor blocks in each group.
1820 */
1821unsigned long ext4_bg_num_gdb(struct super_block *sb, int group)
1822{
1823 unsigned long first_meta_bg =
1824 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
1825 unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
1826
1827 if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) ||
1828 metagroup < first_meta_bg)
1829 return ext4_bg_num_gdb_nometa(sb,group);
1830
1831 return ext4_bg_num_gdb_meta(sb,group);
1832
1833}
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
new file mode 100644
index 000000000000..11e93c169bcf
--- /dev/null
+++ b/fs/ext4/bitmap.c
@@ -0,0 +1,32 @@
1/*
2 * linux/fs/ext4/bitmap.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 */
9
10#include <linux/buffer_head.h>
11#include <linux/jbd2.h>
12#include <linux/ext4_fs.h>
13
14#ifdef EXT4FS_DEBUG
15
16static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
17
18unsigned long ext4_count_free (struct buffer_head * map, unsigned int numchars)
19{
20 unsigned int i;
21 unsigned long sum = 0;
22
23 if (!map)
24 return (0);
25 for (i = 0; i < numchars; i++)
26 sum += nibblemap[map->b_data[i] & 0xf] +
27 nibblemap[(map->b_data[i] >> 4) & 0xf];
28 return (sum);
29}
30
31#endif /* EXT4FS_DEBUG */
32
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
new file mode 100644
index 000000000000..f8595787a70e
--- /dev/null
+++ b/fs/ext4/dir.c
@@ -0,0 +1,518 @@
1/*
2 * linux/fs/ext4/dir.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/dir.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * ext4 directory handling functions
16 *
17 * Big-endian to little-endian byte-swapping/bitmaps by
18 * David S. Miller (davem@caip.rutgers.edu), 1995
19 *
20 * Hash Tree Directory indexing (c) 2001 Daniel Phillips
21 *
22 */
23
24#include <linux/fs.h>
25#include <linux/jbd2.h>
26#include <linux/ext4_fs.h>
27#include <linux/buffer_head.h>
28#include <linux/smp_lock.h>
29#include <linux/slab.h>
30#include <linux/rbtree.h>
31
32static unsigned char ext4_filetype_table[] = {
33 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
34};
35
36static int ext4_readdir(struct file *, void *, filldir_t);
37static int ext4_dx_readdir(struct file * filp,
38 void * dirent, filldir_t filldir);
39static int ext4_release_dir (struct inode * inode,
40 struct file * filp);
41
42const struct file_operations ext4_dir_operations = {
43 .llseek = generic_file_llseek,
44 .read = generic_read_dir,
45 .readdir = ext4_readdir, /* we take BKL. needed?*/
46 .ioctl = ext4_ioctl, /* BKL held */
47#ifdef CONFIG_COMPAT
48 .compat_ioctl = ext4_compat_ioctl,
49#endif
50 .fsync = ext4_sync_file, /* BKL held */
51#ifdef CONFIG_EXT4_INDEX
52 .release = ext4_release_dir,
53#endif
54};
55
56
57static unsigned char get_dtype(struct super_block *sb, int filetype)
58{
59 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
60 (filetype >= EXT4_FT_MAX))
61 return DT_UNKNOWN;
62
63 return (ext4_filetype_table[filetype]);
64}
65
66
67int ext4_check_dir_entry (const char * function, struct inode * dir,
68 struct ext4_dir_entry_2 * de,
69 struct buffer_head * bh,
70 unsigned long offset)
71{
72 const char * error_msg = NULL;
73 const int rlen = le16_to_cpu(de->rec_len);
74
75 if (rlen < EXT4_DIR_REC_LEN(1))
76 error_msg = "rec_len is smaller than minimal";
77 else if (rlen % 4 != 0)
78 error_msg = "rec_len % 4 != 0";
79 else if (rlen < EXT4_DIR_REC_LEN(de->name_len))
80 error_msg = "rec_len is too small for name_len";
81 else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
82 error_msg = "directory entry across blocks";
83 else if (le32_to_cpu(de->inode) >
84 le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))
85 error_msg = "inode out of bounds";
86
87 if (error_msg != NULL)
88 ext4_error (dir->i_sb, function,
89 "bad entry in directory #%lu: %s - "
90 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
91 dir->i_ino, error_msg, offset,
92 (unsigned long) le32_to_cpu(de->inode),
93 rlen, de->name_len);
94 return error_msg == NULL ? 1 : 0;
95}
96
97static int ext4_readdir(struct file * filp,
98 void * dirent, filldir_t filldir)
99{
100 int error = 0;
101 unsigned long offset;
102 int i, stored;
103 struct ext4_dir_entry_2 *de;
104 struct super_block *sb;
105 int err;
106 struct inode *inode = filp->f_dentry->d_inode;
107 int ret = 0;
108
109 sb = inode->i_sb;
110
111#ifdef CONFIG_EXT4_INDEX
112 if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
113 EXT4_FEATURE_COMPAT_DIR_INDEX) &&
114 ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) ||
115 ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
116 err = ext4_dx_readdir(filp, dirent, filldir);
117 if (err != ERR_BAD_DX_DIR) {
118 ret = err;
119 goto out;
120 }
121 /*
122 * We don't set the inode dirty flag since it's not
123 * critical that it get flushed back to the disk.
124 */
125 EXT4_I(filp->f_dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL;
126 }
127#endif
128 stored = 0;
129 offset = filp->f_pos & (sb->s_blocksize - 1);
130
131 while (!error && !stored && filp->f_pos < inode->i_size) {
132 unsigned long blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
133 struct buffer_head map_bh;
134 struct buffer_head *bh = NULL;
135
136 map_bh.b_state = 0;
137 err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0);
138 if (err > 0) {
139 page_cache_readahead(sb->s_bdev->bd_inode->i_mapping,
140 &filp->f_ra,
141 filp,
142 map_bh.b_blocknr >>
143 (PAGE_CACHE_SHIFT - inode->i_blkbits),
144 1);
145 bh = ext4_bread(NULL, inode, blk, 0, &err);
146 }
147
148 /*
149 * We ignore I/O errors on directories so users have a chance
150 * of recovering data when there's a bad sector
151 */
152 if (!bh) {
153 ext4_error (sb, "ext4_readdir",
154 "directory #%lu contains a hole at offset %lu",
155 inode->i_ino, (unsigned long)filp->f_pos);
156 filp->f_pos += sb->s_blocksize - offset;
157 continue;
158 }
159
160revalidate:
161 /* If the dir block has changed since the last call to
162 * readdir(2), then we might be pointing to an invalid
163 * dirent right now. Scan from the start of the block
164 * to make sure. */
165 if (filp->f_version != inode->i_version) {
166 for (i = 0; i < sb->s_blocksize && i < offset; ) {
167 de = (struct ext4_dir_entry_2 *)
168 (bh->b_data + i);
169 /* It's too expensive to do a full
170 * dirent test each time round this
171 * loop, but we do have to test at
172 * least that it is non-zero. A
173 * failure will be detected in the
174 * dirent test below. */
175 if (le16_to_cpu(de->rec_len) <
176 EXT4_DIR_REC_LEN(1))
177 break;
178 i += le16_to_cpu(de->rec_len);
179 }
180 offset = i;
181 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
182 | offset;
183 filp->f_version = inode->i_version;
184 }
185
186 while (!error && filp->f_pos < inode->i_size
187 && offset < sb->s_blocksize) {
188 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
189 if (!ext4_check_dir_entry ("ext4_readdir", inode, de,
190 bh, offset)) {
191 /*
192 * On error, skip the f_pos to the next block
193 */
194 filp->f_pos = (filp->f_pos |
195 (sb->s_blocksize - 1)) + 1;
196 brelse (bh);
197 ret = stored;
198 goto out;
199 }
200 offset += le16_to_cpu(de->rec_len);
201 if (le32_to_cpu(de->inode)) {
202 /* We might block in the next section
203 * if the data destination is
204 * currently swapped out. So, use a
205 * version stamp to detect whether or
206 * not the directory has been modified
207 * during the copy operation.
208 */
209 unsigned long version = filp->f_version;
210
211 error = filldir(dirent, de->name,
212 de->name_len,
213 filp->f_pos,
214 le32_to_cpu(de->inode),
215 get_dtype(sb, de->file_type));
216 if (error)
217 break;
218 if (version != filp->f_version)
219 goto revalidate;
220 stored ++;
221 }
222 filp->f_pos += le16_to_cpu(de->rec_len);
223 }
224 offset = 0;
225 brelse (bh);
226 }
227out:
228 return ret;
229}
230
231#ifdef CONFIG_EXT4_INDEX
232/*
233 * These functions convert from the major/minor hash to an f_pos
234 * value.
235 *
236 * Currently we only use major hash numer. This is unfortunate, but
237 * on 32-bit machines, the same VFS interface is used for lseek and
238 * llseek, so if we use the 64 bit offset, then the 32-bit versions of
239 * lseek/telldir/seekdir will blow out spectacularly, and from within
240 * the ext2 low-level routine, we don't know if we're being called by
241 * a 64-bit version of the system call or the 32-bit version of the
242 * system call. Worse yet, NFSv2 only allows for a 32-bit readdir
243 * cookie. Sigh.
244 */
245#define hash2pos(major, minor) (major >> 1)
246#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff)
247#define pos2min_hash(pos) (0)
248
249/*
250 * This structure holds the nodes of the red-black tree used to store
251 * the directory entry in hash order.
252 */
253struct fname {
254 __u32 hash;
255 __u32 minor_hash;
256 struct rb_node rb_hash;
257 struct fname *next;
258 __u32 inode;
259 __u8 name_len;
260 __u8 file_type;
261 char name[0];
262};
263
264/*
265 * This functoin implements a non-recursive way of freeing all of the
266 * nodes in the red-black tree.
267 */
268static void free_rb_tree_fname(struct rb_root *root)
269{
270 struct rb_node *n = root->rb_node;
271 struct rb_node *parent;
272 struct fname *fname;
273
274 while (n) {
275 /* Do the node's children first */
276 if ((n)->rb_left) {
277 n = n->rb_left;
278 continue;
279 }
280 if (n->rb_right) {
281 n = n->rb_right;
282 continue;
283 }
284 /*
285 * The node has no children; free it, and then zero
286 * out parent's link to it. Finally go to the
287 * beginning of the loop and try to free the parent
288 * node.
289 */
290 parent = rb_parent(n);
291 fname = rb_entry(n, struct fname, rb_hash);
292 while (fname) {
293 struct fname * old = fname;
294 fname = fname->next;
295 kfree (old);
296 }
297 if (!parent)
298 root->rb_node = NULL;
299 else if (parent->rb_left == n)
300 parent->rb_left = NULL;
301 else if (parent->rb_right == n)
302 parent->rb_right = NULL;
303 n = parent;
304 }
305 root->rb_node = NULL;
306}
307
308
309static struct dir_private_info *create_dir_info(loff_t pos)
310{
311 struct dir_private_info *p;
312
313 p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
314 if (!p)
315 return NULL;
316 p->root.rb_node = NULL;
317 p->curr_node = NULL;
318 p->extra_fname = NULL;
319 p->last_pos = 0;
320 p->curr_hash = pos2maj_hash(pos);
321 p->curr_minor_hash = pos2min_hash(pos);
322 p->next_hash = 0;
323 return p;
324}
325
326void ext4_htree_free_dir_info(struct dir_private_info *p)
327{
328 free_rb_tree_fname(&p->root);
329 kfree(p);
330}
331
332/*
333 * Given a directory entry, enter it into the fname rb tree.
334 */
335int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
336 __u32 minor_hash,
337 struct ext4_dir_entry_2 *dirent)
338{
339 struct rb_node **p, *parent = NULL;
340 struct fname * fname, *new_fn;
341 struct dir_private_info *info;
342 int len;
343
344 info = (struct dir_private_info *) dir_file->private_data;
345 p = &info->root.rb_node;
346
347 /* Create and allocate the fname structure */
348 len = sizeof(struct fname) + dirent->name_len + 1;
349 new_fn = kzalloc(len, GFP_KERNEL);
350 if (!new_fn)
351 return -ENOMEM;
352 new_fn->hash = hash;
353 new_fn->minor_hash = minor_hash;
354 new_fn->inode = le32_to_cpu(dirent->inode);
355 new_fn->name_len = dirent->name_len;
356 new_fn->file_type = dirent->file_type;
357 memcpy(new_fn->name, dirent->name, dirent->name_len);
358 new_fn->name[dirent->name_len] = 0;
359
360 while (*p) {
361 parent = *p;
362 fname = rb_entry(parent, struct fname, rb_hash);
363
364 /*
365 * If the hash and minor hash match up, then we put
366 * them on a linked list. This rarely happens...
367 */
368 if ((new_fn->hash == fname->hash) &&
369 (new_fn->minor_hash == fname->minor_hash)) {
370 new_fn->next = fname->next;
371 fname->next = new_fn;
372 return 0;
373 }
374
375 if (new_fn->hash < fname->hash)
376 p = &(*p)->rb_left;
377 else if (new_fn->hash > fname->hash)
378 p = &(*p)->rb_right;
379 else if (new_fn->minor_hash < fname->minor_hash)
380 p = &(*p)->rb_left;
381 else /* if (new_fn->minor_hash > fname->minor_hash) */
382 p = &(*p)->rb_right;
383 }
384
385 rb_link_node(&new_fn->rb_hash, parent, p);
386 rb_insert_color(&new_fn->rb_hash, &info->root);
387 return 0;
388}
389
390
391
392/*
393 * This is a helper function for ext4_dx_readdir. It calls filldir
394 * for all entres on the fname linked list. (Normally there is only
395 * one entry on the linked list, unless there are 62 bit hash collisions.)
396 */
397static int call_filldir(struct file * filp, void * dirent,
398 filldir_t filldir, struct fname *fname)
399{
400 struct dir_private_info *info = filp->private_data;
401 loff_t curr_pos;
402 struct inode *inode = filp->f_dentry->d_inode;
403 struct super_block * sb;
404 int error;
405
406 sb = inode->i_sb;
407
408 if (!fname) {
409 printk("call_filldir: called with null fname?!?\n");
410 return 0;
411 }
412 curr_pos = hash2pos(fname->hash, fname->minor_hash);
413 while (fname) {
414 error = filldir(dirent, fname->name,
415 fname->name_len, curr_pos,
416 fname->inode,
417 get_dtype(sb, fname->file_type));
418 if (error) {
419 filp->f_pos = curr_pos;
420 info->extra_fname = fname->next;
421 return error;
422 }
423 fname = fname->next;
424 }
425 return 0;
426}
427
428static int ext4_dx_readdir(struct file * filp,
429 void * dirent, filldir_t filldir)
430{
431 struct dir_private_info *info = filp->private_data;
432 struct inode *inode = filp->f_dentry->d_inode;
433 struct fname *fname;
434 int ret;
435
436 if (!info) {
437 info = create_dir_info(filp->f_pos);
438 if (!info)
439 return -ENOMEM;
440 filp->private_data = info;
441 }
442
443 if (filp->f_pos == EXT4_HTREE_EOF)
444 return 0; /* EOF */
445
446 /* Some one has messed with f_pos; reset the world */
447 if (info->last_pos != filp->f_pos) {
448 free_rb_tree_fname(&info->root);
449 info->curr_node = NULL;
450 info->extra_fname = NULL;
451 info->curr_hash = pos2maj_hash(filp->f_pos);
452 info->curr_minor_hash = pos2min_hash(filp->f_pos);
453 }
454
455 /*
456 * If there are any leftover names on the hash collision
457 * chain, return them first.
458 */
459 if (info->extra_fname &&
460 call_filldir(filp, dirent, filldir, info->extra_fname))
461 goto finished;
462
463 if (!info->curr_node)
464 info->curr_node = rb_first(&info->root);
465
466 while (1) {
467 /*
468 * Fill the rbtree if we have no more entries,
469 * or the inode has changed since we last read in the
470 * cached entries.
471 */
472 if ((!info->curr_node) ||
473 (filp->f_version != inode->i_version)) {
474 info->curr_node = NULL;
475 free_rb_tree_fname(&info->root);
476 filp->f_version = inode->i_version;
477 ret = ext4_htree_fill_tree(filp, info->curr_hash,
478 info->curr_minor_hash,
479 &info->next_hash);
480 if (ret < 0)
481 return ret;
482 if (ret == 0) {
483 filp->f_pos = EXT4_HTREE_EOF;
484 break;
485 }
486 info->curr_node = rb_first(&info->root);
487 }
488
489 fname = rb_entry(info->curr_node, struct fname, rb_hash);
490 info->curr_hash = fname->hash;
491 info->curr_minor_hash = fname->minor_hash;
492 if (call_filldir(filp, dirent, filldir, fname))
493 break;
494
495 info->curr_node = rb_next(info->curr_node);
496 if (!info->curr_node) {
497 if (info->next_hash == ~0) {
498 filp->f_pos = EXT4_HTREE_EOF;
499 break;
500 }
501 info->curr_hash = info->next_hash;
502 info->curr_minor_hash = 0;
503 }
504 }
505finished:
506 info->last_pos = filp->f_pos;
507 return 0;
508}
509
510static int ext4_release_dir (struct inode * inode, struct file * filp)
511{
512 if (filp->private_data)
513 ext4_htree_free_dir_info(filp->private_data);
514
515 return 0;
516}
517
518#endif
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
new file mode 100644
index 000000000000..2608dce18f3e
--- /dev/null
+++ b/fs/ext4/extents.c
@@ -0,0 +1,2152 @@
1/*
2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3 * Written by Alex Tomas <alex@clusterfs.com>
4 *
5 * Architecture independence:
6 * Copyright (c) 2005, Bull S.A.
7 * Written by Pierre Peiffer <pierre.peiffer@bull.net>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public Licens
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
21 */
22
23/*
24 * Extents support for EXT4
25 *
26 * TODO:
27 * - ext4*_error() should be used in some situations
28 * - analyze all BUG()/BUG_ON(), use -EIO where appropriate
29 * - smart tree reduction
30 */
31
32#include <linux/module.h>
33#include <linux/fs.h>
34#include <linux/time.h>
35#include <linux/ext4_jbd2.h>
36#include <linux/jbd.h>
37#include <linux/smp_lock.h>
38#include <linux/highuid.h>
39#include <linux/pagemap.h>
40#include <linux/quotaops.h>
41#include <linux/string.h>
42#include <linux/slab.h>
43#include <linux/ext4_fs_extents.h>
44#include <asm/uaccess.h>
45
46
47/*
48 * ext_pblock:
49 * combine low and high parts of physical block number into ext4_fsblk_t
50 */
51static inline ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
52{
53 ext4_fsblk_t block;
54
55 block = le32_to_cpu(ex->ee_start);
56 block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
57 return block;
58}
59
60/*
61 * idx_pblock:
62 * combine low and high parts of a leaf physical block number into ext4_fsblk_t
63 */
64static inline ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
65{
66 ext4_fsblk_t block;
67
68 block = le32_to_cpu(ix->ei_leaf);
69 block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
70 return block;
71}
72
73/*
74 * ext4_ext_store_pblock:
75 * stores a large physical block number into an extent struct,
76 * breaking it into parts
77 */
78static inline void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
79{
80 ex->ee_start = cpu_to_le32((unsigned long) (pb & 0xffffffff));
81 ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
82}
83
84/*
85 * ext4_idx_store_pblock:
86 * stores a large physical block number into an index struct,
87 * breaking it into parts
88 */
89static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
90{
91 ix->ei_leaf = cpu_to_le32((unsigned long) (pb & 0xffffffff));
92 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
93}
94
95static int ext4_ext_check_header(const char *function, struct inode *inode,
96 struct ext4_extent_header *eh)
97{
98 const char *error_msg = NULL;
99
100 if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
101 error_msg = "invalid magic";
102 goto corrupted;
103 }
104 if (unlikely(eh->eh_max == 0)) {
105 error_msg = "invalid eh_max";
106 goto corrupted;
107 }
108 if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
109 error_msg = "invalid eh_entries";
110 goto corrupted;
111 }
112 return 0;
113
114corrupted:
115 ext4_error(inode->i_sb, function,
116 "bad header in inode #%lu: %s - magic %x, "
117 "entries %u, max %u, depth %u",
118 inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
119 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
120 le16_to_cpu(eh->eh_depth));
121
122 return -EIO;
123}
124
125static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed)
126{
127 int err;
128
129 if (handle->h_buffer_credits > needed)
130 return handle;
131 if (!ext4_journal_extend(handle, needed))
132 return handle;
133 err = ext4_journal_restart(handle, needed);
134
135 return handle;
136}
137
138/*
139 * could return:
140 * - EROFS
141 * - ENOMEM
142 */
143static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
144 struct ext4_ext_path *path)
145{
146 if (path->p_bh) {
147 /* path points to block */
148 return ext4_journal_get_write_access(handle, path->p_bh);
149 }
150 /* path points to leaf/index in inode body */
151 /* we use in-core data, no need to protect them */
152 return 0;
153}
154
155/*
156 * could return:
157 * - EROFS
158 * - ENOMEM
159 * - EIO
160 */
161static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
162 struct ext4_ext_path *path)
163{
164 int err;
165 if (path->p_bh) {
166 /* path points to block */
167 err = ext4_journal_dirty_metadata(handle, path->p_bh);
168 } else {
169 /* path points to leaf/index in inode body */
170 err = ext4_mark_inode_dirty(handle, inode);
171 }
172 return err;
173}
174
175static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
176 struct ext4_ext_path *path,
177 ext4_fsblk_t block)
178{
179 struct ext4_inode_info *ei = EXT4_I(inode);
180 ext4_fsblk_t bg_start;
181 ext4_grpblk_t colour;
182 int depth;
183
184 if (path) {
185 struct ext4_extent *ex;
186 depth = path->p_depth;
187
188 /* try to predict block placement */
189 if ((ex = path[depth].p_ext))
190 return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block));
191
192 /* it looks like index is empty;
193 * try to find starting block from index itself */
194 if (path[depth].p_bh)
195 return path[depth].p_bh->b_blocknr;
196 }
197
198 /* OK. use inode's group */
199 bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
200 le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
201 colour = (current->pid % 16) *
202 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
203 return bg_start + colour + block;
204}
205
206static ext4_fsblk_t
207ext4_ext_new_block(handle_t *handle, struct inode *inode,
208 struct ext4_ext_path *path,
209 struct ext4_extent *ex, int *err)
210{
211 ext4_fsblk_t goal, newblock;
212
213 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
214 newblock = ext4_new_block(handle, inode, goal, err);
215 return newblock;
216}
217
218static inline int ext4_ext_space_block(struct inode *inode)
219{
220 int size;
221
222 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
223 / sizeof(struct ext4_extent);
224#ifdef AGRESSIVE_TEST
225 if (size > 6)
226 size = 6;
227#endif
228 return size;
229}
230
231static inline int ext4_ext_space_block_idx(struct inode *inode)
232{
233 int size;
234
235 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
236 / sizeof(struct ext4_extent_idx);
237#ifdef AGRESSIVE_TEST
238 if (size > 5)
239 size = 5;
240#endif
241 return size;
242}
243
244static inline int ext4_ext_space_root(struct inode *inode)
245{
246 int size;
247
248 size = sizeof(EXT4_I(inode)->i_data);
249 size -= sizeof(struct ext4_extent_header);
250 size /= sizeof(struct ext4_extent);
251#ifdef AGRESSIVE_TEST
252 if (size > 3)
253 size = 3;
254#endif
255 return size;
256}
257
258static inline int ext4_ext_space_root_idx(struct inode *inode)
259{
260 int size;
261
262 size = sizeof(EXT4_I(inode)->i_data);
263 size -= sizeof(struct ext4_extent_header);
264 size /= sizeof(struct ext4_extent_idx);
265#ifdef AGRESSIVE_TEST
266 if (size > 4)
267 size = 4;
268#endif
269 return size;
270}
271
272#ifdef EXT_DEBUG
273static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
274{
275 int k, l = path->p_depth;
276
277 ext_debug("path:");
278 for (k = 0; k <= l; k++, path++) {
279 if (path->p_idx) {
280 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block),
281 idx_pblock(path->p_idx));
282 } else if (path->p_ext) {
283 ext_debug(" %d:%d:%llu ",
284 le32_to_cpu(path->p_ext->ee_block),
285 le16_to_cpu(path->p_ext->ee_len),
286 ext_pblock(path->p_ext));
287 } else
288 ext_debug(" []");
289 }
290 ext_debug("\n");
291}
292
293static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
294{
295 int depth = ext_depth(inode);
296 struct ext4_extent_header *eh;
297 struct ext4_extent *ex;
298 int i;
299
300 if (!path)
301 return;
302
303 eh = path[depth].p_hdr;
304 ex = EXT_FIRST_EXTENT(eh);
305
306 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
307 ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block),
308 le16_to_cpu(ex->ee_len), ext_pblock(ex));
309 }
310 ext_debug("\n");
311}
312#else
313#define ext4_ext_show_path(inode,path)
314#define ext4_ext_show_leaf(inode,path)
315#endif
316
317static void ext4_ext_drop_refs(struct ext4_ext_path *path)
318{
319 int depth = path->p_depth;
320 int i;
321
322 for (i = 0; i <= depth; i++, path++)
323 if (path->p_bh) {
324 brelse(path->p_bh);
325 path->p_bh = NULL;
326 }
327}
328
329/*
330 * ext4_ext_binsearch_idx:
331 * binary search for the closest index of the given block
332 */
333static void
334ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, int block)
335{
336 struct ext4_extent_header *eh = path->p_hdr;
337 struct ext4_extent_idx *r, *l, *m;
338
339 BUG_ON(eh->eh_magic != EXT4_EXT_MAGIC);
340 BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max));
341 BUG_ON(le16_to_cpu(eh->eh_entries) <= 0);
342
343 ext_debug("binsearch for %d(idx): ", block);
344
345 l = EXT_FIRST_INDEX(eh) + 1;
346 r = EXT_FIRST_INDEX(eh) + le16_to_cpu(eh->eh_entries) - 1;
347 while (l <= r) {
348 m = l + (r - l) / 2;
349 if (block < le32_to_cpu(m->ei_block))
350 r = m - 1;
351 else
352 l = m + 1;
353 ext_debug("%p(%u):%p(%u):%p(%u) ", l, l->ei_block,
354 m, m->ei_block, r, r->ei_block);
355 }
356
357 path->p_idx = l - 1;
358 ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
359 idx_block(path->p_idx));
360
361#ifdef CHECK_BINSEARCH
362 {
363 struct ext4_extent_idx *chix, *ix;
364 int k;
365
366 chix = ix = EXT_FIRST_INDEX(eh);
367 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
368 if (k != 0 &&
369 le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
370 printk("k=%d, ix=0x%p, first=0x%p\n", k,
371 ix, EXT_FIRST_INDEX(eh));
372 printk("%u <= %u\n",
373 le32_to_cpu(ix->ei_block),
374 le32_to_cpu(ix[-1].ei_block));
375 }
376 BUG_ON(k && le32_to_cpu(ix->ei_block)
377 <= le32_to_cpu(ix[-1].ei_block));
378 if (block < le32_to_cpu(ix->ei_block))
379 break;
380 chix = ix;
381 }
382 BUG_ON(chix != path->p_idx);
383 }
384#endif
385
386}
387
388/*
389 * ext4_ext_binsearch:
390 * binary search for closest extent of the given block
391 */
392static void
393ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block)
394{
395 struct ext4_extent_header *eh = path->p_hdr;
396 struct ext4_extent *r, *l, *m;
397
398 BUG_ON(eh->eh_magic != EXT4_EXT_MAGIC);
399 BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max));
400
401 if (eh->eh_entries == 0) {
402 /*
403 * this leaf is empty:
404 * we get such a leaf in split/add case
405 */
406 return;
407 }
408
409 ext_debug("binsearch for %d: ", block);
410
411 l = EXT_FIRST_EXTENT(eh) + 1;
412 r = EXT_FIRST_EXTENT(eh) + le16_to_cpu(eh->eh_entries) - 1;
413
414 while (l <= r) {
415 m = l + (r - l) / 2;
416 if (block < le32_to_cpu(m->ee_block))
417 r = m - 1;
418 else
419 l = m + 1;
420 ext_debug("%p(%u):%p(%u):%p(%u) ", l, l->ee_block,
421 m, m->ee_block, r, r->ee_block);
422 }
423
424 path->p_ext = l - 1;
425 ext_debug(" -> %d:%llu:%d ",
426 le32_to_cpu(path->p_ext->ee_block),
427 ext_pblock(path->p_ext),
428 le16_to_cpu(path->p_ext->ee_len));
429
430#ifdef CHECK_BINSEARCH
431 {
432 struct ext4_extent *chex, *ex;
433 int k;
434
435 chex = ex = EXT_FIRST_EXTENT(eh);
436 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
437 BUG_ON(k && le32_to_cpu(ex->ee_block)
438 <= le32_to_cpu(ex[-1].ee_block));
439 if (block < le32_to_cpu(ex->ee_block))
440 break;
441 chex = ex;
442 }
443 BUG_ON(chex != path->p_ext);
444 }
445#endif
446
447}
448
449int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
450{
451 struct ext4_extent_header *eh;
452
453 eh = ext_inode_hdr(inode);
454 eh->eh_depth = 0;
455 eh->eh_entries = 0;
456 eh->eh_magic = EXT4_EXT_MAGIC;
457 eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode));
458 ext4_mark_inode_dirty(handle, inode);
459 ext4_ext_invalidate_cache(inode);
460 return 0;
461}
462
463struct ext4_ext_path *
464ext4_ext_find_extent(struct inode *inode, int block, struct ext4_ext_path *path)
465{
466 struct ext4_extent_header *eh;
467 struct buffer_head *bh;
468 short int depth, i, ppos = 0, alloc = 0;
469
470 eh = ext_inode_hdr(inode);
471 BUG_ON(eh == NULL);
472 if (ext4_ext_check_header(__FUNCTION__, inode, eh))
473 return ERR_PTR(-EIO);
474
475 i = depth = ext_depth(inode);
476
477 /* account possible depth increase */
478 if (!path) {
479 path = kmalloc(sizeof(struct ext4_ext_path) * (depth + 2),
480 GFP_NOFS);
481 if (!path)
482 return ERR_PTR(-ENOMEM);
483 alloc = 1;
484 }
485 memset(path, 0, sizeof(struct ext4_ext_path) * (depth + 1));
486 path[0].p_hdr = eh;
487
488 /* walk through the tree */
489 while (i) {
490 ext_debug("depth %d: num %d, max %d\n",
491 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
492 ext4_ext_binsearch_idx(inode, path + ppos, block);
493 path[ppos].p_block = idx_pblock(path[ppos].p_idx);
494 path[ppos].p_depth = i;
495 path[ppos].p_ext = NULL;
496
497 bh = sb_bread(inode->i_sb, path[ppos].p_block);
498 if (!bh)
499 goto err;
500
501 eh = ext_block_hdr(bh);
502 ppos++;
503 BUG_ON(ppos > depth);
504 path[ppos].p_bh = bh;
505 path[ppos].p_hdr = eh;
506 i--;
507
508 if (ext4_ext_check_header(__FUNCTION__, inode, eh))
509 goto err;
510 }
511
512 path[ppos].p_depth = i;
513 path[ppos].p_hdr = eh;
514 path[ppos].p_ext = NULL;
515 path[ppos].p_idx = NULL;
516
517 if (ext4_ext_check_header(__FUNCTION__, inode, eh))
518 goto err;
519
520 /* find extent */
521 ext4_ext_binsearch(inode, path + ppos, block);
522
523 ext4_ext_show_path(inode, path);
524
525 return path;
526
527err:
528 ext4_ext_drop_refs(path);
529 if (alloc)
530 kfree(path);
531 return ERR_PTR(-EIO);
532}
533
534/*
535 * ext4_ext_insert_index:
536 * insert new index [@logical;@ptr] into the block at @curp;
537 * check where to insert: before @curp or after @curp
538 */
539static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
540 struct ext4_ext_path *curp,
541 int logical, ext4_fsblk_t ptr)
542{
543 struct ext4_extent_idx *ix;
544 int len, err;
545
546 if ((err = ext4_ext_get_access(handle, inode, curp)))
547 return err;
548
549 BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block));
550 len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
551 if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
552 /* insert after */
553 if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
554 len = (len - 1) * sizeof(struct ext4_extent_idx);
555 len = len < 0 ? 0 : len;
556 ext_debug("insert new index %d after: %d. "
557 "move %d from 0x%p to 0x%p\n",
558 logical, ptr, len,
559 (curp->p_idx + 1), (curp->p_idx + 2));
560 memmove(curp->p_idx + 2, curp->p_idx + 1, len);
561 }
562 ix = curp->p_idx + 1;
563 } else {
564 /* insert before */
565 len = len * sizeof(struct ext4_extent_idx);
566 len = len < 0 ? 0 : len;
567 ext_debug("insert new index %d before: %d. "
568 "move %d from 0x%p to 0x%p\n",
569 logical, ptr, len,
570 curp->p_idx, (curp->p_idx + 1));
571 memmove(curp->p_idx + 1, curp->p_idx, len);
572 ix = curp->p_idx;
573 }
574
575 ix->ei_block = cpu_to_le32(logical);
576 ext4_idx_store_pblock(ix, ptr);
577 curp->p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(curp->p_hdr->eh_entries)+1);
578
579 BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries)
580 > le16_to_cpu(curp->p_hdr->eh_max));
581 BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr));
582
583 err = ext4_ext_dirty(handle, inode, curp);
584 ext4_std_error(inode->i_sb, err);
585
586 return err;
587}
588
589/*
590 * ext4_ext_split:
591 * inserts new subtree into the path, using free index entry
592 * at depth @at:
593 * - allocates all needed blocks (new leaf and all intermediate index blocks)
594 * - makes decision where to split
595 * - moves remaining extents and index entries (right to the split point)
596 * into the newly allocated blocks
597 * - initializes subtree
598 */
599static int ext4_ext_split(handle_t *handle, struct inode *inode,
600 struct ext4_ext_path *path,
601 struct ext4_extent *newext, int at)
602{
603 struct buffer_head *bh = NULL;
604 int depth = ext_depth(inode);
605 struct ext4_extent_header *neh;
606 struct ext4_extent_idx *fidx;
607 struct ext4_extent *ex;
608 int i = at, k, m, a;
609 ext4_fsblk_t newblock, oldblock;
610 __le32 border;
611 ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
612 int err = 0;
613
614 /* make decision: where to split? */
615 /* FIXME: now decision is simplest: at current extent */
616
617 /* if current leaf will be split, then we should use
618 * border from split point */
619 BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr));
620 if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
621 border = path[depth].p_ext[1].ee_block;
622 ext_debug("leaf will be split."
623 " next leaf starts at %d\n",
624 le32_to_cpu(border));
625 } else {
626 border = newext->ee_block;
627 ext_debug("leaf will be added."
628 " next leaf starts at %d\n",
629 le32_to_cpu(border));
630 }
631
632 /*
633 * If error occurs, then we break processing
634 * and mark filesystem read-only. index won't
635 * be inserted and tree will be in consistent
636 * state. Next mount will repair buffers too.
637 */
638
639 /*
640 * Get array to track all allocated blocks.
641 * We need this to handle errors and free blocks
642 * upon them.
643 */
644 ablocks = kmalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
645 if (!ablocks)
646 return -ENOMEM;
647 memset(ablocks, 0, sizeof(ext4_fsblk_t) * depth);
648
649 /* allocate all needed blocks */
650 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
651 for (a = 0; a < depth - at; a++) {
652 newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
653 if (newblock == 0)
654 goto cleanup;
655 ablocks[a] = newblock;
656 }
657
658 /* initialize new leaf */
659 newblock = ablocks[--a];
660 BUG_ON(newblock == 0);
661 bh = sb_getblk(inode->i_sb, newblock);
662 if (!bh) {
663 err = -EIO;
664 goto cleanup;
665 }
666 lock_buffer(bh);
667
668 if ((err = ext4_journal_get_create_access(handle, bh)))
669 goto cleanup;
670
671 neh = ext_block_hdr(bh);
672 neh->eh_entries = 0;
673 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
674 neh->eh_magic = EXT4_EXT_MAGIC;
675 neh->eh_depth = 0;
676 ex = EXT_FIRST_EXTENT(neh);
677
678 /* move remainder of path[depth] to the new leaf */
679 BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max);
680 /* start copy from next extent */
681 /* TODO: we could do it by single memmove */
682 m = 0;
683 path[depth].p_ext++;
684 while (path[depth].p_ext <=
685 EXT_MAX_EXTENT(path[depth].p_hdr)) {
686 ext_debug("move %d:%llu:%d in new leaf %llu\n",
687 le32_to_cpu(path[depth].p_ext->ee_block),
688 ext_pblock(path[depth].p_ext),
689 le16_to_cpu(path[depth].p_ext->ee_len),
690 newblock);
691 /*memmove(ex++, path[depth].p_ext++,
692 sizeof(struct ext4_extent));
693 neh->eh_entries++;*/
694 path[depth].p_ext++;
695 m++;
696 }
697 if (m) {
698 memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m);
699 neh->eh_entries = cpu_to_le16(le16_to_cpu(neh->eh_entries)+m);
700 }
701
702 set_buffer_uptodate(bh);
703 unlock_buffer(bh);
704
705 if ((err = ext4_journal_dirty_metadata(handle, bh)))
706 goto cleanup;
707 brelse(bh);
708 bh = NULL;
709
710 /* correct old leaf */
711 if (m) {
712 if ((err = ext4_ext_get_access(handle, inode, path + depth)))
713 goto cleanup;
714 path[depth].p_hdr->eh_entries =
715 cpu_to_le16(le16_to_cpu(path[depth].p_hdr->eh_entries)-m);
716 if ((err = ext4_ext_dirty(handle, inode, path + depth)))
717 goto cleanup;
718
719 }
720
721 /* create intermediate indexes */
722 k = depth - at - 1;
723 BUG_ON(k < 0);
724 if (k)
725 ext_debug("create %d intermediate indices\n", k);
726 /* insert new index into current index block */
727 /* current depth stored in i var */
728 i = depth - 1;
729 while (k--) {
730 oldblock = newblock;
731 newblock = ablocks[--a];
732 bh = sb_getblk(inode->i_sb, (ext4_fsblk_t)newblock);
733 if (!bh) {
734 err = -EIO;
735 goto cleanup;
736 }
737 lock_buffer(bh);
738
739 if ((err = ext4_journal_get_create_access(handle, bh)))
740 goto cleanup;
741
742 neh = ext_block_hdr(bh);
743 neh->eh_entries = cpu_to_le16(1);
744 neh->eh_magic = EXT4_EXT_MAGIC;
745 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
746 neh->eh_depth = cpu_to_le16(depth - i);
747 fidx = EXT_FIRST_INDEX(neh);
748 fidx->ei_block = border;
749 ext4_idx_store_pblock(fidx, oldblock);
750
751 ext_debug("int.index at %d (block %llu): %lu -> %llu\n", i,
752 newblock, (unsigned long) le32_to_cpu(border),
753 oldblock);
754 /* copy indexes */
755 m = 0;
756 path[i].p_idx++;
757
758 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
759 EXT_MAX_INDEX(path[i].p_hdr));
760 BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) !=
761 EXT_LAST_INDEX(path[i].p_hdr));
762 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
763 ext_debug("%d: move %d:%d in new index %llu\n", i,
764 le32_to_cpu(path[i].p_idx->ei_block),
765 idx_pblock(path[i].p_idx),
766 newblock);
767 /*memmove(++fidx, path[i].p_idx++,
768 sizeof(struct ext4_extent_idx));
769 neh->eh_entries++;
770 BUG_ON(neh->eh_entries > neh->eh_max);*/
771 path[i].p_idx++;
772 m++;
773 }
774 if (m) {
775 memmove(++fidx, path[i].p_idx - m,
776 sizeof(struct ext4_extent_idx) * m);
777 neh->eh_entries =
778 cpu_to_le16(le16_to_cpu(neh->eh_entries) + m);
779 }
780 set_buffer_uptodate(bh);
781 unlock_buffer(bh);
782
783 if ((err = ext4_journal_dirty_metadata(handle, bh)))
784 goto cleanup;
785 brelse(bh);
786 bh = NULL;
787
788 /* correct old index */
789 if (m) {
790 err = ext4_ext_get_access(handle, inode, path + i);
791 if (err)
792 goto cleanup;
793 path[i].p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(path[i].p_hdr->eh_entries)-m);
794 err = ext4_ext_dirty(handle, inode, path + i);
795 if (err)
796 goto cleanup;
797 }
798
799 i--;
800 }
801
802 /* insert new index */
803 if (err)
804 goto cleanup;
805
806 err = ext4_ext_insert_index(handle, inode, path + at,
807 le32_to_cpu(border), newblock);
808
809cleanup:
810 if (bh) {
811 if (buffer_locked(bh))
812 unlock_buffer(bh);
813 brelse(bh);
814 }
815
816 if (err) {
817 /* free all allocated blocks in error case */
818 for (i = 0; i < depth; i++) {
819 if (!ablocks[i])
820 continue;
821 ext4_free_blocks(handle, inode, ablocks[i], 1);
822 }
823 }
824 kfree(ablocks);
825
826 return err;
827}
828
829/*
830 * ext4_ext_grow_indepth:
831 * implements tree growing procedure:
832 * - allocates new block
833 * - moves top-level data (index block or leaf) into the new block
834 * - initializes new top-level, creating index that points to the
835 * just created block
836 */
837static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
838 struct ext4_ext_path *path,
839 struct ext4_extent *newext)
840{
841 struct ext4_ext_path *curp = path;
842 struct ext4_extent_header *neh;
843 struct ext4_extent_idx *fidx;
844 struct buffer_head *bh;
845 ext4_fsblk_t newblock;
846 int err = 0;
847
848 newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
849 if (newblock == 0)
850 return err;
851
852 bh = sb_getblk(inode->i_sb, newblock);
853 if (!bh) {
854 err = -EIO;
855 ext4_std_error(inode->i_sb, err);
856 return err;
857 }
858 lock_buffer(bh);
859
860 if ((err = ext4_journal_get_create_access(handle, bh))) {
861 unlock_buffer(bh);
862 goto out;
863 }
864
865 /* move top-level index/leaf into new block */
866 memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data));
867
868 /* set size of new block */
869 neh = ext_block_hdr(bh);
870 /* old root could have indexes or leaves
871 * so calculate e_max right way */
872 if (ext_depth(inode))
873 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
874 else
875 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
876 neh->eh_magic = EXT4_EXT_MAGIC;
877 set_buffer_uptodate(bh);
878 unlock_buffer(bh);
879
880 if ((err = ext4_journal_dirty_metadata(handle, bh)))
881 goto out;
882
883 /* create index in new top-level index: num,max,pointer */
884 if ((err = ext4_ext_get_access(handle, inode, curp)))
885 goto out;
886
887 curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
888 curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode));
889 curp->p_hdr->eh_entries = cpu_to_le16(1);
890 curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
891 /* FIXME: it works, but actually path[0] can be index */
892 curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
893 ext4_idx_store_pblock(curp->p_idx, newblock);
894
895 neh = ext_inode_hdr(inode);
896 fidx = EXT_FIRST_INDEX(neh);
897 ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
898 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
899 le32_to_cpu(fidx->ei_block), idx_pblock(fidx));
900
901 neh->eh_depth = cpu_to_le16(path->p_depth + 1);
902 err = ext4_ext_dirty(handle, inode, curp);
903out:
904 brelse(bh);
905
906 return err;
907}
908
909/*
910 * ext4_ext_create_new_leaf:
911 * finds empty index and adds new leaf.
912 * if no free index is found, then it requests in-depth growing.
913 */
914static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
915 struct ext4_ext_path *path,
916 struct ext4_extent *newext)
917{
918 struct ext4_ext_path *curp;
919 int depth, i, err = 0;
920
921repeat:
922 i = depth = ext_depth(inode);
923
924 /* walk up to the tree and look for free index entry */
925 curp = path + depth;
926 while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
927 i--;
928 curp--;
929 }
930
931 /* we use already allocated block for index block,
932 * so subsequent data blocks should be contiguous */
933 if (EXT_HAS_FREE_INDEX(curp)) {
934 /* if we found index with free entry, then use that
935 * entry: create all needed subtree and add new leaf */
936 err = ext4_ext_split(handle, inode, path, newext, i);
937
938 /* refill path */
939 ext4_ext_drop_refs(path);
940 path = ext4_ext_find_extent(inode,
941 le32_to_cpu(newext->ee_block),
942 path);
943 if (IS_ERR(path))
944 err = PTR_ERR(path);
945 } else {
946 /* tree is full, time to grow in depth */
947 err = ext4_ext_grow_indepth(handle, inode, path, newext);
948 if (err)
949 goto out;
950
951 /* refill path */
952 ext4_ext_drop_refs(path);
953 path = ext4_ext_find_extent(inode,
954 le32_to_cpu(newext->ee_block),
955 path);
956 if (IS_ERR(path)) {
957 err = PTR_ERR(path);
958 goto out;
959 }
960
961 /*
962 * only first (depth 0 -> 1) produces free space;
963 * in all other cases we have to split the grown tree
964 */
965 depth = ext_depth(inode);
966 if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
967 /* now we need to split */
968 goto repeat;
969 }
970 }
971
972out:
973 return err;
974}
975
976/*
977 * ext4_ext_next_allocated_block:
978 * returns allocated block in subsequent extent or EXT_MAX_BLOCK.
979 * NOTE: it considers block number from index entry as
980 * allocated block. Thus, index entries have to be consistent
981 * with leaves.
982 */
983static unsigned long
984ext4_ext_next_allocated_block(struct ext4_ext_path *path)
985{
986 int depth;
987
988 BUG_ON(path == NULL);
989 depth = path->p_depth;
990
991 if (depth == 0 && path->p_ext == NULL)
992 return EXT_MAX_BLOCK;
993
994 while (depth >= 0) {
995 if (depth == path->p_depth) {
996 /* leaf */
997 if (path[depth].p_ext !=
998 EXT_LAST_EXTENT(path[depth].p_hdr))
999 return le32_to_cpu(path[depth].p_ext[1].ee_block);
1000 } else {
1001 /* index */
1002 if (path[depth].p_idx !=
1003 EXT_LAST_INDEX(path[depth].p_hdr))
1004 return le32_to_cpu(path[depth].p_idx[1].ei_block);
1005 }
1006 depth--;
1007 }
1008
1009 return EXT_MAX_BLOCK;
1010}
1011
1012/*
1013 * ext4_ext_next_leaf_block:
1014 * returns first allocated block from next leaf or EXT_MAX_BLOCK
1015 */
1016static unsigned ext4_ext_next_leaf_block(struct inode *inode,
1017 struct ext4_ext_path *path)
1018{
1019 int depth;
1020
1021 BUG_ON(path == NULL);
1022 depth = path->p_depth;
1023
1024 /* zero-tree has no leaf blocks at all */
1025 if (depth == 0)
1026 return EXT_MAX_BLOCK;
1027
1028 /* go to index block */
1029 depth--;
1030
1031 while (depth >= 0) {
1032 if (path[depth].p_idx !=
1033 EXT_LAST_INDEX(path[depth].p_hdr))
1034 return le32_to_cpu(path[depth].p_idx[1].ei_block);
1035 depth--;
1036 }
1037
1038 return EXT_MAX_BLOCK;
1039}
1040
1041/*
1042 * ext4_ext_correct_indexes:
1043 * if leaf gets modified and modified extent is first in the leaf,
1044 * then we have to correct all indexes above.
1045 * TODO: do we need to correct tree in all cases?
1046 */
1047int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1048 struct ext4_ext_path *path)
1049{
1050 struct ext4_extent_header *eh;
1051 int depth = ext_depth(inode);
1052 struct ext4_extent *ex;
1053 __le32 border;
1054 int k, err = 0;
1055
1056 eh = path[depth].p_hdr;
1057 ex = path[depth].p_ext;
1058 BUG_ON(ex == NULL);
1059 BUG_ON(eh == NULL);
1060
1061 if (depth == 0) {
1062 /* there is no tree at all */
1063 return 0;
1064 }
1065
1066 if (ex != EXT_FIRST_EXTENT(eh)) {
1067 /* we correct tree if first leaf got modified only */
1068 return 0;
1069 }
1070
1071 /*
1072 * TODO: we need correction if border is smaller than current one
1073 */
1074 k = depth - 1;
1075 border = path[depth].p_ext->ee_block;
1076 if ((err = ext4_ext_get_access(handle, inode, path + k)))
1077 return err;
1078 path[k].p_idx->ei_block = border;
1079 if ((err = ext4_ext_dirty(handle, inode, path + k)))
1080 return err;
1081
1082 while (k--) {
1083 /* change all left-side indexes */
1084 if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1085 break;
1086 if ((err = ext4_ext_get_access(handle, inode, path + k)))
1087 break;
1088 path[k].p_idx->ei_block = border;
1089 if ((err = ext4_ext_dirty(handle, inode, path + k)))
1090 break;
1091 }
1092
1093 return err;
1094}
1095
1096static int inline
1097ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1098 struct ext4_extent *ex2)
1099{
1100 if (le32_to_cpu(ex1->ee_block) + le16_to_cpu(ex1->ee_len) !=
1101 le32_to_cpu(ex2->ee_block))
1102 return 0;
1103
1104 /*
1105 * To allow future support for preallocated extents to be added
1106 * as an RO_COMPAT feature, refuse to merge to extents if
1107 * this can result in the top bit of ee_len being set.
1108 */
1109 if (le16_to_cpu(ex1->ee_len) + le16_to_cpu(ex2->ee_len) > EXT_MAX_LEN)
1110 return 0;
1111#ifdef AGRESSIVE_TEST
1112 if (le16_to_cpu(ex1->ee_len) >= 4)
1113 return 0;
1114#endif
1115
1116 if (ext_pblock(ex1) + le16_to_cpu(ex1->ee_len) == ext_pblock(ex2))
1117 return 1;
1118 return 0;
1119}
1120
1121/*
1122 * ext4_ext_insert_extent:
1123 * tries to merge requsted extent into the existing extent or
1124 * inserts requested extent as new one into the tree,
1125 * creating new leaf in the no-space case.
1126 */
1127int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1128 struct ext4_ext_path *path,
1129 struct ext4_extent *newext)
1130{
1131 struct ext4_extent_header * eh;
1132 struct ext4_extent *ex, *fex;
1133 struct ext4_extent *nearex; /* nearest extent */
1134 struct ext4_ext_path *npath = NULL;
1135 int depth, len, err, next;
1136
1137 BUG_ON(newext->ee_len == 0);
1138 depth = ext_depth(inode);
1139 ex = path[depth].p_ext;
1140 BUG_ON(path[depth].p_hdr == NULL);
1141
1142 /* try to insert block into found extent and return */
1143 if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
1144 ext_debug("append %d block to %d:%d (from %llu)\n",
1145 le16_to_cpu(newext->ee_len),
1146 le32_to_cpu(ex->ee_block),
1147 le16_to_cpu(ex->ee_len), ext_pblock(ex));
1148 if ((err = ext4_ext_get_access(handle, inode, path + depth)))
1149 return err;
1150 ex->ee_len = cpu_to_le16(le16_to_cpu(ex->ee_len)
1151 + le16_to_cpu(newext->ee_len));
1152 eh = path[depth].p_hdr;
1153 nearex = ex;
1154 goto merge;
1155 }
1156
1157repeat:
1158 depth = ext_depth(inode);
1159 eh = path[depth].p_hdr;
1160 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
1161 goto has_space;
1162
1163 /* probably next leaf has space for us? */
1164 fex = EXT_LAST_EXTENT(eh);
1165 next = ext4_ext_next_leaf_block(inode, path);
1166 if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)
1167 && next != EXT_MAX_BLOCK) {
1168 ext_debug("next leaf block - %d\n", next);
1169 BUG_ON(npath != NULL);
1170 npath = ext4_ext_find_extent(inode, next, NULL);
1171 if (IS_ERR(npath))
1172 return PTR_ERR(npath);
1173 BUG_ON(npath->p_depth != path->p_depth);
1174 eh = npath[depth].p_hdr;
1175 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
1176 ext_debug("next leaf isnt full(%d)\n",
1177 le16_to_cpu(eh->eh_entries));
1178 path = npath;
1179 goto repeat;
1180 }
1181 ext_debug("next leaf has no free space(%d,%d)\n",
1182 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
1183 }
1184
1185 /*
1186 * There is no free space in the found leaf.
1187 * We're gonna add a new leaf in the tree.
1188 */
1189 err = ext4_ext_create_new_leaf(handle, inode, path, newext);
1190 if (err)
1191 goto cleanup;
1192 depth = ext_depth(inode);
1193 eh = path[depth].p_hdr;
1194
1195has_space:
1196 nearex = path[depth].p_ext;
1197
1198 if ((err = ext4_ext_get_access(handle, inode, path + depth)))
1199 goto cleanup;
1200
1201 if (!nearex) {
1202 /* there is no extent in this leaf, create first one */
1203 ext_debug("first extent in the leaf: %d:%llu:%d\n",
1204 le32_to_cpu(newext->ee_block),
1205 ext_pblock(newext),
1206 le16_to_cpu(newext->ee_len));
1207 path[depth].p_ext = EXT_FIRST_EXTENT(eh);
1208 } else if (le32_to_cpu(newext->ee_block)
1209 > le32_to_cpu(nearex->ee_block)) {
1210/* BUG_ON(newext->ee_block == nearex->ee_block); */
1211 if (nearex != EXT_LAST_EXTENT(eh)) {
1212 len = EXT_MAX_EXTENT(eh) - nearex;
1213 len = (len - 1) * sizeof(struct ext4_extent);
1214 len = len < 0 ? 0 : len;
1215 ext_debug("insert %d:%llu:%d after: nearest 0x%p, "
1216 "move %d from 0x%p to 0x%p\n",
1217 le32_to_cpu(newext->ee_block),
1218 ext_pblock(newext),
1219 le16_to_cpu(newext->ee_len),
1220 nearex, len, nearex + 1, nearex + 2);
1221 memmove(nearex + 2, nearex + 1, len);
1222 }
1223 path[depth].p_ext = nearex + 1;
1224 } else {
1225 BUG_ON(newext->ee_block == nearex->ee_block);
1226 len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
1227 len = len < 0 ? 0 : len;
1228 ext_debug("insert %d:%llu:%d before: nearest 0x%p, "
1229 "move %d from 0x%p to 0x%p\n",
1230 le32_to_cpu(newext->ee_block),
1231 ext_pblock(newext),
1232 le16_to_cpu(newext->ee_len),
1233 nearex, len, nearex + 1, nearex + 2);
1234 memmove(nearex + 1, nearex, len);
1235 path[depth].p_ext = nearex;
1236 }
1237
1238 eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)+1);
1239 nearex = path[depth].p_ext;
1240 nearex->ee_block = newext->ee_block;
1241 nearex->ee_start = newext->ee_start;
1242 nearex->ee_start_hi = newext->ee_start_hi;
1243 nearex->ee_len = newext->ee_len;
1244
1245merge:
1246 /* try to merge extents to the right */
1247 while (nearex < EXT_LAST_EXTENT(eh)) {
1248 if (!ext4_can_extents_be_merged(inode, nearex, nearex + 1))
1249 break;
1250 /* merge with next extent! */
1251 nearex->ee_len = cpu_to_le16(le16_to_cpu(nearex->ee_len)
1252 + le16_to_cpu(nearex[1].ee_len));
1253 if (nearex + 1 < EXT_LAST_EXTENT(eh)) {
1254 len = (EXT_LAST_EXTENT(eh) - nearex - 1)
1255 * sizeof(struct ext4_extent);
1256 memmove(nearex + 1, nearex + 2, len);
1257 }
1258 eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1);
1259 BUG_ON(eh->eh_entries == 0);
1260 }
1261
1262 /* try to merge extents to the left */
1263
1264 /* time to correct all indexes above */
1265 err = ext4_ext_correct_indexes(handle, inode, path);
1266 if (err)
1267 goto cleanup;
1268
1269 err = ext4_ext_dirty(handle, inode, path + depth);
1270
1271cleanup:
1272 if (npath) {
1273 ext4_ext_drop_refs(npath);
1274 kfree(npath);
1275 }
1276 ext4_ext_tree_changed(inode);
1277 ext4_ext_invalidate_cache(inode);
1278 return err;
1279}
1280
1281int ext4_ext_walk_space(struct inode *inode, unsigned long block,
1282 unsigned long num, ext_prepare_callback func,
1283 void *cbdata)
1284{
1285 struct ext4_ext_path *path = NULL;
1286 struct ext4_ext_cache cbex;
1287 struct ext4_extent *ex;
1288 unsigned long next, start = 0, end = 0;
1289 unsigned long last = block + num;
1290 int depth, exists, err = 0;
1291
1292 BUG_ON(func == NULL);
1293 BUG_ON(inode == NULL);
1294
1295 while (block < last && block != EXT_MAX_BLOCK) {
1296 num = last - block;
1297 /* find extent for this block */
1298 path = ext4_ext_find_extent(inode, block, path);
1299 if (IS_ERR(path)) {
1300 err = PTR_ERR(path);
1301 path = NULL;
1302 break;
1303 }
1304
1305 depth = ext_depth(inode);
1306 BUG_ON(path[depth].p_hdr == NULL);
1307 ex = path[depth].p_ext;
1308 next = ext4_ext_next_allocated_block(path);
1309
1310 exists = 0;
1311 if (!ex) {
1312 /* there is no extent yet, so try to allocate
1313 * all requested space */
1314 start = block;
1315 end = block + num;
1316 } else if (le32_to_cpu(ex->ee_block) > block) {
1317 /* need to allocate space before found extent */
1318 start = block;
1319 end = le32_to_cpu(ex->ee_block);
1320 if (block + num < end)
1321 end = block + num;
1322 } else if (block >=
1323 le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len)) {
1324 /* need to allocate space after found extent */
1325 start = block;
1326 end = block + num;
1327 if (end >= next)
1328 end = next;
1329 } else if (block >= le32_to_cpu(ex->ee_block)) {
1330 /*
1331 * some part of requested space is covered
1332 * by found extent
1333 */
1334 start = block;
1335 end = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len);
1336 if (block + num < end)
1337 end = block + num;
1338 exists = 1;
1339 } else {
1340 BUG();
1341 }
1342 BUG_ON(end <= start);
1343
1344 if (!exists) {
1345 cbex.ec_block = start;
1346 cbex.ec_len = end - start;
1347 cbex.ec_start = 0;
1348 cbex.ec_type = EXT4_EXT_CACHE_GAP;
1349 } else {
1350 cbex.ec_block = le32_to_cpu(ex->ee_block);
1351 cbex.ec_len = le16_to_cpu(ex->ee_len);
1352 cbex.ec_start = ext_pblock(ex);
1353 cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
1354 }
1355
1356 BUG_ON(cbex.ec_len == 0);
1357 err = func(inode, path, &cbex, cbdata);
1358 ext4_ext_drop_refs(path);
1359
1360 if (err < 0)
1361 break;
1362 if (err == EXT_REPEAT)
1363 continue;
1364 else if (err == EXT_BREAK) {
1365 err = 0;
1366 break;
1367 }
1368
1369 if (ext_depth(inode) != depth) {
1370 /* depth was changed. we have to realloc path */
1371 kfree(path);
1372 path = NULL;
1373 }
1374
1375 block = cbex.ec_block + cbex.ec_len;
1376 }
1377
1378 if (path) {
1379 ext4_ext_drop_refs(path);
1380 kfree(path);
1381 }
1382
1383 return err;
1384}
1385
1386static inline void
1387ext4_ext_put_in_cache(struct inode *inode, __u32 block,
1388 __u32 len, __u32 start, int type)
1389{
1390 struct ext4_ext_cache *cex;
1391 BUG_ON(len == 0);
1392 cex = &EXT4_I(inode)->i_cached_extent;
1393 cex->ec_type = type;
1394 cex->ec_block = block;
1395 cex->ec_len = len;
1396 cex->ec_start = start;
1397}
1398
1399/*
1400 * ext4_ext_put_gap_in_cache:
1401 * calculate boundaries of the gap that the requested block fits into
1402 * and cache this gap
1403 */
1404static inline void
1405ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
1406 unsigned long block)
1407{
1408 int depth = ext_depth(inode);
1409 unsigned long lblock, len;
1410 struct ext4_extent *ex;
1411
1412 ex = path[depth].p_ext;
1413 if (ex == NULL) {
1414 /* there is no extent yet, so gap is [0;-] */
1415 lblock = 0;
1416 len = EXT_MAX_BLOCK;
1417 ext_debug("cache gap(whole file):");
1418 } else if (block < le32_to_cpu(ex->ee_block)) {
1419 lblock = block;
1420 len = le32_to_cpu(ex->ee_block) - block;
1421 ext_debug("cache gap(before): %lu [%lu:%lu]",
1422 (unsigned long) block,
1423 (unsigned long) le32_to_cpu(ex->ee_block),
1424 (unsigned long) le16_to_cpu(ex->ee_len));
1425 } else if (block >= le32_to_cpu(ex->ee_block)
1426 + le16_to_cpu(ex->ee_len)) {
1427 lblock = le32_to_cpu(ex->ee_block)
1428 + le16_to_cpu(ex->ee_len);
1429 len = ext4_ext_next_allocated_block(path);
1430 ext_debug("cache gap(after): [%lu:%lu] %lu",
1431 (unsigned long) le32_to_cpu(ex->ee_block),
1432 (unsigned long) le16_to_cpu(ex->ee_len),
1433 (unsigned long) block);
1434 BUG_ON(len == lblock);
1435 len = len - lblock;
1436 } else {
1437 lblock = len = 0;
1438 BUG();
1439 }
1440
1441 ext_debug(" -> %lu:%lu\n", (unsigned long) lblock, len);
1442 ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
1443}
1444
1445static inline int
1446ext4_ext_in_cache(struct inode *inode, unsigned long block,
1447 struct ext4_extent *ex)
1448{
1449 struct ext4_ext_cache *cex;
1450
1451 cex = &EXT4_I(inode)->i_cached_extent;
1452
1453 /* has cache valid data? */
1454 if (cex->ec_type == EXT4_EXT_CACHE_NO)
1455 return EXT4_EXT_CACHE_NO;
1456
1457 BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
1458 cex->ec_type != EXT4_EXT_CACHE_EXTENT);
1459 if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) {
1460 ex->ee_block = cpu_to_le32(cex->ec_block);
1461 ext4_ext_store_pblock(ex, cex->ec_start);
1462 ex->ee_len = cpu_to_le16(cex->ec_len);
1463 ext_debug("%lu cached by %lu:%lu:%llu\n",
1464 (unsigned long) block,
1465 (unsigned long) cex->ec_block,
1466 (unsigned long) cex->ec_len,
1467 cex->ec_start);
1468 return cex->ec_type;
1469 }
1470
1471 /* not in cache */
1472 return EXT4_EXT_CACHE_NO;
1473}
1474
1475/*
1476 * ext4_ext_rm_idx:
1477 * removes index from the index block.
1478 * It's used in truncate case only, thus all requests are for
1479 * last index in the block only.
1480 */
1481int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1482 struct ext4_ext_path *path)
1483{
1484 struct buffer_head *bh;
1485 int err;
1486 ext4_fsblk_t leaf;
1487
1488 /* free index block */
1489 path--;
1490 leaf = idx_pblock(path->p_idx);
1491 BUG_ON(path->p_hdr->eh_entries == 0);
1492 if ((err = ext4_ext_get_access(handle, inode, path)))
1493 return err;
1494 path->p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(path->p_hdr->eh_entries)-1);
1495 if ((err = ext4_ext_dirty(handle, inode, path)))
1496 return err;
1497 ext_debug("index is empty, remove it, free block %llu\n", leaf);
1498 bh = sb_find_get_block(inode->i_sb, leaf);
1499 ext4_forget(handle, 1, inode, bh, leaf);
1500 ext4_free_blocks(handle, inode, leaf, 1);
1501 return err;
1502}
1503
1504/*
1505 * ext4_ext_calc_credits_for_insert:
1506 * This routine returns max. credits that the extent tree can consume.
1507 * It should be OK for low-performance paths like ->writepage()
1508 * To allow many writing processes to fit into a single transaction,
1509 * the caller should calculate credits under truncate_mutex and
1510 * pass the actual path.
1511 */
1512int inline ext4_ext_calc_credits_for_insert(struct inode *inode,
1513 struct ext4_ext_path *path)
1514{
1515 int depth, needed;
1516
1517 if (path) {
1518 /* probably there is space in leaf? */
1519 depth = ext_depth(inode);
1520 if (le16_to_cpu(path[depth].p_hdr->eh_entries)
1521 < le16_to_cpu(path[depth].p_hdr->eh_max))
1522 return 1;
1523 }
1524
1525 /*
1526 * given 32-bit logical block (4294967296 blocks), max. tree
1527 * can be 4 levels in depth -- 4 * 340^4 == 53453440000.
1528 * Let's also add one more level for imbalance.
1529 */
1530 depth = 5;
1531
1532 /* allocation of new data block(s) */
1533 needed = 2;
1534
1535 /*
1536 * tree can be full, so it would need to grow in depth:
1537 * allocation + old root + new root
1538 */
1539 needed += 2 + 1 + 1;
1540
1541 /*
1542 * Index split can happen, we would need:
1543 * allocate intermediate indexes (bitmap + group)
1544 * + change two blocks at each level, but root (already included)
1545 */
1546 needed = (depth * 2) + (depth * 2);
1547
1548 /* any allocation modifies superblock */
1549 needed += 1;
1550
1551 return needed;
1552}
1553
1554static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
1555 struct ext4_extent *ex,
1556 unsigned long from, unsigned long to)
1557{
1558 struct buffer_head *bh;
1559 int i;
1560
1561#ifdef EXTENTS_STATS
1562 {
1563 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1564 unsigned short ee_len = le16_to_cpu(ex->ee_len);
1565 spin_lock(&sbi->s_ext_stats_lock);
1566 sbi->s_ext_blocks += ee_len;
1567 sbi->s_ext_extents++;
1568 if (ee_len < sbi->s_ext_min)
1569 sbi->s_ext_min = ee_len;
1570 if (ee_len > sbi->s_ext_max)
1571 sbi->s_ext_max = ee_len;
1572 if (ext_depth(inode) > sbi->s_depth_max)
1573 sbi->s_depth_max = ext_depth(inode);
1574 spin_unlock(&sbi->s_ext_stats_lock);
1575 }
1576#endif
1577 if (from >= le32_to_cpu(ex->ee_block)
1578 && to == le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) {
1579 /* tail removal */
1580 unsigned long num;
1581 ext4_fsblk_t start;
1582 num = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - from;
1583 start = ext_pblock(ex) + le16_to_cpu(ex->ee_len) - num;
1584 ext_debug("free last %lu blocks starting %llu\n", num, start);
1585 for (i = 0; i < num; i++) {
1586 bh = sb_find_get_block(inode->i_sb, start + i);
1587 ext4_forget(handle, 0, inode, bh, start + i);
1588 }
1589 ext4_free_blocks(handle, inode, start, num);
1590 } else if (from == le32_to_cpu(ex->ee_block)
1591 && to <= le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) {
1592 printk("strange request: removal %lu-%lu from %u:%u\n",
1593 from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len));
1594 } else {
1595 printk("strange request: removal(2) %lu-%lu from %u:%u\n",
1596 from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len));
1597 }
1598 return 0;
1599}
1600
1601static int
1602ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
1603 struct ext4_ext_path *path, unsigned long start)
1604{
1605 int err = 0, correct_index = 0;
1606 int depth = ext_depth(inode), credits;
1607 struct ext4_extent_header *eh;
1608 unsigned a, b, block, num;
1609 unsigned long ex_ee_block;
1610 unsigned short ex_ee_len;
1611 struct ext4_extent *ex;
1612
1613 ext_debug("truncate since %lu in leaf\n", start);
1614 if (!path[depth].p_hdr)
1615 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
1616 eh = path[depth].p_hdr;
1617 BUG_ON(eh == NULL);
1618 BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max));
1619 BUG_ON(eh->eh_magic != EXT4_EXT_MAGIC);
1620
1621 /* find where to start removing */
1622 ex = EXT_LAST_EXTENT(eh);
1623
1624 ex_ee_block = le32_to_cpu(ex->ee_block);
1625 ex_ee_len = le16_to_cpu(ex->ee_len);
1626
1627 while (ex >= EXT_FIRST_EXTENT(eh) &&
1628 ex_ee_block + ex_ee_len > start) {
1629 ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len);
1630 path[depth].p_ext = ex;
1631
1632 a = ex_ee_block > start ? ex_ee_block : start;
1633 b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ?
1634 ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK;
1635
1636 ext_debug(" border %u:%u\n", a, b);
1637
1638 if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) {
1639 block = 0;
1640 num = 0;
1641 BUG();
1642 } else if (a != ex_ee_block) {
1643 /* remove tail of the extent */
1644 block = ex_ee_block;
1645 num = a - block;
1646 } else if (b != ex_ee_block + ex_ee_len - 1) {
1647 /* remove head of the extent */
1648 block = a;
1649 num = b - a;
1650 /* there is no "make a hole" API yet */
1651 BUG();
1652 } else {
1653 /* remove whole extent: excellent! */
1654 block = ex_ee_block;
1655 num = 0;
1656 BUG_ON(a != ex_ee_block);
1657 BUG_ON(b != ex_ee_block + ex_ee_len - 1);
1658 }
1659
1660 /* at present, extent can't cross block group: */
1661 /* leaf + bitmap + group desc + sb + inode */
1662 credits = 5;
1663 if (ex == EXT_FIRST_EXTENT(eh)) {
1664 correct_index = 1;
1665 credits += (ext_depth(inode)) + 1;
1666 }
1667#ifdef CONFIG_QUOTA
1668 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
1669#endif
1670
1671 handle = ext4_ext_journal_restart(handle, credits);
1672 if (IS_ERR(handle)) {
1673 err = PTR_ERR(handle);
1674 goto out;
1675 }
1676
1677 err = ext4_ext_get_access(handle, inode, path + depth);
1678 if (err)
1679 goto out;
1680
1681 err = ext4_remove_blocks(handle, inode, ex, a, b);
1682 if (err)
1683 goto out;
1684
1685 if (num == 0) {
1686 /* this extent is removed; mark slot entirely unused */
1687 ext4_ext_store_pblock(ex, 0);
1688 eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1);
1689 }
1690
1691 ex->ee_block = cpu_to_le32(block);
1692 ex->ee_len = cpu_to_le16(num);
1693
1694 err = ext4_ext_dirty(handle, inode, path + depth);
1695 if (err)
1696 goto out;
1697
1698 ext_debug("new extent: %u:%u:%llu\n", block, num,
1699 ext_pblock(ex));
1700 ex--;
1701 ex_ee_block = le32_to_cpu(ex->ee_block);
1702 ex_ee_len = le16_to_cpu(ex->ee_len);
1703 }
1704
1705 if (correct_index && eh->eh_entries)
1706 err = ext4_ext_correct_indexes(handle, inode, path);
1707
1708 /* if this leaf is free, then we should
1709 * remove it from index block above */
1710 if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
1711 err = ext4_ext_rm_idx(handle, inode, path + depth);
1712
1713out:
1714 return err;
1715}
1716
1717/*
1718 * ext4_ext_more_to_rm:
1719 * returns 1 if current index has to be freed (even partial)
1720 */
1721static int inline
1722ext4_ext_more_to_rm(struct ext4_ext_path *path)
1723{
1724 BUG_ON(path->p_idx == NULL);
1725
1726 if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
1727 return 0;
1728
1729 /*
1730 * if truncate on deeper level happened, it wasn't partial,
1731 * so we have to consider current index for truncation
1732 */
1733 if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
1734 return 0;
1735 return 1;
1736}
1737
1738int ext4_ext_remove_space(struct inode *inode, unsigned long start)
1739{
1740 struct super_block *sb = inode->i_sb;
1741 int depth = ext_depth(inode);
1742 struct ext4_ext_path *path;
1743 handle_t *handle;
1744 int i = 0, err = 0;
1745
1746 ext_debug("truncate since %lu\n", start);
1747
1748 /* probably first extent we're gonna free will be last in block */
1749 handle = ext4_journal_start(inode, depth + 1);
1750 if (IS_ERR(handle))
1751 return PTR_ERR(handle);
1752
1753 ext4_ext_invalidate_cache(inode);
1754
1755 /*
1756 * We start scanning from right side, freeing all the blocks
1757 * after i_size and walking into the tree depth-wise.
1758 */
1759 path = kmalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_KERNEL);
1760 if (path == NULL) {
1761 ext4_journal_stop(handle);
1762 return -ENOMEM;
1763 }
1764 memset(path, 0, sizeof(struct ext4_ext_path) * (depth + 1));
1765 path[0].p_hdr = ext_inode_hdr(inode);
1766 if (ext4_ext_check_header(__FUNCTION__, inode, path[0].p_hdr)) {
1767 err = -EIO;
1768 goto out;
1769 }
1770 path[0].p_depth = depth;
1771
1772 while (i >= 0 && err == 0) {
1773 if (i == depth) {
1774 /* this is leaf block */
1775 err = ext4_ext_rm_leaf(handle, inode, path, start);
1776 /* root level has p_bh == NULL, brelse() eats this */
1777 brelse(path[i].p_bh);
1778 path[i].p_bh = NULL;
1779 i--;
1780 continue;
1781 }
1782
1783 /* this is index block */
1784 if (!path[i].p_hdr) {
1785 ext_debug("initialize header\n");
1786 path[i].p_hdr = ext_block_hdr(path[i].p_bh);
1787 if (ext4_ext_check_header(__FUNCTION__, inode,
1788 path[i].p_hdr)) {
1789 err = -EIO;
1790 goto out;
1791 }
1792 }
1793
1794 BUG_ON(le16_to_cpu(path[i].p_hdr->eh_entries)
1795 > le16_to_cpu(path[i].p_hdr->eh_max));
1796 BUG_ON(path[i].p_hdr->eh_magic != EXT4_EXT_MAGIC);
1797
1798 if (!path[i].p_idx) {
1799 /* this level hasn't been touched yet */
1800 path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
1801 path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
1802 ext_debug("init index ptr: hdr 0x%p, num %d\n",
1803 path[i].p_hdr,
1804 le16_to_cpu(path[i].p_hdr->eh_entries));
1805 } else {
1806 /* we were already here, see at next index */
1807 path[i].p_idx--;
1808 }
1809
1810 ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
1811 i, EXT_FIRST_INDEX(path[i].p_hdr),
1812 path[i].p_idx);
1813 if (ext4_ext_more_to_rm(path + i)) {
1814 /* go to the next level */
1815 ext_debug("move to level %d (block %llu)\n",
1816 i + 1, idx_pblock(path[i].p_idx));
1817 memset(path + i + 1, 0, sizeof(*path));
1818 path[i+1].p_bh =
1819 sb_bread(sb, idx_pblock(path[i].p_idx));
1820 if (!path[i+1].p_bh) {
1821 /* should we reset i_size? */
1822 err = -EIO;
1823 break;
1824 }
1825
1826 /* save actual number of indexes since this
1827 * number is changed at the next iteration */
1828 path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
1829 i++;
1830 } else {
1831 /* we finished processing this index, go up */
1832 if (path[i].p_hdr->eh_entries == 0 && i > 0) {
1833 /* index is empty, remove it;
1834 * handle must be already prepared by the
1835 * truncatei_leaf() */
1836 err = ext4_ext_rm_idx(handle, inode, path + i);
1837 }
1838 /* root level has p_bh == NULL, brelse() eats this */
1839 brelse(path[i].p_bh);
1840 path[i].p_bh = NULL;
1841 i--;
1842 ext_debug("return to level %d\n", i);
1843 }
1844 }
1845
1846 /* TODO: flexible tree reduction should be here */
1847 if (path->p_hdr->eh_entries == 0) {
1848 /*
1849 * truncate to zero freed all the tree,
1850 * so we need to correct eh_depth
1851 */
1852 err = ext4_ext_get_access(handle, inode, path);
1853 if (err == 0) {
1854 ext_inode_hdr(inode)->eh_depth = 0;
1855 ext_inode_hdr(inode)->eh_max =
1856 cpu_to_le16(ext4_ext_space_root(inode));
1857 err = ext4_ext_dirty(handle, inode, path);
1858 }
1859 }
1860out:
1861 ext4_ext_tree_changed(inode);
1862 ext4_ext_drop_refs(path);
1863 kfree(path);
1864 ext4_journal_stop(handle);
1865
1866 return err;
1867}
1868
1869/*
1870 * called at mount time
1871 */
1872void ext4_ext_init(struct super_block *sb)
1873{
1874 /*
1875 * possible initialization would be here
1876 */
1877
1878 if (test_opt(sb, EXTENTS)) {
1879 printk("EXT4-fs: file extents enabled");
1880#ifdef AGRESSIVE_TEST
1881 printk(", agressive tests");
1882#endif
1883#ifdef CHECK_BINSEARCH
1884 printk(", check binsearch");
1885#endif
1886#ifdef EXTENTS_STATS
1887 printk(", stats");
1888#endif
1889 printk("\n");
1890#ifdef EXTENTS_STATS
1891 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
1892 EXT4_SB(sb)->s_ext_min = 1 << 30;
1893 EXT4_SB(sb)->s_ext_max = 0;
1894#endif
1895 }
1896}
1897
1898/*
1899 * called at umount time
1900 */
1901void ext4_ext_release(struct super_block *sb)
1902{
1903 if (!test_opt(sb, EXTENTS))
1904 return;
1905
1906#ifdef EXTENTS_STATS
1907 if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
1908 struct ext4_sb_info *sbi = EXT4_SB(sb);
1909 printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
1910 sbi->s_ext_blocks, sbi->s_ext_extents,
1911 sbi->s_ext_blocks / sbi->s_ext_extents);
1912 printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
1913 sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
1914 }
1915#endif
1916}
1917
1918int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1919 ext4_fsblk_t iblock,
1920 unsigned long max_blocks, struct buffer_head *bh_result,
1921 int create, int extend_disksize)
1922{
1923 struct ext4_ext_path *path = NULL;
1924 struct ext4_extent newex, *ex;
1925 ext4_fsblk_t goal, newblock;
1926 int err = 0, depth;
1927 unsigned long allocated = 0;
1928
1929 __clear_bit(BH_New, &bh_result->b_state);
1930 ext_debug("blocks %d/%lu requested for inode %u\n", (int) iblock,
1931 max_blocks, (unsigned) inode->i_ino);
1932 mutex_lock(&EXT4_I(inode)->truncate_mutex);
1933
1934 /* check in cache */
1935 if ((goal = ext4_ext_in_cache(inode, iblock, &newex))) {
1936 if (goal == EXT4_EXT_CACHE_GAP) {
1937 if (!create) {
1938 /* block isn't allocated yet and
1939 * user doesn't want to allocate it */
1940 goto out2;
1941 }
1942 /* we should allocate requested block */
1943 } else if (goal == EXT4_EXT_CACHE_EXTENT) {
1944 /* block is already allocated */
1945 newblock = iblock
1946 - le32_to_cpu(newex.ee_block)
1947 + ext_pblock(&newex);
1948 /* number of remaining blocks in the extent */
1949 allocated = le16_to_cpu(newex.ee_len) -
1950 (iblock - le32_to_cpu(newex.ee_block));
1951 goto out;
1952 } else {
1953 BUG();
1954 }
1955 }
1956
1957 /* find extent for this block */
1958 path = ext4_ext_find_extent(inode, iblock, NULL);
1959 if (IS_ERR(path)) {
1960 err = PTR_ERR(path);
1961 path = NULL;
1962 goto out2;
1963 }
1964
1965 depth = ext_depth(inode);
1966
1967 /*
1968 * consistent leaf must not be empty;
1969 * this situation is possible, though, _during_ tree modification;
1970 * this is why assert can't be put in ext4_ext_find_extent()
1971 */
1972 BUG_ON(path[depth].p_ext == NULL && depth != 0);
1973
1974 if ((ex = path[depth].p_ext)) {
1975 unsigned long ee_block = le32_to_cpu(ex->ee_block);
1976 ext4_fsblk_t ee_start = ext_pblock(ex);
1977 unsigned short ee_len = le16_to_cpu(ex->ee_len);
1978
1979 /*
1980 * Allow future support for preallocated extents to be added
1981 * as an RO_COMPAT feature:
1982 * Uninitialized extents are treated as holes, except that
1983 * we avoid (fail) allocating new blocks during a write.
1984 */
1985 if (ee_len > EXT_MAX_LEN)
1986 goto out2;
1987 /* if found extent covers block, simply return it */
1988 if (iblock >= ee_block && iblock < ee_block + ee_len) {
1989 newblock = iblock - ee_block + ee_start;
1990 /* number of remaining blocks in the extent */
1991 allocated = ee_len - (iblock - ee_block);
1992 ext_debug("%d fit into %lu:%d -> %llu\n", (int) iblock,
1993 ee_block, ee_len, newblock);
1994 ext4_ext_put_in_cache(inode, ee_block, ee_len,
1995 ee_start, EXT4_EXT_CACHE_EXTENT);
1996 goto out;
1997 }
1998 }
1999
2000 /*
2001 * requested block isn't allocated yet;
2002 * we couldn't try to create block if create flag is zero
2003 */
2004 if (!create) {
2005 /* put just found gap into cache to speed up
2006 * subsequent requests */
2007 ext4_ext_put_gap_in_cache(inode, path, iblock);
2008 goto out2;
2009 }
2010 /*
2011 * Okay, we need to do block allocation. Lazily initialize the block
2012 * allocation info here if necessary.
2013 */
2014 if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info))
2015 ext4_init_block_alloc_info(inode);
2016
2017 /* allocate new block */
2018 goal = ext4_ext_find_goal(inode, path, iblock);
2019 allocated = max_blocks;
2020 newblock = ext4_new_blocks(handle, inode, goal, &allocated, &err);
2021 if (!newblock)
2022 goto out2;
2023 ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
2024 goal, newblock, allocated);
2025
2026 /* try to insert new extent into found leaf and return */
2027 newex.ee_block = cpu_to_le32(iblock);
2028 ext4_ext_store_pblock(&newex, newblock);
2029 newex.ee_len = cpu_to_le16(allocated);
2030 err = ext4_ext_insert_extent(handle, inode, path, &newex);
2031 if (err)
2032 goto out2;
2033
2034 if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
2035 EXT4_I(inode)->i_disksize = inode->i_size;
2036
2037 /* previous routine could use block we allocated */
2038 newblock = ext_pblock(&newex);
2039 __set_bit(BH_New, &bh_result->b_state);
2040
2041 ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
2042 EXT4_EXT_CACHE_EXTENT);
2043out:
2044 if (allocated > max_blocks)
2045 allocated = max_blocks;
2046 ext4_ext_show_leaf(inode, path);
2047 __set_bit(BH_Mapped, &bh_result->b_state);
2048 bh_result->b_bdev = inode->i_sb->s_bdev;
2049 bh_result->b_blocknr = newblock;
2050out2:
2051 if (path) {
2052 ext4_ext_drop_refs(path);
2053 kfree(path);
2054 }
2055 mutex_unlock(&EXT4_I(inode)->truncate_mutex);
2056
2057 return err ? err : allocated;
2058}
2059
2060void ext4_ext_truncate(struct inode * inode, struct page *page)
2061{
2062 struct address_space *mapping = inode->i_mapping;
2063 struct super_block *sb = inode->i_sb;
2064 unsigned long last_block;
2065 handle_t *handle;
2066 int err = 0;
2067
2068 /*
2069 * probably first extent we're gonna free will be last in block
2070 */
2071 err = ext4_writepage_trans_blocks(inode) + 3;
2072 handle = ext4_journal_start(inode, err);
2073 if (IS_ERR(handle)) {
2074 if (page) {
2075 clear_highpage(page);
2076 flush_dcache_page(page);
2077 unlock_page(page);
2078 page_cache_release(page);
2079 }
2080 return;
2081 }
2082
2083 if (page)
2084 ext4_block_truncate_page(handle, page, mapping, inode->i_size);
2085
2086 mutex_lock(&EXT4_I(inode)->truncate_mutex);
2087 ext4_ext_invalidate_cache(inode);
2088
2089 /*
2090 * TODO: optimization is possible here.
2091 * Probably we need not scan at all,
2092 * because page truncation is enough.
2093 */
2094 if (ext4_orphan_add(handle, inode))
2095 goto out_stop;
2096
2097 /* we have to know where to truncate from in crash case */
2098 EXT4_I(inode)->i_disksize = inode->i_size;
2099 ext4_mark_inode_dirty(handle, inode);
2100
2101 last_block = (inode->i_size + sb->s_blocksize - 1)
2102 >> EXT4_BLOCK_SIZE_BITS(sb);
2103 err = ext4_ext_remove_space(inode, last_block);
2104
2105 /* In a multi-transaction truncate, we only make the final
2106 * transaction synchronous. */
2107 if (IS_SYNC(inode))
2108 handle->h_sync = 1;
2109
2110out_stop:
2111 /*
2112 * If this was a simple ftruncate() and the file will remain alive,
2113 * then we need to clear up the orphan record which we created above.
2114 * However, if this was a real unlink then we were called by
2115 * ext4_delete_inode(), and we allow that function to clean up the
2116 * orphan info for us.
2117 */
2118 if (inode->i_nlink)
2119 ext4_orphan_del(handle, inode);
2120
2121 mutex_unlock(&EXT4_I(inode)->truncate_mutex);
2122 ext4_journal_stop(handle);
2123}
2124
2125/*
2126 * ext4_ext_writepage_trans_blocks:
2127 * calculate max number of blocks we could modify
2128 * in order to allocate new block for an inode
2129 */
2130int ext4_ext_writepage_trans_blocks(struct inode *inode, int num)
2131{
2132 int needed;
2133
2134 needed = ext4_ext_calc_credits_for_insert(inode, NULL);
2135
2136 /* caller wants to allocate num blocks, but note it includes sb */
2137 needed = needed * num - (num - 1);
2138
2139#ifdef CONFIG_QUOTA
2140 needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
2141#endif
2142
2143 return needed;
2144}
2145
2146EXPORT_SYMBOL(ext4_mark_inode_dirty);
2147EXPORT_SYMBOL(ext4_ext_invalidate_cache);
2148EXPORT_SYMBOL(ext4_ext_insert_extent);
2149EXPORT_SYMBOL(ext4_ext_walk_space);
2150EXPORT_SYMBOL(ext4_ext_find_goal);
2151EXPORT_SYMBOL(ext4_ext_calc_credits_for_insert);
2152
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
new file mode 100644
index 000000000000..0b622c0624b7
--- /dev/null
+++ b/fs/ext4/file.c
@@ -0,0 +1,139 @@
1/*
2 * linux/fs/ext4/file.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/file.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * ext4 fs regular file handling primitives
16 *
17 * 64-bit file support on 64-bit platforms by Jakub Jelinek
18 * (jj@sunsite.ms.mff.cuni.cz)
19 */
20
21#include <linux/time.h>
22#include <linux/fs.h>
23#include <linux/jbd2.h>
24#include <linux/ext4_fs.h>
25#include <linux/ext4_jbd2.h>
26#include "xattr.h"
27#include "acl.h"
28
29/*
30 * Called when an inode is released. Note that this is different
31 * from ext4_file_open: open gets called at every open, but release
32 * gets called only when /all/ the files are closed.
33 */
34static int ext4_release_file (struct inode * inode, struct file * filp)
35{
36 /* if we are the last writer on the inode, drop the block reservation */
37 if ((filp->f_mode & FMODE_WRITE) &&
38 (atomic_read(&inode->i_writecount) == 1))
39 {
40 mutex_lock(&EXT4_I(inode)->truncate_mutex);
41 ext4_discard_reservation(inode);
42 mutex_unlock(&EXT4_I(inode)->truncate_mutex);
43 }
44 if (is_dx(inode) && filp->private_data)
45 ext4_htree_free_dir_info(filp->private_data);
46
47 return 0;
48}
49
50static ssize_t
51ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
52 unsigned long nr_segs, loff_t pos)
53{
54 struct file *file = iocb->ki_filp;
55 struct inode *inode = file->f_dentry->d_inode;
56 ssize_t ret;
57 int err;
58
59 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
60
61 /*
62 * Skip flushing if there was an error, or if nothing was written.
63 */
64 if (ret <= 0)
65 return ret;
66
67 /*
68 * If the inode is IS_SYNC, or is O_SYNC and we are doing data
69 * journalling then we need to make sure that we force the transaction
70 * to disk to keep all metadata uptodate synchronously.
71 */
72 if (file->f_flags & O_SYNC) {
73 /*
74 * If we are non-data-journaled, then the dirty data has
75 * already been flushed to backing store by generic_osync_inode,
76 * and the inode has been flushed too if there have been any
77 * modifications other than mere timestamp updates.
78 *
79 * Open question --- do we care about flushing timestamps too
80 * if the inode is IS_SYNC?
81 */
82 if (!ext4_should_journal_data(inode))
83 return ret;
84
85 goto force_commit;
86 }
87
88 /*
89 * So we know that there has been no forced data flush. If the inode
90 * is marked IS_SYNC, we need to force one ourselves.
91 */
92 if (!IS_SYNC(inode))
93 return ret;
94
95 /*
96 * Open question #2 --- should we force data to disk here too? If we
97 * don't, the only impact is that data=writeback filesystems won't
98 * flush data to disk automatically on IS_SYNC, only metadata (but
99 * historically, that is what ext2 has done.)
100 */
101
102force_commit:
103 err = ext4_force_commit(inode->i_sb);
104 if (err)
105 return err;
106 return ret;
107}
108
109const struct file_operations ext4_file_operations = {
110 .llseek = generic_file_llseek,
111 .read = do_sync_read,
112 .write = do_sync_write,
113 .aio_read = generic_file_aio_read,
114 .aio_write = ext4_file_write,
115 .ioctl = ext4_ioctl,
116#ifdef CONFIG_COMPAT
117 .compat_ioctl = ext4_compat_ioctl,
118#endif
119 .mmap = generic_file_mmap,
120 .open = generic_file_open,
121 .release = ext4_release_file,
122 .fsync = ext4_sync_file,
123 .sendfile = generic_file_sendfile,
124 .splice_read = generic_file_splice_read,
125 .splice_write = generic_file_splice_write,
126};
127
128struct inode_operations ext4_file_inode_operations = {
129 .truncate = ext4_truncate,
130 .setattr = ext4_setattr,
131#ifdef CONFIG_EXT4DEV_FS_XATTR
132 .setxattr = generic_setxattr,
133 .getxattr = generic_getxattr,
134 .listxattr = ext4_listxattr,
135 .removexattr = generic_removexattr,
136#endif
137 .permission = ext4_permission,
138};
139
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
new file mode 100644
index 000000000000..2a167d7131fa
--- /dev/null
+++ b/fs/ext4/fsync.c
@@ -0,0 +1,88 @@
1/*
2 * linux/fs/ext4/fsync.c
3 *
4 * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com)
5 * from
6 * Copyright (C) 1992 Remy Card (card@masi.ibp.fr)
7 * Laboratoire MASI - Institut Blaise Pascal
8 * Universite Pierre et Marie Curie (Paris VI)
9 * from
10 * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds
11 *
12 * ext4fs fsync primitive
13 *
14 * Big-endian to little-endian byte-swapping/bitmaps by
15 * David S. Miller (davem@caip.rutgers.edu), 1995
16 *
17 * Removed unnecessary code duplication for little endian machines
18 * and excessive __inline__s.
19 * Andi Kleen, 1997
20 *
21 * Major simplications and cleanup - we only need to do the metadata, because
22 * we can depend on generic_block_fdatasync() to sync the data blocks.
23 */
24
25#include <linux/time.h>
26#include <linux/fs.h>
27#include <linux/sched.h>
28#include <linux/writeback.h>
29#include <linux/jbd2.h>
30#include <linux/ext4_fs.h>
31#include <linux/ext4_jbd2.h>
32
33/*
34 * akpm: A new design for ext4_sync_file().
35 *
36 * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
37 * There cannot be a transaction open by this task.
38 * Another task could have dirtied this inode. Its data can be in any
39 * state in the journalling system.
40 *
41 * What we do is just kick off a commit and wait on it. This will snapshot the
42 * inode to disk.
43 */
44
45int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
46{
47 struct inode *inode = dentry->d_inode;
48 int ret = 0;
49
50 J_ASSERT(ext4_journal_current_handle() == 0);
51
52 /*
53 * data=writeback:
54 * The caller's filemap_fdatawrite()/wait will sync the data.
55 * sync_inode() will sync the metadata
56 *
57 * data=ordered:
58 * The caller's filemap_fdatawrite() will write the data and
59 * sync_inode() will write the inode if it is dirty. Then the caller's
60 * filemap_fdatawait() will wait on the pages.
61 *
62 * data=journal:
63 * filemap_fdatawrite won't do anything (the buffers are clean).
64 * ext4_force_commit will write the file data into the journal and
65 * will wait on that.
66 * filemap_fdatawait() will encounter a ton of newly-dirtied pages
67 * (they were dirtied by commit). But that's OK - the blocks are
68 * safe in-journal, which is all fsync() needs to ensure.
69 */
70 if (ext4_should_journal_data(inode)) {
71 ret = ext4_force_commit(inode->i_sb);
72 goto out;
73 }
74
75 /*
76 * The VFS has written the file data. If the inode is unaltered
77 * then we need not start a commit.
78 */
79 if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
80 struct writeback_control wbc = {
81 .sync_mode = WB_SYNC_ALL,
82 .nr_to_write = 0, /* sys_fsync did this */
83 };
84 ret = sync_inode(inode, &wbc);
85 }
86out:
87 return ret;
88}
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
new file mode 100644
index 000000000000..a67966385e06
--- /dev/null
+++ b/fs/ext4/hash.c
@@ -0,0 +1,152 @@
1/*
2 * linux/fs/ext4/hash.c
3 *
4 * Copyright (C) 2002 by Theodore Ts'o
5 *
6 * This file is released under the GPL v2.
7 *
8 * This file may be redistributed under the terms of the GNU Public
9 * License.
10 */
11
12#include <linux/fs.h>
13#include <linux/jbd2.h>
14#include <linux/sched.h>
15#include <linux/ext4_fs.h>
16#include <linux/cryptohash.h>
17
18#define DELTA 0x9E3779B9
19
20static void TEA_transform(__u32 buf[4], __u32 const in[])
21{
22 __u32 sum = 0;
23 __u32 b0 = buf[0], b1 = buf[1];
24 __u32 a = in[0], b = in[1], c = in[2], d = in[3];
25 int n = 16;
26
27 do {
28 sum += DELTA;
29 b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
30 b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
31 } while(--n);
32
33 buf[0] += b0;
34 buf[1] += b1;
35}
36
37
38/* The old legacy hash */
39static __u32 dx_hack_hash (const char *name, int len)
40{
41 __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
42 while (len--) {
43 __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
44
45 if (hash & 0x80000000) hash -= 0x7fffffff;
46 hash1 = hash0;
47 hash0 = hash;
48 }
49 return (hash0 << 1);
50}
51
52static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
53{
54 __u32 pad, val;
55 int i;
56
57 pad = (__u32)len | ((__u32)len << 8);
58 pad |= pad << 16;
59
60 val = pad;
61 if (len > num*4)
62 len = num * 4;
63 for (i=0; i < len; i++) {
64 if ((i % 4) == 0)
65 val = pad;
66 val = msg[i] + (val << 8);
67 if ((i % 4) == 3) {
68 *buf++ = val;
69 val = pad;
70 num--;
71 }
72 }
73 if (--num >= 0)
74 *buf++ = val;
75 while (--num >= 0)
76 *buf++ = pad;
77}
78
79/*
80 * Returns the hash of a filename. If len is 0 and name is NULL, then
81 * this function can be used to test whether or not a hash version is
82 * supported.
83 *
84 * The seed is an 4 longword (32 bits) "secret" which can be used to
85 * uniquify a hash. If the seed is all zero's, then some default seed
86 * may be used.
87 *
88 * A particular hash version specifies whether or not the seed is
89 * represented, and whether or not the returned hash is 32 bits or 64
90 * bits. 32 bit hashes will return 0 for the minor hash.
91 */
92int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
93{
94 __u32 hash;
95 __u32 minor_hash = 0;
96 const char *p;
97 int i;
98 __u32 in[8], buf[4];
99
100 /* Initialize the default seed for the hash checksum functions */
101 buf[0] = 0x67452301;
102 buf[1] = 0xefcdab89;
103 buf[2] = 0x98badcfe;
104 buf[3] = 0x10325476;
105
106 /* Check to see if the seed is all zero's */
107 if (hinfo->seed) {
108 for (i=0; i < 4; i++) {
109 if (hinfo->seed[i])
110 break;
111 }
112 if (i < 4)
113 memcpy(buf, hinfo->seed, sizeof(buf));
114 }
115
116 switch (hinfo->hash_version) {
117 case DX_HASH_LEGACY:
118 hash = dx_hack_hash(name, len);
119 break;
120 case DX_HASH_HALF_MD4:
121 p = name;
122 while (len > 0) {
123 str2hashbuf(p, len, in, 8);
124 half_md4_transform(buf, in);
125 len -= 32;
126 p += 32;
127 }
128 minor_hash = buf[2];
129 hash = buf[1];
130 break;
131 case DX_HASH_TEA:
132 p = name;
133 while (len > 0) {
134 str2hashbuf(p, len, in, 4);
135 TEA_transform(buf, in);
136 len -= 16;
137 p += 16;
138 }
139 hash = buf[0];
140 minor_hash = buf[1];
141 break;
142 default:
143 hinfo->hash = 0;
144 return -1;
145 }
146 hash = hash & ~1;
147 if (hash == (EXT4_HTREE_EOF << 1))
148 hash = (EXT4_HTREE_EOF-1) << 1;
149 hinfo->hash = hash;
150 hinfo->minor_hash = minor_hash;
151 return 0;
152}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
new file mode 100644
index 000000000000..c88b439ba5cd
--- /dev/null
+++ b/fs/ext4/ialloc.c
@@ -0,0 +1,772 @@
1/*
2 * linux/fs/ext4/ialloc.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * BSD ufs-inspired inode and directory allocation by
10 * Stephen Tweedie (sct@redhat.com), 1993
11 * Big-endian to little-endian byte-swapping/bitmaps by
12 * David S. Miller (davem@caip.rutgers.edu), 1995
13 */
14
15#include <linux/time.h>
16#include <linux/fs.h>
17#include <linux/jbd2.h>
18#include <linux/ext4_fs.h>
19#include <linux/ext4_jbd2.h>
20#include <linux/stat.h>
21#include <linux/string.h>
22#include <linux/quotaops.h>
23#include <linux/buffer_head.h>
24#include <linux/random.h>
25#include <linux/bitops.h>
26#include <linux/blkdev.h>
27#include <asm/byteorder.h>
28
29#include "xattr.h"
30#include "acl.h"
31
32/*
33 * ialloc.c contains the inodes allocation and deallocation routines
34 */
35
36/*
37 * The free inodes are managed by bitmaps. A file system contains several
38 * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
39 * block for inodes, N blocks for the inode table and data blocks.
40 *
41 * The file system contains group descriptors which are located after the
42 * super block. Each descriptor contains the number of the bitmap block and
43 * the free blocks count in the block.
44 */
45
46
47/*
48 * Read the inode allocation bitmap for a given block_group, reading
49 * into the specified slot in the superblock's bitmap cache.
50 *
51 * Return buffer_head of bitmap on success or NULL.
52 */
53static struct buffer_head *
54read_inode_bitmap(struct super_block * sb, unsigned long block_group)
55{
56 struct ext4_group_desc *desc;
57 struct buffer_head *bh = NULL;
58
59 desc = ext4_get_group_desc(sb, block_group, NULL);
60 if (!desc)
61 goto error_out;
62
63 bh = sb_bread(sb, ext4_inode_bitmap(sb, desc));
64 if (!bh)
65 ext4_error(sb, "read_inode_bitmap",
66 "Cannot read inode bitmap - "
67 "block_group = %lu, inode_bitmap = %llu",
68 block_group, ext4_inode_bitmap(sb, desc));
69error_out:
70 return bh;
71}
72
73/*
74 * NOTE! When we get the inode, we're the only people
75 * that have access to it, and as such there are no
76 * race conditions we have to worry about. The inode
77 * is not on the hash-lists, and it cannot be reached
78 * through the filesystem because the directory entry
79 * has been deleted earlier.
80 *
81 * HOWEVER: we must make sure that we get no aliases,
82 * which means that we have to call "clear_inode()"
83 * _before_ we mark the inode not in use in the inode
84 * bitmaps. Otherwise a newly created file might use
85 * the same inode number (not actually the same pointer
86 * though), and then we'd have two inodes sharing the
87 * same inode number and space on the harddisk.
88 */
89void ext4_free_inode (handle_t *handle, struct inode * inode)
90{
91 struct super_block * sb = inode->i_sb;
92 int is_directory;
93 unsigned long ino;
94 struct buffer_head *bitmap_bh = NULL;
95 struct buffer_head *bh2;
96 unsigned long block_group;
97 unsigned long bit;
98 struct ext4_group_desc * gdp;
99 struct ext4_super_block * es;
100 struct ext4_sb_info *sbi;
101 int fatal = 0, err;
102
103 if (atomic_read(&inode->i_count) > 1) {
104 printk ("ext4_free_inode: inode has count=%d\n",
105 atomic_read(&inode->i_count));
106 return;
107 }
108 if (inode->i_nlink) {
109 printk ("ext4_free_inode: inode has nlink=%d\n",
110 inode->i_nlink);
111 return;
112 }
113 if (!sb) {
114 printk("ext4_free_inode: inode on nonexistent device\n");
115 return;
116 }
117 sbi = EXT4_SB(sb);
118
119 ino = inode->i_ino;
120 ext4_debug ("freeing inode %lu\n", ino);
121
122 /*
123 * Note: we must free any quota before locking the superblock,
124 * as writing the quota to disk may need the lock as well.
125 */
126 DQUOT_INIT(inode);
127 ext4_xattr_delete_inode(handle, inode);
128 DQUOT_FREE_INODE(inode);
129 DQUOT_DROP(inode);
130
131 is_directory = S_ISDIR(inode->i_mode);
132
133 /* Do this BEFORE marking the inode not in use or returning an error */
134 clear_inode (inode);
135
136 es = EXT4_SB(sb)->s_es;
137 if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
138 ext4_error (sb, "ext4_free_inode",
139 "reserved or nonexistent inode %lu", ino);
140 goto error_return;
141 }
142 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
143 bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
144 bitmap_bh = read_inode_bitmap(sb, block_group);
145 if (!bitmap_bh)
146 goto error_return;
147
148 BUFFER_TRACE(bitmap_bh, "get_write_access");
149 fatal = ext4_journal_get_write_access(handle, bitmap_bh);
150 if (fatal)
151 goto error_return;
152
153 /* Ok, now we can actually update the inode bitmaps.. */
154 if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
155 bit, bitmap_bh->b_data))
156 ext4_error (sb, "ext4_free_inode",
157 "bit already cleared for inode %lu", ino);
158 else {
159 gdp = ext4_get_group_desc (sb, block_group, &bh2);
160
161 BUFFER_TRACE(bh2, "get_write_access");
162 fatal = ext4_journal_get_write_access(handle, bh2);
163 if (fatal) goto error_return;
164
165 if (gdp) {
166 spin_lock(sb_bgl_lock(sbi, block_group));
167 gdp->bg_free_inodes_count = cpu_to_le16(
168 le16_to_cpu(gdp->bg_free_inodes_count) + 1);
169 if (is_directory)
170 gdp->bg_used_dirs_count = cpu_to_le16(
171 le16_to_cpu(gdp->bg_used_dirs_count) - 1);
172 spin_unlock(sb_bgl_lock(sbi, block_group));
173 percpu_counter_inc(&sbi->s_freeinodes_counter);
174 if (is_directory)
175 percpu_counter_dec(&sbi->s_dirs_counter);
176
177 }
178 BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
179 err = ext4_journal_dirty_metadata(handle, bh2);
180 if (!fatal) fatal = err;
181 }
182 BUFFER_TRACE(bitmap_bh, "call ext4_journal_dirty_metadata");
183 err = ext4_journal_dirty_metadata(handle, bitmap_bh);
184 if (!fatal)
185 fatal = err;
186 sb->s_dirt = 1;
187error_return:
188 brelse(bitmap_bh);
189 ext4_std_error(sb, fatal);
190}
191
192/*
193 * There are two policies for allocating an inode. If the new inode is
194 * a directory, then a forward search is made for a block group with both
195 * free space and a low directory-to-inode ratio; if that fails, then of
196 * the groups with above-average free space, that group with the fewest
197 * directories already is chosen.
198 *
199 * For other inodes, search forward from the parent directory\'s block
200 * group to find a free inode.
201 */
202static int find_group_dir(struct super_block *sb, struct inode *parent)
203{
204 int ngroups = EXT4_SB(sb)->s_groups_count;
205 unsigned int freei, avefreei;
206 struct ext4_group_desc *desc, *best_desc = NULL;
207 struct buffer_head *bh;
208 int group, best_group = -1;
209
210 freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter);
211 avefreei = freei / ngroups;
212
213 for (group = 0; group < ngroups; group++) {
214 desc = ext4_get_group_desc (sb, group, &bh);
215 if (!desc || !desc->bg_free_inodes_count)
216 continue;
217 if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
218 continue;
219 if (!best_desc ||
220 (le16_to_cpu(desc->bg_free_blocks_count) >
221 le16_to_cpu(best_desc->bg_free_blocks_count))) {
222 best_group = group;
223 best_desc = desc;
224 }
225 }
226 return best_group;
227}
228
229/*
230 * Orlov's allocator for directories.
231 *
232 * We always try to spread first-level directories.
233 *
234 * If there are blockgroups with both free inodes and free blocks counts
235 * not worse than average we return one with smallest directory count.
236 * Otherwise we simply return a random group.
237 *
238 * For the rest rules look so:
239 *
240 * It's OK to put directory into a group unless
241 * it has too many directories already (max_dirs) or
242 * it has too few free inodes left (min_inodes) or
243 * it has too few free blocks left (min_blocks) or
244 * it's already running too large debt (max_debt).
245 * Parent's group is prefered, if it doesn't satisfy these
246 * conditions we search cyclically through the rest. If none
247 * of the groups look good we just look for a group with more
248 * free inodes than average (starting at parent's group).
249 *
250 * Debt is incremented each time we allocate a directory and decremented
251 * when we allocate an inode, within 0--255.
252 */
253
254#define INODE_COST 64
255#define BLOCK_COST 256
256
257static int find_group_orlov(struct super_block *sb, struct inode *parent)
258{
259 int parent_group = EXT4_I(parent)->i_block_group;
260 struct ext4_sb_info *sbi = EXT4_SB(sb);
261 struct ext4_super_block *es = sbi->s_es;
262 int ngroups = sbi->s_groups_count;
263 int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
264 unsigned int freei, avefreei;
265 ext4_fsblk_t freeb, avefreeb;
266 ext4_fsblk_t blocks_per_dir;
267 unsigned int ndirs;
268 int max_debt, max_dirs, min_inodes;
269 ext4_grpblk_t min_blocks;
270 int group = -1, i;
271 struct ext4_group_desc *desc;
272 struct buffer_head *bh;
273
274 freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
275 avefreei = freei / ngroups;
276 freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
277 avefreeb = freeb;
278 do_div(avefreeb, ngroups);
279 ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
280
281 if ((parent == sb->s_root->d_inode) ||
282 (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
283 int best_ndir = inodes_per_group;
284 int best_group = -1;
285
286 get_random_bytes(&group, sizeof(group));
287 parent_group = (unsigned)group % ngroups;
288 for (i = 0; i < ngroups; i++) {
289 group = (parent_group + i) % ngroups;
290 desc = ext4_get_group_desc (sb, group, &bh);
291 if (!desc || !desc->bg_free_inodes_count)
292 continue;
293 if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
294 continue;
295 if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
296 continue;
297 if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb)
298 continue;
299 best_group = group;
300 best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
301 }
302 if (best_group >= 0)
303 return best_group;
304 goto fallback;
305 }
306
307 blocks_per_dir = ext4_blocks_count(es) - freeb;
308 do_div(blocks_per_dir, ndirs);
309
310 max_dirs = ndirs / ngroups + inodes_per_group / 16;
311 min_inodes = avefreei - inodes_per_group / 4;
312 min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4;
313
314 max_debt = EXT4_BLOCKS_PER_GROUP(sb);
315 max_debt /= max_t(int, blocks_per_dir, BLOCK_COST);
316 if (max_debt * INODE_COST > inodes_per_group)
317 max_debt = inodes_per_group / INODE_COST;
318 if (max_debt > 255)
319 max_debt = 255;
320 if (max_debt == 0)
321 max_debt = 1;
322
323 for (i = 0; i < ngroups; i++) {
324 group = (parent_group + i) % ngroups;
325 desc = ext4_get_group_desc (sb, group, &bh);
326 if (!desc || !desc->bg_free_inodes_count)
327 continue;
328 if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
329 continue;
330 if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes)
331 continue;
332 if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks)
333 continue;
334 return group;
335 }
336
337fallback:
338 for (i = 0; i < ngroups; i++) {
339 group = (parent_group + i) % ngroups;
340 desc = ext4_get_group_desc (sb, group, &bh);
341 if (!desc || !desc->bg_free_inodes_count)
342 continue;
343 if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
344 return group;
345 }
346
347 if (avefreei) {
348 /*
349 * The free-inodes counter is approximate, and for really small
350 * filesystems the above test can fail to find any blockgroups
351 */
352 avefreei = 0;
353 goto fallback;
354 }
355
356 return -1;
357}
358
359static int find_group_other(struct super_block *sb, struct inode *parent)
360{
361 int parent_group = EXT4_I(parent)->i_block_group;
362 int ngroups = EXT4_SB(sb)->s_groups_count;
363 struct ext4_group_desc *desc;
364 struct buffer_head *bh;
365 int group, i;
366
367 /*
368 * Try to place the inode in its parent directory
369 */
370 group = parent_group;
371 desc = ext4_get_group_desc (sb, group, &bh);
372 if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
373 le16_to_cpu(desc->bg_free_blocks_count))
374 return group;
375
376 /*
377 * We're going to place this inode in a different blockgroup from its
378 * parent. We want to cause files in a common directory to all land in
379 * the same blockgroup. But we want files which are in a different
380 * directory which shares a blockgroup with our parent to land in a
381 * different blockgroup.
382 *
383 * So add our directory's i_ino into the starting point for the hash.
384 */
385 group = (group + parent->i_ino) % ngroups;
386
387 /*
388 * Use a quadratic hash to find a group with a free inode and some free
389 * blocks.
390 */
391 for (i = 1; i < ngroups; i <<= 1) {
392 group += i;
393 if (group >= ngroups)
394 group -= ngroups;
395 desc = ext4_get_group_desc (sb, group, &bh);
396 if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
397 le16_to_cpu(desc->bg_free_blocks_count))
398 return group;
399 }
400
401 /*
402 * That failed: try linear search for a free inode, even if that group
403 * has no free blocks.
404 */
405 group = parent_group;
406 for (i = 0; i < ngroups; i++) {
407 if (++group >= ngroups)
408 group = 0;
409 desc = ext4_get_group_desc (sb, group, &bh);
410 if (desc && le16_to_cpu(desc->bg_free_inodes_count))
411 return group;
412 }
413
414 return -1;
415}
416
417/*
418 * There are two policies for allocating an inode. If the new inode is
419 * a directory, then a forward search is made for a block group with both
420 * free space and a low directory-to-inode ratio; if that fails, then of
421 * the groups with above-average free space, that group with the fewest
422 * directories already is chosen.
423 *
424 * For other inodes, search forward from the parent directory's block
425 * group to find a free inode.
426 */
427struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
428{
429 struct super_block *sb;
430 struct buffer_head *bitmap_bh = NULL;
431 struct buffer_head *bh2;
432 int group;
433 unsigned long ino = 0;
434 struct inode * inode;
435 struct ext4_group_desc * gdp = NULL;
436 struct ext4_super_block * es;
437 struct ext4_inode_info *ei;
438 struct ext4_sb_info *sbi;
439 int err = 0;
440 struct inode *ret;
441 int i;
442
443 /* Cannot create files in a deleted directory */
444 if (!dir || !dir->i_nlink)
445 return ERR_PTR(-EPERM);
446
447 sb = dir->i_sb;
448 inode = new_inode(sb);
449 if (!inode)
450 return ERR_PTR(-ENOMEM);
451 ei = EXT4_I(inode);
452
453 sbi = EXT4_SB(sb);
454 es = sbi->s_es;
455 if (S_ISDIR(mode)) {
456 if (test_opt (sb, OLDALLOC))
457 group = find_group_dir(sb, dir);
458 else
459 group = find_group_orlov(sb, dir);
460 } else
461 group = find_group_other(sb, dir);
462
463 err = -ENOSPC;
464 if (group == -1)
465 goto out;
466
467 for (i = 0; i < sbi->s_groups_count; i++) {
468 err = -EIO;
469
470 gdp = ext4_get_group_desc(sb, group, &bh2);
471 if (!gdp)
472 goto fail;
473
474 brelse(bitmap_bh);
475 bitmap_bh = read_inode_bitmap(sb, group);
476 if (!bitmap_bh)
477 goto fail;
478
479 ino = 0;
480
481repeat_in_this_group:
482 ino = ext4_find_next_zero_bit((unsigned long *)
483 bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino);
484 if (ino < EXT4_INODES_PER_GROUP(sb)) {
485
486 BUFFER_TRACE(bitmap_bh, "get_write_access");
487 err = ext4_journal_get_write_access(handle, bitmap_bh);
488 if (err)
489 goto fail;
490
491 if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group),
492 ino, bitmap_bh->b_data)) {
493 /* we won it */
494 BUFFER_TRACE(bitmap_bh,
495 "call ext4_journal_dirty_metadata");
496 err = ext4_journal_dirty_metadata(handle,
497 bitmap_bh);
498 if (err)
499 goto fail;
500 goto got;
501 }
502 /* we lost it */
503 jbd2_journal_release_buffer(handle, bitmap_bh);
504
505 if (++ino < EXT4_INODES_PER_GROUP(sb))
506 goto repeat_in_this_group;
507 }
508
509 /*
510 * This case is possible in concurrent environment. It is very
511 * rare. We cannot repeat the find_group_xxx() call because
512 * that will simply return the same blockgroup, because the
513 * group descriptor metadata has not yet been updated.
514 * So we just go onto the next blockgroup.
515 */
516 if (++group == sbi->s_groups_count)
517 group = 0;
518 }
519 err = -ENOSPC;
520 goto out;
521
522got:
523 ino += group * EXT4_INODES_PER_GROUP(sb) + 1;
524 if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
525 ext4_error (sb, "ext4_new_inode",
526 "reserved inode or inode > inodes count - "
527 "block_group = %d, inode=%lu", group, ino);
528 err = -EIO;
529 goto fail;
530 }
531
532 BUFFER_TRACE(bh2, "get_write_access");
533 err = ext4_journal_get_write_access(handle, bh2);
534 if (err) goto fail;
535 spin_lock(sb_bgl_lock(sbi, group));
536 gdp->bg_free_inodes_count =
537 cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1);
538 if (S_ISDIR(mode)) {
539 gdp->bg_used_dirs_count =
540 cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1);
541 }
542 spin_unlock(sb_bgl_lock(sbi, group));
543 BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
544 err = ext4_journal_dirty_metadata(handle, bh2);
545 if (err) goto fail;
546
547 percpu_counter_dec(&sbi->s_freeinodes_counter);
548 if (S_ISDIR(mode))
549 percpu_counter_inc(&sbi->s_dirs_counter);
550 sb->s_dirt = 1;
551
552 inode->i_uid = current->fsuid;
553 if (test_opt (sb, GRPID))
554 inode->i_gid = dir->i_gid;
555 else if (dir->i_mode & S_ISGID) {
556 inode->i_gid = dir->i_gid;
557 if (S_ISDIR(mode))
558 mode |= S_ISGID;
559 } else
560 inode->i_gid = current->fsgid;
561 inode->i_mode = mode;
562
563 inode->i_ino = ino;
564 /* This is the optimal IO size (for stat), not the fs block size */
565 inode->i_blocks = 0;
566 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
567
568 memset(ei->i_data, 0, sizeof(ei->i_data));
569 ei->i_dir_start_lookup = 0;
570 ei->i_disksize = 0;
571
572 ei->i_flags = EXT4_I(dir)->i_flags & ~EXT4_INDEX_FL;
573 if (S_ISLNK(mode))
574 ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL);
575 /* dirsync only applies to directories */
576 if (!S_ISDIR(mode))
577 ei->i_flags &= ~EXT4_DIRSYNC_FL;
578#ifdef EXT4_FRAGMENTS
579 ei->i_faddr = 0;
580 ei->i_frag_no = 0;
581 ei->i_frag_size = 0;
582#endif
583 ei->i_file_acl = 0;
584 ei->i_dir_acl = 0;
585 ei->i_dtime = 0;
586 ei->i_block_alloc_info = NULL;
587 ei->i_block_group = group;
588
589 ext4_set_inode_flags(inode);
590 if (IS_DIRSYNC(inode))
591 handle->h_sync = 1;
592 insert_inode_hash(inode);
593 spin_lock(&sbi->s_next_gen_lock);
594 inode->i_generation = sbi->s_next_generation++;
595 spin_unlock(&sbi->s_next_gen_lock);
596
597 ei->i_state = EXT4_STATE_NEW;
598 ei->i_extra_isize =
599 (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) ?
600 sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE : 0;
601
602 ret = inode;
603 if(DQUOT_ALLOC_INODE(inode)) {
604 err = -EDQUOT;
605 goto fail_drop;
606 }
607
608 err = ext4_init_acl(handle, inode, dir);
609 if (err)
610 goto fail_free_drop;
611
612 err = ext4_init_security(handle,inode, dir);
613 if (err)
614 goto fail_free_drop;
615
616 err = ext4_mark_inode_dirty(handle, inode);
617 if (err) {
618 ext4_std_error(sb, err);
619 goto fail_free_drop;
620 }
621 if (test_opt(sb, EXTENTS)) {
622 EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
623 ext4_ext_tree_init(handle, inode);
624 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
625 err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
626 if (err) goto fail;
627 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS);
628 BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "call ext4_journal_dirty_metadata");
629 err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
630 }
631 }
632
633 ext4_debug("allocating inode %lu\n", inode->i_ino);
634 goto really_out;
635fail:
636 ext4_std_error(sb, err);
637out:
638 iput(inode);
639 ret = ERR_PTR(err);
640really_out:
641 brelse(bitmap_bh);
642 return ret;
643
644fail_free_drop:
645 DQUOT_FREE_INODE(inode);
646
647fail_drop:
648 DQUOT_DROP(inode);
649 inode->i_flags |= S_NOQUOTA;
650 inode->i_nlink = 0;
651 iput(inode);
652 brelse(bitmap_bh);
653 return ERR_PTR(err);
654}
655
656/* Verify that we are loading a valid orphan from disk */
657struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
658{
659 unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
660 unsigned long block_group;
661 int bit;
662 struct buffer_head *bitmap_bh = NULL;
663 struct inode *inode = NULL;
664
665 /* Error cases - e2fsck has already cleaned up for us */
666 if (ino > max_ino) {
667 ext4_warning(sb, __FUNCTION__,
668 "bad orphan ino %lu! e2fsck was run?", ino);
669 goto out;
670 }
671
672 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
673 bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
674 bitmap_bh = read_inode_bitmap(sb, block_group);
675 if (!bitmap_bh) {
676 ext4_warning(sb, __FUNCTION__,
677 "inode bitmap error for orphan %lu", ino);
678 goto out;
679 }
680
681 /* Having the inode bit set should be a 100% indicator that this
682 * is a valid orphan (no e2fsck run on fs). Orphans also include
683 * inodes that were being truncated, so we can't check i_nlink==0.
684 */
685 if (!ext4_test_bit(bit, bitmap_bh->b_data) ||
686 !(inode = iget(sb, ino)) || is_bad_inode(inode) ||
687 NEXT_ORPHAN(inode) > max_ino) {
688 ext4_warning(sb, __FUNCTION__,
689 "bad orphan inode %lu! e2fsck was run?", ino);
690 printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n",
691 bit, (unsigned long long)bitmap_bh->b_blocknr,
692 ext4_test_bit(bit, bitmap_bh->b_data));
693 printk(KERN_NOTICE "inode=%p\n", inode);
694 if (inode) {
695 printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
696 is_bad_inode(inode));
697 printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
698 NEXT_ORPHAN(inode));
699 printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
700 }
701 /* Avoid freeing blocks if we got a bad deleted inode */
702 if (inode && inode->i_nlink == 0)
703 inode->i_blocks = 0;
704 iput(inode);
705 inode = NULL;
706 }
707out:
708 brelse(bitmap_bh);
709 return inode;
710}
711
712unsigned long ext4_count_free_inodes (struct super_block * sb)
713{
714 unsigned long desc_count;
715 struct ext4_group_desc *gdp;
716 int i;
717#ifdef EXT4FS_DEBUG
718 struct ext4_super_block *es;
719 unsigned long bitmap_count, x;
720 struct buffer_head *bitmap_bh = NULL;
721
722 es = EXT4_SB(sb)->s_es;
723 desc_count = 0;
724 bitmap_count = 0;
725 gdp = NULL;
726 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
727 gdp = ext4_get_group_desc (sb, i, NULL);
728 if (!gdp)
729 continue;
730 desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
731 brelse(bitmap_bh);
732 bitmap_bh = read_inode_bitmap(sb, i);
733 if (!bitmap_bh)
734 continue;
735
736 x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
737 printk("group %d: stored = %d, counted = %lu\n",
738 i, le16_to_cpu(gdp->bg_free_inodes_count), x);
739 bitmap_count += x;
740 }
741 brelse(bitmap_bh);
742 printk("ext4_count_free_inodes: stored = %u, computed = %lu, %lu\n",
743 le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
744 return desc_count;
745#else
746 desc_count = 0;
747 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
748 gdp = ext4_get_group_desc (sb, i, NULL);
749 if (!gdp)
750 continue;
751 desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
752 cond_resched();
753 }
754 return desc_count;
755#endif
756}
757
758/* Called at mount-time, super-block is locked */
759unsigned long ext4_count_dirs (struct super_block * sb)
760{
761 unsigned long count = 0;
762 int i;
763
764 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
765 struct ext4_group_desc *gdp = ext4_get_group_desc (sb, i, NULL);
766 if (!gdp)
767 continue;
768 count += le16_to_cpu(gdp->bg_used_dirs_count);
769 }
770 return count;
771}
772
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
new file mode 100644
index 000000000000..0a60ec5a16db
--- /dev/null
+++ b/fs/ext4/inode.c
@@ -0,0 +1,3233 @@
1/*
2 * linux/fs/ext4/inode.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/inode.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * Goal-directed block allocation by Stephen Tweedie
16 * (sct@redhat.com), 1993, 1998
17 * Big-endian to little-endian byte-swapping/bitmaps by
18 * David S. Miller (davem@caip.rutgers.edu), 1995
19 * 64-bit file support on 64-bit platforms by Jakub Jelinek
20 * (jj@sunsite.ms.mff.cuni.cz)
21 *
22 * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
23 */
24
25#include <linux/module.h>
26#include <linux/fs.h>
27#include <linux/time.h>
28#include <linux/ext4_jbd2.h>
29#include <linux/jbd2.h>
30#include <linux/smp_lock.h>
31#include <linux/highuid.h>
32#include <linux/pagemap.h>
33#include <linux/quotaops.h>
34#include <linux/string.h>
35#include <linux/buffer_head.h>
36#include <linux/writeback.h>
37#include <linux/mpage.h>
38#include <linux/uio.h>
39#include <linux/bio.h>
40#include "xattr.h"
41#include "acl.h"
42
43/*
44 * Test whether an inode is a fast symlink.
45 */
46static int ext4_inode_is_fast_symlink(struct inode *inode)
47{
48 int ea_blocks = EXT4_I(inode)->i_file_acl ?
49 (inode->i_sb->s_blocksize >> 9) : 0;
50
51 return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
52}
53
54/*
55 * The ext4 forget function must perform a revoke if we are freeing data
56 * which has been journaled. Metadata (eg. indirect blocks) must be
57 * revoked in all cases.
58 *
59 * "bh" may be NULL: a metadata block may have been freed from memory
60 * but there may still be a record of it in the journal, and that record
61 * still needs to be revoked.
62 */
63int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
64 struct buffer_head *bh, ext4_fsblk_t blocknr)
65{
66 int err;
67
68 might_sleep();
69
70 BUFFER_TRACE(bh, "enter");
71
72 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
73 "data mode %lx\n",
74 bh, is_metadata, inode->i_mode,
75 test_opt(inode->i_sb, DATA_FLAGS));
76
77 /* Never use the revoke function if we are doing full data
78 * journaling: there is no need to, and a V1 superblock won't
79 * support it. Otherwise, only skip the revoke on un-journaled
80 * data blocks. */
81
82 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
83 (!is_metadata && !ext4_should_journal_data(inode))) {
84 if (bh) {
85 BUFFER_TRACE(bh, "call jbd2_journal_forget");
86 return ext4_journal_forget(handle, bh);
87 }
88 return 0;
89 }
90
91 /*
92 * data!=journal && (is_metadata || should_journal_data(inode))
93 */
94 BUFFER_TRACE(bh, "call ext4_journal_revoke");
95 err = ext4_journal_revoke(handle, blocknr, bh);
96 if (err)
97 ext4_abort(inode->i_sb, __FUNCTION__,
98 "error %d when attempting revoke", err);
99 BUFFER_TRACE(bh, "exit");
100 return err;
101}
102
103/*
104 * Work out how many blocks we need to proceed with the next chunk of a
105 * truncate transaction.
106 */
107static unsigned long blocks_for_truncate(struct inode *inode)
108{
109 unsigned long needed;
110
111 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
112
113 /* Give ourselves just enough room to cope with inodes in which
114 * i_blocks is corrupt: we've seen disk corruptions in the past
115 * which resulted in random data in an inode which looked enough
116 * like a regular file for ext4 to try to delete it. Things
117 * will go a bit crazy if that happens, but at least we should
118 * try not to panic the whole kernel. */
119 if (needed < 2)
120 needed = 2;
121
122 /* But we need to bound the transaction so we don't overflow the
123 * journal. */
124 if (needed > EXT4_MAX_TRANS_DATA)
125 needed = EXT4_MAX_TRANS_DATA;
126
127 return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
128}
129
130/*
131 * Truncate transactions can be complex and absolutely huge. So we need to
132 * be able to restart the transaction at a conventient checkpoint to make
133 * sure we don't overflow the journal.
134 *
135 * start_transaction gets us a new handle for a truncate transaction,
136 * and extend_transaction tries to extend the existing one a bit. If
137 * extend fails, we need to propagate the failure up and restart the
138 * transaction in the top-level truncate loop. --sct
139 */
140static handle_t *start_transaction(struct inode *inode)
141{
142 handle_t *result;
143
144 result = ext4_journal_start(inode, blocks_for_truncate(inode));
145 if (!IS_ERR(result))
146 return result;
147
148 ext4_std_error(inode->i_sb, PTR_ERR(result));
149 return result;
150}
151
152/*
153 * Try to extend this transaction for the purposes of truncation.
154 *
155 * Returns 0 if we managed to create more room. If we can't create more
156 * room, and the transaction must be restarted we return 1.
157 */
158static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
159{
160 if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)
161 return 0;
162 if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
163 return 0;
164 return 1;
165}
166
167/*
168 * Restart the transaction associated with *handle. This does a commit,
169 * so before we call here everything must be consistently dirtied against
170 * this transaction.
171 */
172static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
173{
174 jbd_debug(2, "restarting handle %p\n", handle);
175 return ext4_journal_restart(handle, blocks_for_truncate(inode));
176}
177
178/*
179 * Called at the last iput() if i_nlink is zero.
180 */
181void ext4_delete_inode (struct inode * inode)
182{
183 handle_t *handle;
184
185 truncate_inode_pages(&inode->i_data, 0);
186
187 if (is_bad_inode(inode))
188 goto no_delete;
189
190 handle = start_transaction(inode);
191 if (IS_ERR(handle)) {
192 /*
193 * If we're going to skip the normal cleanup, we still need to
194 * make sure that the in-core orphan linked list is properly
195 * cleaned up.
196 */
197 ext4_orphan_del(NULL, inode);
198 goto no_delete;
199 }
200
201 if (IS_SYNC(inode))
202 handle->h_sync = 1;
203 inode->i_size = 0;
204 if (inode->i_blocks)
205 ext4_truncate(inode);
206 /*
207 * Kill off the orphan record which ext4_truncate created.
208 * AKPM: I think this can be inside the above `if'.
209 * Note that ext4_orphan_del() has to be able to cope with the
210 * deletion of a non-existent orphan - this is because we don't
211 * know if ext4_truncate() actually created an orphan record.
212 * (Well, we could do this if we need to, but heck - it works)
213 */
214 ext4_orphan_del(handle, inode);
215 EXT4_I(inode)->i_dtime = get_seconds();
216
217 /*
218 * One subtle ordering requirement: if anything has gone wrong
219 * (transaction abort, IO errors, whatever), then we can still
220 * do these next steps (the fs will already have been marked as
221 * having errors), but we can't free the inode if the mark_dirty
222 * fails.
223 */
224 if (ext4_mark_inode_dirty(handle, inode))
225 /* If that failed, just do the required in-core inode clear. */
226 clear_inode(inode);
227 else
228 ext4_free_inode(handle, inode);
229 ext4_journal_stop(handle);
230 return;
231no_delete:
232 clear_inode(inode); /* We must guarantee clearing of inode... */
233}
234
235typedef struct {
236 __le32 *p;
237 __le32 key;
238 struct buffer_head *bh;
239} Indirect;
240
241static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
242{
243 p->key = *(p->p = v);
244 p->bh = bh;
245}
246
247static int verify_chain(Indirect *from, Indirect *to)
248{
249 while (from <= to && from->key == *from->p)
250 from++;
251 return (from > to);
252}
253
254/**
255 * ext4_block_to_path - parse the block number into array of offsets
256 * @inode: inode in question (we are only interested in its superblock)
257 * @i_block: block number to be parsed
258 * @offsets: array to store the offsets in
259 * @boundary: set this non-zero if the referred-to block is likely to be
260 * followed (on disk) by an indirect block.
261 *
262 * To store the locations of file's data ext4 uses a data structure common
263 * for UNIX filesystems - tree of pointers anchored in the inode, with
264 * data blocks at leaves and indirect blocks in intermediate nodes.
265 * This function translates the block number into path in that tree -
266 * return value is the path length and @offsets[n] is the offset of
267 * pointer to (n+1)th node in the nth one. If @block is out of range
268 * (negative or too large) warning is printed and zero returned.
269 *
270 * Note: function doesn't find node addresses, so no IO is needed. All
271 * we need to know is the capacity of indirect blocks (taken from the
272 * inode->i_sb).
273 */
274
275/*
276 * Portability note: the last comparison (check that we fit into triple
277 * indirect block) is spelled differently, because otherwise on an
278 * architecture with 32-bit longs and 8Kb pages we might get into trouble
279 * if our filesystem had 8Kb blocks. We might use long long, but that would
280 * kill us on x86. Oh, well, at least the sign propagation does not matter -
281 * i_block would have to be negative in the very beginning, so we would not
282 * get there at all.
283 */
284
285static int ext4_block_to_path(struct inode *inode,
286 long i_block, int offsets[4], int *boundary)
287{
288 int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
289 int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
290 const long direct_blocks = EXT4_NDIR_BLOCKS,
291 indirect_blocks = ptrs,
292 double_blocks = (1 << (ptrs_bits * 2));
293 int n = 0;
294 int final = 0;
295
296 if (i_block < 0) {
297 ext4_warning (inode->i_sb, "ext4_block_to_path", "block < 0");
298 } else if (i_block < direct_blocks) {
299 offsets[n++] = i_block;
300 final = direct_blocks;
301 } else if ( (i_block -= direct_blocks) < indirect_blocks) {
302 offsets[n++] = EXT4_IND_BLOCK;
303 offsets[n++] = i_block;
304 final = ptrs;
305 } else if ((i_block -= indirect_blocks) < double_blocks) {
306 offsets[n++] = EXT4_DIND_BLOCK;
307 offsets[n++] = i_block >> ptrs_bits;
308 offsets[n++] = i_block & (ptrs - 1);
309 final = ptrs;
310 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
311 offsets[n++] = EXT4_TIND_BLOCK;
312 offsets[n++] = i_block >> (ptrs_bits * 2);
313 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
314 offsets[n++] = i_block & (ptrs - 1);
315 final = ptrs;
316 } else {
317 ext4_warning(inode->i_sb, "ext4_block_to_path", "block > big");
318 }
319 if (boundary)
320 *boundary = final - 1 - (i_block & (ptrs - 1));
321 return n;
322}
323
324/**
325 * ext4_get_branch - read the chain of indirect blocks leading to data
326 * @inode: inode in question
327 * @depth: depth of the chain (1 - direct pointer, etc.)
328 * @offsets: offsets of pointers in inode/indirect blocks
329 * @chain: place to store the result
330 * @err: here we store the error value
331 *
332 * Function fills the array of triples <key, p, bh> and returns %NULL
333 * if everything went OK or the pointer to the last filled triple
334 * (incomplete one) otherwise. Upon the return chain[i].key contains
335 * the number of (i+1)-th block in the chain (as it is stored in memory,
336 * i.e. little-endian 32-bit), chain[i].p contains the address of that
337 * number (it points into struct inode for i==0 and into the bh->b_data
338 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
339 * block for i>0 and NULL for i==0. In other words, it holds the block
340 * numbers of the chain, addresses they were taken from (and where we can
341 * verify that chain did not change) and buffer_heads hosting these
342 * numbers.
343 *
344 * Function stops when it stumbles upon zero pointer (absent block)
345 * (pointer to last triple returned, *@err == 0)
346 * or when it gets an IO error reading an indirect block
347 * (ditto, *@err == -EIO)
348 * or when it notices that chain had been changed while it was reading
349 * (ditto, *@err == -EAGAIN)
350 * or when it reads all @depth-1 indirect blocks successfully and finds
351 * the whole chain, all way to the data (returns %NULL, *err == 0).
352 */
353static Indirect *ext4_get_branch(struct inode *inode, int depth, int *offsets,
354 Indirect chain[4], int *err)
355{
356 struct super_block *sb = inode->i_sb;
357 Indirect *p = chain;
358 struct buffer_head *bh;
359
360 *err = 0;
361 /* i_data is not going away, no lock needed */
362 add_chain (chain, NULL, EXT4_I(inode)->i_data + *offsets);
363 if (!p->key)
364 goto no_block;
365 while (--depth) {
366 bh = sb_bread(sb, le32_to_cpu(p->key));
367 if (!bh)
368 goto failure;
369 /* Reader: pointers */
370 if (!verify_chain(chain, p))
371 goto changed;
372 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
373 /* Reader: end */
374 if (!p->key)
375 goto no_block;
376 }
377 return NULL;
378
379changed:
380 brelse(bh);
381 *err = -EAGAIN;
382 goto no_block;
383failure:
384 *err = -EIO;
385no_block:
386 return p;
387}
388
389/**
390 * ext4_find_near - find a place for allocation with sufficient locality
391 * @inode: owner
392 * @ind: descriptor of indirect block.
393 *
394 * This function returns the prefered place for block allocation.
395 * It is used when heuristic for sequential allocation fails.
396 * Rules are:
397 * + if there is a block to the left of our position - allocate near it.
398 * + if pointer will live in indirect block - allocate near that block.
399 * + if pointer will live in inode - allocate in the same
400 * cylinder group.
401 *
402 * In the latter case we colour the starting block by the callers PID to
403 * prevent it from clashing with concurrent allocations for a different inode
404 * in the same block group. The PID is used here so that functionally related
405 * files will be close-by on-disk.
406 *
407 * Caller must make sure that @ind is valid and will stay that way.
408 */
409static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
410{
411 struct ext4_inode_info *ei = EXT4_I(inode);
412 __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
413 __le32 *p;
414 ext4_fsblk_t bg_start;
415 ext4_grpblk_t colour;
416
417 /* Try to find previous block */
418 for (p = ind->p - 1; p >= start; p--) {
419 if (*p)
420 return le32_to_cpu(*p);
421 }
422
423 /* No such thing, so let's try location of indirect block */
424 if (ind->bh)
425 return ind->bh->b_blocknr;
426
427 /*
428 * It is going to be referred to from the inode itself? OK, just put it
429 * into the same cylinder group then.
430 */
431 bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
432 colour = (current->pid % 16) *
433 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
434 return bg_start + colour;
435}
436
437/**
438 * ext4_find_goal - find a prefered place for allocation.
439 * @inode: owner
440 * @block: block we want
441 * @chain: chain of indirect blocks
442 * @partial: pointer to the last triple within a chain
443 * @goal: place to store the result.
444 *
445 * Normally this function find the prefered place for block allocation,
446 * stores it in *@goal and returns zero.
447 */
448
449static ext4_fsblk_t ext4_find_goal(struct inode *inode, long block,
450 Indirect chain[4], Indirect *partial)
451{
452 struct ext4_block_alloc_info *block_i;
453
454 block_i = EXT4_I(inode)->i_block_alloc_info;
455
456 /*
457 * try the heuristic for sequential allocation,
458 * failing that at least try to get decent locality.
459 */
460 if (block_i && (block == block_i->last_alloc_logical_block + 1)
461 && (block_i->last_alloc_physical_block != 0)) {
462 return block_i->last_alloc_physical_block + 1;
463 }
464
465 return ext4_find_near(inode, partial);
466}
467
468/**
469 * ext4_blks_to_allocate: Look up the block map and count the number
470 * of direct blocks need to be allocated for the given branch.
471 *
472 * @branch: chain of indirect blocks
473 * @k: number of blocks need for indirect blocks
474 * @blks: number of data blocks to be mapped.
475 * @blocks_to_boundary: the offset in the indirect block
476 *
477 * return the total number of blocks to be allocate, including the
478 * direct and indirect blocks.
479 */
480static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
481 int blocks_to_boundary)
482{
483 unsigned long count = 0;
484
485 /*
486 * Simple case, [t,d]Indirect block(s) has not allocated yet
487 * then it's clear blocks on that path have not allocated
488 */
489 if (k > 0) {
490 /* right now we don't handle cross boundary allocation */
491 if (blks < blocks_to_boundary + 1)
492 count += blks;
493 else
494 count += blocks_to_boundary + 1;
495 return count;
496 }
497
498 count++;
499 while (count < blks && count <= blocks_to_boundary &&
500 le32_to_cpu(*(branch[0].p + count)) == 0) {
501 count++;
502 }
503 return count;
504}
505
506/**
507 * ext4_alloc_blocks: multiple allocate blocks needed for a branch
508 * @indirect_blks: the number of blocks need to allocate for indirect
509 * blocks
510 *
511 * @new_blocks: on return it will store the new block numbers for
512 * the indirect blocks(if needed) and the first direct block,
513 * @blks: on return it will store the total number of allocated
514 * direct blocks
515 */
516static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
517 ext4_fsblk_t goal, int indirect_blks, int blks,
518 ext4_fsblk_t new_blocks[4], int *err)
519{
520 int target, i;
521 unsigned long count = 0;
522 int index = 0;
523 ext4_fsblk_t current_block = 0;
524 int ret = 0;
525
526 /*
527 * Here we try to allocate the requested multiple blocks at once,
528 * on a best-effort basis.
529 * To build a branch, we should allocate blocks for
530 * the indirect blocks(if not allocated yet), and at least
531 * the first direct block of this branch. That's the
532 * minimum number of blocks need to allocate(required)
533 */
534 target = blks + indirect_blks;
535
536 while (1) {
537 count = target;
538 /* allocating blocks for indirect blocks and direct blocks */
539 current_block = ext4_new_blocks(handle,inode,goal,&count,err);
540 if (*err)
541 goto failed_out;
542
543 target -= count;
544 /* allocate blocks for indirect blocks */
545 while (index < indirect_blks && count) {
546 new_blocks[index++] = current_block++;
547 count--;
548 }
549
550 if (count > 0)
551 break;
552 }
553
554 /* save the new block number for the first direct block */
555 new_blocks[index] = current_block;
556
557 /* total number of blocks allocated for direct blocks */
558 ret = count;
559 *err = 0;
560 return ret;
561failed_out:
562 for (i = 0; i <index; i++)
563 ext4_free_blocks(handle, inode, new_blocks[i], 1);
564 return ret;
565}
566
567/**
568 * ext4_alloc_branch - allocate and set up a chain of blocks.
569 * @inode: owner
570 * @indirect_blks: number of allocated indirect blocks
571 * @blks: number of allocated direct blocks
572 * @offsets: offsets (in the blocks) to store the pointers to next.
573 * @branch: place to store the chain in.
574 *
575 * This function allocates blocks, zeroes out all but the last one,
576 * links them into chain and (if we are synchronous) writes them to disk.
577 * In other words, it prepares a branch that can be spliced onto the
578 * inode. It stores the information about that chain in the branch[], in
579 * the same format as ext4_get_branch() would do. We are calling it after
580 * we had read the existing part of chain and partial points to the last
581 * triple of that (one with zero ->key). Upon the exit we have the same
582 * picture as after the successful ext4_get_block(), except that in one
583 * place chain is disconnected - *branch->p is still zero (we did not
584 * set the last link), but branch->key contains the number that should
585 * be placed into *branch->p to fill that gap.
586 *
587 * If allocation fails we free all blocks we've allocated (and forget
588 * their buffer_heads) and return the error value the from failed
589 * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
590 * as described above and return 0.
591 */
592static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
593 int indirect_blks, int *blks, ext4_fsblk_t goal,
594 int *offsets, Indirect *branch)
595{
596 int blocksize = inode->i_sb->s_blocksize;
597 int i, n = 0;
598 int err = 0;
599 struct buffer_head *bh;
600 int num;
601 ext4_fsblk_t new_blocks[4];
602 ext4_fsblk_t current_block;
603
604 num = ext4_alloc_blocks(handle, inode, goal, indirect_blks,
605 *blks, new_blocks, &err);
606 if (err)
607 return err;
608
609 branch[0].key = cpu_to_le32(new_blocks[0]);
610 /*
611 * metadata blocks and data blocks are allocated.
612 */
613 for (n = 1; n <= indirect_blks; n++) {
614 /*
615 * Get buffer_head for parent block, zero it out
616 * and set the pointer to new one, then send
617 * parent to disk.
618 */
619 bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
620 branch[n].bh = bh;
621 lock_buffer(bh);
622 BUFFER_TRACE(bh, "call get_create_access");
623 err = ext4_journal_get_create_access(handle, bh);
624 if (err) {
625 unlock_buffer(bh);
626 brelse(bh);
627 goto failed;
628 }
629
630 memset(bh->b_data, 0, blocksize);
631 branch[n].p = (__le32 *) bh->b_data + offsets[n];
632 branch[n].key = cpu_to_le32(new_blocks[n]);
633 *branch[n].p = branch[n].key;
634 if ( n == indirect_blks) {
635 current_block = new_blocks[n];
636 /*
637 * End of chain, update the last new metablock of
638 * the chain to point to the new allocated
639 * data blocks numbers
640 */
641 for (i=1; i < num; i++)
642 *(branch[n].p + i) = cpu_to_le32(++current_block);
643 }
644 BUFFER_TRACE(bh, "marking uptodate");
645 set_buffer_uptodate(bh);
646 unlock_buffer(bh);
647
648 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
649 err = ext4_journal_dirty_metadata(handle, bh);
650 if (err)
651 goto failed;
652 }
653 *blks = num;
654 return err;
655failed:
656 /* Allocation failed, free what we already allocated */
657 for (i = 1; i <= n ; i++) {
658 BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
659 ext4_journal_forget(handle, branch[i].bh);
660 }
661 for (i = 0; i <indirect_blks; i++)
662 ext4_free_blocks(handle, inode, new_blocks[i], 1);
663
664 ext4_free_blocks(handle, inode, new_blocks[i], num);
665
666 return err;
667}
668
669/**
670 * ext4_splice_branch - splice the allocated branch onto inode.
671 * @inode: owner
672 * @block: (logical) number of block we are adding
673 * @chain: chain of indirect blocks (with a missing link - see
674 * ext4_alloc_branch)
675 * @where: location of missing link
676 * @num: number of indirect blocks we are adding
677 * @blks: number of direct blocks we are adding
678 *
679 * This function fills the missing link and does all housekeeping needed in
680 * inode (->i_blocks, etc.). In case of success we end up with the full
681 * chain to new block and return 0.
682 */
683static int ext4_splice_branch(handle_t *handle, struct inode *inode,
684 long block, Indirect *where, int num, int blks)
685{
686 int i;
687 int err = 0;
688 struct ext4_block_alloc_info *block_i;
689 ext4_fsblk_t current_block;
690
691 block_i = EXT4_I(inode)->i_block_alloc_info;
692 /*
693 * If we're splicing into a [td]indirect block (as opposed to the
694 * inode) then we need to get write access to the [td]indirect block
695 * before the splice.
696 */
697 if (where->bh) {
698 BUFFER_TRACE(where->bh, "get_write_access");
699 err = ext4_journal_get_write_access(handle, where->bh);
700 if (err)
701 goto err_out;
702 }
703 /* That's it */
704
705 *where->p = where->key;
706
707 /*
708 * Update the host buffer_head or inode to point to more just allocated
709 * direct blocks blocks
710 */
711 if (num == 0 && blks > 1) {
712 current_block = le32_to_cpu(where->key) + 1;
713 for (i = 1; i < blks; i++)
714 *(where->p + i ) = cpu_to_le32(current_block++);
715 }
716
717 /*
718 * update the most recently allocated logical & physical block
719 * in i_block_alloc_info, to assist find the proper goal block for next
720 * allocation
721 */
722 if (block_i) {
723 block_i->last_alloc_logical_block = block + blks - 1;
724 block_i->last_alloc_physical_block =
725 le32_to_cpu(where[num].key) + blks - 1;
726 }
727
728 /* We are done with atomic stuff, now do the rest of housekeeping */
729
730 inode->i_ctime = CURRENT_TIME_SEC;
731 ext4_mark_inode_dirty(handle, inode);
732
733 /* had we spliced it onto indirect block? */
734 if (where->bh) {
735 /*
736 * If we spliced it onto an indirect block, we haven't
737 * altered the inode. Note however that if it is being spliced
738 * onto an indirect block at the very end of the file (the
739 * file is growing) then we *will* alter the inode to reflect
740 * the new i_size. But that is not done here - it is done in
741 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
742 */
743 jbd_debug(5, "splicing indirect only\n");
744 BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata");
745 err = ext4_journal_dirty_metadata(handle, where->bh);
746 if (err)
747 goto err_out;
748 } else {
749 /*
750 * OK, we spliced it into the inode itself on a direct block.
751 * Inode was dirtied above.
752 */
753 jbd_debug(5, "splicing direct\n");
754 }
755 return err;
756
757err_out:
758 for (i = 1; i <= num; i++) {
759 BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
760 ext4_journal_forget(handle, where[i].bh);
761 ext4_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1);
762 }
763 ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
764
765 return err;
766}
767
768/*
769 * Allocation strategy is simple: if we have to allocate something, we will
770 * have to go the whole way to leaf. So let's do it before attaching anything
771 * to tree, set linkage between the newborn blocks, write them if sync is
772 * required, recheck the path, free and repeat if check fails, otherwise
773 * set the last missing link (that will protect us from any truncate-generated
774 * removals - all blocks on the path are immune now) and possibly force the
775 * write on the parent block.
776 * That has a nice additional property: no special recovery from the failed
777 * allocations is needed - we simply release blocks and do not touch anything
778 * reachable from inode.
779 *
780 * `handle' can be NULL if create == 0.
781 *
782 * The BKL may not be held on entry here. Be sure to take it early.
783 * return > 0, # of blocks mapped or allocated.
784 * return = 0, if plain lookup failed.
785 * return < 0, error case.
786 */
787int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
788 sector_t iblock, unsigned long maxblocks,
789 struct buffer_head *bh_result,
790 int create, int extend_disksize)
791{
792 int err = -EIO;
793 int offsets[4];
794 Indirect chain[4];
795 Indirect *partial;
796 ext4_fsblk_t goal;
797 int indirect_blks;
798 int blocks_to_boundary = 0;
799 int depth;
800 struct ext4_inode_info *ei = EXT4_I(inode);
801 int count = 0;
802 ext4_fsblk_t first_block = 0;
803
804
805 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
806 J_ASSERT(handle != NULL || create == 0);
807 depth = ext4_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
808
809 if (depth == 0)
810 goto out;
811
812 partial = ext4_get_branch(inode, depth, offsets, chain, &err);
813
814 /* Simplest case - block found, no allocation needed */
815 if (!partial) {
816 first_block = le32_to_cpu(chain[depth - 1].key);
817 clear_buffer_new(bh_result);
818 count++;
819 /*map more blocks*/
820 while (count < maxblocks && count <= blocks_to_boundary) {
821 ext4_fsblk_t blk;
822
823 if (!verify_chain(chain, partial)) {
824 /*
825 * Indirect block might be removed by
826 * truncate while we were reading it.
827 * Handling of that case: forget what we've
828 * got now. Flag the err as EAGAIN, so it
829 * will reread.
830 */
831 err = -EAGAIN;
832 count = 0;
833 break;
834 }
835 blk = le32_to_cpu(*(chain[depth-1].p + count));
836
837 if (blk == first_block + count)
838 count++;
839 else
840 break;
841 }
842 if (err != -EAGAIN)
843 goto got_it;
844 }
845
846 /* Next simple case - plain lookup or failed read of indirect block */
847 if (!create || err == -EIO)
848 goto cleanup;
849
850 mutex_lock(&ei->truncate_mutex);
851
852 /*
853 * If the indirect block is missing while we are reading
854 * the chain(ext4_get_branch() returns -EAGAIN err), or
855 * if the chain has been changed after we grab the semaphore,
856 * (either because another process truncated this branch, or
857 * another get_block allocated this branch) re-grab the chain to see if
858 * the request block has been allocated or not.
859 *
860 * Since we already block the truncate/other get_block
861 * at this point, we will have the current copy of the chain when we
862 * splice the branch into the tree.
863 */
864 if (err == -EAGAIN || !verify_chain(chain, partial)) {
865 while (partial > chain) {
866 brelse(partial->bh);
867 partial--;
868 }
869 partial = ext4_get_branch(inode, depth, offsets, chain, &err);
870 if (!partial) {
871 count++;
872 mutex_unlock(&ei->truncate_mutex);
873 if (err)
874 goto cleanup;
875 clear_buffer_new(bh_result);
876 goto got_it;
877 }
878 }
879
880 /*
881 * Okay, we need to do block allocation. Lazily initialize the block
882 * allocation info here if necessary
883 */
884 if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
885 ext4_init_block_alloc_info(inode);
886
887 goal = ext4_find_goal(inode, iblock, chain, partial);
888
889 /* the number of blocks need to allocate for [d,t]indirect blocks */
890 indirect_blks = (chain + depth) - partial - 1;
891
892 /*
893 * Next look up the indirect map to count the totoal number of
894 * direct blocks to allocate for this branch.
895 */
896 count = ext4_blks_to_allocate(partial, indirect_blks,
897 maxblocks, blocks_to_boundary);
898 /*
899 * Block out ext4_truncate while we alter the tree
900 */
901 err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal,
902 offsets + (partial - chain), partial);
903
904 /*
905 * The ext4_splice_branch call will free and forget any buffers
906 * on the new chain if there is a failure, but that risks using
907 * up transaction credits, especially for bitmaps where the
908 * credits cannot be returned. Can we handle this somehow? We
909 * may need to return -EAGAIN upwards in the worst case. --sct
910 */
911 if (!err)
912 err = ext4_splice_branch(handle, inode, iblock,
913 partial, indirect_blks, count);
914 /*
915 * i_disksize growing is protected by truncate_mutex. Don't forget to
916 * protect it if you're about to implement concurrent
917 * ext4_get_block() -bzzz
918 */
919 if (!err && extend_disksize && inode->i_size > ei->i_disksize)
920 ei->i_disksize = inode->i_size;
921 mutex_unlock(&ei->truncate_mutex);
922 if (err)
923 goto cleanup;
924
925 set_buffer_new(bh_result);
926got_it:
927 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
928 if (count > blocks_to_boundary)
929 set_buffer_boundary(bh_result);
930 err = count;
931 /* Clean up and exit */
932 partial = chain + depth - 1; /* the whole chain */
933cleanup:
934 while (partial > chain) {
935 BUFFER_TRACE(partial->bh, "call brelse");
936 brelse(partial->bh);
937 partial--;
938 }
939 BUFFER_TRACE(bh_result, "returned");
940out:
941 return err;
942}
943
944#define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32)
945
946static int ext4_get_block(struct inode *inode, sector_t iblock,
947 struct buffer_head *bh_result, int create)
948{
949 handle_t *handle = journal_current_handle();
950 int ret = 0;
951 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
952
953 if (!create)
954 goto get_block; /* A read */
955
956 if (max_blocks == 1)
957 goto get_block; /* A single block get */
958
959 if (handle->h_transaction->t_state == T_LOCKED) {
960 /*
961 * Huge direct-io writes can hold off commits for long
962 * periods of time. Let this commit run.
963 */
964 ext4_journal_stop(handle);
965 handle = ext4_journal_start(inode, DIO_CREDITS);
966 if (IS_ERR(handle))
967 ret = PTR_ERR(handle);
968 goto get_block;
969 }
970
971 if (handle->h_buffer_credits <= EXT4_RESERVE_TRANS_BLOCKS) {
972 /*
973 * Getting low on buffer credits...
974 */
975 ret = ext4_journal_extend(handle, DIO_CREDITS);
976 if (ret > 0) {
977 /*
978 * Couldn't extend the transaction. Start a new one.
979 */
980 ret = ext4_journal_restart(handle, DIO_CREDITS);
981 }
982 }
983
984get_block:
985 if (ret == 0) {
986 ret = ext4_get_blocks_wrap(handle, inode, iblock,
987 max_blocks, bh_result, create, 0);
988 if (ret > 0) {
989 bh_result->b_size = (ret << inode->i_blkbits);
990 ret = 0;
991 }
992 }
993 return ret;
994}
995
996/*
997 * `handle' can be NULL if create is zero
998 */
999struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
1000 long block, int create, int *errp)
1001{
1002 struct buffer_head dummy;
1003 int fatal = 0, err;
1004
1005 J_ASSERT(handle != NULL || create == 0);
1006
1007 dummy.b_state = 0;
1008 dummy.b_blocknr = -1000;
1009 buffer_trace_init(&dummy.b_history);
1010 err = ext4_get_blocks_wrap(handle, inode, block, 1,
1011 &dummy, create, 1);
1012 /*
1013 * ext4_get_blocks_handle() returns number of blocks
1014 * mapped. 0 in case of a HOLE.
1015 */
1016 if (err > 0) {
1017 if (err > 1)
1018 WARN_ON(1);
1019 err = 0;
1020 }
1021 *errp = err;
1022 if (!err && buffer_mapped(&dummy)) {
1023 struct buffer_head *bh;
1024 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
1025 if (!bh) {
1026 *errp = -EIO;
1027 goto err;
1028 }
1029 if (buffer_new(&dummy)) {
1030 J_ASSERT(create != 0);
1031 J_ASSERT(handle != 0);
1032
1033 /*
1034 * Now that we do not always journal data, we should
1035 * keep in mind whether this should always journal the
1036 * new buffer as metadata. For now, regular file
1037 * writes use ext4_get_block instead, so it's not a
1038 * problem.
1039 */
1040 lock_buffer(bh);
1041 BUFFER_TRACE(bh, "call get_create_access");
1042 fatal = ext4_journal_get_create_access(handle, bh);
1043 if (!fatal && !buffer_uptodate(bh)) {
1044 memset(bh->b_data,0,inode->i_sb->s_blocksize);
1045 set_buffer_uptodate(bh);
1046 }
1047 unlock_buffer(bh);
1048 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
1049 err = ext4_journal_dirty_metadata(handle, bh);
1050 if (!fatal)
1051 fatal = err;
1052 } else {
1053 BUFFER_TRACE(bh, "not a new buffer");
1054 }
1055 if (fatal) {
1056 *errp = fatal;
1057 brelse(bh);
1058 bh = NULL;
1059 }
1060 return bh;
1061 }
1062err:
1063 return NULL;
1064}
1065
1066struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
1067 int block, int create, int *err)
1068{
1069 struct buffer_head * bh;
1070
1071 bh = ext4_getblk(handle, inode, block, create, err);
1072 if (!bh)
1073 return bh;
1074 if (buffer_uptodate(bh))
1075 return bh;
1076 ll_rw_block(READ_META, 1, &bh);
1077 wait_on_buffer(bh);
1078 if (buffer_uptodate(bh))
1079 return bh;
1080 put_bh(bh);
1081 *err = -EIO;
1082 return NULL;
1083}
1084
1085static int walk_page_buffers( handle_t *handle,
1086 struct buffer_head *head,
1087 unsigned from,
1088 unsigned to,
1089 int *partial,
1090 int (*fn)( handle_t *handle,
1091 struct buffer_head *bh))
1092{
1093 struct buffer_head *bh;
1094 unsigned block_start, block_end;
1095 unsigned blocksize = head->b_size;
1096 int err, ret = 0;
1097 struct buffer_head *next;
1098
1099 for ( bh = head, block_start = 0;
1100 ret == 0 && (bh != head || !block_start);
1101 block_start = block_end, bh = next)
1102 {
1103 next = bh->b_this_page;
1104 block_end = block_start + blocksize;
1105 if (block_end <= from || block_start >= to) {
1106 if (partial && !buffer_uptodate(bh))
1107 *partial = 1;
1108 continue;
1109 }
1110 err = (*fn)(handle, bh);
1111 if (!ret)
1112 ret = err;
1113 }
1114 return ret;
1115}
1116
1117/*
1118 * To preserve ordering, it is essential that the hole instantiation and
1119 * the data write be encapsulated in a single transaction. We cannot
1120 * close off a transaction and start a new one between the ext4_get_block()
1121 * and the commit_write(). So doing the jbd2_journal_start at the start of
1122 * prepare_write() is the right place.
1123 *
1124 * Also, this function can nest inside ext4_writepage() ->
1125 * block_write_full_page(). In that case, we *know* that ext4_writepage()
1126 * has generated enough buffer credits to do the whole page. So we won't
1127 * block on the journal in that case, which is good, because the caller may
1128 * be PF_MEMALLOC.
1129 *
1130 * By accident, ext4 can be reentered when a transaction is open via
1131 * quota file writes. If we were to commit the transaction while thus
1132 * reentered, there can be a deadlock - we would be holding a quota
1133 * lock, and the commit would never complete if another thread had a
1134 * transaction open and was blocking on the quota lock - a ranking
1135 * violation.
1136 *
1137 * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
1138 * will _not_ run commit under these circumstances because handle->h_ref
1139 * is elevated. We'll still have enough credits for the tiny quotafile
1140 * write.
1141 */
1142static int do_journal_get_write_access(handle_t *handle,
1143 struct buffer_head *bh)
1144{
1145 if (!buffer_mapped(bh) || buffer_freed(bh))
1146 return 0;
1147 return ext4_journal_get_write_access(handle, bh);
1148}
1149
1150static int ext4_prepare_write(struct file *file, struct page *page,
1151 unsigned from, unsigned to)
1152{
1153 struct inode *inode = page->mapping->host;
1154 int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
1155 handle_t *handle;
1156 int retries = 0;
1157
1158retry:
1159 handle = ext4_journal_start(inode, needed_blocks);
1160 if (IS_ERR(handle)) {
1161 ret = PTR_ERR(handle);
1162 goto out;
1163 }
1164 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
1165 ret = nobh_prepare_write(page, from, to, ext4_get_block);
1166 else
1167 ret = block_prepare_write(page, from, to, ext4_get_block);
1168 if (ret)
1169 goto prepare_write_failed;
1170
1171 if (ext4_should_journal_data(inode)) {
1172 ret = walk_page_buffers(handle, page_buffers(page),
1173 from, to, NULL, do_journal_get_write_access);
1174 }
1175prepare_write_failed:
1176 if (ret)
1177 ext4_journal_stop(handle);
1178 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
1179 goto retry;
1180out:
1181 return ret;
1182}
1183
1184int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1185{
1186 int err = jbd2_journal_dirty_data(handle, bh);
1187 if (err)
1188 ext4_journal_abort_handle(__FUNCTION__, __FUNCTION__,
1189 bh, handle,err);
1190 return err;
1191}
1192
1193/* For commit_write() in data=journal mode */
1194static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
1195{
1196 if (!buffer_mapped(bh) || buffer_freed(bh))
1197 return 0;
1198 set_buffer_uptodate(bh);
1199 return ext4_journal_dirty_metadata(handle, bh);
1200}
1201
1202/*
1203 * We need to pick up the new inode size which generic_commit_write gave us
1204 * `file' can be NULL - eg, when called from page_symlink().
1205 *
1206 * ext4 never places buffers on inode->i_mapping->private_list. metadata
1207 * buffers are managed internally.
1208 */
1209static int ext4_ordered_commit_write(struct file *file, struct page *page,
1210 unsigned from, unsigned to)
1211{
1212 handle_t *handle = ext4_journal_current_handle();
1213 struct inode *inode = page->mapping->host;
1214 int ret = 0, ret2;
1215
1216 ret = walk_page_buffers(handle, page_buffers(page),
1217 from, to, NULL, ext4_journal_dirty_data);
1218
1219 if (ret == 0) {
1220 /*
1221 * generic_commit_write() will run mark_inode_dirty() if i_size
1222 * changes. So let's piggyback the i_disksize mark_inode_dirty
1223 * into that.
1224 */
1225 loff_t new_i_size;
1226
1227 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1228 if (new_i_size > EXT4_I(inode)->i_disksize)
1229 EXT4_I(inode)->i_disksize = new_i_size;
1230 ret = generic_commit_write(file, page, from, to);
1231 }
1232 ret2 = ext4_journal_stop(handle);
1233 if (!ret)
1234 ret = ret2;
1235 return ret;
1236}
1237
1238static int ext4_writeback_commit_write(struct file *file, struct page *page,
1239 unsigned from, unsigned to)
1240{
1241 handle_t *handle = ext4_journal_current_handle();
1242 struct inode *inode = page->mapping->host;
1243 int ret = 0, ret2;
1244 loff_t new_i_size;
1245
1246 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1247 if (new_i_size > EXT4_I(inode)->i_disksize)
1248 EXT4_I(inode)->i_disksize = new_i_size;
1249
1250 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
1251 ret = nobh_commit_write(file, page, from, to);
1252 else
1253 ret = generic_commit_write(file, page, from, to);
1254
1255 ret2 = ext4_journal_stop(handle);
1256 if (!ret)
1257 ret = ret2;
1258 return ret;
1259}
1260
1261static int ext4_journalled_commit_write(struct file *file,
1262 struct page *page, unsigned from, unsigned to)
1263{
1264 handle_t *handle = ext4_journal_current_handle();
1265 struct inode *inode = page->mapping->host;
1266 int ret = 0, ret2;
1267 int partial = 0;
1268 loff_t pos;
1269
1270 /*
1271 * Here we duplicate the generic_commit_write() functionality
1272 */
1273 pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1274
1275 ret = walk_page_buffers(handle, page_buffers(page), from,
1276 to, &partial, commit_write_fn);
1277 if (!partial)
1278 SetPageUptodate(page);
1279 if (pos > inode->i_size)
1280 i_size_write(inode, pos);
1281 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
1282 if (inode->i_size > EXT4_I(inode)->i_disksize) {
1283 EXT4_I(inode)->i_disksize = inode->i_size;
1284 ret2 = ext4_mark_inode_dirty(handle, inode);
1285 if (!ret)
1286 ret = ret2;
1287 }
1288 ret2 = ext4_journal_stop(handle);
1289 if (!ret)
1290 ret = ret2;
1291 return ret;
1292}
1293
1294/*
1295 * bmap() is special. It gets used by applications such as lilo and by
1296 * the swapper to find the on-disk block of a specific piece of data.
1297 *
1298 * Naturally, this is dangerous if the block concerned is still in the
1299 * journal. If somebody makes a swapfile on an ext4 data-journaling
1300 * filesystem and enables swap, then they may get a nasty shock when the
1301 * data getting swapped to that swapfile suddenly gets overwritten by
1302 * the original zero's written out previously to the journal and
1303 * awaiting writeback in the kernel's buffer cache.
1304 *
1305 * So, if we see any bmap calls here on a modified, data-journaled file,
1306 * take extra steps to flush any blocks which might be in the cache.
1307 */
1308static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
1309{
1310 struct inode *inode = mapping->host;
1311 journal_t *journal;
1312 int err;
1313
1314 if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
1315 /*
1316 * This is a REALLY heavyweight approach, but the use of
1317 * bmap on dirty files is expected to be extremely rare:
1318 * only if we run lilo or swapon on a freshly made file
1319 * do we expect this to happen.
1320 *
1321 * (bmap requires CAP_SYS_RAWIO so this does not
1322 * represent an unprivileged user DOS attack --- we'd be
1323 * in trouble if mortal users could trigger this path at
1324 * will.)
1325 *
1326 * NB. EXT4_STATE_JDATA is not set on files other than
1327 * regular files. If somebody wants to bmap a directory
1328 * or symlink and gets confused because the buffer
1329 * hasn't yet been flushed to disk, they deserve
1330 * everything they get.
1331 */
1332
1333 EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA;
1334 journal = EXT4_JOURNAL(inode);
1335 jbd2_journal_lock_updates(journal);
1336 err = jbd2_journal_flush(journal);
1337 jbd2_journal_unlock_updates(journal);
1338
1339 if (err)
1340 return 0;
1341 }
1342
1343 return generic_block_bmap(mapping,block,ext4_get_block);
1344}
1345
1346static int bget_one(handle_t *handle, struct buffer_head *bh)
1347{
1348 get_bh(bh);
1349 return 0;
1350}
1351
1352static int bput_one(handle_t *handle, struct buffer_head *bh)
1353{
1354 put_bh(bh);
1355 return 0;
1356}
1357
1358static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1359{
1360 if (buffer_mapped(bh))
1361 return ext4_journal_dirty_data(handle, bh);
1362 return 0;
1363}
1364
1365/*
1366 * Note that we always start a transaction even if we're not journalling
1367 * data. This is to preserve ordering: any hole instantiation within
1368 * __block_write_full_page -> ext4_get_block() should be journalled
1369 * along with the data so we don't crash and then get metadata which
1370 * refers to old data.
1371 *
1372 * In all journalling modes block_write_full_page() will start the I/O.
1373 *
1374 * Problem:
1375 *
1376 * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
1377 * ext4_writepage()
1378 *
1379 * Similar for:
1380 *
1381 * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ...
1382 *
1383 * Same applies to ext4_get_block(). We will deadlock on various things like
1384 * lock_journal and i_truncate_mutex.
1385 *
1386 * Setting PF_MEMALLOC here doesn't work - too many internal memory
1387 * allocations fail.
1388 *
1389 * 16May01: If we're reentered then journal_current_handle() will be
1390 * non-zero. We simply *return*.
1391 *
1392 * 1 July 2001: @@@ FIXME:
1393 * In journalled data mode, a data buffer may be metadata against the
1394 * current transaction. But the same file is part of a shared mapping
1395 * and someone does a writepage() on it.
1396 *
1397 * We will move the buffer onto the async_data list, but *after* it has
1398 * been dirtied. So there's a small window where we have dirty data on
1399 * BJ_Metadata.
1400 *
1401 * Note that this only applies to the last partial page in the file. The
1402 * bit which block_write_full_page() uses prepare/commit for. (That's
1403 * broken code anyway: it's wrong for msync()).
1404 *
1405 * It's a rare case: affects the final partial page, for journalled data
1406 * where the file is subject to bith write() and writepage() in the same
1407 * transction. To fix it we'll need a custom block_write_full_page().
1408 * We'll probably need that anyway for journalling writepage() output.
1409 *
1410 * We don't honour synchronous mounts for writepage(). That would be
1411 * disastrous. Any write() or metadata operation will sync the fs for
1412 * us.
1413 *
1414 * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
1415 * we don't need to open a transaction here.
1416 */
1417static int ext4_ordered_writepage(struct page *page,
1418 struct writeback_control *wbc)
1419{
1420 struct inode *inode = page->mapping->host;
1421 struct buffer_head *page_bufs;
1422 handle_t *handle = NULL;
1423 int ret = 0;
1424 int err;
1425
1426 J_ASSERT(PageLocked(page));
1427
1428 /*
1429 * We give up here if we're reentered, because it might be for a
1430 * different filesystem.
1431 */
1432 if (ext4_journal_current_handle())
1433 goto out_fail;
1434
1435 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
1436
1437 if (IS_ERR(handle)) {
1438 ret = PTR_ERR(handle);
1439 goto out_fail;
1440 }
1441
1442 if (!page_has_buffers(page)) {
1443 create_empty_buffers(page, inode->i_sb->s_blocksize,
1444 (1 << BH_Dirty)|(1 << BH_Uptodate));
1445 }
1446 page_bufs = page_buffers(page);
1447 walk_page_buffers(handle, page_bufs, 0,
1448 PAGE_CACHE_SIZE, NULL, bget_one);
1449
1450 ret = block_write_full_page(page, ext4_get_block, wbc);
1451
1452 /*
1453 * The page can become unlocked at any point now, and
1454 * truncate can then come in and change things. So we
1455 * can't touch *page from now on. But *page_bufs is
1456 * safe due to elevated refcount.
1457 */
1458
1459 /*
1460 * And attach them to the current transaction. But only if
1461 * block_write_full_page() succeeded. Otherwise they are unmapped,
1462 * and generally junk.
1463 */
1464 if (ret == 0) {
1465 err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
1466 NULL, jbd2_journal_dirty_data_fn);
1467 if (!ret)
1468 ret = err;
1469 }
1470 walk_page_buffers(handle, page_bufs, 0,
1471 PAGE_CACHE_SIZE, NULL, bput_one);
1472 err = ext4_journal_stop(handle);
1473 if (!ret)
1474 ret = err;
1475 return ret;
1476
1477out_fail:
1478 redirty_page_for_writepage(wbc, page);
1479 unlock_page(page);
1480 return ret;
1481}
1482
1483static int ext4_writeback_writepage(struct page *page,
1484 struct writeback_control *wbc)
1485{
1486 struct inode *inode = page->mapping->host;
1487 handle_t *handle = NULL;
1488 int ret = 0;
1489 int err;
1490
1491 if (ext4_journal_current_handle())
1492 goto out_fail;
1493
1494 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
1495 if (IS_ERR(handle)) {
1496 ret = PTR_ERR(handle);
1497 goto out_fail;
1498 }
1499
1500 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
1501 ret = nobh_writepage(page, ext4_get_block, wbc);
1502 else
1503 ret = block_write_full_page(page, ext4_get_block, wbc);
1504
1505 err = ext4_journal_stop(handle);
1506 if (!ret)
1507 ret = err;
1508 return ret;
1509
1510out_fail:
1511 redirty_page_for_writepage(wbc, page);
1512 unlock_page(page);
1513 return ret;
1514}
1515
1516static int ext4_journalled_writepage(struct page *page,
1517 struct writeback_control *wbc)
1518{
1519 struct inode *inode = page->mapping->host;
1520 handle_t *handle = NULL;
1521 int ret = 0;
1522 int err;
1523
1524 if (ext4_journal_current_handle())
1525 goto no_write;
1526
1527 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
1528 if (IS_ERR(handle)) {
1529 ret = PTR_ERR(handle);
1530 goto no_write;
1531 }
1532
1533 if (!page_has_buffers(page) || PageChecked(page)) {
1534 /*
1535 * It's mmapped pagecache. Add buffers and journal it. There
1536 * doesn't seem much point in redirtying the page here.
1537 */
1538 ClearPageChecked(page);
1539 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
1540 ext4_get_block);
1541 if (ret != 0) {
1542 ext4_journal_stop(handle);
1543 goto out_unlock;
1544 }
1545 ret = walk_page_buffers(handle, page_buffers(page), 0,
1546 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1547
1548 err = walk_page_buffers(handle, page_buffers(page), 0,
1549 PAGE_CACHE_SIZE, NULL, commit_write_fn);
1550 if (ret == 0)
1551 ret = err;
1552 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
1553 unlock_page(page);
1554 } else {
1555 /*
1556 * It may be a page full of checkpoint-mode buffers. We don't
1557 * really know unless we go poke around in the buffer_heads.
1558 * But block_write_full_page will do the right thing.
1559 */
1560 ret = block_write_full_page(page, ext4_get_block, wbc);
1561 }
1562 err = ext4_journal_stop(handle);
1563 if (!ret)
1564 ret = err;
1565out:
1566 return ret;
1567
1568no_write:
1569 redirty_page_for_writepage(wbc, page);
1570out_unlock:
1571 unlock_page(page);
1572 goto out;
1573}
1574
1575static int ext4_readpage(struct file *file, struct page *page)
1576{
1577 return mpage_readpage(page, ext4_get_block);
1578}
1579
1580static int
1581ext4_readpages(struct file *file, struct address_space *mapping,
1582 struct list_head *pages, unsigned nr_pages)
1583{
1584 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
1585}
1586
1587static void ext4_invalidatepage(struct page *page, unsigned long offset)
1588{
1589 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
1590
1591 /*
1592 * If it's a full truncate we just forget about the pending dirtying
1593 */
1594 if (offset == 0)
1595 ClearPageChecked(page);
1596
1597 jbd2_journal_invalidatepage(journal, page, offset);
1598}
1599
1600static int ext4_releasepage(struct page *page, gfp_t wait)
1601{
1602 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
1603
1604 WARN_ON(PageChecked(page));
1605 if (!page_has_buffers(page))
1606 return 0;
1607 return jbd2_journal_try_to_free_buffers(journal, page, wait);
1608}
1609
1610/*
1611 * If the O_DIRECT write will extend the file then add this inode to the
1612 * orphan list. So recovery will truncate it back to the original size
1613 * if the machine crashes during the write.
1614 *
1615 * If the O_DIRECT write is intantiating holes inside i_size and the machine
1616 * crashes then stale disk data _may_ be exposed inside the file.
1617 */
1618static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
1619 const struct iovec *iov, loff_t offset,
1620 unsigned long nr_segs)
1621{
1622 struct file *file = iocb->ki_filp;
1623 struct inode *inode = file->f_mapping->host;
1624 struct ext4_inode_info *ei = EXT4_I(inode);
1625 handle_t *handle = NULL;
1626 ssize_t ret;
1627 int orphan = 0;
1628 size_t count = iov_length(iov, nr_segs);
1629
1630 if (rw == WRITE) {
1631 loff_t final_size = offset + count;
1632
1633 handle = ext4_journal_start(inode, DIO_CREDITS);
1634 if (IS_ERR(handle)) {
1635 ret = PTR_ERR(handle);
1636 goto out;
1637 }
1638 if (final_size > inode->i_size) {
1639 ret = ext4_orphan_add(handle, inode);
1640 if (ret)
1641 goto out_stop;
1642 orphan = 1;
1643 ei->i_disksize = inode->i_size;
1644 }
1645 }
1646
1647 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
1648 offset, nr_segs,
1649 ext4_get_block, NULL);
1650
1651 /*
1652 * Reacquire the handle: ext4_get_block() can restart the transaction
1653 */
1654 handle = journal_current_handle();
1655
1656out_stop:
1657 if (handle) {
1658 int err;
1659
1660 if (orphan && inode->i_nlink)
1661 ext4_orphan_del(handle, inode);
1662 if (orphan && ret > 0) {
1663 loff_t end = offset + ret;
1664 if (end > inode->i_size) {
1665 ei->i_disksize = end;
1666 i_size_write(inode, end);
1667 /*
1668 * We're going to return a positive `ret'
1669 * here due to non-zero-length I/O, so there's
1670 * no way of reporting error returns from
1671 * ext4_mark_inode_dirty() to userspace. So
1672 * ignore it.
1673 */
1674 ext4_mark_inode_dirty(handle, inode);
1675 }
1676 }
1677 err = ext4_journal_stop(handle);
1678 if (ret == 0)
1679 ret = err;
1680 }
1681out:
1682 return ret;
1683}
1684
1685/*
1686 * Pages can be marked dirty completely asynchronously from ext4's journalling
1687 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
1688 * much here because ->set_page_dirty is called under VFS locks. The page is
1689 * not necessarily locked.
1690 *
1691 * We cannot just dirty the page and leave attached buffers clean, because the
1692 * buffers' dirty state is "definitive". We cannot just set the buffers dirty
1693 * or jbddirty because all the journalling code will explode.
1694 *
1695 * So what we do is to mark the page "pending dirty" and next time writepage
1696 * is called, propagate that into the buffers appropriately.
1697 */
1698static int ext4_journalled_set_page_dirty(struct page *page)
1699{
1700 SetPageChecked(page);
1701 return __set_page_dirty_nobuffers(page);
1702}
1703
1704static const struct address_space_operations ext4_ordered_aops = {
1705 .readpage = ext4_readpage,
1706 .readpages = ext4_readpages,
1707 .writepage = ext4_ordered_writepage,
1708 .sync_page = block_sync_page,
1709 .prepare_write = ext4_prepare_write,
1710 .commit_write = ext4_ordered_commit_write,
1711 .bmap = ext4_bmap,
1712 .invalidatepage = ext4_invalidatepage,
1713 .releasepage = ext4_releasepage,
1714 .direct_IO = ext4_direct_IO,
1715 .migratepage = buffer_migrate_page,
1716};
1717
1718static const struct address_space_operations ext4_writeback_aops = {
1719 .readpage = ext4_readpage,
1720 .readpages = ext4_readpages,
1721 .writepage = ext4_writeback_writepage,
1722 .sync_page = block_sync_page,
1723 .prepare_write = ext4_prepare_write,
1724 .commit_write = ext4_writeback_commit_write,
1725 .bmap = ext4_bmap,
1726 .invalidatepage = ext4_invalidatepage,
1727 .releasepage = ext4_releasepage,
1728 .direct_IO = ext4_direct_IO,
1729 .migratepage = buffer_migrate_page,
1730};
1731
1732static const struct address_space_operations ext4_journalled_aops = {
1733 .readpage = ext4_readpage,
1734 .readpages = ext4_readpages,
1735 .writepage = ext4_journalled_writepage,
1736 .sync_page = block_sync_page,
1737 .prepare_write = ext4_prepare_write,
1738 .commit_write = ext4_journalled_commit_write,
1739 .set_page_dirty = ext4_journalled_set_page_dirty,
1740 .bmap = ext4_bmap,
1741 .invalidatepage = ext4_invalidatepage,
1742 .releasepage = ext4_releasepage,
1743};
1744
1745void ext4_set_aops(struct inode *inode)
1746{
1747 if (ext4_should_order_data(inode))
1748 inode->i_mapping->a_ops = &ext4_ordered_aops;
1749 else if (ext4_should_writeback_data(inode))
1750 inode->i_mapping->a_ops = &ext4_writeback_aops;
1751 else
1752 inode->i_mapping->a_ops = &ext4_journalled_aops;
1753}
1754
1755/*
1756 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
1757 * up to the end of the block which corresponds to `from'.
1758 * This required during truncate. We need to physically zero the tail end
1759 * of that block so it doesn't yield old data if the file is later grown.
1760 */
1761int ext4_block_truncate_page(handle_t *handle, struct page *page,
1762 struct address_space *mapping, loff_t from)
1763{
1764 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
1765 unsigned offset = from & (PAGE_CACHE_SIZE-1);
1766 unsigned blocksize, iblock, length, pos;
1767 struct inode *inode = mapping->host;
1768 struct buffer_head *bh;
1769 int err = 0;
1770 void *kaddr;
1771
1772 blocksize = inode->i_sb->s_blocksize;
1773 length = blocksize - (offset & (blocksize - 1));
1774 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1775
1776 /*
1777 * For "nobh" option, we can only work if we don't need to
1778 * read-in the page - otherwise we create buffers to do the IO.
1779 */
1780 if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
1781 ext4_should_writeback_data(inode) && PageUptodate(page)) {
1782 kaddr = kmap_atomic(page, KM_USER0);
1783 memset(kaddr + offset, 0, length);
1784 flush_dcache_page(page);
1785 kunmap_atomic(kaddr, KM_USER0);
1786 set_page_dirty(page);
1787 goto unlock;
1788 }
1789
1790 if (!page_has_buffers(page))
1791 create_empty_buffers(page, blocksize, 0);
1792
1793 /* Find the buffer that contains "offset" */
1794 bh = page_buffers(page);
1795 pos = blocksize;
1796 while (offset >= pos) {
1797 bh = bh->b_this_page;
1798 iblock++;
1799 pos += blocksize;
1800 }
1801
1802 err = 0;
1803 if (buffer_freed(bh)) {
1804 BUFFER_TRACE(bh, "freed: skip");
1805 goto unlock;
1806 }
1807
1808 if (!buffer_mapped(bh)) {
1809 BUFFER_TRACE(bh, "unmapped");
1810 ext4_get_block(inode, iblock, bh, 0);
1811 /* unmapped? It's a hole - nothing to do */
1812 if (!buffer_mapped(bh)) {
1813 BUFFER_TRACE(bh, "still unmapped");
1814 goto unlock;
1815 }
1816 }
1817
1818 /* Ok, it's mapped. Make sure it's up-to-date */
1819 if (PageUptodate(page))
1820 set_buffer_uptodate(bh);
1821
1822 if (!buffer_uptodate(bh)) {
1823 err = -EIO;
1824 ll_rw_block(READ, 1, &bh);
1825 wait_on_buffer(bh);
1826 /* Uhhuh. Read error. Complain and punt. */
1827 if (!buffer_uptodate(bh))
1828 goto unlock;
1829 }
1830
1831 if (ext4_should_journal_data(inode)) {
1832 BUFFER_TRACE(bh, "get write access");
1833 err = ext4_journal_get_write_access(handle, bh);
1834 if (err)
1835 goto unlock;
1836 }
1837
1838 kaddr = kmap_atomic(page, KM_USER0);
1839 memset(kaddr + offset, 0, length);
1840 flush_dcache_page(page);
1841 kunmap_atomic(kaddr, KM_USER0);
1842
1843 BUFFER_TRACE(bh, "zeroed end of block");
1844
1845 err = 0;
1846 if (ext4_should_journal_data(inode)) {
1847 err = ext4_journal_dirty_metadata(handle, bh);
1848 } else {
1849 if (ext4_should_order_data(inode))
1850 err = ext4_journal_dirty_data(handle, bh);
1851 mark_buffer_dirty(bh);
1852 }
1853
1854unlock:
1855 unlock_page(page);
1856 page_cache_release(page);
1857 return err;
1858}
1859
1860/*
1861 * Probably it should be a library function... search for first non-zero word
1862 * or memcmp with zero_page, whatever is better for particular architecture.
1863 * Linus?
1864 */
1865static inline int all_zeroes(__le32 *p, __le32 *q)
1866{
1867 while (p < q)
1868 if (*p++)
1869 return 0;
1870 return 1;
1871}
1872
1873/**
1874 * ext4_find_shared - find the indirect blocks for partial truncation.
1875 * @inode: inode in question
1876 * @depth: depth of the affected branch
1877 * @offsets: offsets of pointers in that branch (see ext4_block_to_path)
1878 * @chain: place to store the pointers to partial indirect blocks
1879 * @top: place to the (detached) top of branch
1880 *
1881 * This is a helper function used by ext4_truncate().
1882 *
1883 * When we do truncate() we may have to clean the ends of several
1884 * indirect blocks but leave the blocks themselves alive. Block is
1885 * partially truncated if some data below the new i_size is refered
1886 * from it (and it is on the path to the first completely truncated
1887 * data block, indeed). We have to free the top of that path along
1888 * with everything to the right of the path. Since no allocation
1889 * past the truncation point is possible until ext4_truncate()
1890 * finishes, we may safely do the latter, but top of branch may
1891 * require special attention - pageout below the truncation point
1892 * might try to populate it.
1893 *
1894 * We atomically detach the top of branch from the tree, store the
1895 * block number of its root in *@top, pointers to buffer_heads of
1896 * partially truncated blocks - in @chain[].bh and pointers to
1897 * their last elements that should not be removed - in
1898 * @chain[].p. Return value is the pointer to last filled element
1899 * of @chain.
1900 *
1901 * The work left to caller to do the actual freeing of subtrees:
1902 * a) free the subtree starting from *@top
1903 * b) free the subtrees whose roots are stored in
1904 * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
1905 * c) free the subtrees growing from the inode past the @chain[0].
1906 * (no partially truncated stuff there). */
1907
1908static Indirect *ext4_find_shared(struct inode *inode, int depth,
1909 int offsets[4], Indirect chain[4], __le32 *top)
1910{
1911 Indirect *partial, *p;
1912 int k, err;
1913
1914 *top = 0;
1915 /* Make k index the deepest non-null offest + 1 */
1916 for (k = depth; k > 1 && !offsets[k-1]; k--)
1917 ;
1918 partial = ext4_get_branch(inode, k, offsets, chain, &err);
1919 /* Writer: pointers */
1920 if (!partial)
1921 partial = chain + k-1;
1922 /*
1923 * If the branch acquired continuation since we've looked at it -
1924 * fine, it should all survive and (new) top doesn't belong to us.
1925 */
1926 if (!partial->key && *partial->p)
1927 /* Writer: end */
1928 goto no_top;
1929 for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
1930 ;
1931 /*
1932 * OK, we've found the last block that must survive. The rest of our
1933 * branch should be detached before unlocking. However, if that rest
1934 * of branch is all ours and does not grow immediately from the inode
1935 * it's easier to cheat and just decrement partial->p.
1936 */
1937 if (p == chain + k - 1 && p > chain) {
1938 p->p--;
1939 } else {
1940 *top = *p->p;
1941 /* Nope, don't do this in ext4. Must leave the tree intact */
1942#if 0
1943 *p->p = 0;
1944#endif
1945 }
1946 /* Writer: end */
1947
1948 while(partial > p) {
1949 brelse(partial->bh);
1950 partial--;
1951 }
1952no_top:
1953 return partial;
1954}
1955
1956/*
1957 * Zero a number of block pointers in either an inode or an indirect block.
1958 * If we restart the transaction we must again get write access to the
1959 * indirect block for further modification.
1960 *
1961 * We release `count' blocks on disk, but (last - first) may be greater
1962 * than `count' because there can be holes in there.
1963 */
1964static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
1965 struct buffer_head *bh, ext4_fsblk_t block_to_free,
1966 unsigned long count, __le32 *first, __le32 *last)
1967{
1968 __le32 *p;
1969 if (try_to_extend_transaction(handle, inode)) {
1970 if (bh) {
1971 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
1972 ext4_journal_dirty_metadata(handle, bh);
1973 }
1974 ext4_mark_inode_dirty(handle, inode);
1975 ext4_journal_test_restart(handle, inode);
1976 if (bh) {
1977 BUFFER_TRACE(bh, "retaking write access");
1978 ext4_journal_get_write_access(handle, bh);
1979 }
1980 }
1981
1982 /*
1983 * Any buffers which are on the journal will be in memory. We find
1984 * them on the hash table so jbd2_journal_revoke() will run jbd2_journal_forget()
1985 * on them. We've already detached each block from the file, so
1986 * bforget() in jbd2_journal_forget() should be safe.
1987 *
1988 * AKPM: turn on bforget in jbd2_journal_forget()!!!
1989 */
1990 for (p = first; p < last; p++) {
1991 u32 nr = le32_to_cpu(*p);
1992 if (nr) {
1993 struct buffer_head *bh;
1994
1995 *p = 0;
1996 bh = sb_find_get_block(inode->i_sb, nr);
1997 ext4_forget(handle, 0, inode, bh, nr);
1998 }
1999 }
2000
2001 ext4_free_blocks(handle, inode, block_to_free, count);
2002}
2003
2004/**
2005 * ext4_free_data - free a list of data blocks
2006 * @handle: handle for this transaction
2007 * @inode: inode we are dealing with
2008 * @this_bh: indirect buffer_head which contains *@first and *@last
2009 * @first: array of block numbers
2010 * @last: points immediately past the end of array
2011 *
2012 * We are freeing all blocks refered from that array (numbers are stored as
2013 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
2014 *
2015 * We accumulate contiguous runs of blocks to free. Conveniently, if these
2016 * blocks are contiguous then releasing them at one time will only affect one
2017 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
2018 * actually use a lot of journal space.
2019 *
2020 * @this_bh will be %NULL if @first and @last point into the inode's direct
2021 * block pointers.
2022 */
2023static void ext4_free_data(handle_t *handle, struct inode *inode,
2024 struct buffer_head *this_bh,
2025 __le32 *first, __le32 *last)
2026{
2027 ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */
2028 unsigned long count = 0; /* Number of blocks in the run */
2029 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind
2030 corresponding to
2031 block_to_free */
2032 ext4_fsblk_t nr; /* Current block # */
2033 __le32 *p; /* Pointer into inode/ind
2034 for current block */
2035 int err;
2036
2037 if (this_bh) { /* For indirect block */
2038 BUFFER_TRACE(this_bh, "get_write_access");
2039 err = ext4_journal_get_write_access(handle, this_bh);
2040 /* Important: if we can't update the indirect pointers
2041 * to the blocks, we can't free them. */
2042 if (err)
2043 return;
2044 }
2045
2046 for (p = first; p < last; p++) {
2047 nr = le32_to_cpu(*p);
2048 if (nr) {
2049 /* accumulate blocks to free if they're contiguous */
2050 if (count == 0) {
2051 block_to_free = nr;
2052 block_to_free_p = p;
2053 count = 1;
2054 } else if (nr == block_to_free + count) {
2055 count++;
2056 } else {
2057 ext4_clear_blocks(handle, inode, this_bh,
2058 block_to_free,
2059 count, block_to_free_p, p);
2060 block_to_free = nr;
2061 block_to_free_p = p;
2062 count = 1;
2063 }
2064 }
2065 }
2066
2067 if (count > 0)
2068 ext4_clear_blocks(handle, inode, this_bh, block_to_free,
2069 count, block_to_free_p, p);
2070
2071 if (this_bh) {
2072 BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
2073 ext4_journal_dirty_metadata(handle, this_bh);
2074 }
2075}
2076
2077/**
2078 * ext4_free_branches - free an array of branches
2079 * @handle: JBD handle for this transaction
2080 * @inode: inode we are dealing with
2081 * @parent_bh: the buffer_head which contains *@first and *@last
2082 * @first: array of block numbers
2083 * @last: pointer immediately past the end of array
2084 * @depth: depth of the branches to free
2085 *
2086 * We are freeing all blocks refered from these branches (numbers are
2087 * stored as little-endian 32-bit) and updating @inode->i_blocks
2088 * appropriately.
2089 */
2090static void ext4_free_branches(handle_t *handle, struct inode *inode,
2091 struct buffer_head *parent_bh,
2092 __le32 *first, __le32 *last, int depth)
2093{
2094 ext4_fsblk_t nr;
2095 __le32 *p;
2096
2097 if (is_handle_aborted(handle))
2098 return;
2099
2100 if (depth--) {
2101 struct buffer_head *bh;
2102 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
2103 p = last;
2104 while (--p >= first) {
2105 nr = le32_to_cpu(*p);
2106 if (!nr)
2107 continue; /* A hole */
2108
2109 /* Go read the buffer for the next level down */
2110 bh = sb_bread(inode->i_sb, nr);
2111
2112 /*
2113 * A read failure? Report error and clear slot
2114 * (should be rare).
2115 */
2116 if (!bh) {
2117 ext4_error(inode->i_sb, "ext4_free_branches",
2118 "Read failure, inode=%lu, block=%llu",
2119 inode->i_ino, nr);
2120 continue;
2121 }
2122
2123 /* This zaps the entire block. Bottom up. */
2124 BUFFER_TRACE(bh, "free child branches");
2125 ext4_free_branches(handle, inode, bh,
2126 (__le32*)bh->b_data,
2127 (__le32*)bh->b_data + addr_per_block,
2128 depth);
2129
2130 /*
2131 * We've probably journalled the indirect block several
2132 * times during the truncate. But it's no longer
2133 * needed and we now drop it from the transaction via
2134 * jbd2_journal_revoke().
2135 *
2136 * That's easy if it's exclusively part of this
2137 * transaction. But if it's part of the committing
2138 * transaction then jbd2_journal_forget() will simply
2139 * brelse() it. That means that if the underlying
2140 * block is reallocated in ext4_get_block(),
2141 * unmap_underlying_metadata() will find this block
2142 * and will try to get rid of it. damn, damn.
2143 *
2144 * If this block has already been committed to the
2145 * journal, a revoke record will be written. And
2146 * revoke records must be emitted *before* clearing
2147 * this block's bit in the bitmaps.
2148 */
2149 ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
2150
2151 /*
2152 * Everything below this this pointer has been
2153 * released. Now let this top-of-subtree go.
2154 *
2155 * We want the freeing of this indirect block to be
2156 * atomic in the journal with the updating of the
2157 * bitmap block which owns it. So make some room in
2158 * the journal.
2159 *
2160 * We zero the parent pointer *after* freeing its
2161 * pointee in the bitmaps, so if extend_transaction()
2162 * for some reason fails to put the bitmap changes and
2163 * the release into the same transaction, recovery
2164 * will merely complain about releasing a free block,
2165 * rather than leaking blocks.
2166 */
2167 if (is_handle_aborted(handle))
2168 return;
2169 if (try_to_extend_transaction(handle, inode)) {
2170 ext4_mark_inode_dirty(handle, inode);
2171 ext4_journal_test_restart(handle, inode);
2172 }
2173
2174 ext4_free_blocks(handle, inode, nr, 1);
2175
2176 if (parent_bh) {
2177 /*
2178 * The block which we have just freed is
2179 * pointed to by an indirect block: journal it
2180 */
2181 BUFFER_TRACE(parent_bh, "get_write_access");
2182 if (!ext4_journal_get_write_access(handle,
2183 parent_bh)){
2184 *p = 0;
2185 BUFFER_TRACE(parent_bh,
2186 "call ext4_journal_dirty_metadata");
2187 ext4_journal_dirty_metadata(handle,
2188 parent_bh);
2189 }
2190 }
2191 }
2192 } else {
2193 /* We have reached the bottom of the tree. */
2194 BUFFER_TRACE(parent_bh, "free data blocks");
2195 ext4_free_data(handle, inode, parent_bh, first, last);
2196 }
2197}
2198
2199/*
2200 * ext4_truncate()
2201 *
2202 * We block out ext4_get_block() block instantiations across the entire
2203 * transaction, and VFS/VM ensures that ext4_truncate() cannot run
2204 * simultaneously on behalf of the same inode.
2205 *
2206 * As we work through the truncate and commmit bits of it to the journal there
2207 * is one core, guiding principle: the file's tree must always be consistent on
2208 * disk. We must be able to restart the truncate after a crash.
2209 *
2210 * The file's tree may be transiently inconsistent in memory (although it
2211 * probably isn't), but whenever we close off and commit a journal transaction,
2212 * the contents of (the filesystem + the journal) must be consistent and
2213 * restartable. It's pretty simple, really: bottom up, right to left (although
2214 * left-to-right works OK too).
2215 *
2216 * Note that at recovery time, journal replay occurs *before* the restart of
2217 * truncate against the orphan inode list.
2218 *
2219 * The committed inode has the new, desired i_size (which is the same as
2220 * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see
2221 * that this inode's truncate did not complete and it will again call
2222 * ext4_truncate() to have another go. So there will be instantiated blocks
2223 * to the right of the truncation point in a crashed ext4 filesystem. But
2224 * that's fine - as long as they are linked from the inode, the post-crash
2225 * ext4_truncate() run will find them and release them.
2226 */
2227void ext4_truncate(struct inode *inode)
2228{
2229 handle_t *handle;
2230 struct ext4_inode_info *ei = EXT4_I(inode);
2231 __le32 *i_data = ei->i_data;
2232 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
2233 struct address_space *mapping = inode->i_mapping;
2234 int offsets[4];
2235 Indirect chain[4];
2236 Indirect *partial;
2237 __le32 nr = 0;
2238 int n;
2239 long last_block;
2240 unsigned blocksize = inode->i_sb->s_blocksize;
2241 struct page *page;
2242
2243 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
2244 S_ISLNK(inode->i_mode)))
2245 return;
2246 if (ext4_inode_is_fast_symlink(inode))
2247 return;
2248 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2249 return;
2250
2251 /*
2252 * We have to lock the EOF page here, because lock_page() nests
2253 * outside jbd2_journal_start().
2254 */
2255 if ((inode->i_size & (blocksize - 1)) == 0) {
2256 /* Block boundary? Nothing to do */
2257 page = NULL;
2258 } else {
2259 page = grab_cache_page(mapping,
2260 inode->i_size >> PAGE_CACHE_SHIFT);
2261 if (!page)
2262 return;
2263 }
2264
2265 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
2266 return ext4_ext_truncate(inode, page);
2267
2268 handle = start_transaction(inode);
2269 if (IS_ERR(handle)) {
2270 if (page) {
2271 clear_highpage(page);
2272 flush_dcache_page(page);
2273 unlock_page(page);
2274 page_cache_release(page);
2275 }
2276 return; /* AKPM: return what? */
2277 }
2278
2279 last_block = (inode->i_size + blocksize-1)
2280 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
2281
2282 if (page)
2283 ext4_block_truncate_page(handle, page, mapping, inode->i_size);
2284
2285 n = ext4_block_to_path(inode, last_block, offsets, NULL);
2286 if (n == 0)
2287 goto out_stop; /* error */
2288
2289 /*
2290 * OK. This truncate is going to happen. We add the inode to the
2291 * orphan list, so that if this truncate spans multiple transactions,
2292 * and we crash, we will resume the truncate when the filesystem
2293 * recovers. It also marks the inode dirty, to catch the new size.
2294 *
2295 * Implication: the file must always be in a sane, consistent
2296 * truncatable state while each transaction commits.
2297 */
2298 if (ext4_orphan_add(handle, inode))
2299 goto out_stop;
2300
2301 /*
2302 * The orphan list entry will now protect us from any crash which
2303 * occurs before the truncate completes, so it is now safe to propagate
2304 * the new, shorter inode size (held for now in i_size) into the
2305 * on-disk inode. We do this via i_disksize, which is the value which
2306 * ext4 *really* writes onto the disk inode.
2307 */
2308 ei->i_disksize = inode->i_size;
2309
2310 /*
2311 * From here we block out all ext4_get_block() callers who want to
2312 * modify the block allocation tree.
2313 */
2314 mutex_lock(&ei->truncate_mutex);
2315
2316 if (n == 1) { /* direct blocks */
2317 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
2318 i_data + EXT4_NDIR_BLOCKS);
2319 goto do_indirects;
2320 }
2321
2322 partial = ext4_find_shared(inode, n, offsets, chain, &nr);
2323 /* Kill the top of shared branch (not detached) */
2324 if (nr) {
2325 if (partial == chain) {
2326 /* Shared branch grows from the inode */
2327 ext4_free_branches(handle, inode, NULL,
2328 &nr, &nr+1, (chain+n-1) - partial);
2329 *partial->p = 0;
2330 /*
2331 * We mark the inode dirty prior to restart,
2332 * and prior to stop. No need for it here.
2333 */
2334 } else {
2335 /* Shared branch grows from an indirect block */
2336 BUFFER_TRACE(partial->bh, "get_write_access");
2337 ext4_free_branches(handle, inode, partial->bh,
2338 partial->p,
2339 partial->p+1, (chain+n-1) - partial);
2340 }
2341 }
2342 /* Clear the ends of indirect blocks on the shared branch */
2343 while (partial > chain) {
2344 ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
2345 (__le32*)partial->bh->b_data+addr_per_block,
2346 (chain+n-1) - partial);
2347 BUFFER_TRACE(partial->bh, "call brelse");
2348 brelse (partial->bh);
2349 partial--;
2350 }
2351do_indirects:
2352 /* Kill the remaining (whole) subtrees */
2353 switch (offsets[0]) {
2354 default:
2355 nr = i_data[EXT4_IND_BLOCK];
2356 if (nr) {
2357 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
2358 i_data[EXT4_IND_BLOCK] = 0;
2359 }
2360 case EXT4_IND_BLOCK:
2361 nr = i_data[EXT4_DIND_BLOCK];
2362 if (nr) {
2363 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
2364 i_data[EXT4_DIND_BLOCK] = 0;
2365 }
2366 case EXT4_DIND_BLOCK:
2367 nr = i_data[EXT4_TIND_BLOCK];
2368 if (nr) {
2369 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
2370 i_data[EXT4_TIND_BLOCK] = 0;
2371 }
2372 case EXT4_TIND_BLOCK:
2373 ;
2374 }
2375
2376 ext4_discard_reservation(inode);
2377
2378 mutex_unlock(&ei->truncate_mutex);
2379 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
2380 ext4_mark_inode_dirty(handle, inode);
2381
2382 /*
2383 * In a multi-transaction truncate, we only make the final transaction
2384 * synchronous
2385 */
2386 if (IS_SYNC(inode))
2387 handle->h_sync = 1;
2388out_stop:
2389 /*
2390 * If this was a simple ftruncate(), and the file will remain alive
2391 * then we need to clear up the orphan record which we created above.
2392 * However, if this was a real unlink then we were called by
2393 * ext4_delete_inode(), and we allow that function to clean up the
2394 * orphan info for us.
2395 */
2396 if (inode->i_nlink)
2397 ext4_orphan_del(handle, inode);
2398
2399 ext4_journal_stop(handle);
2400}
2401
2402static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
2403 unsigned long ino, struct ext4_iloc *iloc)
2404{
2405 unsigned long desc, group_desc, block_group;
2406 unsigned long offset;
2407 ext4_fsblk_t block;
2408 struct buffer_head *bh;
2409 struct ext4_group_desc * gdp;
2410
2411 if (!ext4_valid_inum(sb, ino)) {
2412 /*
2413 * This error is already checked for in namei.c unless we are
2414 * looking at an NFS filehandle, in which case no error
2415 * report is needed
2416 */
2417 return 0;
2418 }
2419
2420 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
2421 if (block_group >= EXT4_SB(sb)->s_groups_count) {
2422 ext4_error(sb,"ext4_get_inode_block","group >= groups count");
2423 return 0;
2424 }
2425 smp_rmb();
2426 group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
2427 desc = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
2428 bh = EXT4_SB(sb)->s_group_desc[group_desc];
2429 if (!bh) {
2430 ext4_error (sb, "ext4_get_inode_block",
2431 "Descriptor not loaded");
2432 return 0;
2433 }
2434
2435 gdp = (struct ext4_group_desc *)((__u8 *)bh->b_data +
2436 desc * EXT4_DESC_SIZE(sb));
2437 /*
2438 * Figure out the offset within the block group inode table
2439 */
2440 offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *
2441 EXT4_INODE_SIZE(sb);
2442 block = ext4_inode_table(sb, gdp) +
2443 (offset >> EXT4_BLOCK_SIZE_BITS(sb));
2444
2445 iloc->block_group = block_group;
2446 iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);
2447 return block;
2448}
2449
2450/*
2451 * ext4_get_inode_loc returns with an extra refcount against the inode's
2452 * underlying buffer_head on success. If 'in_mem' is true, we have all
2453 * data in memory that is needed to recreate the on-disk version of this
2454 * inode.
2455 */
2456static int __ext4_get_inode_loc(struct inode *inode,
2457 struct ext4_iloc *iloc, int in_mem)
2458{
2459 ext4_fsblk_t block;
2460 struct buffer_head *bh;
2461
2462 block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc);
2463 if (!block)
2464 return -EIO;
2465
2466 bh = sb_getblk(inode->i_sb, block);
2467 if (!bh) {
2468 ext4_error (inode->i_sb, "ext4_get_inode_loc",
2469 "unable to read inode block - "
2470 "inode=%lu, block=%llu",
2471 inode->i_ino, block);
2472 return -EIO;
2473 }
2474 if (!buffer_uptodate(bh)) {
2475 lock_buffer(bh);
2476 if (buffer_uptodate(bh)) {
2477 /* someone brought it uptodate while we waited */
2478 unlock_buffer(bh);
2479 goto has_buffer;
2480 }
2481
2482 /*
2483 * If we have all information of the inode in memory and this
2484 * is the only valid inode in the block, we need not read the
2485 * block.
2486 */
2487 if (in_mem) {
2488 struct buffer_head *bitmap_bh;
2489 struct ext4_group_desc *desc;
2490 int inodes_per_buffer;
2491 int inode_offset, i;
2492 int block_group;
2493 int start;
2494
2495 block_group = (inode->i_ino - 1) /
2496 EXT4_INODES_PER_GROUP(inode->i_sb);
2497 inodes_per_buffer = bh->b_size /
2498 EXT4_INODE_SIZE(inode->i_sb);
2499 inode_offset = ((inode->i_ino - 1) %
2500 EXT4_INODES_PER_GROUP(inode->i_sb));
2501 start = inode_offset & ~(inodes_per_buffer - 1);
2502
2503 /* Is the inode bitmap in cache? */
2504 desc = ext4_get_group_desc(inode->i_sb,
2505 block_group, NULL);
2506 if (!desc)
2507 goto make_io;
2508
2509 bitmap_bh = sb_getblk(inode->i_sb,
2510 ext4_inode_bitmap(inode->i_sb, desc));
2511 if (!bitmap_bh)
2512 goto make_io;
2513
2514 /*
2515 * If the inode bitmap isn't in cache then the
2516 * optimisation may end up performing two reads instead
2517 * of one, so skip it.
2518 */
2519 if (!buffer_uptodate(bitmap_bh)) {
2520 brelse(bitmap_bh);
2521 goto make_io;
2522 }
2523 for (i = start; i < start + inodes_per_buffer; i++) {
2524 if (i == inode_offset)
2525 continue;
2526 if (ext4_test_bit(i, bitmap_bh->b_data))
2527 break;
2528 }
2529 brelse(bitmap_bh);
2530 if (i == start + inodes_per_buffer) {
2531 /* all other inodes are free, so skip I/O */
2532 memset(bh->b_data, 0, bh->b_size);
2533 set_buffer_uptodate(bh);
2534 unlock_buffer(bh);
2535 goto has_buffer;
2536 }
2537 }
2538
2539make_io:
2540 /*
2541 * There are other valid inodes in the buffer, this inode
2542 * has in-inode xattrs, or we don't have this inode in memory.
2543 * Read the block from disk.
2544 */
2545 get_bh(bh);
2546 bh->b_end_io = end_buffer_read_sync;
2547 submit_bh(READ_META, bh);
2548 wait_on_buffer(bh);
2549 if (!buffer_uptodate(bh)) {
2550 ext4_error(inode->i_sb, "ext4_get_inode_loc",
2551 "unable to read inode block - "
2552 "inode=%lu, block=%llu",
2553 inode->i_ino, block);
2554 brelse(bh);
2555 return -EIO;
2556 }
2557 }
2558has_buffer:
2559 iloc->bh = bh;
2560 return 0;
2561}
2562
2563int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
2564{
2565 /* We have all inode data except xattrs in memory here. */
2566 return __ext4_get_inode_loc(inode, iloc,
2567 !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR));
2568}
2569
2570void ext4_set_inode_flags(struct inode *inode)
2571{
2572 unsigned int flags = EXT4_I(inode)->i_flags;
2573
2574 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
2575 if (flags & EXT4_SYNC_FL)
2576 inode->i_flags |= S_SYNC;
2577 if (flags & EXT4_APPEND_FL)
2578 inode->i_flags |= S_APPEND;
2579 if (flags & EXT4_IMMUTABLE_FL)
2580 inode->i_flags |= S_IMMUTABLE;
2581 if (flags & EXT4_NOATIME_FL)
2582 inode->i_flags |= S_NOATIME;
2583 if (flags & EXT4_DIRSYNC_FL)
2584 inode->i_flags |= S_DIRSYNC;
2585}
2586
2587void ext4_read_inode(struct inode * inode)
2588{
2589 struct ext4_iloc iloc;
2590 struct ext4_inode *raw_inode;
2591 struct ext4_inode_info *ei = EXT4_I(inode);
2592 struct buffer_head *bh;
2593 int block;
2594
2595#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
2596 ei->i_acl = EXT4_ACL_NOT_CACHED;
2597 ei->i_default_acl = EXT4_ACL_NOT_CACHED;
2598#endif
2599 ei->i_block_alloc_info = NULL;
2600
2601 if (__ext4_get_inode_loc(inode, &iloc, 0))
2602 goto bad_inode;
2603 bh = iloc.bh;
2604 raw_inode = ext4_raw_inode(&iloc);
2605 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
2606 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
2607 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
2608 if(!(test_opt (inode->i_sb, NO_UID32))) {
2609 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
2610 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
2611 }
2612 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
2613 inode->i_size = le32_to_cpu(raw_inode->i_size);
2614 inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
2615 inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
2616 inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
2617 inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
2618
2619 ei->i_state = 0;
2620 ei->i_dir_start_lookup = 0;
2621 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
2622 /* We now have enough fields to check if the inode was active or not.
2623 * This is needed because nfsd might try to access dead inodes
2624 * the test is that same one that e2fsck uses
2625 * NeilBrown 1999oct15
2626 */
2627 if (inode->i_nlink == 0) {
2628 if (inode->i_mode == 0 ||
2629 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
2630 /* this inode is deleted */
2631 brelse (bh);
2632 goto bad_inode;
2633 }
2634 /* The only unlinked inodes we let through here have
2635 * valid i_mode and are being read by the orphan
2636 * recovery code: that's fine, we're about to complete
2637 * the process of deleting those. */
2638 }
2639 inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
2640 ei->i_flags = le32_to_cpu(raw_inode->i_flags);
2641#ifdef EXT4_FRAGMENTS
2642 ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
2643 ei->i_frag_no = raw_inode->i_frag;
2644 ei->i_frag_size = raw_inode->i_fsize;
2645#endif
2646 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
2647 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
2648 cpu_to_le32(EXT4_OS_HURD))
2649 ei->i_file_acl |=
2650 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
2651 if (!S_ISREG(inode->i_mode)) {
2652 ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
2653 } else {
2654 inode->i_size |=
2655 ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
2656 }
2657 ei->i_disksize = inode->i_size;
2658 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
2659 ei->i_block_group = iloc.block_group;
2660 /*
2661 * NOTE! The in-memory inode i_data array is in little-endian order
2662 * even on big-endian machines: we do NOT byteswap the block numbers!
2663 */
2664 for (block = 0; block < EXT4_N_BLOCKS; block++)
2665 ei->i_data[block] = raw_inode->i_block[block];
2666 INIT_LIST_HEAD(&ei->i_orphan);
2667
2668 if (inode->i_ino >= EXT4_FIRST_INO(inode->i_sb) + 1 &&
2669 EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
2670 /*
2671 * When mke2fs creates big inodes it does not zero out
2672 * the unused bytes above EXT4_GOOD_OLD_INODE_SIZE,
2673 * so ignore those first few inodes.
2674 */
2675 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
2676 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
2677 EXT4_INODE_SIZE(inode->i_sb))
2678 goto bad_inode;
2679 if (ei->i_extra_isize == 0) {
2680 /* The extra space is currently unused. Use it. */
2681 ei->i_extra_isize = sizeof(struct ext4_inode) -
2682 EXT4_GOOD_OLD_INODE_SIZE;
2683 } else {
2684 __le32 *magic = (void *)raw_inode +
2685 EXT4_GOOD_OLD_INODE_SIZE +
2686 ei->i_extra_isize;
2687 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
2688 ei->i_state |= EXT4_STATE_XATTR;
2689 }
2690 } else
2691 ei->i_extra_isize = 0;
2692
2693 if (S_ISREG(inode->i_mode)) {
2694 inode->i_op = &ext4_file_inode_operations;
2695 inode->i_fop = &ext4_file_operations;
2696 ext4_set_aops(inode);
2697 } else if (S_ISDIR(inode->i_mode)) {
2698 inode->i_op = &ext4_dir_inode_operations;
2699 inode->i_fop = &ext4_dir_operations;
2700 } else if (S_ISLNK(inode->i_mode)) {
2701 if (ext4_inode_is_fast_symlink(inode))
2702 inode->i_op = &ext4_fast_symlink_inode_operations;
2703 else {
2704 inode->i_op = &ext4_symlink_inode_operations;
2705 ext4_set_aops(inode);
2706 }
2707 } else {
2708 inode->i_op = &ext4_special_inode_operations;
2709 if (raw_inode->i_block[0])
2710 init_special_inode(inode, inode->i_mode,
2711 old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
2712 else
2713 init_special_inode(inode, inode->i_mode,
2714 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
2715 }
2716 brelse (iloc.bh);
2717 ext4_set_inode_flags(inode);
2718 return;
2719
2720bad_inode:
2721 make_bad_inode(inode);
2722 return;
2723}
2724
2725/*
2726 * Post the struct inode info into an on-disk inode location in the
2727 * buffer-cache. This gobbles the caller's reference to the
2728 * buffer_head in the inode location struct.
2729 *
2730 * The caller must have write access to iloc->bh.
2731 */
2732static int ext4_do_update_inode(handle_t *handle,
2733 struct inode *inode,
2734 struct ext4_iloc *iloc)
2735{
2736 struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
2737 struct ext4_inode_info *ei = EXT4_I(inode);
2738 struct buffer_head *bh = iloc->bh;
2739 int err = 0, rc, block;
2740
2741 /* For fields not not tracking in the in-memory inode,
2742 * initialise them to zero for new inodes. */
2743 if (ei->i_state & EXT4_STATE_NEW)
2744 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
2745
2746 raw_inode->i_mode = cpu_to_le16(inode->i_mode);
2747 if(!(test_opt(inode->i_sb, NO_UID32))) {
2748 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
2749 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
2750/*
2751 * Fix up interoperability with old kernels. Otherwise, old inodes get
2752 * re-used with the upper 16 bits of the uid/gid intact
2753 */
2754 if(!ei->i_dtime) {
2755 raw_inode->i_uid_high =
2756 cpu_to_le16(high_16_bits(inode->i_uid));
2757 raw_inode->i_gid_high =
2758 cpu_to_le16(high_16_bits(inode->i_gid));
2759 } else {
2760 raw_inode->i_uid_high = 0;
2761 raw_inode->i_gid_high = 0;
2762 }
2763 } else {
2764 raw_inode->i_uid_low =
2765 cpu_to_le16(fs_high2lowuid(inode->i_uid));
2766 raw_inode->i_gid_low =
2767 cpu_to_le16(fs_high2lowgid(inode->i_gid));
2768 raw_inode->i_uid_high = 0;
2769 raw_inode->i_gid_high = 0;
2770 }
2771 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
2772 raw_inode->i_size = cpu_to_le32(ei->i_disksize);
2773 raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
2774 raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
2775 raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
2776 raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
2777 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
2778 raw_inode->i_flags = cpu_to_le32(ei->i_flags);
2779#ifdef EXT4_FRAGMENTS
2780 raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
2781 raw_inode->i_frag = ei->i_frag_no;
2782 raw_inode->i_fsize = ei->i_frag_size;
2783#endif
2784 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
2785 cpu_to_le32(EXT4_OS_HURD))
2786 raw_inode->i_file_acl_high =
2787 cpu_to_le16(ei->i_file_acl >> 32);
2788 raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
2789 if (!S_ISREG(inode->i_mode)) {
2790 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
2791 } else {
2792 raw_inode->i_size_high =
2793 cpu_to_le32(ei->i_disksize >> 32);
2794 if (ei->i_disksize > 0x7fffffffULL) {
2795 struct super_block *sb = inode->i_sb;
2796 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
2797 EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
2798 EXT4_SB(sb)->s_es->s_rev_level ==
2799 cpu_to_le32(EXT4_GOOD_OLD_REV)) {
2800 /* If this is the first large file
2801 * created, add a flag to the superblock.
2802 */
2803 err = ext4_journal_get_write_access(handle,
2804 EXT4_SB(sb)->s_sbh);
2805 if (err)
2806 goto out_brelse;
2807 ext4_update_dynamic_rev(sb);
2808 EXT4_SET_RO_COMPAT_FEATURE(sb,
2809 EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
2810 sb->s_dirt = 1;
2811 handle->h_sync = 1;
2812 err = ext4_journal_dirty_metadata(handle,
2813 EXT4_SB(sb)->s_sbh);
2814 }
2815 }
2816 }
2817 raw_inode->i_generation = cpu_to_le32(inode->i_generation);
2818 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
2819 if (old_valid_dev(inode->i_rdev)) {
2820 raw_inode->i_block[0] =
2821 cpu_to_le32(old_encode_dev(inode->i_rdev));
2822 raw_inode->i_block[1] = 0;
2823 } else {
2824 raw_inode->i_block[0] = 0;
2825 raw_inode->i_block[1] =
2826 cpu_to_le32(new_encode_dev(inode->i_rdev));
2827 raw_inode->i_block[2] = 0;
2828 }
2829 } else for (block = 0; block < EXT4_N_BLOCKS; block++)
2830 raw_inode->i_block[block] = ei->i_data[block];
2831
2832 if (ei->i_extra_isize)
2833 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
2834
2835 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
2836 rc = ext4_journal_dirty_metadata(handle, bh);
2837 if (!err)
2838 err = rc;
2839 ei->i_state &= ~EXT4_STATE_NEW;
2840
2841out_brelse:
2842 brelse (bh);
2843 ext4_std_error(inode->i_sb, err);
2844 return err;
2845}
2846
2847/*
2848 * ext4_write_inode()
2849 *
2850 * We are called from a few places:
2851 *
2852 * - Within generic_file_write() for O_SYNC files.
2853 * Here, there will be no transaction running. We wait for any running
2854 * trasnaction to commit.
2855 *
2856 * - Within sys_sync(), kupdate and such.
2857 * We wait on commit, if tol to.
2858 *
2859 * - Within prune_icache() (PF_MEMALLOC == true)
2860 * Here we simply return. We can't afford to block kswapd on the
2861 * journal commit.
2862 *
2863 * In all cases it is actually safe for us to return without doing anything,
2864 * because the inode has been copied into a raw inode buffer in
2865 * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
2866 * knfsd.
2867 *
2868 * Note that we are absolutely dependent upon all inode dirtiers doing the
2869 * right thing: they *must* call mark_inode_dirty() after dirtying info in
2870 * which we are interested.
2871 *
2872 * It would be a bug for them to not do this. The code:
2873 *
2874 * mark_inode_dirty(inode)
2875 * stuff();
2876 * inode->i_size = expr;
2877 *
2878 * is in error because a kswapd-driven write_inode() could occur while
2879 * `stuff()' is running, and the new i_size will be lost. Plus the inode
2880 * will no longer be on the superblock's dirty inode list.
2881 */
2882int ext4_write_inode(struct inode *inode, int wait)
2883{
2884 if (current->flags & PF_MEMALLOC)
2885 return 0;
2886
2887 if (ext4_journal_current_handle()) {
2888 jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
2889 dump_stack();
2890 return -EIO;
2891 }
2892
2893 if (!wait)
2894 return 0;
2895
2896 return ext4_force_commit(inode->i_sb);
2897}
2898
2899/*
2900 * ext4_setattr()
2901 *
2902 * Called from notify_change.
2903 *
2904 * We want to trap VFS attempts to truncate the file as soon as
2905 * possible. In particular, we want to make sure that when the VFS
2906 * shrinks i_size, we put the inode on the orphan list and modify
2907 * i_disksize immediately, so that during the subsequent flushing of
2908 * dirty pages and freeing of disk blocks, we can guarantee that any
2909 * commit will leave the blocks being flushed in an unused state on
2910 * disk. (On recovery, the inode will get truncated and the blocks will
2911 * be freed, so we have a strong guarantee that no future commit will
2912 * leave these blocks visible to the user.)
2913 *
2914 * Called with inode->sem down.
2915 */
2916int ext4_setattr(struct dentry *dentry, struct iattr *attr)
2917{
2918 struct inode *inode = dentry->d_inode;
2919 int error, rc = 0;
2920 const unsigned int ia_valid = attr->ia_valid;
2921
2922 error = inode_change_ok(inode, attr);
2923 if (error)
2924 return error;
2925
2926 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
2927 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
2928 handle_t *handle;
2929
2930 /* (user+group)*(old+new) structure, inode write (sb,
2931 * inode block, ? - but truncate inode update has it) */
2932 handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+
2933 EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
2934 if (IS_ERR(handle)) {
2935 error = PTR_ERR(handle);
2936 goto err_out;
2937 }
2938 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
2939 if (error) {
2940 ext4_journal_stop(handle);
2941 return error;
2942 }
2943 /* Update corresponding info in inode so that everything is in
2944 * one transaction */
2945 if (attr->ia_valid & ATTR_UID)
2946 inode->i_uid = attr->ia_uid;
2947 if (attr->ia_valid & ATTR_GID)
2948 inode->i_gid = attr->ia_gid;
2949 error = ext4_mark_inode_dirty(handle, inode);
2950 ext4_journal_stop(handle);
2951 }
2952
2953 if (S_ISREG(inode->i_mode) &&
2954 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
2955 handle_t *handle;
2956
2957 handle = ext4_journal_start(inode, 3);
2958 if (IS_ERR(handle)) {
2959 error = PTR_ERR(handle);
2960 goto err_out;
2961 }
2962
2963 error = ext4_orphan_add(handle, inode);
2964 EXT4_I(inode)->i_disksize = attr->ia_size;
2965 rc = ext4_mark_inode_dirty(handle, inode);
2966 if (!error)
2967 error = rc;
2968 ext4_journal_stop(handle);
2969 }
2970
2971 rc = inode_setattr(inode, attr);
2972
2973 /* If inode_setattr's call to ext4_truncate failed to get a
2974 * transaction handle at all, we need to clean up the in-core
2975 * orphan list manually. */
2976 if (inode->i_nlink)
2977 ext4_orphan_del(NULL, inode);
2978
2979 if (!rc && (ia_valid & ATTR_MODE))
2980 rc = ext4_acl_chmod(inode);
2981
2982err_out:
2983 ext4_std_error(inode->i_sb, error);
2984 if (!error)
2985 error = rc;
2986 return error;
2987}
2988
2989
2990/*
2991 * How many blocks doth make a writepage()?
2992 *
2993 * With N blocks per page, it may be:
2994 * N data blocks
2995 * 2 indirect block
2996 * 2 dindirect
2997 * 1 tindirect
2998 * N+5 bitmap blocks (from the above)
2999 * N+5 group descriptor summary blocks
3000 * 1 inode block
3001 * 1 superblock.
3002 * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files
3003 *
3004 * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS
3005 *
3006 * With ordered or writeback data it's the same, less the N data blocks.
3007 *
3008 * If the inode's direct blocks can hold an integral number of pages then a
3009 * page cannot straddle two indirect blocks, and we can only touch one indirect
3010 * and dindirect block, and the "5" above becomes "3".
3011 *
3012 * This still overestimates under most circumstances. If we were to pass the
3013 * start and end offsets in here as well we could do block_to_path() on each
3014 * block and work out the exact number of indirects which are touched. Pah.
3015 */
3016
3017int ext4_writepage_trans_blocks(struct inode *inode)
3018{
3019 int bpp = ext4_journal_blocks_per_page(inode);
3020 int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3;
3021 int ret;
3022
3023 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
3024 return ext4_ext_writepage_trans_blocks(inode, bpp);
3025
3026 if (ext4_should_journal_data(inode))
3027 ret = 3 * (bpp + indirects) + 2;
3028 else
3029 ret = 2 * (bpp + indirects) + 2;
3030
3031#ifdef CONFIG_QUOTA
3032 /* We know that structure was already allocated during DQUOT_INIT so
3033 * we will be updating only the data blocks + inodes */
3034 ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
3035#endif
3036
3037 return ret;
3038}
3039
3040/*
3041 * The caller must have previously called ext4_reserve_inode_write().
3042 * Give this, we know that the caller already has write access to iloc->bh.
3043 */
3044int ext4_mark_iloc_dirty(handle_t *handle,
3045 struct inode *inode, struct ext4_iloc *iloc)
3046{
3047 int err = 0;
3048
3049 /* the do_update_inode consumes one bh->b_count */
3050 get_bh(iloc->bh);
3051
3052 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
3053 err = ext4_do_update_inode(handle, inode, iloc);
3054 put_bh(iloc->bh);
3055 return err;
3056}
3057
3058/*
3059 * On success, We end up with an outstanding reference count against
3060 * iloc->bh. This _must_ be cleaned up later.
3061 */
3062
3063int
3064ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
3065 struct ext4_iloc *iloc)
3066{
3067 int err = 0;
3068 if (handle) {
3069 err = ext4_get_inode_loc(inode, iloc);
3070 if (!err) {
3071 BUFFER_TRACE(iloc->bh, "get_write_access");
3072 err = ext4_journal_get_write_access(handle, iloc->bh);
3073 if (err) {
3074 brelse(iloc->bh);
3075 iloc->bh = NULL;
3076 }
3077 }
3078 }
3079 ext4_std_error(inode->i_sb, err);
3080 return err;
3081}
3082
3083/*
3084 * What we do here is to mark the in-core inode as clean with respect to inode
3085 * dirtiness (it may still be data-dirty).
3086 * This means that the in-core inode may be reaped by prune_icache
3087 * without having to perform any I/O. This is a very good thing,
3088 * because *any* task may call prune_icache - even ones which
3089 * have a transaction open against a different journal.
3090 *
3091 * Is this cheating? Not really. Sure, we haven't written the
3092 * inode out, but prune_icache isn't a user-visible syncing function.
3093 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
3094 * we start and wait on commits.
3095 *
3096 * Is this efficient/effective? Well, we're being nice to the system
3097 * by cleaning up our inodes proactively so they can be reaped
3098 * without I/O. But we are potentially leaving up to five seconds'
3099 * worth of inodes floating about which prune_icache wants us to
3100 * write out. One way to fix that would be to get prune_icache()
3101 * to do a write_super() to free up some memory. It has the desired
3102 * effect.
3103 */
3104int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
3105{
3106 struct ext4_iloc iloc;
3107 int err;
3108
3109 might_sleep();
3110 err = ext4_reserve_inode_write(handle, inode, &iloc);
3111 if (!err)
3112 err = ext4_mark_iloc_dirty(handle, inode, &iloc);
3113 return err;
3114}
3115
3116/*
3117 * ext4_dirty_inode() is called from __mark_inode_dirty()
3118 *
3119 * We're really interested in the case where a file is being extended.
3120 * i_size has been changed by generic_commit_write() and we thus need
3121 * to include the updated inode in the current transaction.
3122 *
3123 * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
3124 * are allocated to the file.
3125 *
3126 * If the inode is marked synchronous, we don't honour that here - doing
3127 * so would cause a commit on atime updates, which we don't bother doing.
3128 * We handle synchronous inodes at the highest possible level.
3129 */
3130void ext4_dirty_inode(struct inode *inode)
3131{
3132 handle_t *current_handle = ext4_journal_current_handle();
3133 handle_t *handle;
3134
3135 handle = ext4_journal_start(inode, 2);
3136 if (IS_ERR(handle))
3137 goto out;
3138 if (current_handle &&
3139 current_handle->h_transaction != handle->h_transaction) {
3140 /* This task has a transaction open against a different fs */
3141 printk(KERN_EMERG "%s: transactions do not match!\n",
3142 __FUNCTION__);
3143 } else {
3144 jbd_debug(5, "marking dirty. outer handle=%p\n",
3145 current_handle);
3146 ext4_mark_inode_dirty(handle, inode);
3147 }
3148 ext4_journal_stop(handle);
3149out:
3150 return;
3151}
3152
3153#if 0
3154/*
3155 * Bind an inode's backing buffer_head into this transaction, to prevent
3156 * it from being flushed to disk early. Unlike
3157 * ext4_reserve_inode_write, this leaves behind no bh reference and
3158 * returns no iloc structure, so the caller needs to repeat the iloc
3159 * lookup to mark the inode dirty later.
3160 */
3161static int ext4_pin_inode(handle_t *handle, struct inode *inode)
3162{
3163 struct ext4_iloc iloc;
3164
3165 int err = 0;
3166 if (handle) {
3167 err = ext4_get_inode_loc(inode, &iloc);
3168 if (!err) {
3169 BUFFER_TRACE(iloc.bh, "get_write_access");
3170 err = jbd2_journal_get_write_access(handle, iloc.bh);
3171 if (!err)
3172 err = ext4_journal_dirty_metadata(handle,
3173 iloc.bh);
3174 brelse(iloc.bh);
3175 }
3176 }
3177 ext4_std_error(inode->i_sb, err);
3178 return err;
3179}
3180#endif
3181
3182int ext4_change_inode_journal_flag(struct inode *inode, int val)
3183{
3184 journal_t *journal;
3185 handle_t *handle;
3186 int err;
3187
3188 /*
3189 * We have to be very careful here: changing a data block's
3190 * journaling status dynamically is dangerous. If we write a
3191 * data block to the journal, change the status and then delete
3192 * that block, we risk forgetting to revoke the old log record
3193 * from the journal and so a subsequent replay can corrupt data.
3194 * So, first we make sure that the journal is empty and that
3195 * nobody is changing anything.
3196 */
3197
3198 journal = EXT4_JOURNAL(inode);
3199 if (is_journal_aborted(journal) || IS_RDONLY(inode))
3200 return -EROFS;
3201
3202 jbd2_journal_lock_updates(journal);
3203 jbd2_journal_flush(journal);
3204
3205 /*
3206 * OK, there are no updates running now, and all cached data is
3207 * synced to disk. We are now in a completely consistent state
3208 * which doesn't have anything in the journal, and we know that
3209 * no filesystem updates are running, so it is safe to modify
3210 * the inode's in-core data-journaling state flag now.
3211 */
3212
3213 if (val)
3214 EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL;
3215 else
3216 EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL;
3217 ext4_set_aops(inode);
3218
3219 jbd2_journal_unlock_updates(journal);
3220
3221 /* Finally we can mark the inode as dirty. */
3222
3223 handle = ext4_journal_start(inode, 1);
3224 if (IS_ERR(handle))
3225 return PTR_ERR(handle);
3226
3227 err = ext4_mark_inode_dirty(handle, inode);
3228 handle->h_sync = 1;
3229 ext4_journal_stop(handle);
3230 ext4_std_error(inode->i_sb, err);
3231
3232 return err;
3233}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
new file mode 100644
index 000000000000..22a737c306c7
--- /dev/null
+++ b/fs/ext4/ioctl.c
@@ -0,0 +1,306 @@
1/*
2 * linux/fs/ext4/ioctl.c
3 *
4 * Copyright (C) 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 */
9
10#include <linux/fs.h>
11#include <linux/jbd2.h>
12#include <linux/capability.h>
13#include <linux/ext4_fs.h>
14#include <linux/ext4_jbd2.h>
15#include <linux/time.h>
16#include <linux/compat.h>
17#include <linux/smp_lock.h>
18#include <asm/uaccess.h>
19
20int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
21 unsigned long arg)
22{
23 struct ext4_inode_info *ei = EXT4_I(inode);
24 unsigned int flags;
25 unsigned short rsv_window_size;
26
27 ext4_debug ("cmd = %u, arg = %lu\n", cmd, arg);
28
29 switch (cmd) {
30 case EXT4_IOC_GETFLAGS:
31 flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
32 return put_user(flags, (int __user *) arg);
33 case EXT4_IOC_SETFLAGS: {
34 handle_t *handle = NULL;
35 int err;
36 struct ext4_iloc iloc;
37 unsigned int oldflags;
38 unsigned int jflag;
39
40 if (IS_RDONLY(inode))
41 return -EROFS;
42
43 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
44 return -EACCES;
45
46 if (get_user(flags, (int __user *) arg))
47 return -EFAULT;
48
49 if (!S_ISDIR(inode->i_mode))
50 flags &= ~EXT4_DIRSYNC_FL;
51
52 mutex_lock(&inode->i_mutex);
53 oldflags = ei->i_flags;
54
55 /* The JOURNAL_DATA flag is modifiable only by root */
56 jflag = flags & EXT4_JOURNAL_DATA_FL;
57
58 /*
59 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
60 * the relevant capability.
61 *
62 * This test looks nicer. Thanks to Pauline Middelink
63 */
64 if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
65 if (!capable(CAP_LINUX_IMMUTABLE)) {
66 mutex_unlock(&inode->i_mutex);
67 return -EPERM;
68 }
69 }
70
71 /*
72 * The JOURNAL_DATA flag can only be changed by
73 * the relevant capability.
74 */
75 if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
76 if (!capable(CAP_SYS_RESOURCE)) {
77 mutex_unlock(&inode->i_mutex);
78 return -EPERM;
79 }
80 }
81
82
83 handle = ext4_journal_start(inode, 1);
84 if (IS_ERR(handle)) {
85 mutex_unlock(&inode->i_mutex);
86 return PTR_ERR(handle);
87 }
88 if (IS_SYNC(inode))
89 handle->h_sync = 1;
90 err = ext4_reserve_inode_write(handle, inode, &iloc);
91 if (err)
92 goto flags_err;
93
94 flags = flags & EXT4_FL_USER_MODIFIABLE;
95 flags |= oldflags & ~EXT4_FL_USER_MODIFIABLE;
96 ei->i_flags = flags;
97
98 ext4_set_inode_flags(inode);
99 inode->i_ctime = CURRENT_TIME_SEC;
100
101 err = ext4_mark_iloc_dirty(handle, inode, &iloc);
102flags_err:
103 ext4_journal_stop(handle);
104 if (err) {
105 mutex_unlock(&inode->i_mutex);
106 return err;
107 }
108
109 if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
110 err = ext4_change_inode_journal_flag(inode, jflag);
111 mutex_unlock(&inode->i_mutex);
112 return err;
113 }
114 case EXT4_IOC_GETVERSION:
115 case EXT4_IOC_GETVERSION_OLD:
116 return put_user(inode->i_generation, (int __user *) arg);
117 case EXT4_IOC_SETVERSION:
118 case EXT4_IOC_SETVERSION_OLD: {
119 handle_t *handle;
120 struct ext4_iloc iloc;
121 __u32 generation;
122 int err;
123
124 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
125 return -EPERM;
126 if (IS_RDONLY(inode))
127 return -EROFS;
128 if (get_user(generation, (int __user *) arg))
129 return -EFAULT;
130
131 handle = ext4_journal_start(inode, 1);
132 if (IS_ERR(handle))
133 return PTR_ERR(handle);
134 err = ext4_reserve_inode_write(handle, inode, &iloc);
135 if (err == 0) {
136 inode->i_ctime = CURRENT_TIME_SEC;
137 inode->i_generation = generation;
138 err = ext4_mark_iloc_dirty(handle, inode, &iloc);
139 }
140 ext4_journal_stop(handle);
141 return err;
142 }
143#ifdef CONFIG_JBD_DEBUG
144 case EXT4_IOC_WAIT_FOR_READONLY:
145 /*
146 * This is racy - by the time we're woken up and running,
147 * the superblock could be released. And the module could
148 * have been unloaded. So sue me.
149 *
150 * Returns 1 if it slept, else zero.
151 */
152 {
153 struct super_block *sb = inode->i_sb;
154 DECLARE_WAITQUEUE(wait, current);
155 int ret = 0;
156
157 set_current_state(TASK_INTERRUPTIBLE);
158 add_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
159 if (timer_pending(&EXT4_SB(sb)->turn_ro_timer)) {
160 schedule();
161 ret = 1;
162 }
163 remove_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
164 return ret;
165 }
166#endif
167 case EXT4_IOC_GETRSVSZ:
168 if (test_opt(inode->i_sb, RESERVATION)
169 && S_ISREG(inode->i_mode)
170 && ei->i_block_alloc_info) {
171 rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size;
172 return put_user(rsv_window_size, (int __user *)arg);
173 }
174 return -ENOTTY;
175 case EXT4_IOC_SETRSVSZ: {
176
177 if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
178 return -ENOTTY;
179
180 if (IS_RDONLY(inode))
181 return -EROFS;
182
183 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
184 return -EACCES;
185
186 if (get_user(rsv_window_size, (int __user *)arg))
187 return -EFAULT;
188
189 if (rsv_window_size > EXT4_MAX_RESERVE_BLOCKS)
190 rsv_window_size = EXT4_MAX_RESERVE_BLOCKS;
191
192 /*
193 * need to allocate reservation structure for this inode
194 * before set the window size
195 */
196 mutex_lock(&ei->truncate_mutex);
197 if (!ei->i_block_alloc_info)
198 ext4_init_block_alloc_info(inode);
199
200 if (ei->i_block_alloc_info){
201 struct ext4_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
202 rsv->rsv_goal_size = rsv_window_size;
203 }
204 mutex_unlock(&ei->truncate_mutex);
205 return 0;
206 }
207 case EXT4_IOC_GROUP_EXTEND: {
208 ext4_fsblk_t n_blocks_count;
209 struct super_block *sb = inode->i_sb;
210 int err;
211
212 if (!capable(CAP_SYS_RESOURCE))
213 return -EPERM;
214
215 if (IS_RDONLY(inode))
216 return -EROFS;
217
218 if (get_user(n_blocks_count, (__u32 __user *)arg))
219 return -EFAULT;
220
221 err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
222 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
223 jbd2_journal_flush(EXT4_SB(sb)->s_journal);
224 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
225
226 return err;
227 }
228 case EXT4_IOC_GROUP_ADD: {
229 struct ext4_new_group_data input;
230 struct super_block *sb = inode->i_sb;
231 int err;
232
233 if (!capable(CAP_SYS_RESOURCE))
234 return -EPERM;
235
236 if (IS_RDONLY(inode))
237 return -EROFS;
238
239 if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
240 sizeof(input)))
241 return -EFAULT;
242
243 err = ext4_group_add(sb, &input);
244 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
245 jbd2_journal_flush(EXT4_SB(sb)->s_journal);
246 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
247
248 return err;
249 }
250
251 default:
252 return -ENOTTY;
253 }
254}
255
256#ifdef CONFIG_COMPAT
257long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
258{
259 struct inode *inode = file->f_dentry->d_inode;
260 int ret;
261
262 /* These are just misnamed, they actually get/put from/to user an int */
263 switch (cmd) {
264 case EXT4_IOC32_GETFLAGS:
265 cmd = EXT4_IOC_GETFLAGS;
266 break;
267 case EXT4_IOC32_SETFLAGS:
268 cmd = EXT4_IOC_SETFLAGS;
269 break;
270 case EXT4_IOC32_GETVERSION:
271 cmd = EXT4_IOC_GETVERSION;
272 break;
273 case EXT4_IOC32_SETVERSION:
274 cmd = EXT4_IOC_SETVERSION;
275 break;
276 case EXT4_IOC32_GROUP_EXTEND:
277 cmd = EXT4_IOC_GROUP_EXTEND;
278 break;
279 case EXT4_IOC32_GETVERSION_OLD:
280 cmd = EXT4_IOC_GETVERSION_OLD;
281 break;
282 case EXT4_IOC32_SETVERSION_OLD:
283 cmd = EXT4_IOC_SETVERSION_OLD;
284 break;
285#ifdef CONFIG_JBD_DEBUG
286 case EXT4_IOC32_WAIT_FOR_READONLY:
287 cmd = EXT4_IOC_WAIT_FOR_READONLY;
288 break;
289#endif
290 case EXT4_IOC32_GETRSVSZ:
291 cmd = EXT4_IOC_GETRSVSZ;
292 break;
293 case EXT4_IOC32_SETRSVSZ:
294 cmd = EXT4_IOC_SETRSVSZ;
295 break;
296 case EXT4_IOC_GROUP_ADD:
297 break;
298 default:
299 return -ENOIOCTLCMD;
300 }
301 lock_kernel();
302 ret = ext4_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
303 unlock_kernel();
304 return ret;
305}
306#endif
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
new file mode 100644
index 000000000000..8b1bd03d20f5
--- /dev/null
+++ b/fs/ext4/namei.c
@@ -0,0 +1,2395 @@
1/*
2 * linux/fs/ext4/namei.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/namei.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * Big-endian to little-endian byte-swapping/bitmaps by
16 * David S. Miller (davem@caip.rutgers.edu), 1995
17 * Directory entry file type support and forward compatibility hooks
18 * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
19 * Hash Tree Directory indexing (c)
20 * Daniel Phillips, 2001
21 * Hash Tree Directory indexing porting
22 * Christopher Li, 2002
23 * Hash Tree Directory indexing cleanup
24 * Theodore Ts'o, 2002
25 */
26
27#include <linux/fs.h>
28#include <linux/pagemap.h>
29#include <linux/jbd2.h>
30#include <linux/time.h>
31#include <linux/ext4_fs.h>
32#include <linux/ext4_jbd2.h>
33#include <linux/fcntl.h>
34#include <linux/stat.h>
35#include <linux/string.h>
36#include <linux/quotaops.h>
37#include <linux/buffer_head.h>
38#include <linux/bio.h>
39#include <linux/smp_lock.h>
40
41#include "namei.h"
42#include "xattr.h"
43#include "acl.h"
44
45/*
46 * define how far ahead to read directories while searching them.
47 */
48#define NAMEI_RA_CHUNKS 2
49#define NAMEI_RA_BLOCKS 4
50#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
51#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
52
53static struct buffer_head *ext4_append(handle_t *handle,
54 struct inode *inode,
55 u32 *block, int *err)
56{
57 struct buffer_head *bh;
58
59 *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
60
61 if ((bh = ext4_bread(handle, inode, *block, 1, err))) {
62 inode->i_size += inode->i_sb->s_blocksize;
63 EXT4_I(inode)->i_disksize = inode->i_size;
64 ext4_journal_get_write_access(handle,bh);
65 }
66 return bh;
67}
68
69#ifndef assert
70#define assert(test) J_ASSERT(test)
71#endif
72
73#ifndef swap
74#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
75#endif
76
77#ifdef DX_DEBUG
78#define dxtrace(command) command
79#else
80#define dxtrace(command)
81#endif
82
83struct fake_dirent
84{
85 __le32 inode;
86 __le16 rec_len;
87 u8 name_len;
88 u8 file_type;
89};
90
91struct dx_countlimit
92{
93 __le16 limit;
94 __le16 count;
95};
96
97struct dx_entry
98{
99 __le32 hash;
100 __le32 block;
101};
102
103/*
104 * dx_root_info is laid out so that if it should somehow get overlaid by a
105 * dirent the two low bits of the hash version will be zero. Therefore, the
106 * hash version mod 4 should never be 0. Sincerely, the paranoia department.
107 */
108
109struct dx_root
110{
111 struct fake_dirent dot;
112 char dot_name[4];
113 struct fake_dirent dotdot;
114 char dotdot_name[4];
115 struct dx_root_info
116 {
117 __le32 reserved_zero;
118 u8 hash_version;
119 u8 info_length; /* 8 */
120 u8 indirect_levels;
121 u8 unused_flags;
122 }
123 info;
124 struct dx_entry entries[0];
125};
126
127struct dx_node
128{
129 struct fake_dirent fake;
130 struct dx_entry entries[0];
131};
132
133
134struct dx_frame
135{
136 struct buffer_head *bh;
137 struct dx_entry *entries;
138 struct dx_entry *at;
139};
140
141struct dx_map_entry
142{
143 u32 hash;
144 u32 offs;
145};
146
147#ifdef CONFIG_EXT4_INDEX
148static inline unsigned dx_get_block (struct dx_entry *entry);
149static void dx_set_block (struct dx_entry *entry, unsigned value);
150static inline unsigned dx_get_hash (struct dx_entry *entry);
151static void dx_set_hash (struct dx_entry *entry, unsigned value);
152static unsigned dx_get_count (struct dx_entry *entries);
153static unsigned dx_get_limit (struct dx_entry *entries);
154static void dx_set_count (struct dx_entry *entries, unsigned value);
155static void dx_set_limit (struct dx_entry *entries, unsigned value);
156static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
157static unsigned dx_node_limit (struct inode *dir);
158static struct dx_frame *dx_probe(struct dentry *dentry,
159 struct inode *dir,
160 struct dx_hash_info *hinfo,
161 struct dx_frame *frame,
162 int *err);
163static void dx_release (struct dx_frame *frames);
164static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
165 struct dx_hash_info *hinfo, struct dx_map_entry map[]);
166static void dx_sort_map(struct dx_map_entry *map, unsigned count);
167static struct ext4_dir_entry_2 *dx_move_dirents (char *from, char *to,
168 struct dx_map_entry *offsets, int count);
169static struct ext4_dir_entry_2* dx_pack_dirents (char *base, int size);
170static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
171static int ext4_htree_next_block(struct inode *dir, __u32 hash,
172 struct dx_frame *frame,
173 struct dx_frame *frames,
174 __u32 *start_hash);
175static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
176 struct ext4_dir_entry_2 **res_dir, int *err);
177static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
178 struct inode *inode);
179
180/*
181 * Future: use high four bits of block for coalesce-on-delete flags
182 * Mask them off for now.
183 */
184
185static inline unsigned dx_get_block (struct dx_entry *entry)
186{
187 return le32_to_cpu(entry->block) & 0x00ffffff;
188}
189
190static inline void dx_set_block (struct dx_entry *entry, unsigned value)
191{
192 entry->block = cpu_to_le32(value);
193}
194
195static inline unsigned dx_get_hash (struct dx_entry *entry)
196{
197 return le32_to_cpu(entry->hash);
198}
199
200static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
201{
202 entry->hash = cpu_to_le32(value);
203}
204
205static inline unsigned dx_get_count (struct dx_entry *entries)
206{
207 return le16_to_cpu(((struct dx_countlimit *) entries)->count);
208}
209
210static inline unsigned dx_get_limit (struct dx_entry *entries)
211{
212 return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
213}
214
215static inline void dx_set_count (struct dx_entry *entries, unsigned value)
216{
217 ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
218}
219
220static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
221{
222 ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
223}
224
225static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
226{
227 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
228 EXT4_DIR_REC_LEN(2) - infosize;
229 return 0? 20: entry_space / sizeof(struct dx_entry);
230}
231
232static inline unsigned dx_node_limit (struct inode *dir)
233{
234 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
235 return 0? 22: entry_space / sizeof(struct dx_entry);
236}
237
238/*
239 * Debug
240 */
241#ifdef DX_DEBUG
242static void dx_show_index (char * label, struct dx_entry *entries)
243{
244 int i, n = dx_get_count (entries);
245 printk("%s index ", label);
246 for (i = 0; i < n; i++) {
247 printk("%x->%u ", i? dx_get_hash(entries + i) :
248 0, dx_get_block(entries + i));
249 }
250 printk("\n");
251}
252
253struct stats
254{
255 unsigned names;
256 unsigned space;
257 unsigned bcount;
258};
259
260static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_entry_2 *de,
261 int size, int show_names)
262{
263 unsigned names = 0, space = 0;
264 char *base = (char *) de;
265 struct dx_hash_info h = *hinfo;
266
267 printk("names: ");
268 while ((char *) de < base + size)
269 {
270 if (de->inode)
271 {
272 if (show_names)
273 {
274 int len = de->name_len;
275 char *name = de->name;
276 while (len--) printk("%c", *name++);
277 ext4fs_dirhash(de->name, de->name_len, &h);
278 printk(":%x.%u ", h.hash,
279 ((char *) de - base));
280 }
281 space += EXT4_DIR_REC_LEN(de->name_len);
282 names++;
283 }
284 de = (struct ext4_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
285 }
286 printk("(%i)\n", names);
287 return (struct stats) { names, space, 1 };
288}
289
290struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
291 struct dx_entry *entries, int levels)
292{
293 unsigned blocksize = dir->i_sb->s_blocksize;
294 unsigned count = dx_get_count (entries), names = 0, space = 0, i;
295 unsigned bcount = 0;
296 struct buffer_head *bh;
297 int err;
298 printk("%i indexed blocks...\n", count);
299 for (i = 0; i < count; i++, entries++)
300 {
301 u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0;
302 u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
303 struct stats stats;
304 printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range);
305 if (!(bh = ext4_bread (NULL,dir, block, 0,&err))) continue;
306 stats = levels?
307 dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
308 dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0);
309 names += stats.names;
310 space += stats.space;
311 bcount += stats.bcount;
312 brelse (bh);
313 }
314 if (bcount)
315 printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ",
316 names, space/bcount,(space/bcount)*100/blocksize);
317 return (struct stats) { names, space, bcount};
318}
319#endif /* DX_DEBUG */
320
321/*
322 * Probe for a directory leaf block to search.
323 *
324 * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
325 * error in the directory index, and the caller should fall back to
326 * searching the directory normally. The callers of dx_probe **MUST**
327 * check for this error code, and make sure it never gets reflected
328 * back to userspace.
329 */
330static struct dx_frame *
331dx_probe(struct dentry *dentry, struct inode *dir,
332 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
333{
334 unsigned count, indirect;
335 struct dx_entry *at, *entries, *p, *q, *m;
336 struct dx_root *root;
337 struct buffer_head *bh;
338 struct dx_frame *frame = frame_in;
339 u32 hash;
340
341 frame->bh = NULL;
342 if (dentry)
343 dir = dentry->d_parent->d_inode;
344 if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
345 goto fail;
346 root = (struct dx_root *) bh->b_data;
347 if (root->info.hash_version != DX_HASH_TEA &&
348 root->info.hash_version != DX_HASH_HALF_MD4 &&
349 root->info.hash_version != DX_HASH_LEGACY) {
350 ext4_warning(dir->i_sb, __FUNCTION__,
351 "Unrecognised inode hash code %d",
352 root->info.hash_version);
353 brelse(bh);
354 *err = ERR_BAD_DX_DIR;
355 goto fail;
356 }
357 hinfo->hash_version = root->info.hash_version;
358 hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
359 if (dentry)
360 ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
361 hash = hinfo->hash;
362
363 if (root->info.unused_flags & 1) {
364 ext4_warning(dir->i_sb, __FUNCTION__,
365 "Unimplemented inode hash flags: %#06x",
366 root->info.unused_flags);
367 brelse(bh);
368 *err = ERR_BAD_DX_DIR;
369 goto fail;
370 }
371
372 if ((indirect = root->info.indirect_levels) > 1) {
373 ext4_warning(dir->i_sb, __FUNCTION__,
374 "Unimplemented inode hash depth: %#06x",
375 root->info.indirect_levels);
376 brelse(bh);
377 *err = ERR_BAD_DX_DIR;
378 goto fail;
379 }
380
381 entries = (struct dx_entry *) (((char *)&root->info) +
382 root->info.info_length);
383 assert(dx_get_limit(entries) == dx_root_limit(dir,
384 root->info.info_length));
385 dxtrace (printk("Look up %x", hash));
386 while (1)
387 {
388 count = dx_get_count(entries);
389 assert (count && count <= dx_get_limit(entries));
390 p = entries + 1;
391 q = entries + count - 1;
392 while (p <= q)
393 {
394 m = p + (q - p)/2;
395 dxtrace(printk("."));
396 if (dx_get_hash(m) > hash)
397 q = m - 1;
398 else
399 p = m + 1;
400 }
401
402 if (0) // linear search cross check
403 {
404 unsigned n = count - 1;
405 at = entries;
406 while (n--)
407 {
408 dxtrace(printk(","));
409 if (dx_get_hash(++at) > hash)
410 {
411 at--;
412 break;
413 }
414 }
415 assert (at == p - 1);
416 }
417
418 at = p - 1;
419 dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
420 frame->bh = bh;
421 frame->entries = entries;
422 frame->at = at;
423 if (!indirect--) return frame;
424 if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err)))
425 goto fail2;
426 at = entries = ((struct dx_node *) bh->b_data)->entries;
427 assert (dx_get_limit(entries) == dx_node_limit (dir));
428 frame++;
429 }
430fail2:
431 while (frame >= frame_in) {
432 brelse(frame->bh);
433 frame--;
434 }
435fail:
436 return NULL;
437}
438
439static void dx_release (struct dx_frame *frames)
440{
441 if (frames[0].bh == NULL)
442 return;
443
444 if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
445 brelse(frames[1].bh);
446 brelse(frames[0].bh);
447}
448
449/*
450 * This function increments the frame pointer to search the next leaf
451 * block, and reads in the necessary intervening nodes if the search
452 * should be necessary. Whether or not the search is necessary is
453 * controlled by the hash parameter. If the hash value is even, then
454 * the search is only continued if the next block starts with that
455 * hash value. This is used if we are searching for a specific file.
456 *
457 * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
458 *
459 * This function returns 1 if the caller should continue to search,
460 * or 0 if it should not. If there is an error reading one of the
461 * index blocks, it will a negative error code.
462 *
463 * If start_hash is non-null, it will be filled in with the starting
464 * hash of the next page.
465 */
466static int ext4_htree_next_block(struct inode *dir, __u32 hash,
467 struct dx_frame *frame,
468 struct dx_frame *frames,
469 __u32 *start_hash)
470{
471 struct dx_frame *p;
472 struct buffer_head *bh;
473 int err, num_frames = 0;
474 __u32 bhash;
475
476 p = frame;
477 /*
478 * Find the next leaf page by incrementing the frame pointer.
479 * If we run out of entries in the interior node, loop around and
480 * increment pointer in the parent node. When we break out of
481 * this loop, num_frames indicates the number of interior
482 * nodes need to be read.
483 */
484 while (1) {
485 if (++(p->at) < p->entries + dx_get_count(p->entries))
486 break;
487 if (p == frames)
488 return 0;
489 num_frames++;
490 p--;
491 }
492
493 /*
494 * If the hash is 1, then continue only if the next page has a
495 * continuation hash of any value. This is used for readdir
496 * handling. Otherwise, check to see if the hash matches the
497 * desired contiuation hash. If it doesn't, return since
498 * there's no point to read in the successive index pages.
499 */
500 bhash = dx_get_hash(p->at);
501 if (start_hash)
502 *start_hash = bhash;
503 if ((hash & 1) == 0) {
504 if ((bhash & ~1) != hash)
505 return 0;
506 }
507 /*
508 * If the hash is HASH_NB_ALWAYS, we always go to the next
509 * block so no check is necessary
510 */
511 while (num_frames--) {
512 if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
513 0, &err)))
514 return err; /* Failure */
515 p++;
516 brelse (p->bh);
517 p->bh = bh;
518 p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
519 }
520 return 1;
521}
522
523
524/*
525 * p is at least 6 bytes before the end of page
526 */
527static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
528{
529 return (struct ext4_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len));
530}
531
532/*
533 * This function fills a red-black tree with information from a
534 * directory block. It returns the number directory entries loaded
535 * into the tree. If there is an error it is returned in err.
536 */
537static int htree_dirblock_to_tree(struct file *dir_file,
538 struct inode *dir, int block,
539 struct dx_hash_info *hinfo,
540 __u32 start_hash, __u32 start_minor_hash)
541{
542 struct buffer_head *bh;
543 struct ext4_dir_entry_2 *de, *top;
544 int err, count = 0;
545
546 dxtrace(printk("In htree dirblock_to_tree: block %d\n", block));
547 if (!(bh = ext4_bread (NULL, dir, block, 0, &err)))
548 return err;
549
550 de = (struct ext4_dir_entry_2 *) bh->b_data;
551 top = (struct ext4_dir_entry_2 *) ((char *) de +
552 dir->i_sb->s_blocksize -
553 EXT4_DIR_REC_LEN(0));
554 for (; de < top; de = ext4_next_entry(de)) {
555 ext4fs_dirhash(de->name, de->name_len, hinfo);
556 if ((hinfo->hash < start_hash) ||
557 ((hinfo->hash == start_hash) &&
558 (hinfo->minor_hash < start_minor_hash)))
559 continue;
560 if (de->inode == 0)
561 continue;
562 if ((err = ext4_htree_store_dirent(dir_file,
563 hinfo->hash, hinfo->minor_hash, de)) != 0) {
564 brelse(bh);
565 return err;
566 }
567 count++;
568 }
569 brelse(bh);
570 return count;
571}
572
573
574/*
575 * This function fills a red-black tree with information from a
576 * directory. We start scanning the directory in hash order, starting
577 * at start_hash and start_minor_hash.
578 *
579 * This function returns the number of entries inserted into the tree,
580 * or a negative error code.
581 */
582int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
583 __u32 start_minor_hash, __u32 *next_hash)
584{
585 struct dx_hash_info hinfo;
586 struct ext4_dir_entry_2 *de;
587 struct dx_frame frames[2], *frame;
588 struct inode *dir;
589 int block, err;
590 int count = 0;
591 int ret;
592 __u32 hashval;
593
594 dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
595 start_minor_hash));
596 dir = dir_file->f_dentry->d_inode;
597 if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
598 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
599 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
600 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
601 start_hash, start_minor_hash);
602 *next_hash = ~0;
603 return count;
604 }
605 hinfo.hash = start_hash;
606 hinfo.minor_hash = 0;
607 frame = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, frames, &err);
608 if (!frame)
609 return err;
610
611 /* Add '.' and '..' from the htree header */
612 if (!start_hash && !start_minor_hash) {
613 de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
614 if ((err = ext4_htree_store_dirent(dir_file, 0, 0, de)) != 0)
615 goto errout;
616 count++;
617 }
618 if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
619 de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
620 de = ext4_next_entry(de);
621 if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0)
622 goto errout;
623 count++;
624 }
625
626 while (1) {
627 block = dx_get_block(frame->at);
628 ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
629 start_hash, start_minor_hash);
630 if (ret < 0) {
631 err = ret;
632 goto errout;
633 }
634 count += ret;
635 hashval = ~0;
636 ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS,
637 frame, frames, &hashval);
638 *next_hash = hashval;
639 if (ret < 0) {
640 err = ret;
641 goto errout;
642 }
643 /*
644 * Stop if: (a) there are no more entries, or
645 * (b) we have inserted at least one entry and the
646 * next hash value is not a continuation
647 */
648 if ((ret == 0) ||
649 (count && ((hashval & 1) == 0)))
650 break;
651 }
652 dx_release(frames);
653 dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
654 count, *next_hash));
655 return count;
656errout:
657 dx_release(frames);
658 return (err);
659}
660
661
662/*
663 * Directory block splitting, compacting
664 */
665
666static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
667 struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
668{
669 int count = 0;
670 char *base = (char *) de;
671 struct dx_hash_info h = *hinfo;
672
673 while ((char *) de < base + size)
674 {
675 if (de->name_len && de->inode) {
676 ext4fs_dirhash(de->name, de->name_len, &h);
677 map_tail--;
678 map_tail->hash = h.hash;
679 map_tail->offs = (u32) ((char *) de - base);
680 count++;
681 cond_resched();
682 }
683 /* XXX: do we need to check rec_len == 0 case? -Chris */
684 de = (struct ext4_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
685 }
686 return count;
687}
688
689static void dx_sort_map (struct dx_map_entry *map, unsigned count)
690{
691 struct dx_map_entry *p, *q, *top = map + count - 1;
692 int more;
693 /* Combsort until bubble sort doesn't suck */
694 while (count > 2) {
695 count = count*10/13;
696 if (count - 9 < 2) /* 9, 10 -> 11 */
697 count = 11;
698 for (p = top, q = p - count; q >= map; p--, q--)
699 if (p->hash < q->hash)
700 swap(*p, *q);
701 }
702 /* Garden variety bubble sort */
703 do {
704 more = 0;
705 q = top;
706 while (q-- > map) {
707 if (q[1].hash >= q[0].hash)
708 continue;
709 swap(*(q+1), *q);
710 more = 1;
711 }
712 } while(more);
713}
714
715static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
716{
717 struct dx_entry *entries = frame->entries;
718 struct dx_entry *old = frame->at, *new = old + 1;
719 int count = dx_get_count(entries);
720
721 assert(count < dx_get_limit(entries));
722 assert(old < entries + count);
723 memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
724 dx_set_hash(new, hash);
725 dx_set_block(new, block);
726 dx_set_count(entries, count + 1);
727}
728#endif
729
730
731static void ext4_update_dx_flag(struct inode *inode)
732{
733 if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
734 EXT4_FEATURE_COMPAT_DIR_INDEX))
735 EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL;
736}
737
738/*
739 * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure.
740 *
741 * `len <= EXT4_NAME_LEN' is guaranteed by caller.
742 * `de != NULL' is guaranteed by caller.
743 */
744static inline int ext4_match (int len, const char * const name,
745 struct ext4_dir_entry_2 * de)
746{
747 if (len != de->name_len)
748 return 0;
749 if (!de->inode)
750 return 0;
751 return !memcmp(name, de->name, len);
752}
753
754/*
755 * Returns 0 if not found, -1 on failure, and 1 on success
756 */
757static inline int search_dirblock(struct buffer_head * bh,
758 struct inode *dir,
759 struct dentry *dentry,
760 unsigned long offset,
761 struct ext4_dir_entry_2 ** res_dir)
762{
763 struct ext4_dir_entry_2 * de;
764 char * dlimit;
765 int de_len;
766 const char *name = dentry->d_name.name;
767 int namelen = dentry->d_name.len;
768
769 de = (struct ext4_dir_entry_2 *) bh->b_data;
770 dlimit = bh->b_data + dir->i_sb->s_blocksize;
771 while ((char *) de < dlimit) {
772 /* this code is executed quadratically often */
773 /* do minimal checking `by hand' */
774
775 if ((char *) de + namelen <= dlimit &&
776 ext4_match (namelen, name, de)) {
777 /* found a match - just to be sure, do a full check */
778 if (!ext4_check_dir_entry("ext4_find_entry",
779 dir, de, bh, offset))
780 return -1;
781 *res_dir = de;
782 return 1;
783 }
784 /* prevent looping on a bad block */
785 de_len = le16_to_cpu(de->rec_len);
786 if (de_len <= 0)
787 return -1;
788 offset += de_len;
789 de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
790 }
791 return 0;
792}
793
794
795/*
796 * ext4_find_entry()
797 *
798 * finds an entry in the specified directory with the wanted name. It
799 * returns the cache buffer in which the entry was found, and the entry
800 * itself (as a parameter - res_dir). It does NOT read the inode of the
801 * entry - you'll have to do that yourself if you want to.
802 *
803 * The returned buffer_head has ->b_count elevated. The caller is expected
804 * to brelse() it when appropriate.
805 */
806static struct buffer_head * ext4_find_entry (struct dentry *dentry,
807 struct ext4_dir_entry_2 ** res_dir)
808{
809 struct super_block * sb;
810 struct buffer_head * bh_use[NAMEI_RA_SIZE];
811 struct buffer_head * bh, *ret = NULL;
812 unsigned long start, block, b;
813 int ra_max = 0; /* Number of bh's in the readahead
814 buffer, bh_use[] */
815 int ra_ptr = 0; /* Current index into readahead
816 buffer */
817 int num = 0;
818 int nblocks, i, err;
819 struct inode *dir = dentry->d_parent->d_inode;
820 int namelen;
821 const u8 *name;
822 unsigned blocksize;
823
824 *res_dir = NULL;
825 sb = dir->i_sb;
826 blocksize = sb->s_blocksize;
827 namelen = dentry->d_name.len;
828 name = dentry->d_name.name;
829 if (namelen > EXT4_NAME_LEN)
830 return NULL;
831#ifdef CONFIG_EXT4_INDEX
832 if (is_dx(dir)) {
833 bh = ext4_dx_find_entry(dentry, res_dir, &err);
834 /*
835 * On success, or if the error was file not found,
836 * return. Otherwise, fall back to doing a search the
837 * old fashioned way.
838 */
839 if (bh || (err != ERR_BAD_DX_DIR))
840 return bh;
841 dxtrace(printk("ext4_find_entry: dx failed, falling back\n"));
842 }
843#endif
844 nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
845 start = EXT4_I(dir)->i_dir_start_lookup;
846 if (start >= nblocks)
847 start = 0;
848 block = start;
849restart:
850 do {
851 /*
852 * We deal with the read-ahead logic here.
853 */
854 if (ra_ptr >= ra_max) {
855 /* Refill the readahead buffer */
856 ra_ptr = 0;
857 b = block;
858 for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
859 /*
860 * Terminate if we reach the end of the
861 * directory and must wrap, or if our
862 * search has finished at this block.
863 */
864 if (b >= nblocks || (num && block == start)) {
865 bh_use[ra_max] = NULL;
866 break;
867 }
868 num++;
869 bh = ext4_getblk(NULL, dir, b++, 0, &err);
870 bh_use[ra_max] = bh;
871 if (bh)
872 ll_rw_block(READ_META, 1, &bh);
873 }
874 }
875 if ((bh = bh_use[ra_ptr++]) == NULL)
876 goto next;
877 wait_on_buffer(bh);
878 if (!buffer_uptodate(bh)) {
879 /* read error, skip block & hope for the best */
880 ext4_error(sb, __FUNCTION__, "reading directory #%lu "
881 "offset %lu", dir->i_ino, block);
882 brelse(bh);
883 goto next;
884 }
885 i = search_dirblock(bh, dir, dentry,
886 block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
887 if (i == 1) {
888 EXT4_I(dir)->i_dir_start_lookup = block;
889 ret = bh;
890 goto cleanup_and_exit;
891 } else {
892 brelse(bh);
893 if (i < 0)
894 goto cleanup_and_exit;
895 }
896 next:
897 if (++block >= nblocks)
898 block = 0;
899 } while (block != start);
900
901 /*
902 * If the directory has grown while we were searching, then
903 * search the last part of the directory before giving up.
904 */
905 block = nblocks;
906 nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
907 if (block < nblocks) {
908 start = 0;
909 goto restart;
910 }
911
912cleanup_and_exit:
913 /* Clean up the read-ahead blocks */
914 for (; ra_ptr < ra_max; ra_ptr++)
915 brelse (bh_use[ra_ptr]);
916 return ret;
917}
918
919#ifdef CONFIG_EXT4_INDEX
920static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
921 struct ext4_dir_entry_2 **res_dir, int *err)
922{
923 struct super_block * sb;
924 struct dx_hash_info hinfo;
925 u32 hash;
926 struct dx_frame frames[2], *frame;
927 struct ext4_dir_entry_2 *de, *top;
928 struct buffer_head *bh;
929 unsigned long block;
930 int retval;
931 int namelen = dentry->d_name.len;
932 const u8 *name = dentry->d_name.name;
933 struct inode *dir = dentry->d_parent->d_inode;
934
935 sb = dir->i_sb;
936 /* NFS may look up ".." - look at dx_root directory block */
937 if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
938 if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err)))
939 return NULL;
940 } else {
941 frame = frames;
942 frame->bh = NULL; /* for dx_release() */
943 frame->at = (struct dx_entry *)frames; /* hack for zero entry*/
944 dx_set_block(frame->at, 0); /* dx_root block is 0 */
945 }
946 hash = hinfo.hash;
947 do {
948 block = dx_get_block(frame->at);
949 if (!(bh = ext4_bread (NULL,dir, block, 0, err)))
950 goto errout;
951 de = (struct ext4_dir_entry_2 *) bh->b_data;
952 top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
953 EXT4_DIR_REC_LEN(0));
954 for (; de < top; de = ext4_next_entry(de))
955 if (ext4_match (namelen, name, de)) {
956 if (!ext4_check_dir_entry("ext4_find_entry",
957 dir, de, bh,
958 (block<<EXT4_BLOCK_SIZE_BITS(sb))
959 +((char *)de - bh->b_data))) {
960 brelse (bh);
961 goto errout;
962 }
963 *res_dir = de;
964 dx_release (frames);
965 return bh;
966 }
967 brelse (bh);
968 /* Check to see if we should continue to search */
969 retval = ext4_htree_next_block(dir, hash, frame,
970 frames, NULL);
971 if (retval < 0) {
972 ext4_warning(sb, __FUNCTION__,
973 "error reading index page in directory #%lu",
974 dir->i_ino);
975 *err = retval;
976 goto errout;
977 }
978 } while (retval == 1);
979
980 *err = -ENOENT;
981errout:
982 dxtrace(printk("%s not found\n", name));
983 dx_release (frames);
984 return NULL;
985}
986#endif
987
988static struct dentry *ext4_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
989{
990 struct inode * inode;
991 struct ext4_dir_entry_2 * de;
992 struct buffer_head * bh;
993
994 if (dentry->d_name.len > EXT4_NAME_LEN)
995 return ERR_PTR(-ENAMETOOLONG);
996
997 bh = ext4_find_entry(dentry, &de);
998 inode = NULL;
999 if (bh) {
1000 unsigned long ino = le32_to_cpu(de->inode);
1001 brelse (bh);
1002 if (!ext4_valid_inum(dir->i_sb, ino)) {
1003 ext4_error(dir->i_sb, "ext4_lookup",
1004 "bad inode number: %lu", ino);
1005 inode = NULL;
1006 } else
1007 inode = iget(dir->i_sb, ino);
1008
1009 if (!inode)
1010 return ERR_PTR(-EACCES);
1011 }
1012 return d_splice_alias(inode, dentry);
1013}
1014
1015
1016struct dentry *ext4_get_parent(struct dentry *child)
1017{
1018 unsigned long ino;
1019 struct dentry *parent;
1020 struct inode *inode;
1021 struct dentry dotdot;
1022 struct ext4_dir_entry_2 * de;
1023 struct buffer_head *bh;
1024
1025 dotdot.d_name.name = "..";
1026 dotdot.d_name.len = 2;
1027 dotdot.d_parent = child; /* confusing, isn't it! */
1028
1029 bh = ext4_find_entry(&dotdot, &de);
1030 inode = NULL;
1031 if (!bh)
1032 return ERR_PTR(-ENOENT);
1033 ino = le32_to_cpu(de->inode);
1034 brelse(bh);
1035
1036 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
1037 ext4_error(child->d_inode->i_sb, "ext4_get_parent",
1038 "bad inode number: %lu", ino);
1039 inode = NULL;
1040 } else
1041 inode = iget(child->d_inode->i_sb, ino);
1042
1043 if (!inode)
1044 return ERR_PTR(-EACCES);
1045
1046 parent = d_alloc_anon(inode);
1047 if (!parent) {
1048 iput(inode);
1049 parent = ERR_PTR(-ENOMEM);
1050 }
1051 return parent;
1052}
1053
1054#define S_SHIFT 12
1055static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
1056 [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE,
1057 [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR,
1058 [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV,
1059 [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV,
1060 [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO,
1061 [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK,
1062 [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK,
1063};
1064
1065static inline void ext4_set_de_type(struct super_block *sb,
1066 struct ext4_dir_entry_2 *de,
1067 umode_t mode) {
1068 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
1069 de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
1070}
1071
1072#ifdef CONFIG_EXT4_INDEX
1073static struct ext4_dir_entry_2 *
1074dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
1075{
1076 unsigned rec_len = 0;
1077
1078 while (count--) {
1079 struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) (from + map->offs);
1080 rec_len = EXT4_DIR_REC_LEN(de->name_len);
1081 memcpy (to, de, rec_len);
1082 ((struct ext4_dir_entry_2 *) to)->rec_len =
1083 cpu_to_le16(rec_len);
1084 de->inode = 0;
1085 map++;
1086 to += rec_len;
1087 }
1088 return (struct ext4_dir_entry_2 *) (to - rec_len);
1089}
1090
1091static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size)
1092{
1093 struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
1094 unsigned rec_len = 0;
1095
1096 prev = to = de;
1097 while ((char*)de < base + size) {
1098 next = (struct ext4_dir_entry_2 *) ((char *) de +
1099 le16_to_cpu(de->rec_len));
1100 if (de->inode && de->name_len) {
1101 rec_len = EXT4_DIR_REC_LEN(de->name_len);
1102 if (de > to)
1103 memmove(to, de, rec_len);
1104 to->rec_len = cpu_to_le16(rec_len);
1105 prev = to;
1106 to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
1107 }
1108 de = next;
1109 }
1110 return prev;
1111}
1112
1113static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1114 struct buffer_head **bh,struct dx_frame *frame,
1115 struct dx_hash_info *hinfo, int *error)
1116{
1117 unsigned blocksize = dir->i_sb->s_blocksize;
1118 unsigned count, continued;
1119 struct buffer_head *bh2;
1120 u32 newblock;
1121 u32 hash2;
1122 struct dx_map_entry *map;
1123 char *data1 = (*bh)->b_data, *data2;
1124 unsigned split;
1125 struct ext4_dir_entry_2 *de = NULL, *de2;
1126 int err;
1127
1128 bh2 = ext4_append (handle, dir, &newblock, error);
1129 if (!(bh2)) {
1130 brelse(*bh);
1131 *bh = NULL;
1132 goto errout;
1133 }
1134
1135 BUFFER_TRACE(*bh, "get_write_access");
1136 err = ext4_journal_get_write_access(handle, *bh);
1137 if (err) {
1138 journal_error:
1139 brelse(*bh);
1140 brelse(bh2);
1141 *bh = NULL;
1142 ext4_std_error(dir->i_sb, err);
1143 goto errout;
1144 }
1145 BUFFER_TRACE(frame->bh, "get_write_access");
1146 err = ext4_journal_get_write_access(handle, frame->bh);
1147 if (err)
1148 goto journal_error;
1149
1150 data2 = bh2->b_data;
1151
1152 /* create map in the end of data2 block */
1153 map = (struct dx_map_entry *) (data2 + blocksize);
1154 count = dx_make_map ((struct ext4_dir_entry_2 *) data1,
1155 blocksize, hinfo, map);
1156 map -= count;
1157 split = count/2; // need to adjust to actual middle
1158 dx_sort_map (map, count);
1159 hash2 = map[split].hash;
1160 continued = hash2 == map[split - 1].hash;
1161 dxtrace(printk("Split block %i at %x, %i/%i\n",
1162 dx_get_block(frame->at), hash2, split, count-split));
1163
1164 /* Fancy dance to stay within two buffers */
1165 de2 = dx_move_dirents(data1, data2, map + split, count - split);
1166 de = dx_pack_dirents(data1,blocksize);
1167 de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
1168 de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
1169 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
1170 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
1171
1172 /* Which block gets the new entry? */
1173 if (hinfo->hash >= hash2)
1174 {
1175 swap(*bh, bh2);
1176 de = de2;
1177 }
1178 dx_insert_block (frame, hash2 + continued, newblock);
1179 err = ext4_journal_dirty_metadata (handle, bh2);
1180 if (err)
1181 goto journal_error;
1182 err = ext4_journal_dirty_metadata (handle, frame->bh);
1183 if (err)
1184 goto journal_error;
1185 brelse (bh2);
1186 dxtrace(dx_show_index ("frame", frame->entries));
1187errout:
1188 return de;
1189}
1190#endif
1191
1192
1193/*
1194 * Add a new entry into a directory (leaf) block. If de is non-NULL,
1195 * it points to a directory entry which is guaranteed to be large
1196 * enough for new directory entry. If de is NULL, then
1197 * add_dirent_to_buf will attempt search the directory block for
1198 * space. It will return -ENOSPC if no space is available, and -EIO
1199 * and -EEXIST if directory entry already exists.
1200 *
1201 * NOTE! bh is NOT released in the case where ENOSPC is returned. In
1202 * all other cases bh is released.
1203 */
1204static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1205 struct inode *inode, struct ext4_dir_entry_2 *de,
1206 struct buffer_head * bh)
1207{
1208 struct inode *dir = dentry->d_parent->d_inode;
1209 const char *name = dentry->d_name.name;
1210 int namelen = dentry->d_name.len;
1211 unsigned long offset = 0;
1212 unsigned short reclen;
1213 int nlen, rlen, err;
1214 char *top;
1215
1216 reclen = EXT4_DIR_REC_LEN(namelen);
1217 if (!de) {
1218 de = (struct ext4_dir_entry_2 *)bh->b_data;
1219 top = bh->b_data + dir->i_sb->s_blocksize - reclen;
1220 while ((char *) de <= top) {
1221 if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
1222 bh, offset)) {
1223 brelse (bh);
1224 return -EIO;
1225 }
1226 if (ext4_match (namelen, name, de)) {
1227 brelse (bh);
1228 return -EEXIST;
1229 }
1230 nlen = EXT4_DIR_REC_LEN(de->name_len);
1231 rlen = le16_to_cpu(de->rec_len);
1232 if ((de->inode? rlen - nlen: rlen) >= reclen)
1233 break;
1234 de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
1235 offset += rlen;
1236 }
1237 if ((char *) de > top)
1238 return -ENOSPC;
1239 }
1240 BUFFER_TRACE(bh, "get_write_access");
1241 err = ext4_journal_get_write_access(handle, bh);
1242 if (err) {
1243 ext4_std_error(dir->i_sb, err);
1244 brelse(bh);
1245 return err;
1246 }
1247
1248 /* By now the buffer is marked for journaling */
1249 nlen = EXT4_DIR_REC_LEN(de->name_len);
1250 rlen = le16_to_cpu(de->rec_len);
1251 if (de->inode) {
1252 struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
1253 de1->rec_len = cpu_to_le16(rlen - nlen);
1254 de->rec_len = cpu_to_le16(nlen);
1255 de = de1;
1256 }
1257 de->file_type = EXT4_FT_UNKNOWN;
1258 if (inode) {
1259 de->inode = cpu_to_le32(inode->i_ino);
1260 ext4_set_de_type(dir->i_sb, de, inode->i_mode);
1261 } else
1262 de->inode = 0;
1263 de->name_len = namelen;
1264 memcpy (de->name, name, namelen);
1265 /*
1266 * XXX shouldn't update any times until successful
1267 * completion of syscall, but too many callers depend
1268 * on this.
1269 *
1270 * XXX similarly, too many callers depend on
1271 * ext4_new_inode() setting the times, but error
1272 * recovery deletes the inode, so the worst that can
1273 * happen is that the times are slightly out of date
1274 * and/or different from the directory change time.
1275 */
1276 dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
1277 ext4_update_dx_flag(dir);
1278 dir->i_version++;
1279 ext4_mark_inode_dirty(handle, dir);
1280 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
1281 err = ext4_journal_dirty_metadata(handle, bh);
1282 if (err)
1283 ext4_std_error(dir->i_sb, err);
1284 brelse(bh);
1285 return 0;
1286}
1287
1288#ifdef CONFIG_EXT4_INDEX
1289/*
1290 * This converts a one block unindexed directory to a 3 block indexed
1291 * directory, and adds the dentry to the indexed directory.
1292 */
1293static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1294 struct inode *inode, struct buffer_head *bh)
1295{
1296 struct inode *dir = dentry->d_parent->d_inode;
1297 const char *name = dentry->d_name.name;
1298 int namelen = dentry->d_name.len;
1299 struct buffer_head *bh2;
1300 struct dx_root *root;
1301 struct dx_frame frames[2], *frame;
1302 struct dx_entry *entries;
1303 struct ext4_dir_entry_2 *de, *de2;
1304 char *data1, *top;
1305 unsigned len;
1306 int retval;
1307 unsigned blocksize;
1308 struct dx_hash_info hinfo;
1309 u32 block;
1310 struct fake_dirent *fde;
1311
1312 blocksize = dir->i_sb->s_blocksize;
1313 dxtrace(printk("Creating index\n"));
1314 retval = ext4_journal_get_write_access(handle, bh);
1315 if (retval) {
1316 ext4_std_error(dir->i_sb, retval);
1317 brelse(bh);
1318 return retval;
1319 }
1320 root = (struct dx_root *) bh->b_data;
1321
1322 bh2 = ext4_append (handle, dir, &block, &retval);
1323 if (!(bh2)) {
1324 brelse(bh);
1325 return retval;
1326 }
1327 EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
1328 data1 = bh2->b_data;
1329
1330 /* The 0th block becomes the root, move the dirents out */
1331 fde = &root->dotdot;
1332 de = (struct ext4_dir_entry_2 *)((char *)fde + le16_to_cpu(fde->rec_len));
1333 len = ((char *) root) + blocksize - (char *) de;
1334 memcpy (data1, de, len);
1335 de = (struct ext4_dir_entry_2 *) data1;
1336 top = data1 + len;
1337 while ((char *)(de2=(void*)de+le16_to_cpu(de->rec_len)) < top)
1338 de = de2;
1339 de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
1340 /* Initialize the root; the dot dirents already exist */
1341 de = (struct ext4_dir_entry_2 *) (&root->dotdot);
1342 de->rec_len = cpu_to_le16(blocksize - EXT4_DIR_REC_LEN(2));
1343 memset (&root->info, 0, sizeof(root->info));
1344 root->info.info_length = sizeof(root->info);
1345 root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
1346 entries = root->entries;
1347 dx_set_block (entries, 1);
1348 dx_set_count (entries, 1);
1349 dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
1350
1351 /* Initialize as for dx_probe */
1352 hinfo.hash_version = root->info.hash_version;
1353 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
1354 ext4fs_dirhash(name, namelen, &hinfo);
1355 frame = frames;
1356 frame->entries = entries;
1357 frame->at = entries;
1358 frame->bh = bh;
1359 bh = bh2;
1360 de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
1361 dx_release (frames);
1362 if (!(de))
1363 return retval;
1364
1365 return add_dirent_to_buf(handle, dentry, inode, de, bh);
1366}
1367#endif
1368
1369/*
1370 * ext4_add_entry()
1371 *
1372 * adds a file entry to the specified directory, using the same
1373 * semantics as ext4_find_entry(). It returns NULL if it failed.
1374 *
1375 * NOTE!! The inode part of 'de' is left at 0 - which means you
1376 * may not sleep between calling this and putting something into
1377 * the entry, as someone else might have used it while you slept.
1378 */
1379static int ext4_add_entry (handle_t *handle, struct dentry *dentry,
1380 struct inode *inode)
1381{
1382 struct inode *dir = dentry->d_parent->d_inode;
1383 unsigned long offset;
1384 struct buffer_head * bh;
1385 struct ext4_dir_entry_2 *de;
1386 struct super_block * sb;
1387 int retval;
1388#ifdef CONFIG_EXT4_INDEX
1389 int dx_fallback=0;
1390#endif
1391 unsigned blocksize;
1392 u32 block, blocks;
1393
1394 sb = dir->i_sb;
1395 blocksize = sb->s_blocksize;
1396 if (!dentry->d_name.len)
1397 return -EINVAL;
1398#ifdef CONFIG_EXT4_INDEX
1399 if (is_dx(dir)) {
1400 retval = ext4_dx_add_entry(handle, dentry, inode);
1401 if (!retval || (retval != ERR_BAD_DX_DIR))
1402 return retval;
1403 EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL;
1404 dx_fallback++;
1405 ext4_mark_inode_dirty(handle, dir);
1406 }
1407#endif
1408 blocks = dir->i_size >> sb->s_blocksize_bits;
1409 for (block = 0, offset = 0; block < blocks; block++) {
1410 bh = ext4_bread(handle, dir, block, 0, &retval);
1411 if(!bh)
1412 return retval;
1413 retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1414 if (retval != -ENOSPC)
1415 return retval;
1416
1417#ifdef CONFIG_EXT4_INDEX
1418 if (blocks == 1 && !dx_fallback &&
1419 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
1420 return make_indexed_dir(handle, dentry, inode, bh);
1421#endif
1422 brelse(bh);
1423 }
1424 bh = ext4_append(handle, dir, &block, &retval);
1425 if (!bh)
1426 return retval;
1427 de = (struct ext4_dir_entry_2 *) bh->b_data;
1428 de->inode = 0;
1429 de->rec_len = cpu_to_le16(blocksize);
1430 return add_dirent_to_buf(handle, dentry, inode, de, bh);
1431}
1432
1433#ifdef CONFIG_EXT4_INDEX
1434/*
1435 * Returns 0 for success, or a negative error value
1436 */
1437static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1438 struct inode *inode)
1439{
1440 struct dx_frame frames[2], *frame;
1441 struct dx_entry *entries, *at;
1442 struct dx_hash_info hinfo;
1443 struct buffer_head * bh;
1444 struct inode *dir = dentry->d_parent->d_inode;
1445 struct super_block * sb = dir->i_sb;
1446 struct ext4_dir_entry_2 *de;
1447 int err;
1448
1449 frame = dx_probe(dentry, NULL, &hinfo, frames, &err);
1450 if (!frame)
1451 return err;
1452 entries = frame->entries;
1453 at = frame->at;
1454
1455 if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
1456 goto cleanup;
1457
1458 BUFFER_TRACE(bh, "get_write_access");
1459 err = ext4_journal_get_write_access(handle, bh);
1460 if (err)
1461 goto journal_error;
1462
1463 err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1464 if (err != -ENOSPC) {
1465 bh = NULL;
1466 goto cleanup;
1467 }
1468
1469 /* Block full, should compress but for now just split */
1470 dxtrace(printk("using %u of %u node entries\n",
1471 dx_get_count(entries), dx_get_limit(entries)));
1472 /* Need to split index? */
1473 if (dx_get_count(entries) == dx_get_limit(entries)) {
1474 u32 newblock;
1475 unsigned icount = dx_get_count(entries);
1476 int levels = frame - frames;
1477 struct dx_entry *entries2;
1478 struct dx_node *node2;
1479 struct buffer_head *bh2;
1480
1481 if (levels && (dx_get_count(frames->entries) ==
1482 dx_get_limit(frames->entries))) {
1483 ext4_warning(sb, __FUNCTION__,
1484 "Directory index full!");
1485 err = -ENOSPC;
1486 goto cleanup;
1487 }
1488 bh2 = ext4_append (handle, dir, &newblock, &err);
1489 if (!(bh2))
1490 goto cleanup;
1491 node2 = (struct dx_node *)(bh2->b_data);
1492 entries2 = node2->entries;
1493 node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
1494 node2->fake.inode = 0;
1495 BUFFER_TRACE(frame->bh, "get_write_access");
1496 err = ext4_journal_get_write_access(handle, frame->bh);
1497 if (err)
1498 goto journal_error;
1499 if (levels) {
1500 unsigned icount1 = icount/2, icount2 = icount - icount1;
1501 unsigned hash2 = dx_get_hash(entries + icount1);
1502 dxtrace(printk("Split index %i/%i\n", icount1, icount2));
1503
1504 BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
1505 err = ext4_journal_get_write_access(handle,
1506 frames[0].bh);
1507 if (err)
1508 goto journal_error;
1509
1510 memcpy ((char *) entries2, (char *) (entries + icount1),
1511 icount2 * sizeof(struct dx_entry));
1512 dx_set_count (entries, icount1);
1513 dx_set_count (entries2, icount2);
1514 dx_set_limit (entries2, dx_node_limit(dir));
1515
1516 /* Which index block gets the new entry? */
1517 if (at - entries >= icount1) {
1518 frame->at = at = at - entries - icount1 + entries2;
1519 frame->entries = entries = entries2;
1520 swap(frame->bh, bh2);
1521 }
1522 dx_insert_block (frames + 0, hash2, newblock);
1523 dxtrace(dx_show_index ("node", frames[1].entries));
1524 dxtrace(dx_show_index ("node",
1525 ((struct dx_node *) bh2->b_data)->entries));
1526 err = ext4_journal_dirty_metadata(handle, bh2);
1527 if (err)
1528 goto journal_error;
1529 brelse (bh2);
1530 } else {
1531 dxtrace(printk("Creating second level index...\n"));
1532 memcpy((char *) entries2, (char *) entries,
1533 icount * sizeof(struct dx_entry));
1534 dx_set_limit(entries2, dx_node_limit(dir));
1535
1536 /* Set up root */
1537 dx_set_count(entries, 1);
1538 dx_set_block(entries + 0, newblock);
1539 ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
1540
1541 /* Add new access path frame */
1542 frame = frames + 1;
1543 frame->at = at = at - entries + entries2;
1544 frame->entries = entries = entries2;
1545 frame->bh = bh2;
1546 err = ext4_journal_get_write_access(handle,
1547 frame->bh);
1548 if (err)
1549 goto journal_error;
1550 }
1551 ext4_journal_dirty_metadata(handle, frames[0].bh);
1552 }
1553 de = do_split(handle, dir, &bh, frame, &hinfo, &err);
1554 if (!de)
1555 goto cleanup;
1556 err = add_dirent_to_buf(handle, dentry, inode, de, bh);
1557 bh = NULL;
1558 goto cleanup;
1559
1560journal_error:
1561 ext4_std_error(dir->i_sb, err);
1562cleanup:
1563 if (bh)
1564 brelse(bh);
1565 dx_release(frames);
1566 return err;
1567}
1568#endif
1569
1570/*
1571 * ext4_delete_entry deletes a directory entry by merging it with the
1572 * previous entry
1573 */
1574static int ext4_delete_entry (handle_t *handle,
1575 struct inode * dir,
1576 struct ext4_dir_entry_2 * de_del,
1577 struct buffer_head * bh)
1578{
1579 struct ext4_dir_entry_2 * de, * pde;
1580 int i;
1581
1582 i = 0;
1583 pde = NULL;
1584 de = (struct ext4_dir_entry_2 *) bh->b_data;
1585 while (i < bh->b_size) {
1586 if (!ext4_check_dir_entry("ext4_delete_entry", dir, de, bh, i))
1587 return -EIO;
1588 if (de == de_del) {
1589 BUFFER_TRACE(bh, "get_write_access");
1590 ext4_journal_get_write_access(handle, bh);
1591 if (pde)
1592 pde->rec_len =
1593 cpu_to_le16(le16_to_cpu(pde->rec_len) +
1594 le16_to_cpu(de->rec_len));
1595 else
1596 de->inode = 0;
1597 dir->i_version++;
1598 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
1599 ext4_journal_dirty_metadata(handle, bh);
1600 return 0;
1601 }
1602 i += le16_to_cpu(de->rec_len);
1603 pde = de;
1604 de = (struct ext4_dir_entry_2 *)
1605 ((char *) de + le16_to_cpu(de->rec_len));
1606 }
1607 return -ENOENT;
1608}
1609
1610/*
1611 * ext4_mark_inode_dirty is somewhat expensive, so unlike ext2 we
1612 * do not perform it in these functions. We perform it at the call site,
1613 * if it is needed.
1614 */
1615static inline void ext4_inc_count(handle_t *handle, struct inode *inode)
1616{
1617 inc_nlink(inode);
1618}
1619
1620static inline void ext4_dec_count(handle_t *handle, struct inode *inode)
1621{
1622 drop_nlink(inode);
1623}
1624
1625static int ext4_add_nondir(handle_t *handle,
1626 struct dentry *dentry, struct inode *inode)
1627{
1628 int err = ext4_add_entry(handle, dentry, inode);
1629 if (!err) {
1630 ext4_mark_inode_dirty(handle, inode);
1631 d_instantiate(dentry, inode);
1632 return 0;
1633 }
1634 ext4_dec_count(handle, inode);
1635 iput(inode);
1636 return err;
1637}
1638
1639/*
1640 * By the time this is called, we already have created
1641 * the directory cache entry for the new file, but it
1642 * is so far negative - it has no inode.
1643 *
1644 * If the create succeeds, we fill in the inode information
1645 * with d_instantiate().
1646 */
1647static int ext4_create (struct inode * dir, struct dentry * dentry, int mode,
1648 struct nameidata *nd)
1649{
1650 handle_t *handle;
1651 struct inode * inode;
1652 int err, retries = 0;
1653
1654retry:
1655 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
1656 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1657 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
1658 if (IS_ERR(handle))
1659 return PTR_ERR(handle);
1660
1661 if (IS_DIRSYNC(dir))
1662 handle->h_sync = 1;
1663
1664 inode = ext4_new_inode (handle, dir, mode);
1665 err = PTR_ERR(inode);
1666 if (!IS_ERR(inode)) {
1667 inode->i_op = &ext4_file_inode_operations;
1668 inode->i_fop = &ext4_file_operations;
1669 ext4_set_aops(inode);
1670 err = ext4_add_nondir(handle, dentry, inode);
1671 }
1672 ext4_journal_stop(handle);
1673 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
1674 goto retry;
1675 return err;
1676}
1677
1678static int ext4_mknod (struct inode * dir, struct dentry *dentry,
1679 int mode, dev_t rdev)
1680{
1681 handle_t *handle;
1682 struct inode *inode;
1683 int err, retries = 0;
1684
1685 if (!new_valid_dev(rdev))
1686 return -EINVAL;
1687
1688retry:
1689 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
1690 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1691 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
1692 if (IS_ERR(handle))
1693 return PTR_ERR(handle);
1694
1695 if (IS_DIRSYNC(dir))
1696 handle->h_sync = 1;
1697
1698 inode = ext4_new_inode (handle, dir, mode);
1699 err = PTR_ERR(inode);
1700 if (!IS_ERR(inode)) {
1701 init_special_inode(inode, inode->i_mode, rdev);
1702#ifdef CONFIG_EXT4DEV_FS_XATTR
1703 inode->i_op = &ext4_special_inode_operations;
1704#endif
1705 err = ext4_add_nondir(handle, dentry, inode);
1706 }
1707 ext4_journal_stop(handle);
1708 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
1709 goto retry;
1710 return err;
1711}
1712
1713static int ext4_mkdir(struct inode * dir, struct dentry * dentry, int mode)
1714{
1715 handle_t *handle;
1716 struct inode * inode;
1717 struct buffer_head * dir_block;
1718 struct ext4_dir_entry_2 * de;
1719 int err, retries = 0;
1720
1721 if (dir->i_nlink >= EXT4_LINK_MAX)
1722 return -EMLINK;
1723
1724retry:
1725 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
1726 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1727 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
1728 if (IS_ERR(handle))
1729 return PTR_ERR(handle);
1730
1731 if (IS_DIRSYNC(dir))
1732 handle->h_sync = 1;
1733
1734 inode = ext4_new_inode (handle, dir, S_IFDIR | mode);
1735 err = PTR_ERR(inode);
1736 if (IS_ERR(inode))
1737 goto out_stop;
1738
1739 inode->i_op = &ext4_dir_inode_operations;
1740 inode->i_fop = &ext4_dir_operations;
1741 inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
1742 dir_block = ext4_bread (handle, inode, 0, 1, &err);
1743 if (!dir_block) {
1744 drop_nlink(inode); /* is this nlink == 0? */
1745 ext4_mark_inode_dirty(handle, inode);
1746 iput (inode);
1747 goto out_stop;
1748 }
1749 BUFFER_TRACE(dir_block, "get_write_access");
1750 ext4_journal_get_write_access(handle, dir_block);
1751 de = (struct ext4_dir_entry_2 *) dir_block->b_data;
1752 de->inode = cpu_to_le32(inode->i_ino);
1753 de->name_len = 1;
1754 de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(de->name_len));
1755 strcpy (de->name, ".");
1756 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1757 de = (struct ext4_dir_entry_2 *)
1758 ((char *) de + le16_to_cpu(de->rec_len));
1759 de->inode = cpu_to_le32(dir->i_ino);
1760 de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize-EXT4_DIR_REC_LEN(1));
1761 de->name_len = 2;
1762 strcpy (de->name, "..");
1763 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1764 inode->i_nlink = 2;
1765 BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata");
1766 ext4_journal_dirty_metadata(handle, dir_block);
1767 brelse (dir_block);
1768 ext4_mark_inode_dirty(handle, inode);
1769 err = ext4_add_entry (handle, dentry, inode);
1770 if (err) {
1771 inode->i_nlink = 0;
1772 ext4_mark_inode_dirty(handle, inode);
1773 iput (inode);
1774 goto out_stop;
1775 }
1776 inc_nlink(dir);
1777 ext4_update_dx_flag(dir);
1778 ext4_mark_inode_dirty(handle, dir);
1779 d_instantiate(dentry, inode);
1780out_stop:
1781 ext4_journal_stop(handle);
1782 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
1783 goto retry;
1784 return err;
1785}
1786
1787/*
1788 * routine to check that the specified directory is empty (for rmdir)
1789 */
1790static int empty_dir (struct inode * inode)
1791{
1792 unsigned long offset;
1793 struct buffer_head * bh;
1794 struct ext4_dir_entry_2 * de, * de1;
1795 struct super_block * sb;
1796 int err = 0;
1797
1798 sb = inode->i_sb;
1799 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
1800 !(bh = ext4_bread (NULL, inode, 0, 0, &err))) {
1801 if (err)
1802 ext4_error(inode->i_sb, __FUNCTION__,
1803 "error %d reading directory #%lu offset 0",
1804 err, inode->i_ino);
1805 else
1806 ext4_warning(inode->i_sb, __FUNCTION__,
1807 "bad directory (dir #%lu) - no data block",
1808 inode->i_ino);
1809 return 1;
1810 }
1811 de = (struct ext4_dir_entry_2 *) bh->b_data;
1812 de1 = (struct ext4_dir_entry_2 *)
1813 ((char *) de + le16_to_cpu(de->rec_len));
1814 if (le32_to_cpu(de->inode) != inode->i_ino ||
1815 !le32_to_cpu(de1->inode) ||
1816 strcmp (".", de->name) ||
1817 strcmp ("..", de1->name)) {
1818 ext4_warning (inode->i_sb, "empty_dir",
1819 "bad directory (dir #%lu) - no `.' or `..'",
1820 inode->i_ino);
1821 brelse (bh);
1822 return 1;
1823 }
1824 offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
1825 de = (struct ext4_dir_entry_2 *)
1826 ((char *) de1 + le16_to_cpu(de1->rec_len));
1827 while (offset < inode->i_size ) {
1828 if (!bh ||
1829 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
1830 err = 0;
1831 brelse (bh);
1832 bh = ext4_bread (NULL, inode,
1833 offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
1834 if (!bh) {
1835 if (err)
1836 ext4_error(sb, __FUNCTION__,
1837 "error %d reading directory"
1838 " #%lu offset %lu",
1839 err, inode->i_ino, offset);
1840 offset += sb->s_blocksize;
1841 continue;
1842 }
1843 de = (struct ext4_dir_entry_2 *) bh->b_data;
1844 }
1845 if (!ext4_check_dir_entry("empty_dir", inode, de, bh, offset)) {
1846 de = (struct ext4_dir_entry_2 *)(bh->b_data +
1847 sb->s_blocksize);
1848 offset = (offset | (sb->s_blocksize - 1)) + 1;
1849 continue;
1850 }
1851 if (le32_to_cpu(de->inode)) {
1852 brelse (bh);
1853 return 0;
1854 }
1855 offset += le16_to_cpu(de->rec_len);
1856 de = (struct ext4_dir_entry_2 *)
1857 ((char *) de + le16_to_cpu(de->rec_len));
1858 }
1859 brelse (bh);
1860 return 1;
1861}
1862
1863/* ext4_orphan_add() links an unlinked or truncated inode into a list of
1864 * such inodes, starting at the superblock, in case we crash before the
1865 * file is closed/deleted, or in case the inode truncate spans multiple
1866 * transactions and the last transaction is not recovered after a crash.
1867 *
1868 * At filesystem recovery time, we walk this list deleting unlinked
1869 * inodes and truncating linked inodes in ext4_orphan_cleanup().
1870 */
1871int ext4_orphan_add(handle_t *handle, struct inode *inode)
1872{
1873 struct super_block *sb = inode->i_sb;
1874 struct ext4_iloc iloc;
1875 int err = 0, rc;
1876
1877 lock_super(sb);
1878 if (!list_empty(&EXT4_I(inode)->i_orphan))
1879 goto out_unlock;
1880
1881 /* Orphan handling is only valid for files with data blocks
1882 * being truncated, or files being unlinked. */
1883
1884 /* @@@ FIXME: Observation from aviro:
1885 * I think I can trigger J_ASSERT in ext4_orphan_add(). We block
1886 * here (on lock_super()), so race with ext4_link() which might bump
1887 * ->i_nlink. For, say it, character device. Not a regular file,
1888 * not a directory, not a symlink and ->i_nlink > 0.
1889 */
1890 J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1891 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
1892
1893 BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
1894 err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
1895 if (err)
1896 goto out_unlock;
1897
1898 err = ext4_reserve_inode_write(handle, inode, &iloc);
1899 if (err)
1900 goto out_unlock;
1901
1902 /* Insert this inode at the head of the on-disk orphan list... */
1903 NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
1904 EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
1905 err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
1906 rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
1907 if (!err)
1908 err = rc;
1909
1910 /* Only add to the head of the in-memory list if all the
1911 * previous operations succeeded. If the orphan_add is going to
1912 * fail (possibly taking the journal offline), we can't risk
1913 * leaving the inode on the orphan list: stray orphan-list
1914 * entries can cause panics at unmount time.
1915 *
1916 * This is safe: on error we're going to ignore the orphan list
1917 * anyway on the next recovery. */
1918 if (!err)
1919 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
1920
1921 jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
1922 jbd_debug(4, "orphan inode %lu will point to %d\n",
1923 inode->i_ino, NEXT_ORPHAN(inode));
1924out_unlock:
1925 unlock_super(sb);
1926 ext4_std_error(inode->i_sb, err);
1927 return err;
1928}
1929
1930/*
1931 * ext4_orphan_del() removes an unlinked or truncated inode from the list
1932 * of such inodes stored on disk, because it is finally being cleaned up.
1933 */
1934int ext4_orphan_del(handle_t *handle, struct inode *inode)
1935{
1936 struct list_head *prev;
1937 struct ext4_inode_info *ei = EXT4_I(inode);
1938 struct ext4_sb_info *sbi;
1939 unsigned long ino_next;
1940 struct ext4_iloc iloc;
1941 int err = 0;
1942
1943 lock_super(inode->i_sb);
1944 if (list_empty(&ei->i_orphan)) {
1945 unlock_super(inode->i_sb);
1946 return 0;
1947 }
1948
1949 ino_next = NEXT_ORPHAN(inode);
1950 prev = ei->i_orphan.prev;
1951 sbi = EXT4_SB(inode->i_sb);
1952
1953 jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
1954
1955 list_del_init(&ei->i_orphan);
1956
1957 /* If we're on an error path, we may not have a valid
1958 * transaction handle with which to update the orphan list on
1959 * disk, but we still need to remove the inode from the linked
1960 * list in memory. */
1961 if (!handle)
1962 goto out;
1963
1964 err = ext4_reserve_inode_write(handle, inode, &iloc);
1965 if (err)
1966 goto out_err;
1967
1968 if (prev == &sbi->s_orphan) {
1969 jbd_debug(4, "superblock will point to %lu\n", ino_next);
1970 BUFFER_TRACE(sbi->s_sbh, "get_write_access");
1971 err = ext4_journal_get_write_access(handle, sbi->s_sbh);
1972 if (err)
1973 goto out_brelse;
1974 sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
1975 err = ext4_journal_dirty_metadata(handle, sbi->s_sbh);
1976 } else {
1977 struct ext4_iloc iloc2;
1978 struct inode *i_prev =
1979 &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
1980
1981 jbd_debug(4, "orphan inode %lu will point to %lu\n",
1982 i_prev->i_ino, ino_next);
1983 err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
1984 if (err)
1985 goto out_brelse;
1986 NEXT_ORPHAN(i_prev) = ino_next;
1987 err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
1988 }
1989 if (err)
1990 goto out_brelse;
1991 NEXT_ORPHAN(inode) = 0;
1992 err = ext4_mark_iloc_dirty(handle, inode, &iloc);
1993
1994out_err:
1995 ext4_std_error(inode->i_sb, err);
1996out:
1997 unlock_super(inode->i_sb);
1998 return err;
1999
2000out_brelse:
2001 brelse(iloc.bh);
2002 goto out_err;
2003}
2004
2005static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
2006{
2007 int retval;
2008 struct inode * inode;
2009 struct buffer_head * bh;
2010 struct ext4_dir_entry_2 * de;
2011 handle_t *handle;
2012
2013 /* Initialize quotas before so that eventual writes go in
2014 * separate transaction */
2015 DQUOT_INIT(dentry->d_inode);
2016 handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
2017 if (IS_ERR(handle))
2018 return PTR_ERR(handle);
2019
2020 retval = -ENOENT;
2021 bh = ext4_find_entry (dentry, &de);
2022 if (!bh)
2023 goto end_rmdir;
2024
2025 if (IS_DIRSYNC(dir))
2026 handle->h_sync = 1;
2027
2028 inode = dentry->d_inode;
2029
2030 retval = -EIO;
2031 if (le32_to_cpu(de->inode) != inode->i_ino)
2032 goto end_rmdir;
2033
2034 retval = -ENOTEMPTY;
2035 if (!empty_dir (inode))
2036 goto end_rmdir;
2037
2038 retval = ext4_delete_entry(handle, dir, de, bh);
2039 if (retval)
2040 goto end_rmdir;
2041 if (inode->i_nlink != 2)
2042 ext4_warning (inode->i_sb, "ext4_rmdir",
2043 "empty directory has nlink!=2 (%d)",
2044 inode->i_nlink);
2045 inode->i_version++;
2046 clear_nlink(inode);
2047 /* There's no need to set i_disksize: the fact that i_nlink is
2048 * zero will ensure that the right thing happens during any
2049 * recovery. */
2050 inode->i_size = 0;
2051 ext4_orphan_add(handle, inode);
2052 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
2053 ext4_mark_inode_dirty(handle, inode);
2054 drop_nlink(dir);
2055 ext4_update_dx_flag(dir);
2056 ext4_mark_inode_dirty(handle, dir);
2057
2058end_rmdir:
2059 ext4_journal_stop(handle);
2060 brelse (bh);
2061 return retval;
2062}
2063
2064static int ext4_unlink(struct inode * dir, struct dentry *dentry)
2065{
2066 int retval;
2067 struct inode * inode;
2068 struct buffer_head * bh;
2069 struct ext4_dir_entry_2 * de;
2070 handle_t *handle;
2071
2072 /* Initialize quotas before so that eventual writes go
2073 * in separate transaction */
2074 DQUOT_INIT(dentry->d_inode);
2075 handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
2076 if (IS_ERR(handle))
2077 return PTR_ERR(handle);
2078
2079 if (IS_DIRSYNC(dir))
2080 handle->h_sync = 1;
2081
2082 retval = -ENOENT;
2083 bh = ext4_find_entry (dentry, &de);
2084 if (!bh)
2085 goto end_unlink;
2086
2087 inode = dentry->d_inode;
2088
2089 retval = -EIO;
2090 if (le32_to_cpu(de->inode) != inode->i_ino)
2091 goto end_unlink;
2092
2093 if (!inode->i_nlink) {
2094 ext4_warning (inode->i_sb, "ext4_unlink",
2095 "Deleting nonexistent file (%lu), %d",
2096 inode->i_ino, inode->i_nlink);
2097 inode->i_nlink = 1;
2098 }
2099 retval = ext4_delete_entry(handle, dir, de, bh);
2100 if (retval)
2101 goto end_unlink;
2102 dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
2103 ext4_update_dx_flag(dir);
2104 ext4_mark_inode_dirty(handle, dir);
2105 drop_nlink(inode);
2106 if (!inode->i_nlink)
2107 ext4_orphan_add(handle, inode);
2108 inode->i_ctime = dir->i_ctime;
2109 ext4_mark_inode_dirty(handle, inode);
2110 retval = 0;
2111
2112end_unlink:
2113 ext4_journal_stop(handle);
2114 brelse (bh);
2115 return retval;
2116}
2117
2118static int ext4_symlink (struct inode * dir,
2119 struct dentry *dentry, const char * symname)
2120{
2121 handle_t *handle;
2122 struct inode * inode;
2123 int l, err, retries = 0;
2124
2125 l = strlen(symname)+1;
2126 if (l > dir->i_sb->s_blocksize)
2127 return -ENAMETOOLONG;
2128
2129retry:
2130 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2131 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
2132 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
2133 if (IS_ERR(handle))
2134 return PTR_ERR(handle);
2135
2136 if (IS_DIRSYNC(dir))
2137 handle->h_sync = 1;
2138
2139 inode = ext4_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
2140 err = PTR_ERR(inode);
2141 if (IS_ERR(inode))
2142 goto out_stop;
2143
2144 if (l > sizeof (EXT4_I(inode)->i_data)) {
2145 inode->i_op = &ext4_symlink_inode_operations;
2146 ext4_set_aops(inode);
2147 /*
2148 * page_symlink() calls into ext4_prepare/commit_write.
2149 * We have a transaction open. All is sweetness. It also sets
2150 * i_size in generic_commit_write().
2151 */
2152 err = __page_symlink(inode, symname, l,
2153 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
2154 if (err) {
2155 ext4_dec_count(handle, inode);
2156 ext4_mark_inode_dirty(handle, inode);
2157 iput (inode);
2158 goto out_stop;
2159 }
2160 } else {
2161 inode->i_op = &ext4_fast_symlink_inode_operations;
2162 memcpy((char*)&EXT4_I(inode)->i_data,symname,l);
2163 inode->i_size = l-1;
2164 }
2165 EXT4_I(inode)->i_disksize = inode->i_size;
2166 err = ext4_add_nondir(handle, dentry, inode);
2167out_stop:
2168 ext4_journal_stop(handle);
2169 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2170 goto retry;
2171 return err;
2172}
2173
2174static int ext4_link (struct dentry * old_dentry,
2175 struct inode * dir, struct dentry *dentry)
2176{
2177 handle_t *handle;
2178 struct inode *inode = old_dentry->d_inode;
2179 int err, retries = 0;
2180
2181 if (inode->i_nlink >= EXT4_LINK_MAX)
2182 return -EMLINK;
2183
2184retry:
2185 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2186 EXT4_INDEX_EXTRA_TRANS_BLOCKS);
2187 if (IS_ERR(handle))
2188 return PTR_ERR(handle);
2189
2190 if (IS_DIRSYNC(dir))
2191 handle->h_sync = 1;
2192
2193 inode->i_ctime = CURRENT_TIME_SEC;
2194 ext4_inc_count(handle, inode);
2195 atomic_inc(&inode->i_count);
2196
2197 err = ext4_add_nondir(handle, dentry, inode);
2198 ext4_journal_stop(handle);
2199 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2200 goto retry;
2201 return err;
2202}
2203
2204#define PARENT_INO(buffer) \
2205 ((struct ext4_dir_entry_2 *) ((char *) buffer + \
2206 le16_to_cpu(((struct ext4_dir_entry_2 *) buffer)->rec_len)))->inode
2207
2208/*
2209 * Anybody can rename anything with this: the permission checks are left to the
2210 * higher-level routines.
2211 */
2212static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
2213 struct inode * new_dir,struct dentry *new_dentry)
2214{
2215 handle_t *handle;
2216 struct inode * old_inode, * new_inode;
2217 struct buffer_head * old_bh, * new_bh, * dir_bh;
2218 struct ext4_dir_entry_2 * old_de, * new_de;
2219 int retval;
2220
2221 old_bh = new_bh = dir_bh = NULL;
2222
2223 /* Initialize quotas before so that eventual writes go
2224 * in separate transaction */
2225 if (new_dentry->d_inode)
2226 DQUOT_INIT(new_dentry->d_inode);
2227 handle = ext4_journal_start(old_dir, 2 *
2228 EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
2229 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
2230 if (IS_ERR(handle))
2231 return PTR_ERR(handle);
2232
2233 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
2234 handle->h_sync = 1;
2235
2236 old_bh = ext4_find_entry (old_dentry, &old_de);
2237 /*
2238 * Check for inode number is _not_ due to possible IO errors.
2239 * We might rmdir the source, keep it as pwd of some process
2240 * and merrily kill the link to whatever was created under the
2241 * same name. Goodbye sticky bit ;-<
2242 */
2243 old_inode = old_dentry->d_inode;
2244 retval = -ENOENT;
2245 if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
2246 goto end_rename;
2247
2248 new_inode = new_dentry->d_inode;
2249 new_bh = ext4_find_entry (new_dentry, &new_de);
2250 if (new_bh) {
2251 if (!new_inode) {
2252 brelse (new_bh);
2253 new_bh = NULL;
2254 }
2255 }
2256 if (S_ISDIR(old_inode->i_mode)) {
2257 if (new_inode) {
2258 retval = -ENOTEMPTY;
2259 if (!empty_dir (new_inode))
2260 goto end_rename;
2261 }
2262 retval = -EIO;
2263 dir_bh = ext4_bread (handle, old_inode, 0, 0, &retval);
2264 if (!dir_bh)
2265 goto end_rename;
2266 if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
2267 goto end_rename;
2268 retval = -EMLINK;
2269 if (!new_inode && new_dir!=old_dir &&
2270 new_dir->i_nlink >= EXT4_LINK_MAX)
2271 goto end_rename;
2272 }
2273 if (!new_bh) {
2274 retval = ext4_add_entry (handle, new_dentry, old_inode);
2275 if (retval)
2276 goto end_rename;
2277 } else {
2278 BUFFER_TRACE(new_bh, "get write access");
2279 ext4_journal_get_write_access(handle, new_bh);
2280 new_de->inode = cpu_to_le32(old_inode->i_ino);
2281 if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
2282 EXT4_FEATURE_INCOMPAT_FILETYPE))
2283 new_de->file_type = old_de->file_type;
2284 new_dir->i_version++;
2285 BUFFER_TRACE(new_bh, "call ext4_journal_dirty_metadata");
2286 ext4_journal_dirty_metadata(handle, new_bh);
2287 brelse(new_bh);
2288 new_bh = NULL;
2289 }
2290
2291 /*
2292 * Like most other Unix systems, set the ctime for inodes on a
2293 * rename.
2294 */
2295 old_inode->i_ctime = CURRENT_TIME_SEC;
2296 ext4_mark_inode_dirty(handle, old_inode);
2297
2298 /*
2299 * ok, that's it
2300 */
2301 if (le32_to_cpu(old_de->inode) != old_inode->i_ino ||
2302 old_de->name_len != old_dentry->d_name.len ||
2303 strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) ||
2304 (retval = ext4_delete_entry(handle, old_dir,
2305 old_de, old_bh)) == -ENOENT) {
2306 /* old_de could have moved from under us during htree split, so
2307 * make sure that we are deleting the right entry. We might
2308 * also be pointing to a stale entry in the unused part of
2309 * old_bh so just checking inum and the name isn't enough. */
2310 struct buffer_head *old_bh2;
2311 struct ext4_dir_entry_2 *old_de2;
2312
2313 old_bh2 = ext4_find_entry(old_dentry, &old_de2);
2314 if (old_bh2) {
2315 retval = ext4_delete_entry(handle, old_dir,
2316 old_de2, old_bh2);
2317 brelse(old_bh2);
2318 }
2319 }
2320 if (retval) {
2321 ext4_warning(old_dir->i_sb, "ext4_rename",
2322 "Deleting old file (%lu), %d, error=%d",
2323 old_dir->i_ino, old_dir->i_nlink, retval);
2324 }
2325
2326 if (new_inode) {
2327 drop_nlink(new_inode);
2328 new_inode->i_ctime = CURRENT_TIME_SEC;
2329 }
2330 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
2331 ext4_update_dx_flag(old_dir);
2332 if (dir_bh) {
2333 BUFFER_TRACE(dir_bh, "get_write_access");
2334 ext4_journal_get_write_access(handle, dir_bh);
2335 PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
2336 BUFFER_TRACE(dir_bh, "call ext4_journal_dirty_metadata");
2337 ext4_journal_dirty_metadata(handle, dir_bh);
2338 drop_nlink(old_dir);
2339 if (new_inode) {
2340 drop_nlink(new_inode);
2341 } else {
2342 inc_nlink(new_dir);
2343 ext4_update_dx_flag(new_dir);
2344 ext4_mark_inode_dirty(handle, new_dir);
2345 }
2346 }
2347 ext4_mark_inode_dirty(handle, old_dir);
2348 if (new_inode) {
2349 ext4_mark_inode_dirty(handle, new_inode);
2350 if (!new_inode->i_nlink)
2351 ext4_orphan_add(handle, new_inode);
2352 }
2353 retval = 0;
2354
2355end_rename:
2356 brelse (dir_bh);
2357 brelse (old_bh);
2358 brelse (new_bh);
2359 ext4_journal_stop(handle);
2360 return retval;
2361}
2362
2363/*
2364 * directories can handle most operations...
2365 */
2366struct inode_operations ext4_dir_inode_operations = {
2367 .create = ext4_create,
2368 .lookup = ext4_lookup,
2369 .link = ext4_link,
2370 .unlink = ext4_unlink,
2371 .symlink = ext4_symlink,
2372 .mkdir = ext4_mkdir,
2373 .rmdir = ext4_rmdir,
2374 .mknod = ext4_mknod,
2375 .rename = ext4_rename,
2376 .setattr = ext4_setattr,
2377#ifdef CONFIG_EXT4DEV_FS_XATTR
2378 .setxattr = generic_setxattr,
2379 .getxattr = generic_getxattr,
2380 .listxattr = ext4_listxattr,
2381 .removexattr = generic_removexattr,
2382#endif
2383 .permission = ext4_permission,
2384};
2385
2386struct inode_operations ext4_special_inode_operations = {
2387 .setattr = ext4_setattr,
2388#ifdef CONFIG_EXT4DEV_FS_XATTR
2389 .setxattr = generic_setxattr,
2390 .getxattr = generic_getxattr,
2391 .listxattr = ext4_listxattr,
2392 .removexattr = generic_removexattr,
2393#endif
2394 .permission = ext4_permission,
2395};
diff --git a/fs/ext4/namei.h b/fs/ext4/namei.h
new file mode 100644
index 000000000000..5e4dfff36a00
--- /dev/null
+++ b/fs/ext4/namei.h
@@ -0,0 +1,8 @@
1/* linux/fs/ext4/namei.h
2 *
3 * Copyright (C) 2005 Simtec Electronics
4 * Ben Dooks <ben@simtec.co.uk>
5 *
6*/
7
8extern struct dentry *ext4_get_parent(struct dentry *child);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
new file mode 100644
index 000000000000..1e9578052cd3
--- /dev/null
+++ b/fs/ext4/resize.c
@@ -0,0 +1,1045 @@
1/*
2 * linux/fs/ext4/resize.c
3 *
4 * Support for resizing an ext4 filesystem while it is mounted.
5 *
6 * Copyright (C) 2001, 2002 Andreas Dilger <adilger@clusterfs.com>
7 *
8 * This could probably be made into a module, because it is not often in use.
9 */
10
11
12#define EXT4FS_DEBUG
13
14#include <linux/sched.h>
15#include <linux/smp_lock.h>
16#include <linux/ext4_jbd2.h>
17
18#include <linux/errno.h>
19#include <linux/slab.h>
20
21
22#define outside(b, first, last) ((b) < (first) || (b) >= (last))
23#define inside(b, first, last) ((b) >= (first) && (b) < (last))
24
25static int verify_group_input(struct super_block *sb,
26 struct ext4_new_group_data *input)
27{
28 struct ext4_sb_info *sbi = EXT4_SB(sb);
29 struct ext4_super_block *es = sbi->s_es;
30 ext4_fsblk_t start = ext4_blocks_count(es);
31 ext4_fsblk_t end = start + input->blocks_count;
32 unsigned group = input->group;
33 ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
34 unsigned overhead = ext4_bg_has_super(sb, group) ?
35 (1 + ext4_bg_num_gdb(sb, group) +
36 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
37 ext4_fsblk_t metaend = start + overhead;
38 struct buffer_head *bh = NULL;
39 ext4_grpblk_t free_blocks_count, offset;
40 int err = -EINVAL;
41
42 input->free_blocks_count = free_blocks_count =
43 input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
44
45 if (test_opt(sb, DEBUG))
46 printk(KERN_DEBUG "EXT4-fs: adding %s group %u: %u blocks "
47 "(%d free, %u reserved)\n",
48 ext4_bg_has_super(sb, input->group) ? "normal" :
49 "no-super", input->group, input->blocks_count,
50 free_blocks_count, input->reserved_blocks);
51
52 ext4_get_group_no_and_offset(sb, start, NULL, &offset);
53 if (group != sbi->s_groups_count)
54 ext4_warning(sb, __FUNCTION__,
55 "Cannot add at group %u (only %lu groups)",
56 input->group, sbi->s_groups_count);
57 else if (offset != 0)
58 ext4_warning(sb, __FUNCTION__, "Last group not full");
59 else if (input->reserved_blocks > input->blocks_count / 5)
60 ext4_warning(sb, __FUNCTION__, "Reserved blocks too high (%u)",
61 input->reserved_blocks);
62 else if (free_blocks_count < 0)
63 ext4_warning(sb, __FUNCTION__, "Bad blocks count %u",
64 input->blocks_count);
65 else if (!(bh = sb_bread(sb, end - 1)))
66 ext4_warning(sb, __FUNCTION__,
67 "Cannot read last block (%llu)",
68 end - 1);
69 else if (outside(input->block_bitmap, start, end))
70 ext4_warning(sb, __FUNCTION__,
71 "Block bitmap not in group (block %llu)",
72 input->block_bitmap);
73 else if (outside(input->inode_bitmap, start, end))
74 ext4_warning(sb, __FUNCTION__,
75 "Inode bitmap not in group (block %llu)",
76 input->inode_bitmap);
77 else if (outside(input->inode_table, start, end) ||
78 outside(itend - 1, start, end))
79 ext4_warning(sb, __FUNCTION__,
80 "Inode table not in group (blocks %llu-%llu)",
81 input->inode_table, itend - 1);
82 else if (input->inode_bitmap == input->block_bitmap)
83 ext4_warning(sb, __FUNCTION__,
84 "Block bitmap same as inode bitmap (%llu)",
85 input->block_bitmap);
86 else if (inside(input->block_bitmap, input->inode_table, itend))
87 ext4_warning(sb, __FUNCTION__,
88 "Block bitmap (%llu) in inode table (%llu-%llu)",
89 input->block_bitmap, input->inode_table, itend-1);
90 else if (inside(input->inode_bitmap, input->inode_table, itend))
91 ext4_warning(sb, __FUNCTION__,
92 "Inode bitmap (%llu) in inode table (%llu-%llu)",
93 input->inode_bitmap, input->inode_table, itend-1);
94 else if (inside(input->block_bitmap, start, metaend))
95 ext4_warning(sb, __FUNCTION__,
96 "Block bitmap (%llu) in GDT table"
97 " (%llu-%llu)",
98 input->block_bitmap, start, metaend - 1);
99 else if (inside(input->inode_bitmap, start, metaend))
100 ext4_warning(sb, __FUNCTION__,
101 "Inode bitmap (%llu) in GDT table"
102 " (%llu-%llu)",
103 input->inode_bitmap, start, metaend - 1);
104 else if (inside(input->inode_table, start, metaend) ||
105 inside(itend - 1, start, metaend))
106 ext4_warning(sb, __FUNCTION__,
107 "Inode table (%llu-%llu) overlaps"
108 "GDT table (%llu-%llu)",
109 input->inode_table, itend - 1, start, metaend - 1);
110 else
111 err = 0;
112 brelse(bh);
113
114 return err;
115}
116
117static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
118 ext4_fsblk_t blk)
119{
120 struct buffer_head *bh;
121 int err;
122
123 bh = sb_getblk(sb, blk);
124 if (!bh)
125 return ERR_PTR(-EIO);
126 if ((err = ext4_journal_get_write_access(handle, bh))) {
127 brelse(bh);
128 bh = ERR_PTR(err);
129 } else {
130 lock_buffer(bh);
131 memset(bh->b_data, 0, sb->s_blocksize);
132 set_buffer_uptodate(bh);
133 unlock_buffer(bh);
134 }
135
136 return bh;
137}
138
139/*
140 * To avoid calling the atomic setbit hundreds or thousands of times, we only
141 * need to use it within a single byte (to ensure we get endianness right).
142 * We can use memset for the rest of the bitmap as there are no other users.
143 */
144static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
145{
146 int i;
147
148 if (start_bit >= end_bit)
149 return;
150
151 ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
152 for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
153 ext4_set_bit(i, bitmap);
154 if (i < end_bit)
155 memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
156}
157
158/*
159 * Set up the block and inode bitmaps, and the inode table for the new group.
160 * This doesn't need to be part of the main transaction, since we are only
161 * changing blocks outside the actual filesystem. We still do journaling to
162 * ensure the recovery is correct in case of a failure just after resize.
163 * If any part of this fails, we simply abort the resize.
164 */
165static int setup_new_group_blocks(struct super_block *sb,
166 struct ext4_new_group_data *input)
167{
168 struct ext4_sb_info *sbi = EXT4_SB(sb);
169 ext4_fsblk_t start = ext4_group_first_block_no(sb, input->group);
170 int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
171 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
172 unsigned long gdblocks = ext4_bg_num_gdb(sb, input->group);
173 struct buffer_head *bh;
174 handle_t *handle;
175 ext4_fsblk_t block;
176 ext4_grpblk_t bit;
177 int i;
178 int err = 0, err2;
179
180 handle = ext4_journal_start_sb(sb, reserved_gdb + gdblocks +
181 2 + sbi->s_itb_per_group);
182 if (IS_ERR(handle))
183 return PTR_ERR(handle);
184
185 lock_super(sb);
186 if (input->group != sbi->s_groups_count) {
187 err = -EBUSY;
188 goto exit_journal;
189 }
190
191 if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) {
192 err = PTR_ERR(bh);
193 goto exit_journal;
194 }
195
196 if (ext4_bg_has_super(sb, input->group)) {
197 ext4_debug("mark backup superblock %#04lx (+0)\n", start);
198 ext4_set_bit(0, bh->b_data);
199 }
200
201 /* Copy all of the GDT blocks into the backup in this group */
202 for (i = 0, bit = 1, block = start + 1;
203 i < gdblocks; i++, block++, bit++) {
204 struct buffer_head *gdb;
205
206 ext4_debug("update backup group %#04lx (+%d)\n", block, bit);
207
208 gdb = sb_getblk(sb, block);
209 if (!gdb) {
210 err = -EIO;
211 goto exit_bh;
212 }
213 if ((err = ext4_journal_get_write_access(handle, gdb))) {
214 brelse(gdb);
215 goto exit_bh;
216 }
217 lock_buffer(bh);
218 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, bh->b_size);
219 set_buffer_uptodate(gdb);
220 unlock_buffer(bh);
221 ext4_journal_dirty_metadata(handle, gdb);
222 ext4_set_bit(bit, bh->b_data);
223 brelse(gdb);
224 }
225
226 /* Zero out all of the reserved backup group descriptor table blocks */
227 for (i = 0, bit = gdblocks + 1, block = start + bit;
228 i < reserved_gdb; i++, block++, bit++) {
229 struct buffer_head *gdb;
230
231 ext4_debug("clear reserved block %#04lx (+%d)\n", block, bit);
232
233 if (IS_ERR(gdb = bclean(handle, sb, block))) {
234 err = PTR_ERR(bh);
235 goto exit_bh;
236 }
237 ext4_journal_dirty_metadata(handle, gdb);
238 ext4_set_bit(bit, bh->b_data);
239 brelse(gdb);
240 }
241 ext4_debug("mark block bitmap %#04x (+%ld)\n", input->block_bitmap,
242 input->block_bitmap - start);
243 ext4_set_bit(input->block_bitmap - start, bh->b_data);
244 ext4_debug("mark inode bitmap %#04x (+%ld)\n", input->inode_bitmap,
245 input->inode_bitmap - start);
246 ext4_set_bit(input->inode_bitmap - start, bh->b_data);
247
248 /* Zero out all of the inode table blocks */
249 for (i = 0, block = input->inode_table, bit = block - start;
250 i < sbi->s_itb_per_group; i++, bit++, block++) {
251 struct buffer_head *it;
252
253 ext4_debug("clear inode block %#04lx (+%d)\n", block, bit);
254 if (IS_ERR(it = bclean(handle, sb, block))) {
255 err = PTR_ERR(it);
256 goto exit_bh;
257 }
258 ext4_journal_dirty_metadata(handle, it);
259 brelse(it);
260 ext4_set_bit(bit, bh->b_data);
261 }
262 mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb),
263 bh->b_data);
264 ext4_journal_dirty_metadata(handle, bh);
265 brelse(bh);
266
267 /* Mark unused entries in inode bitmap used */
268 ext4_debug("clear inode bitmap %#04x (+%ld)\n",
269 input->inode_bitmap, input->inode_bitmap - start);
270 if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) {
271 err = PTR_ERR(bh);
272 goto exit_journal;
273 }
274
275 mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
276 bh->b_data);
277 ext4_journal_dirty_metadata(handle, bh);
278exit_bh:
279 brelse(bh);
280
281exit_journal:
282 unlock_super(sb);
283 if ((err2 = ext4_journal_stop(handle)) && !err)
284 err = err2;
285
286 return err;
287}
288
289
290/*
291 * Iterate through the groups which hold BACKUP superblock/GDT copies in an
292 * ext4 filesystem. The counters should be initialized to 1, 5, and 7 before
293 * calling this for the first time. In a sparse filesystem it will be the
294 * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ...
295 * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ...
296 */
297static unsigned ext4_list_backups(struct super_block *sb, unsigned *three,
298 unsigned *five, unsigned *seven)
299{
300 unsigned *min = three;
301 int mult = 3;
302 unsigned ret;
303
304 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
305 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
306 ret = *min;
307 *min += 1;
308 return ret;
309 }
310
311 if (*five < *min) {
312 min = five;
313 mult = 5;
314 }
315 if (*seven < *min) {
316 min = seven;
317 mult = 7;
318 }
319
320 ret = *min;
321 *min *= mult;
322
323 return ret;
324}
325
326/*
327 * Check that all of the backup GDT blocks are held in the primary GDT block.
328 * It is assumed that they are stored in group order. Returns the number of
329 * groups in current filesystem that have BACKUPS, or -ve error code.
330 */
331static int verify_reserved_gdb(struct super_block *sb,
332 struct buffer_head *primary)
333{
334 const ext4_fsblk_t blk = primary->b_blocknr;
335 const unsigned long end = EXT4_SB(sb)->s_groups_count;
336 unsigned three = 1;
337 unsigned five = 5;
338 unsigned seven = 7;
339 unsigned grp;
340 __le32 *p = (__le32 *)primary->b_data;
341 int gdbackups = 0;
342
343 while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) {
344 if (le32_to_cpu(*p++) !=
345 grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){
346 ext4_warning(sb, __FUNCTION__,
347 "reserved GDT %llu"
348 " missing grp %d (%llu)",
349 blk, grp,
350 grp *
351 (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) +
352 blk);
353 return -EINVAL;
354 }
355 if (++gdbackups > EXT4_ADDR_PER_BLOCK(sb))
356 return -EFBIG;
357 }
358
359 return gdbackups;
360}
361
362/*
363 * Called when we need to bring a reserved group descriptor table block into
364 * use from the resize inode. The primary copy of the new GDT block currently
365 * is an indirect block (under the double indirect block in the resize inode).
366 * The new backup GDT blocks will be stored as leaf blocks in this indirect
367 * block, in group order. Even though we know all the block numbers we need,
368 * we check to ensure that the resize inode has actually reserved these blocks.
369 *
370 * Don't need to update the block bitmaps because the blocks are still in use.
371 *
372 * We get all of the error cases out of the way, so that we are sure to not
373 * fail once we start modifying the data on disk, because JBD has no rollback.
374 */
375static int add_new_gdb(handle_t *handle, struct inode *inode,
376 struct ext4_new_group_data *input,
377 struct buffer_head **primary)
378{
379 struct super_block *sb = inode->i_sb;
380 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
381 unsigned long gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
382 ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
383 struct buffer_head **o_group_desc, **n_group_desc;
384 struct buffer_head *dind;
385 int gdbackups;
386 struct ext4_iloc iloc;
387 __le32 *data;
388 int err;
389
390 if (test_opt(sb, DEBUG))
391 printk(KERN_DEBUG
392 "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n",
393 gdb_num);
394
395 /*
396 * If we are not using the primary superblock/GDT copy don't resize,
397 * because the user tools have no way of handling this. Probably a
398 * bad time to do it anyways.
399 */
400 if (EXT4_SB(sb)->s_sbh->b_blocknr !=
401 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
402 ext4_warning(sb, __FUNCTION__,
403 "won't resize using backup superblock at %llu",
404 (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
405 return -EPERM;
406 }
407
408 *primary = sb_bread(sb, gdblock);
409 if (!*primary)
410 return -EIO;
411
412 if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) {
413 err = gdbackups;
414 goto exit_bh;
415 }
416
417 data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK;
418 dind = sb_bread(sb, le32_to_cpu(*data));
419 if (!dind) {
420 err = -EIO;
421 goto exit_bh;
422 }
423
424 data = (__le32 *)dind->b_data;
425 if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) {
426 ext4_warning(sb, __FUNCTION__,
427 "new group %u GDT block %llu not reserved",
428 input->group, gdblock);
429 err = -EINVAL;
430 goto exit_dind;
431 }
432
433 if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh)))
434 goto exit_dind;
435
436 if ((err = ext4_journal_get_write_access(handle, *primary)))
437 goto exit_sbh;
438
439 if ((err = ext4_journal_get_write_access(handle, dind)))
440 goto exit_primary;
441
442 /* ext4_reserve_inode_write() gets a reference on the iloc */
443 if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
444 goto exit_dindj;
445
446 n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
447 GFP_KERNEL);
448 if (!n_group_desc) {
449 err = -ENOMEM;
450 ext4_warning (sb, __FUNCTION__,
451 "not enough memory for %lu groups", gdb_num + 1);
452 goto exit_inode;
453 }
454
455 /*
456 * Finally, we have all of the possible failures behind us...
457 *
458 * Remove new GDT block from inode double-indirect block and clear out
459 * the new GDT block for use (which also "frees" the backup GDT blocks
460 * from the reserved inode). We don't need to change the bitmaps for
461 * these blocks, because they are marked as in-use from being in the
462 * reserved inode, and will become GDT blocks (primary and backup).
463 */
464 data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
465 ext4_journal_dirty_metadata(handle, dind);
466 brelse(dind);
467 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
468 ext4_mark_iloc_dirty(handle, inode, &iloc);
469 memset((*primary)->b_data, 0, sb->s_blocksize);
470 ext4_journal_dirty_metadata(handle, *primary);
471
472 o_group_desc = EXT4_SB(sb)->s_group_desc;
473 memcpy(n_group_desc, o_group_desc,
474 EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
475 n_group_desc[gdb_num] = *primary;
476 EXT4_SB(sb)->s_group_desc = n_group_desc;
477 EXT4_SB(sb)->s_gdb_count++;
478 kfree(o_group_desc);
479
480 es->s_reserved_gdt_blocks =
481 cpu_to_le16(le16_to_cpu(es->s_reserved_gdt_blocks) - 1);
482 ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
483
484 return 0;
485
486exit_inode:
487 //ext4_journal_release_buffer(handle, iloc.bh);
488 brelse(iloc.bh);
489exit_dindj:
490 //ext4_journal_release_buffer(handle, dind);
491exit_primary:
492 //ext4_journal_release_buffer(handle, *primary);
493exit_sbh:
494 //ext4_journal_release_buffer(handle, *primary);
495exit_dind:
496 brelse(dind);
497exit_bh:
498 brelse(*primary);
499
500 ext4_debug("leaving with error %d\n", err);
501 return err;
502}
503
504/*
505 * Called when we are adding a new group which has a backup copy of each of
506 * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks.
507 * We need to add these reserved backup GDT blocks to the resize inode, so
508 * that they are kept for future resizing and not allocated to files.
509 *
510 * Each reserved backup GDT block will go into a different indirect block.
511 * The indirect blocks are actually the primary reserved GDT blocks,
512 * so we know in advance what their block numbers are. We only get the
513 * double-indirect block to verify it is pointing to the primary reserved
514 * GDT blocks so we don't overwrite a data block by accident. The reserved
515 * backup GDT blocks are stored in their reserved primary GDT block.
516 */
517static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
518 struct ext4_new_group_data *input)
519{
520 struct super_block *sb = inode->i_sb;
521 int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks);
522 struct buffer_head **primary;
523 struct buffer_head *dind;
524 struct ext4_iloc iloc;
525 ext4_fsblk_t blk;
526 __le32 *data, *end;
527 int gdbackups = 0;
528 int res, i;
529 int err;
530
531 primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_KERNEL);
532 if (!primary)
533 return -ENOMEM;
534
535 data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK;
536 dind = sb_bread(sb, le32_to_cpu(*data));
537 if (!dind) {
538 err = -EIO;
539 goto exit_free;
540 }
541
542 blk = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + EXT4_SB(sb)->s_gdb_count;
543 data = (__le32 *)dind->b_data + EXT4_SB(sb)->s_gdb_count;
544 end = (__le32 *)dind->b_data + EXT4_ADDR_PER_BLOCK(sb);
545
546 /* Get each reserved primary GDT block and verify it holds backups */
547 for (res = 0; res < reserved_gdb; res++, blk++) {
548 if (le32_to_cpu(*data) != blk) {
549 ext4_warning(sb, __FUNCTION__,
550 "reserved block %llu"
551 " not at offset %ld",
552 blk,
553 (long)(data - (__le32 *)dind->b_data));
554 err = -EINVAL;
555 goto exit_bh;
556 }
557 primary[res] = sb_bread(sb, blk);
558 if (!primary[res]) {
559 err = -EIO;
560 goto exit_bh;
561 }
562 if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) {
563 brelse(primary[res]);
564 err = gdbackups;
565 goto exit_bh;
566 }
567 if (++data >= end)
568 data = (__le32 *)dind->b_data;
569 }
570
571 for (i = 0; i < reserved_gdb; i++) {
572 if ((err = ext4_journal_get_write_access(handle, primary[i]))) {
573 /*
574 int j;
575 for (j = 0; j < i; j++)
576 ext4_journal_release_buffer(handle, primary[j]);
577 */
578 goto exit_bh;
579 }
580 }
581
582 if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
583 goto exit_bh;
584
585 /*
586 * Finally we can add each of the reserved backup GDT blocks from
587 * the new group to its reserved primary GDT block.
588 */
589 blk = input->group * EXT4_BLOCKS_PER_GROUP(sb);
590 for (i = 0; i < reserved_gdb; i++) {
591 int err2;
592 data = (__le32 *)primary[i]->b_data;
593 /* printk("reserving backup %lu[%u] = %lu\n",
594 primary[i]->b_blocknr, gdbackups,
595 blk + primary[i]->b_blocknr); */
596 data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
597 err2 = ext4_journal_dirty_metadata(handle, primary[i]);
598 if (!err)
599 err = err2;
600 }
601 inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9;
602 ext4_mark_iloc_dirty(handle, inode, &iloc);
603
604exit_bh:
605 while (--res >= 0)
606 brelse(primary[res]);
607 brelse(dind);
608
609exit_free:
610 kfree(primary);
611
612 return err;
613}
614
615/*
616 * Update the backup copies of the ext4 metadata. These don't need to be part
617 * of the main resize transaction, because e2fsck will re-write them if there
618 * is a problem (basically only OOM will cause a problem). However, we
619 * _should_ update the backups if possible, in case the primary gets trashed
620 * for some reason and we need to run e2fsck from a backup superblock. The
621 * important part is that the new block and inode counts are in the backup
622 * superblocks, and the location of the new group metadata in the GDT backups.
623 *
624 * We do not need lock_super() for this, because these blocks are not
625 * otherwise touched by the filesystem code when it is mounted. We don't
626 * need to worry about last changing from sbi->s_groups_count, because the
627 * worst that can happen is that we do not copy the full number of backups
628 * at this time. The resize which changed s_groups_count will backup again.
629 */
630static void update_backups(struct super_block *sb,
631 int blk_off, char *data, int size)
632{
633 struct ext4_sb_info *sbi = EXT4_SB(sb);
634 const unsigned long last = sbi->s_groups_count;
635 const int bpg = EXT4_BLOCKS_PER_GROUP(sb);
636 unsigned three = 1;
637 unsigned five = 5;
638 unsigned seven = 7;
639 unsigned group;
640 int rest = sb->s_blocksize - size;
641 handle_t *handle;
642 int err = 0, err2;
643
644 handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
645 if (IS_ERR(handle)) {
646 group = 1;
647 err = PTR_ERR(handle);
648 goto exit_err;
649 }
650
651 while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) {
652 struct buffer_head *bh;
653
654 /* Out of journal space, and can't get more - abort - so sad */
655 if (handle->h_buffer_credits == 0 &&
656 ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) &&
657 (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
658 break;
659
660 bh = sb_getblk(sb, group * bpg + blk_off);
661 if (!bh) {
662 err = -EIO;
663 break;
664 }
665 ext4_debug("update metadata backup %#04lx\n",
666 (unsigned long)bh->b_blocknr);
667 if ((err = ext4_journal_get_write_access(handle, bh)))
668 break;
669 lock_buffer(bh);
670 memcpy(bh->b_data, data, size);
671 if (rest)
672 memset(bh->b_data + size, 0, rest);
673 set_buffer_uptodate(bh);
674 unlock_buffer(bh);
675 ext4_journal_dirty_metadata(handle, bh);
676 brelse(bh);
677 }
678 if ((err2 = ext4_journal_stop(handle)) && !err)
679 err = err2;
680
681 /*
682 * Ugh! Need to have e2fsck write the backup copies. It is too
683 * late to revert the resize, we shouldn't fail just because of
684 * the backup copies (they are only needed in case of corruption).
685 *
686 * However, if we got here we have a journal problem too, so we
687 * can't really start a transaction to mark the superblock.
688 * Chicken out and just set the flag on the hope it will be written
689 * to disk, and if not - we will simply wait until next fsck.
690 */
691exit_err:
692 if (err) {
693 ext4_warning(sb, __FUNCTION__,
694 "can't update backup for group %d (err %d), "
695 "forcing fsck on next reboot", group, err);
696 sbi->s_mount_state &= ~EXT4_VALID_FS;
697 sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
698 mark_buffer_dirty(sbi->s_sbh);
699 }
700}
701
702/* Add group descriptor data to an existing or new group descriptor block.
703 * Ensure we handle all possible error conditions _before_ we start modifying
704 * the filesystem, because we cannot abort the transaction and not have it
705 * write the data to disk.
706 *
707 * If we are on a GDT block boundary, we need to get the reserved GDT block.
708 * Otherwise, we may need to add backup GDT blocks for a sparse group.
709 *
710 * We only need to hold the superblock lock while we are actually adding
711 * in the new group's counts to the superblock. Prior to that we have
712 * not really "added" the group at all. We re-check that we are still
713 * adding in the last group in case things have changed since verifying.
714 */
715int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
716{
717 struct ext4_sb_info *sbi = EXT4_SB(sb);
718 struct ext4_super_block *es = sbi->s_es;
719 int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
720 le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
721 struct buffer_head *primary = NULL;
722 struct ext4_group_desc *gdp;
723 struct inode *inode = NULL;
724 handle_t *handle;
725 int gdb_off, gdb_num;
726 int err, err2;
727
728 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
729 gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
730
731 if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
732 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
733 ext4_warning(sb, __FUNCTION__,
734 "Can't resize non-sparse filesystem further");
735 return -EPERM;
736 }
737
738 if (ext4_blocks_count(es) + input->blocks_count <
739 ext4_blocks_count(es)) {
740 ext4_warning(sb, __FUNCTION__, "blocks_count overflow\n");
741 return -EINVAL;
742 }
743
744 if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
745 le32_to_cpu(es->s_inodes_count)) {
746 ext4_warning(sb, __FUNCTION__, "inodes_count overflow\n");
747 return -EINVAL;
748 }
749
750 if (reserved_gdb || gdb_off == 0) {
751 if (!EXT4_HAS_COMPAT_FEATURE(sb,
752 EXT4_FEATURE_COMPAT_RESIZE_INODE)){
753 ext4_warning(sb, __FUNCTION__,
754 "No reserved GDT blocks, can't resize");
755 return -EPERM;
756 }
757 inode = iget(sb, EXT4_RESIZE_INO);
758 if (!inode || is_bad_inode(inode)) {
759 ext4_warning(sb, __FUNCTION__,
760 "Error opening resize inode");
761 iput(inode);
762 return -ENOENT;
763 }
764 }
765
766 if ((err = verify_group_input(sb, input)))
767 goto exit_put;
768
769 if ((err = setup_new_group_blocks(sb, input)))
770 goto exit_put;
771
772 /*
773 * We will always be modifying at least the superblock and a GDT
774 * block. If we are adding a group past the last current GDT block,
775 * we will also modify the inode and the dindirect block. If we
776 * are adding a group with superblock/GDT backups we will also
777 * modify each of the reserved GDT dindirect blocks.
778 */
779 handle = ext4_journal_start_sb(sb,
780 ext4_bg_has_super(sb, input->group) ?
781 3 + reserved_gdb : 4);
782 if (IS_ERR(handle)) {
783 err = PTR_ERR(handle);
784 goto exit_put;
785 }
786
787 lock_super(sb);
788 if (input->group != sbi->s_groups_count) {
789 ext4_warning(sb, __FUNCTION__,
790 "multiple resizers run on filesystem!");
791 err = -EBUSY;
792 goto exit_journal;
793 }
794
795 if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh)))
796 goto exit_journal;
797
798 /*
799 * We will only either add reserved group blocks to a backup group
800 * or remove reserved blocks for the first group in a new group block.
801 * Doing both would be mean more complex code, and sane people don't
802 * use non-sparse filesystems anymore. This is already checked above.
803 */
804 if (gdb_off) {
805 primary = sbi->s_group_desc[gdb_num];
806 if ((err = ext4_journal_get_write_access(handle, primary)))
807 goto exit_journal;
808
809 if (reserved_gdb && ext4_bg_num_gdb(sb, input->group) &&
810 (err = reserve_backup_gdb(handle, inode, input)))
811 goto exit_journal;
812 } else if ((err = add_new_gdb(handle, inode, input, &primary)))
813 goto exit_journal;
814
815 /*
816 * OK, now we've set up the new group. Time to make it active.
817 *
818 * Current kernels don't lock all allocations via lock_super(),
819 * so we have to be safe wrt. concurrent accesses the group
820 * data. So we need to be careful to set all of the relevant
821 * group descriptor data etc. *before* we enable the group.
822 *
823 * The key field here is sbi->s_groups_count: as long as
824 * that retains its old value, nobody is going to access the new
825 * group.
826 *
827 * So first we update all the descriptor metadata for the new
828 * group; then we update the total disk blocks count; then we
829 * update the groups count to enable the group; then finally we
830 * update the free space counts so that the system can start
831 * using the new disk blocks.
832 */
833
834 /* Update group descriptor block for new group */
835 gdp = (struct ext4_group_desc *)primary->b_data + gdb_off;
836
837 ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
838 ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
839 ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
840 gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
841 gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb));
842
843 /*
844 * Make the new blocks and inodes valid next. We do this before
845 * increasing the group count so that once the group is enabled,
846 * all of its blocks and inodes are already valid.
847 *
848 * We always allocate group-by-group, then block-by-block or
849 * inode-by-inode within a group, so enabling these
850 * blocks/inodes before the group is live won't actually let us
851 * allocate the new space yet.
852 */
853 ext4_blocks_count_set(es, ext4_blocks_count(es) +
854 input->blocks_count);
855 es->s_inodes_count = cpu_to_le32(le32_to_cpu(es->s_inodes_count) +
856 EXT4_INODES_PER_GROUP(sb));
857
858 /*
859 * We need to protect s_groups_count against other CPUs seeing
860 * inconsistent state in the superblock.
861 *
862 * The precise rules we use are:
863 *
864 * * Writers of s_groups_count *must* hold lock_super
865 * AND
866 * * Writers must perform a smp_wmb() after updating all dependent
867 * data and before modifying the groups count
868 *
869 * * Readers must hold lock_super() over the access
870 * OR
871 * * Readers must perform an smp_rmb() after reading the groups count
872 * and before reading any dependent data.
873 *
874 * NB. These rules can be relaxed when checking the group count
875 * while freeing data, as we can only allocate from a block
876 * group after serialising against the group count, and we can
877 * only then free after serialising in turn against that
878 * allocation.
879 */
880 smp_wmb();
881
882 /* Update the global fs size fields */
883 sbi->s_groups_count++;
884
885 ext4_journal_dirty_metadata(handle, primary);
886
887 /* Update the reserved block counts only once the new group is
888 * active. */
889 ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
890 input->reserved_blocks);
891
892 /* Update the free space counts */
893 percpu_counter_mod(&sbi->s_freeblocks_counter,
894 input->free_blocks_count);
895 percpu_counter_mod(&sbi->s_freeinodes_counter,
896 EXT4_INODES_PER_GROUP(sb));
897
898 ext4_journal_dirty_metadata(handle, sbi->s_sbh);
899 sb->s_dirt = 1;
900
901exit_journal:
902 unlock_super(sb);
903 if ((err2 = ext4_journal_stop(handle)) && !err)
904 err = err2;
905 if (!err) {
906 update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
907 sizeof(struct ext4_super_block));
908 update_backups(sb, primary->b_blocknr, primary->b_data,
909 primary->b_size);
910 }
911exit_put:
912 iput(inode);
913 return err;
914} /* ext4_group_add */
915
916/* Extend the filesystem to the new number of blocks specified. This entry
917 * point is only used to extend the current filesystem to the end of the last
918 * existing group. It can be accessed via ioctl, or by "remount,resize=<size>"
919 * for emergencies (because it has no dependencies on reserved blocks).
920 *
921 * If we _really_ wanted, we could use default values to call ext4_group_add()
922 * allow the "remount" trick to work for arbitrary resizing, assuming enough
923 * GDT blocks are reserved to grow to the desired size.
924 */
925int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
926 ext4_fsblk_t n_blocks_count)
927{
928 ext4_fsblk_t o_blocks_count;
929 unsigned long o_groups_count;
930 ext4_grpblk_t last;
931 ext4_grpblk_t add;
932 struct buffer_head * bh;
933 handle_t *handle;
934 int err;
935 unsigned long freed_blocks;
936
937 /* We don't need to worry about locking wrt other resizers just
938 * yet: we're going to revalidate es->s_blocks_count after
939 * taking lock_super() below. */
940 o_blocks_count = ext4_blocks_count(es);
941 o_groups_count = EXT4_SB(sb)->s_groups_count;
942
943 if (test_opt(sb, DEBUG))
944 printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n",
945 o_blocks_count, n_blocks_count);
946
947 if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
948 return 0;
949
950 if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
951 printk(KERN_ERR "EXT4-fs: filesystem on %s:"
952 " too large to resize to %llu blocks safely\n",
953 sb->s_id, n_blocks_count);
954 if (sizeof(sector_t) < 8)
955 ext4_warning(sb, __FUNCTION__,
956 "CONFIG_LBD not enabled\n");
957 return -EINVAL;
958 }
959
960 if (n_blocks_count < o_blocks_count) {
961 ext4_warning(sb, __FUNCTION__,
962 "can't shrink FS - resize aborted");
963 return -EBUSY;
964 }
965
966 /* Handle the remaining blocks in the last group only. */
967 ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last);
968
969 if (last == 0) {
970 ext4_warning(sb, __FUNCTION__,
971 "need to use ext2online to resize further");
972 return -EPERM;
973 }
974
975 add = EXT4_BLOCKS_PER_GROUP(sb) - last;
976
977 if (o_blocks_count + add < o_blocks_count) {
978 ext4_warning(sb, __FUNCTION__, "blocks_count overflow");
979 return -EINVAL;
980 }
981
982 if (o_blocks_count + add > n_blocks_count)
983 add = n_blocks_count - o_blocks_count;
984
985 if (o_blocks_count + add < n_blocks_count)
986 ext4_warning(sb, __FUNCTION__,
987 "will only finish group (%llu"
988 " blocks, %u new)",
989 o_blocks_count + add, add);
990
991 /* See if the device is actually as big as what was requested */
992 bh = sb_bread(sb, o_blocks_count + add -1);
993 if (!bh) {
994 ext4_warning(sb, __FUNCTION__,
995 "can't read last block, resize aborted");
996 return -ENOSPC;
997 }
998 brelse(bh);
999
1000 /* We will update the superblock, one block bitmap, and
1001 * one group descriptor via ext4_free_blocks().
1002 */
1003 handle = ext4_journal_start_sb(sb, 3);
1004 if (IS_ERR(handle)) {
1005 err = PTR_ERR(handle);
1006 ext4_warning(sb, __FUNCTION__, "error %d on journal start",err);
1007 goto exit_put;
1008 }
1009
1010 lock_super(sb);
1011 if (o_blocks_count != ext4_blocks_count(es)) {
1012 ext4_warning(sb, __FUNCTION__,
1013 "multiple resizers run on filesystem!");
1014 unlock_super(sb);
1015 err = -EBUSY;
1016 goto exit_put;
1017 }
1018
1019 if ((err = ext4_journal_get_write_access(handle,
1020 EXT4_SB(sb)->s_sbh))) {
1021 ext4_warning(sb, __FUNCTION__,
1022 "error %d on journal write access", err);
1023 unlock_super(sb);
1024 ext4_journal_stop(handle);
1025 goto exit_put;
1026 }
1027 ext4_blocks_count_set(es, o_blocks_count + add);
1028 ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
1029 sb->s_dirt = 1;
1030 unlock_super(sb);
1031 ext4_debug("freeing blocks %lu through %llu\n", o_blocks_count,
1032 o_blocks_count + add);
1033 ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
1034 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
1035 o_blocks_count + add);
1036 if ((err = ext4_journal_stop(handle)))
1037 goto exit_put;
1038 if (test_opt(sb, DEBUG))
1039 printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
1040 ext4_blocks_count(es));
1041 update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es,
1042 sizeof(struct ext4_super_block));
1043exit_put:
1044 return err;
1045} /* ext4_group_extend */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
new file mode 100644
index 000000000000..b4b022aa2bc2
--- /dev/null
+++ b/fs/ext4/super.c
@@ -0,0 +1,2829 @@
1/*
2 * linux/fs/ext4/super.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/inode.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * Big-endian to little-endian byte-swapping/bitmaps by
16 * David S. Miller (davem@caip.rutgers.edu), 1995
17 */
18
19#include <linux/module.h>
20#include <linux/string.h>
21#include <linux/fs.h>
22#include <linux/time.h>
23#include <linux/jbd2.h>
24#include <linux/ext4_fs.h>
25#include <linux/ext4_jbd2.h>
26#include <linux/slab.h>
27#include <linux/init.h>
28#include <linux/blkdev.h>
29#include <linux/parser.h>
30#include <linux/smp_lock.h>
31#include <linux/buffer_head.h>
32#include <linux/vfs.h>
33#include <linux/random.h>
34#include <linux/mount.h>
35#include <linux/namei.h>
36#include <linux/quotaops.h>
37#include <linux/seq_file.h>
38
39#include <asm/uaccess.h>
40
41#include "xattr.h"
42#include "acl.h"
43#include "namei.h"
44
45static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
46 unsigned long journal_devnum);
47static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
48 unsigned int);
49static void ext4_commit_super (struct super_block * sb,
50 struct ext4_super_block * es,
51 int sync);
52static void ext4_mark_recovery_complete(struct super_block * sb,
53 struct ext4_super_block * es);
54static void ext4_clear_journal_err(struct super_block * sb,
55 struct ext4_super_block * es);
56static int ext4_sync_fs(struct super_block *sb, int wait);
57static const char *ext4_decode_error(struct super_block * sb, int errno,
58 char nbuf[16]);
59static int ext4_remount (struct super_block * sb, int * flags, char * data);
60static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf);
61static void ext4_unlockfs(struct super_block *sb);
62static void ext4_write_super (struct super_block * sb);
63static void ext4_write_super_lockfs(struct super_block *sb);
64
65
66ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
67 struct ext4_group_desc *bg)
68{
69 return le32_to_cpu(bg->bg_block_bitmap) |
70 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
71 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
72}
73
74ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
75 struct ext4_group_desc *bg)
76{
77 return le32_to_cpu(bg->bg_inode_bitmap) |
78 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
79 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
80}
81
82ext4_fsblk_t ext4_inode_table(struct super_block *sb,
83 struct ext4_group_desc *bg)
84{
85 return le32_to_cpu(bg->bg_inode_table) |
86 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
87 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
88}
89
90void ext4_block_bitmap_set(struct super_block *sb,
91 struct ext4_group_desc *bg, ext4_fsblk_t blk)
92{
93 bg->bg_block_bitmap = cpu_to_le32((u32)blk);
94 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
95 bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32);
96}
97
98void ext4_inode_bitmap_set(struct super_block *sb,
99 struct ext4_group_desc *bg, ext4_fsblk_t blk)
100{
101 bg->bg_inode_bitmap = cpu_to_le32((u32)blk);
102 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
103 bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32);
104}
105
106void ext4_inode_table_set(struct super_block *sb,
107 struct ext4_group_desc *bg, ext4_fsblk_t blk)
108{
109 bg->bg_inode_table = cpu_to_le32((u32)blk);
110 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
111 bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
112}
113
114/*
115 * Wrappers for jbd2_journal_start/end.
116 *
117 * The only special thing we need to do here is to make sure that all
118 * journal_end calls result in the superblock being marked dirty, so
119 * that sync() will call the filesystem's write_super callback if
120 * appropriate.
121 */
122handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
123{
124 journal_t *journal;
125
126 if (sb->s_flags & MS_RDONLY)
127 return ERR_PTR(-EROFS);
128
129 /* Special case here: if the journal has aborted behind our
130 * backs (eg. EIO in the commit thread), then we still need to
131 * take the FS itself readonly cleanly. */
132 journal = EXT4_SB(sb)->s_journal;
133 if (is_journal_aborted(journal)) {
134 ext4_abort(sb, __FUNCTION__,
135 "Detected aborted journal");
136 return ERR_PTR(-EROFS);
137 }
138
139 return jbd2_journal_start(journal, nblocks);
140}
141
142/*
143 * The only special thing we need to do here is to make sure that all
144 * jbd2_journal_stop calls result in the superblock being marked dirty, so
145 * that sync() will call the filesystem's write_super callback if
146 * appropriate.
147 */
148int __ext4_journal_stop(const char *where, handle_t *handle)
149{
150 struct super_block *sb;
151 int err;
152 int rc;
153
154 sb = handle->h_transaction->t_journal->j_private;
155 err = handle->h_err;
156 rc = jbd2_journal_stop(handle);
157
158 if (!err)
159 err = rc;
160 if (err)
161 __ext4_std_error(sb, where, err);
162 return err;
163}
164
165void ext4_journal_abort_handle(const char *caller, const char *err_fn,
166 struct buffer_head *bh, handle_t *handle, int err)
167{
168 char nbuf[16];
169 const char *errstr = ext4_decode_error(NULL, err, nbuf);
170
171 if (bh)
172 BUFFER_TRACE(bh, "abort");
173
174 if (!handle->h_err)
175 handle->h_err = err;
176
177 if (is_handle_aborted(handle))
178 return;
179
180 printk(KERN_ERR "%s: aborting transaction: %s in %s\n",
181 caller, errstr, err_fn);
182
183 jbd2_journal_abort_handle(handle);
184}
185
186/* Deal with the reporting of failure conditions on a filesystem such as
187 * inconsistencies detected or read IO failures.
188 *
189 * On ext2, we can store the error state of the filesystem in the
190 * superblock. That is not possible on ext4, because we may have other
191 * write ordering constraints on the superblock which prevent us from
192 * writing it out straight away; and given that the journal is about to
193 * be aborted, we can't rely on the current, or future, transactions to
194 * write out the superblock safely.
195 *
196 * We'll just use the jbd2_journal_abort() error code to record an error in
197 * the journal instead. On recovery, the journal will compain about
198 * that error until we've noted it down and cleared it.
199 */
200
201static void ext4_handle_error(struct super_block *sb)
202{
203 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
204
205 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
206 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
207
208 if (sb->s_flags & MS_RDONLY)
209 return;
210
211 if (!test_opt (sb, ERRORS_CONT)) {
212 journal_t *journal = EXT4_SB(sb)->s_journal;
213
214 EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
215 if (journal)
216 jbd2_journal_abort(journal, -EIO);
217 }
218 if (test_opt (sb, ERRORS_RO)) {
219 printk (KERN_CRIT "Remounting filesystem read-only\n");
220 sb->s_flags |= MS_RDONLY;
221 }
222 ext4_commit_super(sb, es, 1);
223 if (test_opt(sb, ERRORS_PANIC))
224 panic("EXT4-fs (device %s): panic forced after error\n",
225 sb->s_id);
226}
227
228void ext4_error (struct super_block * sb, const char * function,
229 const char * fmt, ...)
230{
231 va_list args;
232
233 va_start(args, fmt);
234 printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function);
235 vprintk(fmt, args);
236 printk("\n");
237 va_end(args);
238
239 ext4_handle_error(sb);
240}
241
242static const char *ext4_decode_error(struct super_block * sb, int errno,
243 char nbuf[16])
244{
245 char *errstr = NULL;
246
247 switch (errno) {
248 case -EIO:
249 errstr = "IO failure";
250 break;
251 case -ENOMEM:
252 errstr = "Out of memory";
253 break;
254 case -EROFS:
255 if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)
256 errstr = "Journal has aborted";
257 else
258 errstr = "Readonly filesystem";
259 break;
260 default:
261 /* If the caller passed in an extra buffer for unknown
262 * errors, textualise them now. Else we just return
263 * NULL. */
264 if (nbuf) {
265 /* Check for truncated error codes... */
266 if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
267 errstr = nbuf;
268 }
269 break;
270 }
271
272 return errstr;
273}
274
275/* __ext4_std_error decodes expected errors from journaling functions
276 * automatically and invokes the appropriate error response. */
277
278void __ext4_std_error (struct super_block * sb, const char * function,
279 int errno)
280{
281 char nbuf[16];
282 const char *errstr;
283
284 /* Special case: if the error is EROFS, and we're not already
285 * inside a transaction, then there's really no point in logging
286 * an error. */
287 if (errno == -EROFS && journal_current_handle() == NULL &&
288 (sb->s_flags & MS_RDONLY))
289 return;
290
291 errstr = ext4_decode_error(sb, errno, nbuf);
292 printk (KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n",
293 sb->s_id, function, errstr);
294
295 ext4_handle_error(sb);
296}
297
298/*
299 * ext4_abort is a much stronger failure handler than ext4_error. The
300 * abort function may be used to deal with unrecoverable failures such
301 * as journal IO errors or ENOMEM at a critical moment in log management.
302 *
303 * We unconditionally force the filesystem into an ABORT|READONLY state,
304 * unless the error response on the fs has been set to panic in which
305 * case we take the easy way out and panic immediately.
306 */
307
308void ext4_abort (struct super_block * sb, const char * function,
309 const char * fmt, ...)
310{
311 va_list args;
312
313 printk (KERN_CRIT "ext4_abort called.\n");
314
315 va_start(args, fmt);
316 printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function);
317 vprintk(fmt, args);
318 printk("\n");
319 va_end(args);
320
321 if (test_opt(sb, ERRORS_PANIC))
322 panic("EXT4-fs panic from previous error\n");
323
324 if (sb->s_flags & MS_RDONLY)
325 return;
326
327 printk(KERN_CRIT "Remounting filesystem read-only\n");
328 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
329 sb->s_flags |= MS_RDONLY;
330 EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
331 jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
332}
333
334void ext4_warning (struct super_block * sb, const char * function,
335 const char * fmt, ...)
336{
337 va_list args;
338
339 va_start(args, fmt);
340 printk(KERN_WARNING "EXT4-fs warning (device %s): %s: ",
341 sb->s_id, function);
342 vprintk(fmt, args);
343 printk("\n");
344 va_end(args);
345}
346
347void ext4_update_dynamic_rev(struct super_block *sb)
348{
349 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
350
351 if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
352 return;
353
354 ext4_warning(sb, __FUNCTION__,
355 "updating to rev %d because of new feature flag, "
356 "running e2fsck is recommended",
357 EXT4_DYNAMIC_REV);
358
359 es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO);
360 es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE);
361 es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV);
362 /* leave es->s_feature_*compat flags alone */
363 /* es->s_uuid will be set by e2fsck if empty */
364
365 /*
366 * The rest of the superblock fields should be zero, and if not it
367 * means they are likely already in use, so leave them alone. We
368 * can leave it up to e2fsck to clean up any inconsistencies there.
369 */
370}
371
372/*
373 * Open the external journal device
374 */
375static struct block_device *ext4_blkdev_get(dev_t dev)
376{
377 struct block_device *bdev;
378 char b[BDEVNAME_SIZE];
379
380 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
381 if (IS_ERR(bdev))
382 goto fail;
383 return bdev;
384
385fail:
386 printk(KERN_ERR "EXT4: failed to open journal device %s: %ld\n",
387 __bdevname(dev, b), PTR_ERR(bdev));
388 return NULL;
389}
390
391/*
392 * Release the journal device
393 */
394static int ext4_blkdev_put(struct block_device *bdev)
395{
396 bd_release(bdev);
397 return blkdev_put(bdev);
398}
399
400static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
401{
402 struct block_device *bdev;
403 int ret = -ENODEV;
404
405 bdev = sbi->journal_bdev;
406 if (bdev) {
407 ret = ext4_blkdev_put(bdev);
408 sbi->journal_bdev = NULL;
409 }
410 return ret;
411}
412
413static inline struct inode *orphan_list_entry(struct list_head *l)
414{
415 return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode;
416}
417
418static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
419{
420 struct list_head *l;
421
422 printk(KERN_ERR "sb orphan head is %d\n",
423 le32_to_cpu(sbi->s_es->s_last_orphan));
424
425 printk(KERN_ERR "sb_info orphan list:\n");
426 list_for_each(l, &sbi->s_orphan) {
427 struct inode *inode = orphan_list_entry(l);
428 printk(KERN_ERR " "
429 "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
430 inode->i_sb->s_id, inode->i_ino, inode,
431 inode->i_mode, inode->i_nlink,
432 NEXT_ORPHAN(inode));
433 }
434}
435
436static void ext4_put_super (struct super_block * sb)
437{
438 struct ext4_sb_info *sbi = EXT4_SB(sb);
439 struct ext4_super_block *es = sbi->s_es;
440 int i;
441
442 ext4_ext_release(sb);
443 ext4_xattr_put_super(sb);
444 jbd2_journal_destroy(sbi->s_journal);
445 if (!(sb->s_flags & MS_RDONLY)) {
446 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
447 es->s_state = cpu_to_le16(sbi->s_mount_state);
448 BUFFER_TRACE(sbi->s_sbh, "marking dirty");
449 mark_buffer_dirty(sbi->s_sbh);
450 ext4_commit_super(sb, es, 1);
451 }
452
453 for (i = 0; i < sbi->s_gdb_count; i++)
454 brelse(sbi->s_group_desc[i]);
455 kfree(sbi->s_group_desc);
456 percpu_counter_destroy(&sbi->s_freeblocks_counter);
457 percpu_counter_destroy(&sbi->s_freeinodes_counter);
458 percpu_counter_destroy(&sbi->s_dirs_counter);
459 brelse(sbi->s_sbh);
460#ifdef CONFIG_QUOTA
461 for (i = 0; i < MAXQUOTAS; i++)
462 kfree(sbi->s_qf_names[i]);
463#endif
464
465 /* Debugging code just in case the in-memory inode orphan list
466 * isn't empty. The on-disk one can be non-empty if we've
467 * detected an error and taken the fs readonly, but the
468 * in-memory list had better be clean by this point. */
469 if (!list_empty(&sbi->s_orphan))
470 dump_orphan_list(sb, sbi);
471 J_ASSERT(list_empty(&sbi->s_orphan));
472
473 invalidate_bdev(sb->s_bdev, 0);
474 if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
475 /*
476 * Invalidate the journal device's buffers. We don't want them
477 * floating about in memory - the physical journal device may
478 * hotswapped, and it breaks the `ro-after' testing code.
479 */
480 sync_blockdev(sbi->journal_bdev);
481 invalidate_bdev(sbi->journal_bdev, 0);
482 ext4_blkdev_remove(sbi);
483 }
484 sb->s_fs_info = NULL;
485 kfree(sbi);
486 return;
487}
488
489static kmem_cache_t *ext4_inode_cachep;
490
491/*
492 * Called inside transaction, so use GFP_NOFS
493 */
494static struct inode *ext4_alloc_inode(struct super_block *sb)
495{
496 struct ext4_inode_info *ei;
497
498 ei = kmem_cache_alloc(ext4_inode_cachep, SLAB_NOFS);
499 if (!ei)
500 return NULL;
501#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
502 ei->i_acl = EXT4_ACL_NOT_CACHED;
503 ei->i_default_acl = EXT4_ACL_NOT_CACHED;
504#endif
505 ei->i_block_alloc_info = NULL;
506 ei->vfs_inode.i_version = 1;
507 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
508 return &ei->vfs_inode;
509}
510
511static void ext4_destroy_inode(struct inode *inode)
512{
513 kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
514}
515
516static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
517{
518 struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
519
520 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
521 SLAB_CTOR_CONSTRUCTOR) {
522 INIT_LIST_HEAD(&ei->i_orphan);
523#ifdef CONFIG_EXT4DEV_FS_XATTR
524 init_rwsem(&ei->xattr_sem);
525#endif
526 mutex_init(&ei->truncate_mutex);
527 inode_init_once(&ei->vfs_inode);
528 }
529}
530
531static int init_inodecache(void)
532{
533 ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
534 sizeof(struct ext4_inode_info),
535 0, (SLAB_RECLAIM_ACCOUNT|
536 SLAB_MEM_SPREAD),
537 init_once, NULL);
538 if (ext4_inode_cachep == NULL)
539 return -ENOMEM;
540 return 0;
541}
542
543static void destroy_inodecache(void)
544{
545 kmem_cache_destroy(ext4_inode_cachep);
546}
547
548static void ext4_clear_inode(struct inode *inode)
549{
550 struct ext4_block_alloc_info *rsv = EXT4_I(inode)->i_block_alloc_info;
551#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
552 if (EXT4_I(inode)->i_acl &&
553 EXT4_I(inode)->i_acl != EXT4_ACL_NOT_CACHED) {
554 posix_acl_release(EXT4_I(inode)->i_acl);
555 EXT4_I(inode)->i_acl = EXT4_ACL_NOT_CACHED;
556 }
557 if (EXT4_I(inode)->i_default_acl &&
558 EXT4_I(inode)->i_default_acl != EXT4_ACL_NOT_CACHED) {
559 posix_acl_release(EXT4_I(inode)->i_default_acl);
560 EXT4_I(inode)->i_default_acl = EXT4_ACL_NOT_CACHED;
561 }
562#endif
563 ext4_discard_reservation(inode);
564 EXT4_I(inode)->i_block_alloc_info = NULL;
565 if (unlikely(rsv))
566 kfree(rsv);
567}
568
569static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
570{
571#if defined(CONFIG_QUOTA)
572 struct ext4_sb_info *sbi = EXT4_SB(sb);
573
574 if (sbi->s_jquota_fmt)
575 seq_printf(seq, ",jqfmt=%s",
576 (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0");
577
578 if (sbi->s_qf_names[USRQUOTA])
579 seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
580
581 if (sbi->s_qf_names[GRPQUOTA])
582 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
583
584 if (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA)
585 seq_puts(seq, ",usrquota");
586
587 if (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)
588 seq_puts(seq, ",grpquota");
589#endif
590}
591
592static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
593{
594 struct super_block *sb = vfs->mnt_sb;
595
596 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
597 seq_puts(seq, ",data=journal");
598 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
599 seq_puts(seq, ",data=ordered");
600 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
601 seq_puts(seq, ",data=writeback");
602
603 ext4_show_quota_options(seq, sb);
604
605 return 0;
606}
607
608
609static struct dentry *ext4_get_dentry(struct super_block *sb, void *vobjp)
610{
611 __u32 *objp = vobjp;
612 unsigned long ino = objp[0];
613 __u32 generation = objp[1];
614 struct inode *inode;
615 struct dentry *result;
616
617 if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
618 return ERR_PTR(-ESTALE);
619 if (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
620 return ERR_PTR(-ESTALE);
621
622 /* iget isn't really right if the inode is currently unallocated!!
623 *
624 * ext4_read_inode will return a bad_inode if the inode had been
625 * deleted, so we should be safe.
626 *
627 * Currently we don't know the generation for parent directory, so
628 * a generation of 0 means "accept any"
629 */
630 inode = iget(sb, ino);
631 if (inode == NULL)
632 return ERR_PTR(-ENOMEM);
633 if (is_bad_inode(inode) ||
634 (generation && inode->i_generation != generation)) {
635 iput(inode);
636 return ERR_PTR(-ESTALE);
637 }
638 /* now to find a dentry.
639 * If possible, get a well-connected one
640 */
641 result = d_alloc_anon(inode);
642 if (!result) {
643 iput(inode);
644 return ERR_PTR(-ENOMEM);
645 }
646 return result;
647}
648
649#ifdef CONFIG_QUOTA
650#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
651#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
652
653static int ext4_dquot_initialize(struct inode *inode, int type);
654static int ext4_dquot_drop(struct inode *inode);
655static int ext4_write_dquot(struct dquot *dquot);
656static int ext4_acquire_dquot(struct dquot *dquot);
657static int ext4_release_dquot(struct dquot *dquot);
658static int ext4_mark_dquot_dirty(struct dquot *dquot);
659static int ext4_write_info(struct super_block *sb, int type);
660static int ext4_quota_on(struct super_block *sb, int type, int format_id, char *path);
661static int ext4_quota_on_mount(struct super_block *sb, int type);
662static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
663 size_t len, loff_t off);
664static ssize_t ext4_quota_write(struct super_block *sb, int type,
665 const char *data, size_t len, loff_t off);
666
667static struct dquot_operations ext4_quota_operations = {
668 .initialize = ext4_dquot_initialize,
669 .drop = ext4_dquot_drop,
670 .alloc_space = dquot_alloc_space,
671 .alloc_inode = dquot_alloc_inode,
672 .free_space = dquot_free_space,
673 .free_inode = dquot_free_inode,
674 .transfer = dquot_transfer,
675 .write_dquot = ext4_write_dquot,
676 .acquire_dquot = ext4_acquire_dquot,
677 .release_dquot = ext4_release_dquot,
678 .mark_dirty = ext4_mark_dquot_dirty,
679 .write_info = ext4_write_info
680};
681
682static struct quotactl_ops ext4_qctl_operations = {
683 .quota_on = ext4_quota_on,
684 .quota_off = vfs_quota_off,
685 .quota_sync = vfs_quota_sync,
686 .get_info = vfs_get_dqinfo,
687 .set_info = vfs_set_dqinfo,
688 .get_dqblk = vfs_get_dqblk,
689 .set_dqblk = vfs_set_dqblk
690};
691#endif
692
693static struct super_operations ext4_sops = {
694 .alloc_inode = ext4_alloc_inode,
695 .destroy_inode = ext4_destroy_inode,
696 .read_inode = ext4_read_inode,
697 .write_inode = ext4_write_inode,
698 .dirty_inode = ext4_dirty_inode,
699 .delete_inode = ext4_delete_inode,
700 .put_super = ext4_put_super,
701 .write_super = ext4_write_super,
702 .sync_fs = ext4_sync_fs,
703 .write_super_lockfs = ext4_write_super_lockfs,
704 .unlockfs = ext4_unlockfs,
705 .statfs = ext4_statfs,
706 .remount_fs = ext4_remount,
707 .clear_inode = ext4_clear_inode,
708 .show_options = ext4_show_options,
709#ifdef CONFIG_QUOTA
710 .quota_read = ext4_quota_read,
711 .quota_write = ext4_quota_write,
712#endif
713};
714
715static struct export_operations ext4_export_ops = {
716 .get_parent = ext4_get_parent,
717 .get_dentry = ext4_get_dentry,
718};
719
720enum {
721 Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
722 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
723 Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
724 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
725 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
726 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
727 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
728 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
729 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
730 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
731 Opt_grpquota, Opt_extents,
732};
733
734static match_table_t tokens = {
735 {Opt_bsd_df, "bsddf"},
736 {Opt_minix_df, "minixdf"},
737 {Opt_grpid, "grpid"},
738 {Opt_grpid, "bsdgroups"},
739 {Opt_nogrpid, "nogrpid"},
740 {Opt_nogrpid, "sysvgroups"},
741 {Opt_resgid, "resgid=%u"},
742 {Opt_resuid, "resuid=%u"},
743 {Opt_sb, "sb=%u"},
744 {Opt_err_cont, "errors=continue"},
745 {Opt_err_panic, "errors=panic"},
746 {Opt_err_ro, "errors=remount-ro"},
747 {Opt_nouid32, "nouid32"},
748 {Opt_nocheck, "nocheck"},
749 {Opt_nocheck, "check=none"},
750 {Opt_debug, "debug"},
751 {Opt_oldalloc, "oldalloc"},
752 {Opt_orlov, "orlov"},
753 {Opt_user_xattr, "user_xattr"},
754 {Opt_nouser_xattr, "nouser_xattr"},
755 {Opt_acl, "acl"},
756 {Opt_noacl, "noacl"},
757 {Opt_reservation, "reservation"},
758 {Opt_noreservation, "noreservation"},
759 {Opt_noload, "noload"},
760 {Opt_nobh, "nobh"},
761 {Opt_bh, "bh"},
762 {Opt_commit, "commit=%u"},
763 {Opt_journal_update, "journal=update"},
764 {Opt_journal_inum, "journal=%u"},
765 {Opt_journal_dev, "journal_dev=%u"},
766 {Opt_abort, "abort"},
767 {Opt_data_journal, "data=journal"},
768 {Opt_data_ordered, "data=ordered"},
769 {Opt_data_writeback, "data=writeback"},
770 {Opt_offusrjquota, "usrjquota="},
771 {Opt_usrjquota, "usrjquota=%s"},
772 {Opt_offgrpjquota, "grpjquota="},
773 {Opt_grpjquota, "grpjquota=%s"},
774 {Opt_jqfmt_vfsold, "jqfmt=vfsold"},
775 {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
776 {Opt_grpquota, "grpquota"},
777 {Opt_noquota, "noquota"},
778 {Opt_quota, "quota"},
779 {Opt_usrquota, "usrquota"},
780 {Opt_barrier, "barrier=%u"},
781 {Opt_extents, "extents"},
782 {Opt_err, NULL},
783 {Opt_resize, "resize"},
784};
785
786static ext4_fsblk_t get_sb_block(void **data)
787{
788 ext4_fsblk_t sb_block;
789 char *options = (char *) *data;
790
791 if (!options || strncmp(options, "sb=", 3) != 0)
792 return 1; /* Default location */
793 options += 3;
794 /*todo: use simple_strtoll with >32bit ext4 */
795 sb_block = simple_strtoul(options, &options, 0);
796 if (*options && *options != ',') {
797 printk("EXT4-fs: Invalid sb specification: %s\n",
798 (char *) *data);
799 return 1;
800 }
801 if (*options == ',')
802 options++;
803 *data = (void *) options;
804 return sb_block;
805}
806
807static int parse_options (char *options, struct super_block *sb,
808 unsigned int *inum, unsigned long *journal_devnum,
809 ext4_fsblk_t *n_blocks_count, int is_remount)
810{
811 struct ext4_sb_info *sbi = EXT4_SB(sb);
812 char * p;
813 substring_t args[MAX_OPT_ARGS];
814 int data_opt = 0;
815 int option;
816#ifdef CONFIG_QUOTA
817 int qtype;
818 char *qname;
819#endif
820
821 if (!options)
822 return 1;
823
824 while ((p = strsep (&options, ",")) != NULL) {
825 int token;
826 if (!*p)
827 continue;
828
829 token = match_token(p, tokens, args);
830 switch (token) {
831 case Opt_bsd_df:
832 clear_opt (sbi->s_mount_opt, MINIX_DF);
833 break;
834 case Opt_minix_df:
835 set_opt (sbi->s_mount_opt, MINIX_DF);
836 break;
837 case Opt_grpid:
838 set_opt (sbi->s_mount_opt, GRPID);
839 break;
840 case Opt_nogrpid:
841 clear_opt (sbi->s_mount_opt, GRPID);
842 break;
843 case Opt_resuid:
844 if (match_int(&args[0], &option))
845 return 0;
846 sbi->s_resuid = option;
847 break;
848 case Opt_resgid:
849 if (match_int(&args[0], &option))
850 return 0;
851 sbi->s_resgid = option;
852 break;
853 case Opt_sb:
854 /* handled by get_sb_block() instead of here */
855 /* *sb_block = match_int(&args[0]); */
856 break;
857 case Opt_err_panic:
858 clear_opt (sbi->s_mount_opt, ERRORS_CONT);
859 clear_opt (sbi->s_mount_opt, ERRORS_RO);
860 set_opt (sbi->s_mount_opt, ERRORS_PANIC);
861 break;
862 case Opt_err_ro:
863 clear_opt (sbi->s_mount_opt, ERRORS_CONT);
864 clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
865 set_opt (sbi->s_mount_opt, ERRORS_RO);
866 break;
867 case Opt_err_cont:
868 clear_opt (sbi->s_mount_opt, ERRORS_RO);
869 clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
870 set_opt (sbi->s_mount_opt, ERRORS_CONT);
871 break;
872 case Opt_nouid32:
873 set_opt (sbi->s_mount_opt, NO_UID32);
874 break;
875 case Opt_nocheck:
876 clear_opt (sbi->s_mount_opt, CHECK);
877 break;
878 case Opt_debug:
879 set_opt (sbi->s_mount_opt, DEBUG);
880 break;
881 case Opt_oldalloc:
882 set_opt (sbi->s_mount_opt, OLDALLOC);
883 break;
884 case Opt_orlov:
885 clear_opt (sbi->s_mount_opt, OLDALLOC);
886 break;
887#ifdef CONFIG_EXT4DEV_FS_XATTR
888 case Opt_user_xattr:
889 set_opt (sbi->s_mount_opt, XATTR_USER);
890 break;
891 case Opt_nouser_xattr:
892 clear_opt (sbi->s_mount_opt, XATTR_USER);
893 break;
894#else
895 case Opt_user_xattr:
896 case Opt_nouser_xattr:
897 printk("EXT4 (no)user_xattr options not supported\n");
898 break;
899#endif
900#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
901 case Opt_acl:
902 set_opt(sbi->s_mount_opt, POSIX_ACL);
903 break;
904 case Opt_noacl:
905 clear_opt(sbi->s_mount_opt, POSIX_ACL);
906 break;
907#else
908 case Opt_acl:
909 case Opt_noacl:
910 printk("EXT4 (no)acl options not supported\n");
911 break;
912#endif
913 case Opt_reservation:
914 set_opt(sbi->s_mount_opt, RESERVATION);
915 break;
916 case Opt_noreservation:
917 clear_opt(sbi->s_mount_opt, RESERVATION);
918 break;
919 case Opt_journal_update:
920 /* @@@ FIXME */
921 /* Eventually we will want to be able to create
922 a journal file here. For now, only allow the
923 user to specify an existing inode to be the
924 journal file. */
925 if (is_remount) {
926 printk(KERN_ERR "EXT4-fs: cannot specify "
927 "journal on remount\n");
928 return 0;
929 }
930 set_opt (sbi->s_mount_opt, UPDATE_JOURNAL);
931 break;
932 case Opt_journal_inum:
933 if (is_remount) {
934 printk(KERN_ERR "EXT4-fs: cannot specify "
935 "journal on remount\n");
936 return 0;
937 }
938 if (match_int(&args[0], &option))
939 return 0;
940 *inum = option;
941 break;
942 case Opt_journal_dev:
943 if (is_remount) {
944 printk(KERN_ERR "EXT4-fs: cannot specify "
945 "journal on remount\n");
946 return 0;
947 }
948 if (match_int(&args[0], &option))
949 return 0;
950 *journal_devnum = option;
951 break;
952 case Opt_noload:
953 set_opt (sbi->s_mount_opt, NOLOAD);
954 break;
955 case Opt_commit:
956 if (match_int(&args[0], &option))
957 return 0;
958 if (option < 0)
959 return 0;
960 if (option == 0)
961 option = JBD_DEFAULT_MAX_COMMIT_AGE;
962 sbi->s_commit_interval = HZ * option;
963 break;
964 case Opt_data_journal:
965 data_opt = EXT4_MOUNT_JOURNAL_DATA;
966 goto datacheck;
967 case Opt_data_ordered:
968 data_opt = EXT4_MOUNT_ORDERED_DATA;
969 goto datacheck;
970 case Opt_data_writeback:
971 data_opt = EXT4_MOUNT_WRITEBACK_DATA;
972 datacheck:
973 if (is_remount) {
974 if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS)
975 != data_opt) {
976 printk(KERN_ERR
977 "EXT4-fs: cannot change data "
978 "mode on remount\n");
979 return 0;
980 }
981 } else {
982 sbi->s_mount_opt &= ~EXT4_MOUNT_DATA_FLAGS;
983 sbi->s_mount_opt |= data_opt;
984 }
985 break;
986#ifdef CONFIG_QUOTA
987 case Opt_usrjquota:
988 qtype = USRQUOTA;
989 goto set_qf_name;
990 case Opt_grpjquota:
991 qtype = GRPQUOTA;
992set_qf_name:
993 if (sb_any_quota_enabled(sb)) {
994 printk(KERN_ERR
995 "EXT4-fs: Cannot change journalled "
996 "quota options when quota turned on.\n");
997 return 0;
998 }
999 qname = match_strdup(&args[0]);
1000 if (!qname) {
1001 printk(KERN_ERR
1002 "EXT4-fs: not enough memory for "
1003 "storing quotafile name.\n");
1004 return 0;
1005 }
1006 if (sbi->s_qf_names[qtype] &&
1007 strcmp(sbi->s_qf_names[qtype], qname)) {
1008 printk(KERN_ERR
1009 "EXT4-fs: %s quota file already "
1010 "specified.\n", QTYPE2NAME(qtype));
1011 kfree(qname);
1012 return 0;
1013 }
1014 sbi->s_qf_names[qtype] = qname;
1015 if (strchr(sbi->s_qf_names[qtype], '/')) {
1016 printk(KERN_ERR
1017 "EXT4-fs: quotafile must be on "
1018 "filesystem root.\n");
1019 kfree(sbi->s_qf_names[qtype]);
1020 sbi->s_qf_names[qtype] = NULL;
1021 return 0;
1022 }
1023 set_opt(sbi->s_mount_opt, QUOTA);
1024 break;
1025 case Opt_offusrjquota:
1026 qtype = USRQUOTA;
1027 goto clear_qf_name;
1028 case Opt_offgrpjquota:
1029 qtype = GRPQUOTA;
1030clear_qf_name:
1031 if (sb_any_quota_enabled(sb)) {
1032 printk(KERN_ERR "EXT4-fs: Cannot change "
1033 "journalled quota options when "
1034 "quota turned on.\n");
1035 return 0;
1036 }
1037 /*
1038 * The space will be released later when all options
1039 * are confirmed to be correct
1040 */
1041 sbi->s_qf_names[qtype] = NULL;
1042 break;
1043 case Opt_jqfmt_vfsold:
1044 sbi->s_jquota_fmt = QFMT_VFS_OLD;
1045 break;
1046 case Opt_jqfmt_vfsv0:
1047 sbi->s_jquota_fmt = QFMT_VFS_V0;
1048 break;
1049 case Opt_quota:
1050 case Opt_usrquota:
1051 set_opt(sbi->s_mount_opt, QUOTA);
1052 set_opt(sbi->s_mount_opt, USRQUOTA);
1053 break;
1054 case Opt_grpquota:
1055 set_opt(sbi->s_mount_opt, QUOTA);
1056 set_opt(sbi->s_mount_opt, GRPQUOTA);
1057 break;
1058 case Opt_noquota:
1059 if (sb_any_quota_enabled(sb)) {
1060 printk(KERN_ERR "EXT4-fs: Cannot change quota "
1061 "options when quota turned on.\n");
1062 return 0;
1063 }
1064 clear_opt(sbi->s_mount_opt, QUOTA);
1065 clear_opt(sbi->s_mount_opt, USRQUOTA);
1066 clear_opt(sbi->s_mount_opt, GRPQUOTA);
1067 break;
1068#else
1069 case Opt_quota:
1070 case Opt_usrquota:
1071 case Opt_grpquota:
1072 case Opt_usrjquota:
1073 case Opt_grpjquota:
1074 case Opt_offusrjquota:
1075 case Opt_offgrpjquota:
1076 case Opt_jqfmt_vfsold:
1077 case Opt_jqfmt_vfsv0:
1078 printk(KERN_ERR
1079 "EXT4-fs: journalled quota options not "
1080 "supported.\n");
1081 break;
1082 case Opt_noquota:
1083 break;
1084#endif
1085 case Opt_abort:
1086 set_opt(sbi->s_mount_opt, ABORT);
1087 break;
1088 case Opt_barrier:
1089 if (match_int(&args[0], &option))
1090 return 0;
1091 if (option)
1092 set_opt(sbi->s_mount_opt, BARRIER);
1093 else
1094 clear_opt(sbi->s_mount_opt, BARRIER);
1095 break;
1096 case Opt_ignore:
1097 break;
1098 case Opt_resize:
1099 if (!is_remount) {
1100 printk("EXT4-fs: resize option only available "
1101 "for remount\n");
1102 return 0;
1103 }
1104 if (match_int(&args[0], &option) != 0)
1105 return 0;
1106 *n_blocks_count = option;
1107 break;
1108 case Opt_nobh:
1109 set_opt(sbi->s_mount_opt, NOBH);
1110 break;
1111 case Opt_bh:
1112 clear_opt(sbi->s_mount_opt, NOBH);
1113 break;
1114 case Opt_extents:
1115 set_opt (sbi->s_mount_opt, EXTENTS);
1116 break;
1117 default:
1118 printk (KERN_ERR
1119 "EXT4-fs: Unrecognized mount option \"%s\" "
1120 "or missing value\n", p);
1121 return 0;
1122 }
1123 }
1124#ifdef CONFIG_QUOTA
1125 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
1126 if ((sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) &&
1127 sbi->s_qf_names[USRQUOTA])
1128 clear_opt(sbi->s_mount_opt, USRQUOTA);
1129
1130 if ((sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) &&
1131 sbi->s_qf_names[GRPQUOTA])
1132 clear_opt(sbi->s_mount_opt, GRPQUOTA);
1133
1134 if ((sbi->s_qf_names[USRQUOTA] &&
1135 (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) ||
1136 (sbi->s_qf_names[GRPQUOTA] &&
1137 (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) {
1138 printk(KERN_ERR "EXT4-fs: old and new quota "
1139 "format mixing.\n");
1140 return 0;
1141 }
1142
1143 if (!sbi->s_jquota_fmt) {
1144 printk(KERN_ERR "EXT4-fs: journalled quota format "
1145 "not specified.\n");
1146 return 0;
1147 }
1148 } else {
1149 if (sbi->s_jquota_fmt) {
1150 printk(KERN_ERR "EXT4-fs: journalled quota format "
1151 "specified with no journalling "
1152 "enabled.\n");
1153 return 0;
1154 }
1155 }
1156#endif
1157 return 1;
1158}
1159
1160static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1161 int read_only)
1162{
1163 struct ext4_sb_info *sbi = EXT4_SB(sb);
1164 int res = 0;
1165
1166 if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
1167 printk (KERN_ERR "EXT4-fs warning: revision level too high, "
1168 "forcing read-only mode\n");
1169 res = MS_RDONLY;
1170 }
1171 if (read_only)
1172 return res;
1173 if (!(sbi->s_mount_state & EXT4_VALID_FS))
1174 printk (KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
1175 "running e2fsck is recommended\n");
1176 else if ((sbi->s_mount_state & EXT4_ERROR_FS))
1177 printk (KERN_WARNING
1178 "EXT4-fs warning: mounting fs with errors, "
1179 "running e2fsck is recommended\n");
1180 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
1181 le16_to_cpu(es->s_mnt_count) >=
1182 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
1183 printk (KERN_WARNING
1184 "EXT4-fs warning: maximal mount count reached, "
1185 "running e2fsck is recommended\n");
1186 else if (le32_to_cpu(es->s_checkinterval) &&
1187 (le32_to_cpu(es->s_lastcheck) +
1188 le32_to_cpu(es->s_checkinterval) <= get_seconds()))
1189 printk (KERN_WARNING
1190 "EXT4-fs warning: checktime reached, "
1191 "running e2fsck is recommended\n");
1192#if 0
1193 /* @@@ We _will_ want to clear the valid bit if we find
1194 * inconsistencies, to force a fsck at reboot. But for
1195 * a plain journaled filesystem we can keep it set as
1196 * valid forever! :)
1197 */
1198 es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) & ~EXT4_VALID_FS);
1199#endif
1200 if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
1201 es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
1202 es->s_mnt_count=cpu_to_le16(le16_to_cpu(es->s_mnt_count) + 1);
1203 es->s_mtime = cpu_to_le32(get_seconds());
1204 ext4_update_dynamic_rev(sb);
1205 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
1206
1207 ext4_commit_super(sb, es, 1);
1208 if (test_opt(sb, DEBUG))
1209 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%lu, "
1210 "bpg=%lu, ipg=%lu, mo=%04lx]\n",
1211 sb->s_blocksize,
1212 sbi->s_groups_count,
1213 EXT4_BLOCKS_PER_GROUP(sb),
1214 EXT4_INODES_PER_GROUP(sb),
1215 sbi->s_mount_opt);
1216
1217 printk(KERN_INFO "EXT4 FS on %s, ", sb->s_id);
1218 if (EXT4_SB(sb)->s_journal->j_inode == NULL) {
1219 char b[BDEVNAME_SIZE];
1220
1221 printk("external journal on %s\n",
1222 bdevname(EXT4_SB(sb)->s_journal->j_dev, b));
1223 } else {
1224 printk("internal journal\n");
1225 }
1226 return res;
1227}
1228
1229/* Called at mount-time, super-block is locked */
1230static int ext4_check_descriptors (struct super_block * sb)
1231{
1232 struct ext4_sb_info *sbi = EXT4_SB(sb);
1233 ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
1234 ext4_fsblk_t last_block;
1235 ext4_fsblk_t block_bitmap;
1236 ext4_fsblk_t inode_bitmap;
1237 ext4_fsblk_t inode_table;
1238 struct ext4_group_desc * gdp = NULL;
1239 int desc_block = 0;
1240 int i;
1241
1242 ext4_debug ("Checking group descriptors");
1243
1244 for (i = 0; i < sbi->s_groups_count; i++)
1245 {
1246 if (i == sbi->s_groups_count - 1)
1247 last_block = ext4_blocks_count(sbi->s_es) - 1;
1248 else
1249 last_block = first_block +
1250 (EXT4_BLOCKS_PER_GROUP(sb) - 1);
1251
1252 if ((i % EXT4_DESC_PER_BLOCK(sb)) == 0)
1253 gdp = (struct ext4_group_desc *)
1254 sbi->s_group_desc[desc_block++]->b_data;
1255 block_bitmap = ext4_block_bitmap(sb, gdp);
1256 if (block_bitmap < first_block || block_bitmap > last_block)
1257 {
1258 ext4_error (sb, "ext4_check_descriptors",
1259 "Block bitmap for group %d"
1260 " not in group (block %llu)!",
1261 i, block_bitmap);
1262 return 0;
1263 }
1264 inode_bitmap = ext4_inode_bitmap(sb, gdp);
1265 if (inode_bitmap < first_block || inode_bitmap > last_block)
1266 {
1267 ext4_error (sb, "ext4_check_descriptors",
1268 "Inode bitmap for group %d"
1269 " not in group (block %llu)!",
1270 i, inode_bitmap);
1271 return 0;
1272 }
1273 inode_table = ext4_inode_table(sb, gdp);
1274 if (inode_table < first_block ||
1275 inode_table + sbi->s_itb_per_group > last_block)
1276 {
1277 ext4_error (sb, "ext4_check_descriptors",
1278 "Inode table for group %d"
1279 " not in group (block %llu)!",
1280 i, inode_table);
1281 return 0;
1282 }
1283 first_block += EXT4_BLOCKS_PER_GROUP(sb);
1284 gdp = (struct ext4_group_desc *)
1285 ((__u8 *)gdp + EXT4_DESC_SIZE(sb));
1286 }
1287
1288 ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
1289 sbi->s_es->s_free_inodes_count=cpu_to_le32(ext4_count_free_inodes(sb));
1290 return 1;
1291}
1292
1293
1294/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
1295 * the superblock) which were deleted from all directories, but held open by
1296 * a process at the time of a crash. We walk the list and try to delete these
1297 * inodes at recovery time (only with a read-write filesystem).
1298 *
1299 * In order to keep the orphan inode chain consistent during traversal (in
1300 * case of crash during recovery), we link each inode into the superblock
1301 * orphan list_head and handle it the same way as an inode deletion during
1302 * normal operation (which journals the operations for us).
1303 *
1304 * We only do an iget() and an iput() on each inode, which is very safe if we
1305 * accidentally point at an in-use or already deleted inode. The worst that
1306 * can happen in this case is that we get a "bit already cleared" message from
1307 * ext4_free_inode(). The only reason we would point at a wrong inode is if
1308 * e2fsck was run on this filesystem, and it must have already done the orphan
1309 * inode cleanup for us, so we can safely abort without any further action.
1310 */
1311static void ext4_orphan_cleanup (struct super_block * sb,
1312 struct ext4_super_block * es)
1313{
1314 unsigned int s_flags = sb->s_flags;
1315 int nr_orphans = 0, nr_truncates = 0;
1316#ifdef CONFIG_QUOTA
1317 int i;
1318#endif
1319 if (!es->s_last_orphan) {
1320 jbd_debug(4, "no orphan inodes to clean up\n");
1321 return;
1322 }
1323
1324 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
1325 if (es->s_last_orphan)
1326 jbd_debug(1, "Errors on filesystem, "
1327 "clearing orphan list.\n");
1328 es->s_last_orphan = 0;
1329 jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
1330 return;
1331 }
1332
1333 if (s_flags & MS_RDONLY) {
1334 printk(KERN_INFO "EXT4-fs: %s: orphan cleanup on readonly fs\n",
1335 sb->s_id);
1336 sb->s_flags &= ~MS_RDONLY;
1337 }
1338#ifdef CONFIG_QUOTA
1339 /* Needed for iput() to work correctly and not trash data */
1340 sb->s_flags |= MS_ACTIVE;
1341 /* Turn on quotas so that they are updated correctly */
1342 for (i = 0; i < MAXQUOTAS; i++) {
1343 if (EXT4_SB(sb)->s_qf_names[i]) {
1344 int ret = ext4_quota_on_mount(sb, i);
1345 if (ret < 0)
1346 printk(KERN_ERR
1347 "EXT4-fs: Cannot turn on journalled "
1348 "quota: error %d\n", ret);
1349 }
1350 }
1351#endif
1352
1353 while (es->s_last_orphan) {
1354 struct inode *inode;
1355
1356 if (!(inode =
1357 ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)))) {
1358 es->s_last_orphan = 0;
1359 break;
1360 }
1361
1362 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
1363 DQUOT_INIT(inode);
1364 if (inode->i_nlink) {
1365 printk(KERN_DEBUG
1366 "%s: truncating inode %lu to %Ld bytes\n",
1367 __FUNCTION__, inode->i_ino, inode->i_size);
1368 jbd_debug(2, "truncating inode %lu to %Ld bytes\n",
1369 inode->i_ino, inode->i_size);
1370 ext4_truncate(inode);
1371 nr_truncates++;
1372 } else {
1373 printk(KERN_DEBUG
1374 "%s: deleting unreferenced inode %lu\n",
1375 __FUNCTION__, inode->i_ino);
1376 jbd_debug(2, "deleting unreferenced inode %lu\n",
1377 inode->i_ino);
1378 nr_orphans++;
1379 }
1380 iput(inode); /* The delete magic happens here! */
1381 }
1382
1383#define PLURAL(x) (x), ((x)==1) ? "" : "s"
1384
1385 if (nr_orphans)
1386 printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n",
1387 sb->s_id, PLURAL(nr_orphans));
1388 if (nr_truncates)
1389 printk(KERN_INFO "EXT4-fs: %s: %d truncate%s cleaned up\n",
1390 sb->s_id, PLURAL(nr_truncates));
1391#ifdef CONFIG_QUOTA
1392 /* Turn quotas off */
1393 for (i = 0; i < MAXQUOTAS; i++) {
1394 if (sb_dqopt(sb)->files[i])
1395 vfs_quota_off(sb, i);
1396 }
1397#endif
1398 sb->s_flags = s_flags; /* Restore MS_RDONLY status */
1399}
1400
1401#define log2(n) ffz(~(n))
1402
1403/*
1404 * Maximal file size. There is a direct, and {,double-,triple-}indirect
1405 * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks.
1406 * We need to be 1 filesystem block less than the 2^32 sector limit.
1407 */
1408static loff_t ext4_max_size(int bits)
1409{
1410 loff_t res = EXT4_NDIR_BLOCKS;
1411 /* This constant is calculated to be the largest file size for a
1412 * dense, 4k-blocksize file such that the total number of
1413 * sectors in the file, including data and all indirect blocks,
1414 * does not exceed 2^32. */
1415 const loff_t upper_limit = 0x1ff7fffd000LL;
1416
1417 res += 1LL << (bits-2);
1418 res += 1LL << (2*(bits-2));
1419 res += 1LL << (3*(bits-2));
1420 res <<= bits;
1421 if (res > upper_limit)
1422 res = upper_limit;
1423 return res;
1424}
1425
1426static ext4_fsblk_t descriptor_loc(struct super_block *sb,
1427 ext4_fsblk_t logical_sb_block, int nr)
1428{
1429 struct ext4_sb_info *sbi = EXT4_SB(sb);
1430 unsigned long bg, first_meta_bg;
1431 int has_super = 0;
1432
1433 first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
1434
1435 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
1436 nr < first_meta_bg)
1437 return logical_sb_block + nr + 1;
1438 bg = sbi->s_desc_per_block * nr;
1439 if (ext4_bg_has_super(sb, bg))
1440 has_super = 1;
1441 return (has_super + ext4_group_first_block_no(sb, bg));
1442}
1443
1444
1445static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1446{
1447 struct buffer_head * bh;
1448 struct ext4_super_block *es = NULL;
1449 struct ext4_sb_info *sbi;
1450 ext4_fsblk_t block;
1451 ext4_fsblk_t sb_block = get_sb_block(&data);
1452 ext4_fsblk_t logical_sb_block;
1453 unsigned long offset = 0;
1454 unsigned int journal_inum = 0;
1455 unsigned long journal_devnum = 0;
1456 unsigned long def_mount_opts;
1457 struct inode *root;
1458 int blocksize;
1459 int hblock;
1460 int db_count;
1461 int i;
1462 int needs_recovery;
1463 __le32 features;
1464 __u64 blocks_count;
1465
1466 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
1467 if (!sbi)
1468 return -ENOMEM;
1469 sb->s_fs_info = sbi;
1470 sbi->s_mount_opt = 0;
1471 sbi->s_resuid = EXT4_DEF_RESUID;
1472 sbi->s_resgid = EXT4_DEF_RESGID;
1473
1474 unlock_kernel();
1475
1476 blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
1477 if (!blocksize) {
1478 printk(KERN_ERR "EXT4-fs: unable to set blocksize\n");
1479 goto out_fail;
1480 }
1481
1482 /*
1483 * The ext4 superblock will not be buffer aligned for other than 1kB
1484 * block sizes. We need to calculate the offset from buffer start.
1485 */
1486 if (blocksize != EXT4_MIN_BLOCK_SIZE) {
1487 logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
1488 offset = do_div(logical_sb_block, blocksize);
1489 } else {
1490 logical_sb_block = sb_block;
1491 }
1492
1493 if (!(bh = sb_bread(sb, logical_sb_block))) {
1494 printk (KERN_ERR "EXT4-fs: unable to read superblock\n");
1495 goto out_fail;
1496 }
1497 /*
1498 * Note: s_es must be initialized as soon as possible because
1499 * some ext4 macro-instructions depend on its value
1500 */
1501 es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
1502 sbi->s_es = es;
1503 sb->s_magic = le16_to_cpu(es->s_magic);
1504 if (sb->s_magic != EXT4_SUPER_MAGIC)
1505 goto cantfind_ext4;
1506
1507 /* Set defaults before we parse the mount options */
1508 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
1509 if (def_mount_opts & EXT4_DEFM_DEBUG)
1510 set_opt(sbi->s_mount_opt, DEBUG);
1511 if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
1512 set_opt(sbi->s_mount_opt, GRPID);
1513 if (def_mount_opts & EXT4_DEFM_UID16)
1514 set_opt(sbi->s_mount_opt, NO_UID32);
1515 if (def_mount_opts & EXT4_DEFM_XATTR_USER)
1516 set_opt(sbi->s_mount_opt, XATTR_USER);
1517 if (def_mount_opts & EXT4_DEFM_ACL)
1518 set_opt(sbi->s_mount_opt, POSIX_ACL);
1519 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
1520 sbi->s_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
1521 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
1522 sbi->s_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
1523 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
1524 sbi->s_mount_opt |= EXT4_MOUNT_WRITEBACK_DATA;
1525
1526 if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
1527 set_opt(sbi->s_mount_opt, ERRORS_PANIC);
1528 else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_RO)
1529 set_opt(sbi->s_mount_opt, ERRORS_RO);
1530 else
1531 set_opt(sbi->s_mount_opt, ERRORS_CONT);
1532
1533 sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
1534 sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
1535
1536 set_opt(sbi->s_mount_opt, RESERVATION);
1537
1538 if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
1539 NULL, 0))
1540 goto failed_mount;
1541
1542 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
1543 ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
1544
1545 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
1546 (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
1547 EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
1548 EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
1549 printk(KERN_WARNING
1550 "EXT4-fs warning: feature flags set on rev 0 fs, "
1551 "running e2fsck is recommended\n");
1552 /*
1553 * Check feature flags regardless of the revision level, since we
1554 * previously didn't change the revision level when setting the flags,
1555 * so there is a chance incompat flags are set on a rev 0 filesystem.
1556 */
1557 features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
1558 if (features) {
1559 printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of "
1560 "unsupported optional features (%x).\n",
1561 sb->s_id, le32_to_cpu(features));
1562 goto failed_mount;
1563 }
1564 features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
1565 if (!(sb->s_flags & MS_RDONLY) && features) {
1566 printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of "
1567 "unsupported optional features (%x).\n",
1568 sb->s_id, le32_to_cpu(features));
1569 goto failed_mount;
1570 }
1571 blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
1572
1573 if (blocksize < EXT4_MIN_BLOCK_SIZE ||
1574 blocksize > EXT4_MAX_BLOCK_SIZE) {
1575 printk(KERN_ERR
1576 "EXT4-fs: Unsupported filesystem blocksize %d on %s.\n",
1577 blocksize, sb->s_id);
1578 goto failed_mount;
1579 }
1580
1581 hblock = bdev_hardsect_size(sb->s_bdev);
1582 if (sb->s_blocksize != blocksize) {
1583 /*
1584 * Make sure the blocksize for the filesystem is larger
1585 * than the hardware sectorsize for the machine.
1586 */
1587 if (blocksize < hblock) {
1588 printk(KERN_ERR "EXT4-fs: blocksize %d too small for "
1589 "device blocksize %d.\n", blocksize, hblock);
1590 goto failed_mount;
1591 }
1592
1593 brelse (bh);
1594 sb_set_blocksize(sb, blocksize);
1595 logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
1596 offset = do_div(logical_sb_block, blocksize);
1597 bh = sb_bread(sb, logical_sb_block);
1598 if (!bh) {
1599 printk(KERN_ERR
1600 "EXT4-fs: Can't read superblock on 2nd try.\n");
1601 goto failed_mount;
1602 }
1603 es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
1604 sbi->s_es = es;
1605 if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
1606 printk (KERN_ERR
1607 "EXT4-fs: Magic mismatch, very weird !\n");
1608 goto failed_mount;
1609 }
1610 }
1611
1612 sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits);
1613
1614 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
1615 sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
1616 sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
1617 } else {
1618 sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
1619 sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
1620 if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
1621 (sbi->s_inode_size & (sbi->s_inode_size - 1)) ||
1622 (sbi->s_inode_size > blocksize)) {
1623 printk (KERN_ERR
1624 "EXT4-fs: unsupported inode size: %d\n",
1625 sbi->s_inode_size);
1626 goto failed_mount;
1627 }
1628 }
1629 sbi->s_frag_size = EXT4_MIN_FRAG_SIZE <<
1630 le32_to_cpu(es->s_log_frag_size);
1631 if (blocksize != sbi->s_frag_size) {
1632 printk(KERN_ERR
1633 "EXT4-fs: fragsize %lu != blocksize %u (unsupported)\n",
1634 sbi->s_frag_size, blocksize);
1635 goto failed_mount;
1636 }
1637 sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
1638 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
1639 if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
1640 sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
1641 sbi->s_desc_size & (sbi->s_desc_size - 1)) {
1642 printk(KERN_ERR
1643 "EXT4-fs: unsupported descriptor size %lu\n",
1644 sbi->s_desc_size);
1645 goto failed_mount;
1646 }
1647 } else
1648 sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
1649 sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
1650 sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
1651 sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
1652 if (EXT4_INODE_SIZE(sb) == 0)
1653 goto cantfind_ext4;
1654 sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
1655 if (sbi->s_inodes_per_block == 0)
1656 goto cantfind_ext4;
1657 sbi->s_itb_per_group = sbi->s_inodes_per_group /
1658 sbi->s_inodes_per_block;
1659 sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
1660 sbi->s_sbh = bh;
1661 sbi->s_mount_state = le16_to_cpu(es->s_state);
1662 sbi->s_addr_per_block_bits = log2(EXT4_ADDR_PER_BLOCK(sb));
1663 sbi->s_desc_per_block_bits = log2(EXT4_DESC_PER_BLOCK(sb));
1664 for (i=0; i < 4; i++)
1665 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
1666 sbi->s_def_hash_version = es->s_def_hash_version;
1667
1668 if (sbi->s_blocks_per_group > blocksize * 8) {
1669 printk (KERN_ERR
1670 "EXT4-fs: #blocks per group too big: %lu\n",
1671 sbi->s_blocks_per_group);
1672 goto failed_mount;
1673 }
1674 if (sbi->s_frags_per_group > blocksize * 8) {
1675 printk (KERN_ERR
1676 "EXT4-fs: #fragments per group too big: %lu\n",
1677 sbi->s_frags_per_group);
1678 goto failed_mount;
1679 }
1680 if (sbi->s_inodes_per_group > blocksize * 8) {
1681 printk (KERN_ERR
1682 "EXT4-fs: #inodes per group too big: %lu\n",
1683 sbi->s_inodes_per_group);
1684 goto failed_mount;
1685 }
1686
1687 if (ext4_blocks_count(es) >
1688 (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
1689 printk(KERN_ERR "EXT4-fs: filesystem on %s:"
1690 " too large to mount safely\n", sb->s_id);
1691 if (sizeof(sector_t) < 8)
1692 printk(KERN_WARNING "EXT4-fs: CONFIG_LBD not "
1693 "enabled\n");
1694 goto failed_mount;
1695 }
1696
1697 if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
1698 goto cantfind_ext4;
1699 blocks_count = (ext4_blocks_count(es) -
1700 le32_to_cpu(es->s_first_data_block) +
1701 EXT4_BLOCKS_PER_GROUP(sb) - 1);
1702 do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
1703 sbi->s_groups_count = blocks_count;
1704 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
1705 EXT4_DESC_PER_BLOCK(sb);
1706 sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
1707 GFP_KERNEL);
1708 if (sbi->s_group_desc == NULL) {
1709 printk (KERN_ERR "EXT4-fs: not enough memory\n");
1710 goto failed_mount;
1711 }
1712
1713 bgl_lock_init(&sbi->s_blockgroup_lock);
1714
1715 for (i = 0; i < db_count; i++) {
1716 block = descriptor_loc(sb, logical_sb_block, i);
1717 sbi->s_group_desc[i] = sb_bread(sb, block);
1718 if (!sbi->s_group_desc[i]) {
1719 printk (KERN_ERR "EXT4-fs: "
1720 "can't read group descriptor %d\n", i);
1721 db_count = i;
1722 goto failed_mount2;
1723 }
1724 }
1725 if (!ext4_check_descriptors (sb)) {
1726 printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
1727 goto failed_mount2;
1728 }
1729 sbi->s_gdb_count = db_count;
1730 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
1731 spin_lock_init(&sbi->s_next_gen_lock);
1732
1733 percpu_counter_init(&sbi->s_freeblocks_counter,
1734 ext4_count_free_blocks(sb));
1735 percpu_counter_init(&sbi->s_freeinodes_counter,
1736 ext4_count_free_inodes(sb));
1737 percpu_counter_init(&sbi->s_dirs_counter,
1738 ext4_count_dirs(sb));
1739
1740 /* per fileystem reservation list head & lock */
1741 spin_lock_init(&sbi->s_rsv_window_lock);
1742 sbi->s_rsv_window_root = RB_ROOT;
1743 /* Add a single, static dummy reservation to the start of the
1744 * reservation window list --- it gives us a placeholder for
1745 * append-at-start-of-list which makes the allocation logic
1746 * _much_ simpler. */
1747 sbi->s_rsv_window_head.rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
1748 sbi->s_rsv_window_head.rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
1749 sbi->s_rsv_window_head.rsv_alloc_hit = 0;
1750 sbi->s_rsv_window_head.rsv_goal_size = 0;
1751 ext4_rsv_window_add(sb, &sbi->s_rsv_window_head);
1752
1753 /*
1754 * set up enough so that it can read an inode
1755 */
1756 sb->s_op = &ext4_sops;
1757 sb->s_export_op = &ext4_export_ops;
1758 sb->s_xattr = ext4_xattr_handlers;
1759#ifdef CONFIG_QUOTA
1760 sb->s_qcop = &ext4_qctl_operations;
1761 sb->dq_op = &ext4_quota_operations;
1762#endif
1763 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
1764
1765 sb->s_root = NULL;
1766
1767 needs_recovery = (es->s_last_orphan != 0 ||
1768 EXT4_HAS_INCOMPAT_FEATURE(sb,
1769 EXT4_FEATURE_INCOMPAT_RECOVER));
1770
1771 /*
1772 * The first inode we look at is the journal inode. Don't try
1773 * root first: it may be modified in the journal!
1774 */
1775 if (!test_opt(sb, NOLOAD) &&
1776 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
1777 if (ext4_load_journal(sb, es, journal_devnum))
1778 goto failed_mount3;
1779 } else if (journal_inum) {
1780 if (ext4_create_journal(sb, es, journal_inum))
1781 goto failed_mount3;
1782 } else {
1783 if (!silent)
1784 printk (KERN_ERR
1785 "ext4: No journal on filesystem on %s\n",
1786 sb->s_id);
1787 goto failed_mount3;
1788 }
1789
1790 /* We have now updated the journal if required, so we can
1791 * validate the data journaling mode. */
1792 switch (test_opt(sb, DATA_FLAGS)) {
1793 case 0:
1794 /* No mode set, assume a default based on the journal
1795 * capabilities: ORDERED_DATA if the journal can
1796 * cope, else JOURNAL_DATA
1797 */
1798 if (jbd2_journal_check_available_features
1799 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE))
1800 set_opt(sbi->s_mount_opt, ORDERED_DATA);
1801 else
1802 set_opt(sbi->s_mount_opt, JOURNAL_DATA);
1803 break;
1804
1805 case EXT4_MOUNT_ORDERED_DATA:
1806 case EXT4_MOUNT_WRITEBACK_DATA:
1807 if (!jbd2_journal_check_available_features
1808 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
1809 printk(KERN_ERR "EXT4-fs: Journal does not support "
1810 "requested data journaling mode\n");
1811 goto failed_mount4;
1812 }
1813 default:
1814 break;
1815 }
1816
1817 if (test_opt(sb, NOBH)) {
1818 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
1819 printk(KERN_WARNING "EXT4-fs: Ignoring nobh option - "
1820 "its supported only with writeback mode\n");
1821 clear_opt(sbi->s_mount_opt, NOBH);
1822 }
1823 }
1824 /*
1825 * The jbd2_journal_load will have done any necessary log recovery,
1826 * so we can safely mount the rest of the filesystem now.
1827 */
1828
1829 root = iget(sb, EXT4_ROOT_INO);
1830 sb->s_root = d_alloc_root(root);
1831 if (!sb->s_root) {
1832 printk(KERN_ERR "EXT4-fs: get root inode failed\n");
1833 iput(root);
1834 goto failed_mount4;
1835 }
1836 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
1837 dput(sb->s_root);
1838 sb->s_root = NULL;
1839 printk(KERN_ERR "EXT4-fs: corrupt root inode, run e2fsck\n");
1840 goto failed_mount4;
1841 }
1842
1843 ext4_setup_super (sb, es, sb->s_flags & MS_RDONLY);
1844 /*
1845 * akpm: core read_super() calls in here with the superblock locked.
1846 * That deadlocks, because orphan cleanup needs to lock the superblock
1847 * in numerous places. Here we just pop the lock - it's relatively
1848 * harmless, because we are now ready to accept write_super() requests,
1849 * and aviro says that's the only reason for hanging onto the
1850 * superblock lock.
1851 */
1852 EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
1853 ext4_orphan_cleanup(sb, es);
1854 EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
1855 if (needs_recovery)
1856 printk (KERN_INFO "EXT4-fs: recovery complete.\n");
1857 ext4_mark_recovery_complete(sb, es);
1858 printk (KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n",
1859 test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal":
1860 test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
1861 "writeback");
1862
1863 ext4_ext_init(sb);
1864
1865 lock_kernel();
1866 return 0;
1867
1868cantfind_ext4:
1869 if (!silent)
1870 printk(KERN_ERR "VFS: Can't find ext4 filesystem on dev %s.\n",
1871 sb->s_id);
1872 goto failed_mount;
1873
1874failed_mount4:
1875 jbd2_journal_destroy(sbi->s_journal);
1876failed_mount3:
1877 percpu_counter_destroy(&sbi->s_freeblocks_counter);
1878 percpu_counter_destroy(&sbi->s_freeinodes_counter);
1879 percpu_counter_destroy(&sbi->s_dirs_counter);
1880failed_mount2:
1881 for (i = 0; i < db_count; i++)
1882 brelse(sbi->s_group_desc[i]);
1883 kfree(sbi->s_group_desc);
1884failed_mount:
1885#ifdef CONFIG_QUOTA
1886 for (i = 0; i < MAXQUOTAS; i++)
1887 kfree(sbi->s_qf_names[i]);
1888#endif
1889 ext4_blkdev_remove(sbi);
1890 brelse(bh);
1891out_fail:
1892 sb->s_fs_info = NULL;
1893 kfree(sbi);
1894 lock_kernel();
1895 return -EINVAL;
1896}
1897
1898/*
1899 * Setup any per-fs journal parameters now. We'll do this both on
1900 * initial mount, once the journal has been initialised but before we've
1901 * done any recovery; and again on any subsequent remount.
1902 */
1903static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
1904{
1905 struct ext4_sb_info *sbi = EXT4_SB(sb);
1906
1907 if (sbi->s_commit_interval)
1908 journal->j_commit_interval = sbi->s_commit_interval;
1909 /* We could also set up an ext4-specific default for the commit
1910 * interval here, but for now we'll just fall back to the jbd
1911 * default. */
1912
1913 spin_lock(&journal->j_state_lock);
1914 if (test_opt(sb, BARRIER))
1915 journal->j_flags |= JBD2_BARRIER;
1916 else
1917 journal->j_flags &= ~JBD2_BARRIER;
1918 spin_unlock(&journal->j_state_lock);
1919}
1920
1921static journal_t *ext4_get_journal(struct super_block *sb,
1922 unsigned int journal_inum)
1923{
1924 struct inode *journal_inode;
1925 journal_t *journal;
1926
1927 /* First, test for the existence of a valid inode on disk. Bad
1928 * things happen if we iget() an unused inode, as the subsequent
1929 * iput() will try to delete it. */
1930
1931 journal_inode = iget(sb, journal_inum);
1932 if (!journal_inode) {
1933 printk(KERN_ERR "EXT4-fs: no journal found.\n");
1934 return NULL;
1935 }
1936 if (!journal_inode->i_nlink) {
1937 make_bad_inode(journal_inode);
1938 iput(journal_inode);
1939 printk(KERN_ERR "EXT4-fs: journal inode is deleted.\n");
1940 return NULL;
1941 }
1942
1943 jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
1944 journal_inode, journal_inode->i_size);
1945 if (is_bad_inode(journal_inode) || !S_ISREG(journal_inode->i_mode)) {
1946 printk(KERN_ERR "EXT4-fs: invalid journal inode.\n");
1947 iput(journal_inode);
1948 return NULL;
1949 }
1950
1951 journal = jbd2_journal_init_inode(journal_inode);
1952 if (!journal) {
1953 printk(KERN_ERR "EXT4-fs: Could not load journal inode\n");
1954 iput(journal_inode);
1955 return NULL;
1956 }
1957 journal->j_private = sb;
1958 ext4_init_journal_params(sb, journal);
1959 return journal;
1960}
1961
1962static journal_t *ext4_get_dev_journal(struct super_block *sb,
1963 dev_t j_dev)
1964{
1965 struct buffer_head * bh;
1966 journal_t *journal;
1967 ext4_fsblk_t start;
1968 ext4_fsblk_t len;
1969 int hblock, blocksize;
1970 ext4_fsblk_t sb_block;
1971 unsigned long offset;
1972 struct ext4_super_block * es;
1973 struct block_device *bdev;
1974
1975 bdev = ext4_blkdev_get(j_dev);
1976 if (bdev == NULL)
1977 return NULL;
1978
1979 if (bd_claim(bdev, sb)) {
1980 printk(KERN_ERR
1981 "EXT4: failed to claim external journal device.\n");
1982 blkdev_put(bdev);
1983 return NULL;
1984 }
1985
1986 blocksize = sb->s_blocksize;
1987 hblock = bdev_hardsect_size(bdev);
1988 if (blocksize < hblock) {
1989 printk(KERN_ERR
1990 "EXT4-fs: blocksize too small for journal device.\n");
1991 goto out_bdev;
1992 }
1993
1994 sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
1995 offset = EXT4_MIN_BLOCK_SIZE % blocksize;
1996 set_blocksize(bdev, blocksize);
1997 if (!(bh = __bread(bdev, sb_block, blocksize))) {
1998 printk(KERN_ERR "EXT4-fs: couldn't read superblock of "
1999 "external journal\n");
2000 goto out_bdev;
2001 }
2002
2003 es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
2004 if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
2005 !(le32_to_cpu(es->s_feature_incompat) &
2006 EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
2007 printk(KERN_ERR "EXT4-fs: external journal has "
2008 "bad superblock\n");
2009 brelse(bh);
2010 goto out_bdev;
2011 }
2012
2013 if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
2014 printk(KERN_ERR "EXT4-fs: journal UUID does not match\n");
2015 brelse(bh);
2016 goto out_bdev;
2017 }
2018
2019 len = ext4_blocks_count(es);
2020 start = sb_block + 1;
2021 brelse(bh); /* we're done with the superblock */
2022
2023 journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
2024 start, len, blocksize);
2025 if (!journal) {
2026 printk(KERN_ERR "EXT4-fs: failed to create device journal\n");
2027 goto out_bdev;
2028 }
2029 journal->j_private = sb;
2030 ll_rw_block(READ, 1, &journal->j_sb_buffer);
2031 wait_on_buffer(journal->j_sb_buffer);
2032 if (!buffer_uptodate(journal->j_sb_buffer)) {
2033 printk(KERN_ERR "EXT4-fs: I/O error on journal device\n");
2034 goto out_journal;
2035 }
2036 if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
2037 printk(KERN_ERR "EXT4-fs: External journal has more than one "
2038 "user (unsupported) - %d\n",
2039 be32_to_cpu(journal->j_superblock->s_nr_users));
2040 goto out_journal;
2041 }
2042 EXT4_SB(sb)->journal_bdev = bdev;
2043 ext4_init_journal_params(sb, journal);
2044 return journal;
2045out_journal:
2046 jbd2_journal_destroy(journal);
2047out_bdev:
2048 ext4_blkdev_put(bdev);
2049 return NULL;
2050}
2051
2052static int ext4_load_journal(struct super_block *sb,
2053 struct ext4_super_block *es,
2054 unsigned long journal_devnum)
2055{
2056 journal_t *journal;
2057 unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
2058 dev_t journal_dev;
2059 int err = 0;
2060 int really_read_only;
2061
2062 if (journal_devnum &&
2063 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
2064 printk(KERN_INFO "EXT4-fs: external journal device major/minor "
2065 "numbers have changed\n");
2066 journal_dev = new_decode_dev(journal_devnum);
2067 } else
2068 journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
2069
2070 really_read_only = bdev_read_only(sb->s_bdev);
2071
2072 /*
2073 * Are we loading a blank journal or performing recovery after a
2074 * crash? For recovery, we need to check in advance whether we
2075 * can get read-write access to the device.
2076 */
2077
2078 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
2079 if (sb->s_flags & MS_RDONLY) {
2080 printk(KERN_INFO "EXT4-fs: INFO: recovery "
2081 "required on readonly filesystem.\n");
2082 if (really_read_only) {
2083 printk(KERN_ERR "EXT4-fs: write access "
2084 "unavailable, cannot proceed.\n");
2085 return -EROFS;
2086 }
2087 printk (KERN_INFO "EXT4-fs: write access will "
2088 "be enabled during recovery.\n");
2089 }
2090 }
2091
2092 if (journal_inum && journal_dev) {
2093 printk(KERN_ERR "EXT4-fs: filesystem has both journal "
2094 "and inode journals!\n");
2095 return -EINVAL;
2096 }
2097
2098 if (journal_inum) {
2099 if (!(journal = ext4_get_journal(sb, journal_inum)))
2100 return -EINVAL;
2101 } else {
2102 if (!(journal = ext4_get_dev_journal(sb, journal_dev)))
2103 return -EINVAL;
2104 }
2105
2106 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
2107 err = jbd2_journal_update_format(journal);
2108 if (err) {
2109 printk(KERN_ERR "EXT4-fs: error updating journal.\n");
2110 jbd2_journal_destroy(journal);
2111 return err;
2112 }
2113 }
2114
2115 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
2116 err = jbd2_journal_wipe(journal, !really_read_only);
2117 if (!err)
2118 err = jbd2_journal_load(journal);
2119
2120 if (err) {
2121 printk(KERN_ERR "EXT4-fs: error loading journal.\n");
2122 jbd2_journal_destroy(journal);
2123 return err;
2124 }
2125
2126 EXT4_SB(sb)->s_journal = journal;
2127 ext4_clear_journal_err(sb, es);
2128
2129 if (journal_devnum &&
2130 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
2131 es->s_journal_dev = cpu_to_le32(journal_devnum);
2132 sb->s_dirt = 1;
2133
2134 /* Make sure we flush the recovery flag to disk. */
2135 ext4_commit_super(sb, es, 1);
2136 }
2137
2138 return 0;
2139}
2140
2141static int ext4_create_journal(struct super_block * sb,
2142 struct ext4_super_block * es,
2143 unsigned int journal_inum)
2144{
2145 journal_t *journal;
2146
2147 if (sb->s_flags & MS_RDONLY) {
2148 printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to "
2149 "create journal.\n");
2150 return -EROFS;
2151 }
2152
2153 if (!(journal = ext4_get_journal(sb, journal_inum)))
2154 return -EINVAL;
2155
2156 printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n",
2157 journal_inum);
2158
2159 if (jbd2_journal_create(journal)) {
2160 printk(KERN_ERR "EXT4-fs: error creating journal.\n");
2161 jbd2_journal_destroy(journal);
2162 return -EIO;
2163 }
2164
2165 EXT4_SB(sb)->s_journal = journal;
2166
2167 ext4_update_dynamic_rev(sb);
2168 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
2169 EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL);
2170
2171 es->s_journal_inum = cpu_to_le32(journal_inum);
2172 sb->s_dirt = 1;
2173
2174 /* Make sure we flush the recovery flag to disk. */
2175 ext4_commit_super(sb, es, 1);
2176
2177 return 0;
2178}
2179
2180static void ext4_commit_super (struct super_block * sb,
2181 struct ext4_super_block * es,
2182 int sync)
2183{
2184 struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
2185
2186 if (!sbh)
2187 return;
2188 es->s_wtime = cpu_to_le32(get_seconds());
2189 ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb));
2190 es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
2191 BUFFER_TRACE(sbh, "marking dirty");
2192 mark_buffer_dirty(sbh);
2193 if (sync)
2194 sync_dirty_buffer(sbh);
2195}
2196
2197
2198/*
2199 * Have we just finished recovery? If so, and if we are mounting (or
2200 * remounting) the filesystem readonly, then we will end up with a
2201 * consistent fs on disk. Record that fact.
2202 */
2203static void ext4_mark_recovery_complete(struct super_block * sb,
2204 struct ext4_super_block * es)
2205{
2206 journal_t *journal = EXT4_SB(sb)->s_journal;
2207
2208 jbd2_journal_lock_updates(journal);
2209 jbd2_journal_flush(journal);
2210 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
2211 sb->s_flags & MS_RDONLY) {
2212 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
2213 sb->s_dirt = 0;
2214 ext4_commit_super(sb, es, 1);
2215 }
2216 jbd2_journal_unlock_updates(journal);
2217}
2218
2219/*
2220 * If we are mounting (or read-write remounting) a filesystem whose journal
2221 * has recorded an error from a previous lifetime, move that error to the
2222 * main filesystem now.
2223 */
2224static void ext4_clear_journal_err(struct super_block * sb,
2225 struct ext4_super_block * es)
2226{
2227 journal_t *journal;
2228 int j_errno;
2229 const char *errstr;
2230
2231 journal = EXT4_SB(sb)->s_journal;
2232
2233 /*
2234 * Now check for any error status which may have been recorded in the
2235 * journal by a prior ext4_error() or ext4_abort()
2236 */
2237
2238 j_errno = jbd2_journal_errno(journal);
2239 if (j_errno) {
2240 char nbuf[16];
2241
2242 errstr = ext4_decode_error(sb, j_errno, nbuf);
2243 ext4_warning(sb, __FUNCTION__, "Filesystem error recorded "
2244 "from previous mount: %s", errstr);
2245 ext4_warning(sb, __FUNCTION__, "Marking fs in need of "
2246 "filesystem check.");
2247
2248 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
2249 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
2250 ext4_commit_super (sb, es, 1);
2251
2252 jbd2_journal_clear_err(journal);
2253 }
2254}
2255
2256/*
2257 * Force the running and committing transactions to commit,
2258 * and wait on the commit.
2259 */
2260int ext4_force_commit(struct super_block *sb)
2261{
2262 journal_t *journal;
2263 int ret;
2264
2265 if (sb->s_flags & MS_RDONLY)
2266 return 0;
2267
2268 journal = EXT4_SB(sb)->s_journal;
2269 sb->s_dirt = 0;
2270 ret = ext4_journal_force_commit(journal);
2271 return ret;
2272}
2273
2274/*
2275 * Ext4 always journals updates to the superblock itself, so we don't
2276 * have to propagate any other updates to the superblock on disk at this
2277 * point. Just start an async writeback to get the buffers on their way
2278 * to the disk.
2279 *
2280 * This implicitly triggers the writebehind on sync().
2281 */
2282
2283static void ext4_write_super (struct super_block * sb)
2284{
2285 if (mutex_trylock(&sb->s_lock) != 0)
2286 BUG();
2287 sb->s_dirt = 0;
2288}
2289
2290static int ext4_sync_fs(struct super_block *sb, int wait)
2291{
2292 tid_t target;
2293
2294 sb->s_dirt = 0;
2295 if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
2296 if (wait)
2297 jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
2298 }
2299 return 0;
2300}
2301
2302/*
2303 * LVM calls this function before a (read-only) snapshot is created. This
2304 * gives us a chance to flush the journal completely and mark the fs clean.
2305 */
2306static void ext4_write_super_lockfs(struct super_block *sb)
2307{
2308 sb->s_dirt = 0;
2309
2310 if (!(sb->s_flags & MS_RDONLY)) {
2311 journal_t *journal = EXT4_SB(sb)->s_journal;
2312
2313 /* Now we set up the journal barrier. */
2314 jbd2_journal_lock_updates(journal);
2315 jbd2_journal_flush(journal);
2316
2317 /* Journal blocked and flushed, clear needs_recovery flag. */
2318 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
2319 ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
2320 }
2321}
2322
2323/*
2324 * Called by LVM after the snapshot is done. We need to reset the RECOVER
2325 * flag here, even though the filesystem is not technically dirty yet.
2326 */
2327static void ext4_unlockfs(struct super_block *sb)
2328{
2329 if (!(sb->s_flags & MS_RDONLY)) {
2330 lock_super(sb);
2331 /* Reser the needs_recovery flag before the fs is unlocked. */
2332 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
2333 ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
2334 unlock_super(sb);
2335 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
2336 }
2337}
2338
2339static int ext4_remount (struct super_block * sb, int * flags, char * data)
2340{
2341 struct ext4_super_block * es;
2342 struct ext4_sb_info *sbi = EXT4_SB(sb);
2343 ext4_fsblk_t n_blocks_count = 0;
2344 unsigned long old_sb_flags;
2345 struct ext4_mount_options old_opts;
2346 int err;
2347#ifdef CONFIG_QUOTA
2348 int i;
2349#endif
2350
2351 /* Store the original options */
2352 old_sb_flags = sb->s_flags;
2353 old_opts.s_mount_opt = sbi->s_mount_opt;
2354 old_opts.s_resuid = sbi->s_resuid;
2355 old_opts.s_resgid = sbi->s_resgid;
2356 old_opts.s_commit_interval = sbi->s_commit_interval;
2357#ifdef CONFIG_QUOTA
2358 old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
2359 for (i = 0; i < MAXQUOTAS; i++)
2360 old_opts.s_qf_names[i] = sbi->s_qf_names[i];
2361#endif
2362
2363 /*
2364 * Allow the "check" option to be passed as a remount option.
2365 */
2366 if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) {
2367 err = -EINVAL;
2368 goto restore_opts;
2369 }
2370
2371 if (sbi->s_mount_opt & EXT4_MOUNT_ABORT)
2372 ext4_abort(sb, __FUNCTION__, "Abort forced by user");
2373
2374 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
2375 ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
2376
2377 es = sbi->s_es;
2378
2379 ext4_init_journal_params(sb, sbi->s_journal);
2380
2381 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
2382 n_blocks_count > ext4_blocks_count(es)) {
2383 if (sbi->s_mount_opt & EXT4_MOUNT_ABORT) {
2384 err = -EROFS;
2385 goto restore_opts;
2386 }
2387
2388 if (*flags & MS_RDONLY) {
2389 /*
2390 * First of all, the unconditional stuff we have to do
2391 * to disable replay of the journal when we next remount
2392 */
2393 sb->s_flags |= MS_RDONLY;
2394
2395 /*
2396 * OK, test if we are remounting a valid rw partition
2397 * readonly, and if so set the rdonly flag and then
2398 * mark the partition as valid again.
2399 */
2400 if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) &&
2401 (sbi->s_mount_state & EXT4_VALID_FS))
2402 es->s_state = cpu_to_le16(sbi->s_mount_state);
2403
2404 ext4_mark_recovery_complete(sb, es);
2405 } else {
2406 __le32 ret;
2407 if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
2408 ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
2409 printk(KERN_WARNING "EXT4-fs: %s: couldn't "
2410 "remount RDWR because of unsupported "
2411 "optional features (%x).\n",
2412 sb->s_id, le32_to_cpu(ret));
2413 err = -EROFS;
2414 goto restore_opts;
2415 }
2416 /*
2417 * Mounting a RDONLY partition read-write, so reread
2418 * and store the current valid flag. (It may have
2419 * been changed by e2fsck since we originally mounted
2420 * the partition.)
2421 */
2422 ext4_clear_journal_err(sb, es);
2423 sbi->s_mount_state = le16_to_cpu(es->s_state);
2424 if ((err = ext4_group_extend(sb, es, n_blocks_count)))
2425 goto restore_opts;
2426 if (!ext4_setup_super (sb, es, 0))
2427 sb->s_flags &= ~MS_RDONLY;
2428 }
2429 }
2430#ifdef CONFIG_QUOTA
2431 /* Release old quota file names */
2432 for (i = 0; i < MAXQUOTAS; i++)
2433 if (old_opts.s_qf_names[i] &&
2434 old_opts.s_qf_names[i] != sbi->s_qf_names[i])
2435 kfree(old_opts.s_qf_names[i]);
2436#endif
2437 return 0;
2438restore_opts:
2439 sb->s_flags = old_sb_flags;
2440 sbi->s_mount_opt = old_opts.s_mount_opt;
2441 sbi->s_resuid = old_opts.s_resuid;
2442 sbi->s_resgid = old_opts.s_resgid;
2443 sbi->s_commit_interval = old_opts.s_commit_interval;
2444#ifdef CONFIG_QUOTA
2445 sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
2446 for (i = 0; i < MAXQUOTAS; i++) {
2447 if (sbi->s_qf_names[i] &&
2448 old_opts.s_qf_names[i] != sbi->s_qf_names[i])
2449 kfree(sbi->s_qf_names[i]);
2450 sbi->s_qf_names[i] = old_opts.s_qf_names[i];
2451 }
2452#endif
2453 return err;
2454}
2455
2456static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
2457{
2458 struct super_block *sb = dentry->d_sb;
2459 struct ext4_sb_info *sbi = EXT4_SB(sb);
2460 struct ext4_super_block *es = sbi->s_es;
2461 ext4_fsblk_t overhead;
2462 int i;
2463
2464 if (test_opt (sb, MINIX_DF))
2465 overhead = 0;
2466 else {
2467 unsigned long ngroups;
2468 ngroups = EXT4_SB(sb)->s_groups_count;
2469 smp_rmb();
2470
2471 /*
2472 * Compute the overhead (FS structures)
2473 */
2474
2475 /*
2476 * All of the blocks before first_data_block are
2477 * overhead
2478 */
2479 overhead = le32_to_cpu(es->s_first_data_block);
2480
2481 /*
2482 * Add the overhead attributed to the superblock and
2483 * block group descriptors. If the sparse superblocks
2484 * feature is turned on, then not all groups have this.
2485 */
2486 for (i = 0; i < ngroups; i++) {
2487 overhead += ext4_bg_has_super(sb, i) +
2488 ext4_bg_num_gdb(sb, i);
2489 cond_resched();
2490 }
2491
2492 /*
2493 * Every block group has an inode bitmap, a block
2494 * bitmap, and an inode table.
2495 */
2496 overhead += (ngroups * (2 + EXT4_SB(sb)->s_itb_per_group));
2497 }
2498
2499 buf->f_type = EXT4_SUPER_MAGIC;
2500 buf->f_bsize = sb->s_blocksize;
2501 buf->f_blocks = ext4_blocks_count(es) - overhead;
2502 buf->f_bfree = percpu_counter_sum(&sbi->s_freeblocks_counter);
2503 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
2504 if (buf->f_bfree < ext4_r_blocks_count(es))
2505 buf->f_bavail = 0;
2506 buf->f_files = le32_to_cpu(es->s_inodes_count);
2507 buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter);
2508 buf->f_namelen = EXT4_NAME_LEN;
2509 return 0;
2510}
2511
2512/* Helper function for writing quotas on sync - we need to start transaction before quota file
2513 * is locked for write. Otherwise the are possible deadlocks:
2514 * Process 1 Process 2
2515 * ext4_create() quota_sync()
2516 * jbd2_journal_start() write_dquot()
2517 * DQUOT_INIT() down(dqio_mutex)
2518 * down(dqio_mutex) jbd2_journal_start()
2519 *
2520 */
2521
2522#ifdef CONFIG_QUOTA
2523
2524static inline struct inode *dquot_to_inode(struct dquot *dquot)
2525{
2526 return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
2527}
2528
2529static int ext4_dquot_initialize(struct inode *inode, int type)
2530{
2531 handle_t *handle;
2532 int ret, err;
2533
2534 /* We may create quota structure so we need to reserve enough blocks */
2535 handle = ext4_journal_start(inode, 2*EXT4_QUOTA_INIT_BLOCKS(inode->i_sb));
2536 if (IS_ERR(handle))
2537 return PTR_ERR(handle);
2538 ret = dquot_initialize(inode, type);
2539 err = ext4_journal_stop(handle);
2540 if (!ret)
2541 ret = err;
2542 return ret;
2543}
2544
2545static int ext4_dquot_drop(struct inode *inode)
2546{
2547 handle_t *handle;
2548 int ret, err;
2549
2550 /* We may delete quota structure so we need to reserve enough blocks */
2551 handle = ext4_journal_start(inode, 2*EXT4_QUOTA_DEL_BLOCKS(inode->i_sb));
2552 if (IS_ERR(handle))
2553 return PTR_ERR(handle);
2554 ret = dquot_drop(inode);
2555 err = ext4_journal_stop(handle);
2556 if (!ret)
2557 ret = err;
2558 return ret;
2559}
2560
2561static int ext4_write_dquot(struct dquot *dquot)
2562{
2563 int ret, err;
2564 handle_t *handle;
2565 struct inode *inode;
2566
2567 inode = dquot_to_inode(dquot);
2568 handle = ext4_journal_start(inode,
2569 EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
2570 if (IS_ERR(handle))
2571 return PTR_ERR(handle);
2572 ret = dquot_commit(dquot);
2573 err = ext4_journal_stop(handle);
2574 if (!ret)
2575 ret = err;
2576 return ret;
2577}
2578
2579static int ext4_acquire_dquot(struct dquot *dquot)
2580{
2581 int ret, err;
2582 handle_t *handle;
2583
2584 handle = ext4_journal_start(dquot_to_inode(dquot),
2585 EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
2586 if (IS_ERR(handle))
2587 return PTR_ERR(handle);
2588 ret = dquot_acquire(dquot);
2589 err = ext4_journal_stop(handle);
2590 if (!ret)
2591 ret = err;
2592 return ret;
2593}
2594
2595static int ext4_release_dquot(struct dquot *dquot)
2596{
2597 int ret, err;
2598 handle_t *handle;
2599
2600 handle = ext4_journal_start(dquot_to_inode(dquot),
2601 EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
2602 if (IS_ERR(handle))
2603 return PTR_ERR(handle);
2604 ret = dquot_release(dquot);
2605 err = ext4_journal_stop(handle);
2606 if (!ret)
2607 ret = err;
2608 return ret;
2609}
2610
2611static int ext4_mark_dquot_dirty(struct dquot *dquot)
2612{
2613 /* Are we journalling quotas? */
2614 if (EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
2615 EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
2616 dquot_mark_dquot_dirty(dquot);
2617 return ext4_write_dquot(dquot);
2618 } else {
2619 return dquot_mark_dquot_dirty(dquot);
2620 }
2621}
2622
2623static int ext4_write_info(struct super_block *sb, int type)
2624{
2625 int ret, err;
2626 handle_t *handle;
2627
2628 /* Data block + inode block */
2629 handle = ext4_journal_start(sb->s_root->d_inode, 2);
2630 if (IS_ERR(handle))
2631 return PTR_ERR(handle);
2632 ret = dquot_commit_info(sb, type);
2633 err = ext4_journal_stop(handle);
2634 if (!ret)
2635 ret = err;
2636 return ret;
2637}
2638
2639/*
2640 * Turn on quotas during mount time - we need to find
2641 * the quota file and such...
2642 */
2643static int ext4_quota_on_mount(struct super_block *sb, int type)
2644{
2645 return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
2646 EXT4_SB(sb)->s_jquota_fmt, type);
2647}
2648
2649/*
2650 * Standard function to be called on quota_on
2651 */
2652static int ext4_quota_on(struct super_block *sb, int type, int format_id,
2653 char *path)
2654{
2655 int err;
2656 struct nameidata nd;
2657
2658 if (!test_opt(sb, QUOTA))
2659 return -EINVAL;
2660 /* Not journalling quota? */
2661 if (!EXT4_SB(sb)->s_qf_names[USRQUOTA] &&
2662 !EXT4_SB(sb)->s_qf_names[GRPQUOTA])
2663 return vfs_quota_on(sb, type, format_id, path);
2664 err = path_lookup(path, LOOKUP_FOLLOW, &nd);
2665 if (err)
2666 return err;
2667 /* Quotafile not on the same filesystem? */
2668 if (nd.mnt->mnt_sb != sb) {
2669 path_release(&nd);
2670 return -EXDEV;
2671 }
2672 /* Quotafile not of fs root? */
2673 if (nd.dentry->d_parent->d_inode != sb->s_root->d_inode)
2674 printk(KERN_WARNING
2675 "EXT4-fs: Quota file not on filesystem root. "
2676 "Journalled quota will not work.\n");
2677 path_release(&nd);
2678 return vfs_quota_on(sb, type, format_id, path);
2679}
2680
2681/* Read data from quotafile - avoid pagecache and such because we cannot afford
2682 * acquiring the locks... As quota files are never truncated and quota code
2683 * itself serializes the operations (and noone else should touch the files)
2684 * we don't have to be afraid of races */
2685static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
2686 size_t len, loff_t off)
2687{
2688 struct inode *inode = sb_dqopt(sb)->files[type];
2689 sector_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
2690 int err = 0;
2691 int offset = off & (sb->s_blocksize - 1);
2692 int tocopy;
2693 size_t toread;
2694 struct buffer_head *bh;
2695 loff_t i_size = i_size_read(inode);
2696
2697 if (off > i_size)
2698 return 0;
2699 if (off+len > i_size)
2700 len = i_size-off;
2701 toread = len;
2702 while (toread > 0) {
2703 tocopy = sb->s_blocksize - offset < toread ?
2704 sb->s_blocksize - offset : toread;
2705 bh = ext4_bread(NULL, inode, blk, 0, &err);
2706 if (err)
2707 return err;
2708 if (!bh) /* A hole? */
2709 memset(data, 0, tocopy);
2710 else
2711 memcpy(data, bh->b_data+offset, tocopy);
2712 brelse(bh);
2713 offset = 0;
2714 toread -= tocopy;
2715 data += tocopy;
2716 blk++;
2717 }
2718 return len;
2719}
2720
2721/* Write to quotafile (we know the transaction is already started and has
2722 * enough credits) */
2723static ssize_t ext4_quota_write(struct super_block *sb, int type,
2724 const char *data, size_t len, loff_t off)
2725{
2726 struct inode *inode = sb_dqopt(sb)->files[type];
2727 sector_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
2728 int err = 0;
2729 int offset = off & (sb->s_blocksize - 1);
2730 int tocopy;
2731 int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL;
2732 size_t towrite = len;
2733 struct buffer_head *bh;
2734 handle_t *handle = journal_current_handle();
2735
2736 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
2737 while (towrite > 0) {
2738 tocopy = sb->s_blocksize - offset < towrite ?
2739 sb->s_blocksize - offset : towrite;
2740 bh = ext4_bread(handle, inode, blk, 1, &err);
2741 if (!bh)
2742 goto out;
2743 if (journal_quota) {
2744 err = ext4_journal_get_write_access(handle, bh);
2745 if (err) {
2746 brelse(bh);
2747 goto out;
2748 }
2749 }
2750 lock_buffer(bh);
2751 memcpy(bh->b_data+offset, data, tocopy);
2752 flush_dcache_page(bh->b_page);
2753 unlock_buffer(bh);
2754 if (journal_quota)
2755 err = ext4_journal_dirty_metadata(handle, bh);
2756 else {
2757 /* Always do at least ordered writes for quotas */
2758 err = ext4_journal_dirty_data(handle, bh);
2759 mark_buffer_dirty(bh);
2760 }
2761 brelse(bh);
2762 if (err)
2763 goto out;
2764 offset = 0;
2765 towrite -= tocopy;
2766 data += tocopy;
2767 blk++;
2768 }
2769out:
2770 if (len == towrite)
2771 return err;
2772 if (inode->i_size < off+len-towrite) {
2773 i_size_write(inode, off+len-towrite);
2774 EXT4_I(inode)->i_disksize = inode->i_size;
2775 }
2776 inode->i_version++;
2777 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2778 ext4_mark_inode_dirty(handle, inode);
2779 mutex_unlock(&inode->i_mutex);
2780 return len - towrite;
2781}
2782
2783#endif
2784
2785static int ext4_get_sb(struct file_system_type *fs_type,
2786 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
2787{
2788 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
2789}
2790
2791static struct file_system_type ext4dev_fs_type = {
2792 .owner = THIS_MODULE,
2793 .name = "ext4dev",
2794 .get_sb = ext4_get_sb,
2795 .kill_sb = kill_block_super,
2796 .fs_flags = FS_REQUIRES_DEV,
2797};
2798
2799static int __init init_ext4_fs(void)
2800{
2801 int err = init_ext4_xattr();
2802 if (err)
2803 return err;
2804 err = init_inodecache();
2805 if (err)
2806 goto out1;
2807 err = register_filesystem(&ext4dev_fs_type);
2808 if (err)
2809 goto out;
2810 return 0;
2811out:
2812 destroy_inodecache();
2813out1:
2814 exit_ext4_xattr();
2815 return err;
2816}
2817
2818static void __exit exit_ext4_fs(void)
2819{
2820 unregister_filesystem(&ext4dev_fs_type);
2821 destroy_inodecache();
2822 exit_ext4_xattr();
2823}
2824
2825MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
2826MODULE_DESCRIPTION("Fourth Extended Filesystem with extents");
2827MODULE_LICENSE("GPL");
2828module_init(init_ext4_fs)
2829module_exit(exit_ext4_fs)
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
new file mode 100644
index 000000000000..fcf527286d75
--- /dev/null
+++ b/fs/ext4/symlink.c
@@ -0,0 +1,54 @@
1/*
2 * linux/fs/ext4/symlink.c
3 *
4 * Only fast symlinks left here - the rest is done by generic code. AV, 1999
5 *
6 * Copyright (C) 1992, 1993, 1994, 1995
7 * Remy Card (card@masi.ibp.fr)
8 * Laboratoire MASI - Institut Blaise Pascal
9 * Universite Pierre et Marie Curie (Paris VI)
10 *
11 * from
12 *
13 * linux/fs/minix/symlink.c
14 *
15 * Copyright (C) 1991, 1992 Linus Torvalds
16 *
17 * ext4 symlink handling code
18 */
19
20#include <linux/fs.h>
21#include <linux/jbd2.h>
22#include <linux/ext4_fs.h>
23#include <linux/namei.h>
24#include "xattr.h"
25
26static void * ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
27{
28 struct ext4_inode_info *ei = EXT4_I(dentry->d_inode);
29 nd_set_link(nd, (char*)ei->i_data);
30 return NULL;
31}
32
33struct inode_operations ext4_symlink_inode_operations = {
34 .readlink = generic_readlink,
35 .follow_link = page_follow_link_light,
36 .put_link = page_put_link,
37#ifdef CONFIG_EXT4DEV_FS_XATTR
38 .setxattr = generic_setxattr,
39 .getxattr = generic_getxattr,
40 .listxattr = ext4_listxattr,
41 .removexattr = generic_removexattr,
42#endif
43};
44
45struct inode_operations ext4_fast_symlink_inode_operations = {
46 .readlink = generic_readlink,
47 .follow_link = ext4_follow_link,
48#ifdef CONFIG_EXT4DEV_FS_XATTR
49 .setxattr = generic_setxattr,
50 .getxattr = generic_getxattr,
51 .listxattr = ext4_listxattr,
52 .removexattr = generic_removexattr,
53#endif
54};
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
new file mode 100644
index 000000000000..63233cd946a7
--- /dev/null
+++ b/fs/ext4/xattr.c
@@ -0,0 +1,1317 @@
1/*
2 * linux/fs/ext4/xattr.c
3 *
4 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
5 *
6 * Fix by Harrison Xing <harrison@mountainviewdata.com>.
7 * Ext4 code with a lot of help from Eric Jarman <ejarman@acm.org>.
8 * Extended attributes for symlinks and special files added per
9 * suggestion of Luka Renko <luka.renko@hermes.si>.
10 * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
11 * Red Hat Inc.
12 * ea-in-inode support by Alex Tomas <alex@clusterfs.com> aka bzzz
13 * and Andreas Gruenbacher <agruen@suse.de>.
14 */
15
16/*
17 * Extended attributes are stored directly in inodes (on file systems with
18 * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl
19 * field contains the block number if an inode uses an additional block. All
20 * attributes must fit in the inode and one additional block. Blocks that
21 * contain the identical set of attributes may be shared among several inodes.
22 * Identical blocks are detected by keeping a cache of blocks that have
23 * recently been accessed.
24 *
25 * The attributes in inodes and on blocks have a different header; the entries
26 * are stored in the same format:
27 *
28 * +------------------+
29 * | header |
30 * | entry 1 | |
31 * | entry 2 | | growing downwards
32 * | entry 3 | v
33 * | four null bytes |
34 * | . . . |
35 * | value 1 | ^
36 * | value 3 | | growing upwards
37 * | value 2 | |
38 * +------------------+
39 *
40 * The header is followed by multiple entry descriptors. In disk blocks, the
41 * entry descriptors are kept sorted. In inodes, they are unsorted. The
42 * attribute values are aligned to the end of the block in no specific order.
43 *
44 * Locking strategy
45 * ----------------
46 * EXT4_I(inode)->i_file_acl is protected by EXT4_I(inode)->xattr_sem.
47 * EA blocks are only changed if they are exclusive to an inode, so
48 * holding xattr_sem also means that nothing but the EA block's reference
49 * count can change. Multiple writers to the same block are synchronized
50 * by the buffer lock.
51 */
52
53#include <linux/init.h>
54#include <linux/fs.h>
55#include <linux/slab.h>
56#include <linux/ext4_jbd2.h>
57#include <linux/ext4_fs.h>
58#include <linux/mbcache.h>
59#include <linux/quotaops.h>
60#include <linux/rwsem.h>
61#include "xattr.h"
62#include "acl.h"
63
64#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
65#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
66#define BFIRST(bh) ENTRY(BHDR(bh)+1)
67#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
68
69#define IHDR(inode, raw_inode) \
70 ((struct ext4_xattr_ibody_header *) \
71 ((void *)raw_inode + \
72 EXT4_GOOD_OLD_INODE_SIZE + \
73 EXT4_I(inode)->i_extra_isize))
74#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
75
76#ifdef EXT4_XATTR_DEBUG
77# define ea_idebug(inode, f...) do { \
78 printk(KERN_DEBUG "inode %s:%lu: ", \
79 inode->i_sb->s_id, inode->i_ino); \
80 printk(f); \
81 printk("\n"); \
82 } while (0)
83# define ea_bdebug(bh, f...) do { \
84 char b[BDEVNAME_SIZE]; \
85 printk(KERN_DEBUG "block %s:%lu: ", \
86 bdevname(bh->b_bdev, b), \
87 (unsigned long) bh->b_blocknr); \
88 printk(f); \
89 printk("\n"); \
90 } while (0)
91#else
92# define ea_idebug(f...)
93# define ea_bdebug(f...)
94#endif
95
96static void ext4_xattr_cache_insert(struct buffer_head *);
97static struct buffer_head *ext4_xattr_cache_find(struct inode *,
98 struct ext4_xattr_header *,
99 struct mb_cache_entry **);
100static void ext4_xattr_rehash(struct ext4_xattr_header *,
101 struct ext4_xattr_entry *);
102
103static struct mb_cache *ext4_xattr_cache;
104
105static struct xattr_handler *ext4_xattr_handler_map[] = {
106 [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler,
107#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
108 [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler,
109 [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler,
110#endif
111 [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler,
112#ifdef CONFIG_EXT4DEV_FS_SECURITY
113 [EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler,
114#endif
115};
116
117struct xattr_handler *ext4_xattr_handlers[] = {
118 &ext4_xattr_user_handler,
119 &ext4_xattr_trusted_handler,
120#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
121 &ext4_xattr_acl_access_handler,
122 &ext4_xattr_acl_default_handler,
123#endif
124#ifdef CONFIG_EXT4DEV_FS_SECURITY
125 &ext4_xattr_security_handler,
126#endif
127 NULL
128};
129
130static inline struct xattr_handler *
131ext4_xattr_handler(int name_index)
132{
133 struct xattr_handler *handler = NULL;
134
135 if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map))
136 handler = ext4_xattr_handler_map[name_index];
137 return handler;
138}
139
140/*
141 * Inode operation listxattr()
142 *
143 * dentry->d_inode->i_mutex: don't care
144 */
145ssize_t
146ext4_listxattr(struct dentry *dentry, char *buffer, size_t size)
147{
148 return ext4_xattr_list(dentry->d_inode, buffer, size);
149}
150
151static int
152ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end)
153{
154 while (!IS_LAST_ENTRY(entry)) {
155 struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(entry);
156 if ((void *)next >= end)
157 return -EIO;
158 entry = next;
159 }
160 return 0;
161}
162
163static inline int
164ext4_xattr_check_block(struct buffer_head *bh)
165{
166 int error;
167
168 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
169 BHDR(bh)->h_blocks != cpu_to_le32(1))
170 return -EIO;
171 error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
172 return error;
173}
174
175static inline int
176ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size)
177{
178 size_t value_size = le32_to_cpu(entry->e_value_size);
179
180 if (entry->e_value_block != 0 || value_size > size ||
181 le16_to_cpu(entry->e_value_offs) + value_size > size)
182 return -EIO;
183 return 0;
184}
185
186static int
187ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
188 const char *name, size_t size, int sorted)
189{
190 struct ext4_xattr_entry *entry;
191 size_t name_len;
192 int cmp = 1;
193
194 if (name == NULL)
195 return -EINVAL;
196 name_len = strlen(name);
197 entry = *pentry;
198 for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
199 cmp = name_index - entry->e_name_index;
200 if (!cmp)
201 cmp = name_len - entry->e_name_len;
202 if (!cmp)
203 cmp = memcmp(name, entry->e_name, name_len);
204 if (cmp <= 0 && (sorted || cmp == 0))
205 break;
206 }
207 *pentry = entry;
208 if (!cmp && ext4_xattr_check_entry(entry, size))
209 return -EIO;
210 return cmp ? -ENODATA : 0;
211}
212
213static int
214ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
215 void *buffer, size_t buffer_size)
216{
217 struct buffer_head *bh = NULL;
218 struct ext4_xattr_entry *entry;
219 size_t size;
220 int error;
221
222 ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
223 name_index, name, buffer, (long)buffer_size);
224
225 error = -ENODATA;
226 if (!EXT4_I(inode)->i_file_acl)
227 goto cleanup;
228 ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl);
229 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
230 if (!bh)
231 goto cleanup;
232 ea_bdebug(bh, "b_count=%d, refcount=%d",
233 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
234 if (ext4_xattr_check_block(bh)) {
235bad_block: ext4_error(inode->i_sb, __FUNCTION__,
236 "inode %lu: bad block %llu", inode->i_ino,
237 EXT4_I(inode)->i_file_acl);
238 error = -EIO;
239 goto cleanup;
240 }
241 ext4_xattr_cache_insert(bh);
242 entry = BFIRST(bh);
243 error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
244 if (error == -EIO)
245 goto bad_block;
246 if (error)
247 goto cleanup;
248 size = le32_to_cpu(entry->e_value_size);
249 if (buffer) {
250 error = -ERANGE;
251 if (size > buffer_size)
252 goto cleanup;
253 memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
254 size);
255 }
256 error = size;
257
258cleanup:
259 brelse(bh);
260 return error;
261}
262
263static int
264ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
265 void *buffer, size_t buffer_size)
266{
267 struct ext4_xattr_ibody_header *header;
268 struct ext4_xattr_entry *entry;
269 struct ext4_inode *raw_inode;
270 struct ext4_iloc iloc;
271 size_t size;
272 void *end;
273 int error;
274
275 if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
276 return -ENODATA;
277 error = ext4_get_inode_loc(inode, &iloc);
278 if (error)
279 return error;
280 raw_inode = ext4_raw_inode(&iloc);
281 header = IHDR(inode, raw_inode);
282 entry = IFIRST(header);
283 end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
284 error = ext4_xattr_check_names(entry, end);
285 if (error)
286 goto cleanup;
287 error = ext4_xattr_find_entry(&entry, name_index, name,
288 end - (void *)entry, 0);
289 if (error)
290 goto cleanup;
291 size = le32_to_cpu(entry->e_value_size);
292 if (buffer) {
293 error = -ERANGE;
294 if (size > buffer_size)
295 goto cleanup;
296 memcpy(buffer, (void *)IFIRST(header) +
297 le16_to_cpu(entry->e_value_offs), size);
298 }
299 error = size;
300
301cleanup:
302 brelse(iloc.bh);
303 return error;
304}
305
306/*
307 * ext4_xattr_get()
308 *
309 * Copy an extended attribute into the buffer
310 * provided, or compute the buffer size required.
311 * Buffer is NULL to compute the size of the buffer required.
312 *
313 * Returns a negative error number on failure, or the number of bytes
314 * used / required on success.
315 */
316int
317ext4_xattr_get(struct inode *inode, int name_index, const char *name,
318 void *buffer, size_t buffer_size)
319{
320 int error;
321
322 down_read(&EXT4_I(inode)->xattr_sem);
323 error = ext4_xattr_ibody_get(inode, name_index, name, buffer,
324 buffer_size);
325 if (error == -ENODATA)
326 error = ext4_xattr_block_get(inode, name_index, name, buffer,
327 buffer_size);
328 up_read(&EXT4_I(inode)->xattr_sem);
329 return error;
330}
331
332static int
333ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry,
334 char *buffer, size_t buffer_size)
335{
336 size_t rest = buffer_size;
337
338 for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
339 struct xattr_handler *handler =
340 ext4_xattr_handler(entry->e_name_index);
341
342 if (handler) {
343 size_t size = handler->list(inode, buffer, rest,
344 entry->e_name,
345 entry->e_name_len);
346 if (buffer) {
347 if (size > rest)
348 return -ERANGE;
349 buffer += size;
350 }
351 rest -= size;
352 }
353 }
354 return buffer_size - rest;
355}
356
357static int
358ext4_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
359{
360 struct buffer_head *bh = NULL;
361 int error;
362
363 ea_idebug(inode, "buffer=%p, buffer_size=%ld",
364 buffer, (long)buffer_size);
365
366 error = 0;
367 if (!EXT4_I(inode)->i_file_acl)
368 goto cleanup;
369 ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl);
370 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
371 error = -EIO;
372 if (!bh)
373 goto cleanup;
374 ea_bdebug(bh, "b_count=%d, refcount=%d",
375 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
376 if (ext4_xattr_check_block(bh)) {
377 ext4_error(inode->i_sb, __FUNCTION__,
378 "inode %lu: bad block %llu", inode->i_ino,
379 EXT4_I(inode)->i_file_acl);
380 error = -EIO;
381 goto cleanup;
382 }
383 ext4_xattr_cache_insert(bh);
384 error = ext4_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size);
385
386cleanup:
387 brelse(bh);
388
389 return error;
390}
391
392static int
393ext4_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size)
394{
395 struct ext4_xattr_ibody_header *header;
396 struct ext4_inode *raw_inode;
397 struct ext4_iloc iloc;
398 void *end;
399 int error;
400
401 if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
402 return 0;
403 error = ext4_get_inode_loc(inode, &iloc);
404 if (error)
405 return error;
406 raw_inode = ext4_raw_inode(&iloc);
407 header = IHDR(inode, raw_inode);
408 end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
409 error = ext4_xattr_check_names(IFIRST(header), end);
410 if (error)
411 goto cleanup;
412 error = ext4_xattr_list_entries(inode, IFIRST(header),
413 buffer, buffer_size);
414
415cleanup:
416 brelse(iloc.bh);
417 return error;
418}
419
420/*
421 * ext4_xattr_list()
422 *
423 * Copy a list of attribute names into the buffer
424 * provided, or compute the buffer size required.
425 * Buffer is NULL to compute the size of the buffer required.
426 *
427 * Returns a negative error number on failure, or the number of bytes
428 * used / required on success.
429 */
430int
431ext4_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
432{
433 int i_error, b_error;
434
435 down_read(&EXT4_I(inode)->xattr_sem);
436 i_error = ext4_xattr_ibody_list(inode, buffer, buffer_size);
437 if (i_error < 0) {
438 b_error = 0;
439 } else {
440 if (buffer) {
441 buffer += i_error;
442 buffer_size -= i_error;
443 }
444 b_error = ext4_xattr_block_list(inode, buffer, buffer_size);
445 if (b_error < 0)
446 i_error = 0;
447 }
448 up_read(&EXT4_I(inode)->xattr_sem);
449 return i_error + b_error;
450}
451
452/*
453 * If the EXT4_FEATURE_COMPAT_EXT_ATTR feature of this file system is
454 * not set, set it.
455 */
456static void ext4_xattr_update_super_block(handle_t *handle,
457 struct super_block *sb)
458{
459 if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR))
460 return;
461
462 lock_super(sb);
463 if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
464 EXT4_SB(sb)->s_es->s_feature_compat |=
465 cpu_to_le32(EXT4_FEATURE_COMPAT_EXT_ATTR);
466 sb->s_dirt = 1;
467 ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
468 }
469 unlock_super(sb);
470}
471
472/*
473 * Release the xattr block BH: If the reference count is > 1, decrement
474 * it; otherwise free the block.
475 */
476static void
477ext4_xattr_release_block(handle_t *handle, struct inode *inode,
478 struct buffer_head *bh)
479{
480 struct mb_cache_entry *ce = NULL;
481
482 ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr);
483 if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
484 ea_bdebug(bh, "refcount now=0; freeing");
485 if (ce)
486 mb_cache_entry_free(ce);
487 ext4_free_blocks(handle, inode, bh->b_blocknr, 1);
488 get_bh(bh);
489 ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
490 } else {
491 if (ext4_journal_get_write_access(handle, bh) == 0) {
492 lock_buffer(bh);
493 BHDR(bh)->h_refcount = cpu_to_le32(
494 le32_to_cpu(BHDR(bh)->h_refcount) - 1);
495 ext4_journal_dirty_metadata(handle, bh);
496 if (IS_SYNC(inode))
497 handle->h_sync = 1;
498 DQUOT_FREE_BLOCK(inode, 1);
499 unlock_buffer(bh);
500 ea_bdebug(bh, "refcount now=%d; releasing",
501 le32_to_cpu(BHDR(bh)->h_refcount));
502 }
503 if (ce)
504 mb_cache_entry_release(ce);
505 }
506}
507
508struct ext4_xattr_info {
509 int name_index;
510 const char *name;
511 const void *value;
512 size_t value_len;
513};
514
515struct ext4_xattr_search {
516 struct ext4_xattr_entry *first;
517 void *base;
518 void *end;
519 struct ext4_xattr_entry *here;
520 int not_found;
521};
522
523static int
524ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
525{
526 struct ext4_xattr_entry *last;
527 size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
528
529 /* Compute min_offs and last. */
530 last = s->first;
531 for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
532 if (!last->e_value_block && last->e_value_size) {
533 size_t offs = le16_to_cpu(last->e_value_offs);
534 if (offs < min_offs)
535 min_offs = offs;
536 }
537 }
538 free = min_offs - ((void *)last - s->base) - sizeof(__u32);
539 if (!s->not_found) {
540 if (!s->here->e_value_block && s->here->e_value_size) {
541 size_t size = le32_to_cpu(s->here->e_value_size);
542 free += EXT4_XATTR_SIZE(size);
543 }
544 free += EXT4_XATTR_LEN(name_len);
545 }
546 if (i->value) {
547 if (free < EXT4_XATTR_SIZE(i->value_len) ||
548 free < EXT4_XATTR_LEN(name_len) +
549 EXT4_XATTR_SIZE(i->value_len))
550 return -ENOSPC;
551 }
552
553 if (i->value && s->not_found) {
554 /* Insert the new name. */
555 size_t size = EXT4_XATTR_LEN(name_len);
556 size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
557 memmove((void *)s->here + size, s->here, rest);
558 memset(s->here, 0, size);
559 s->here->e_name_index = i->name_index;
560 s->here->e_name_len = name_len;
561 memcpy(s->here->e_name, i->name, name_len);
562 } else {
563 if (!s->here->e_value_block && s->here->e_value_size) {
564 void *first_val = s->base + min_offs;
565 size_t offs = le16_to_cpu(s->here->e_value_offs);
566 void *val = s->base + offs;
567 size_t size = EXT4_XATTR_SIZE(
568 le32_to_cpu(s->here->e_value_size));
569
570 if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) {
571 /* The old and the new value have the same
572 size. Just replace. */
573 s->here->e_value_size =
574 cpu_to_le32(i->value_len);
575 memset(val + size - EXT4_XATTR_PAD, 0,
576 EXT4_XATTR_PAD); /* Clear pad bytes. */
577 memcpy(val, i->value, i->value_len);
578 return 0;
579 }
580
581 /* Remove the old value. */
582 memmove(first_val + size, first_val, val - first_val);
583 memset(first_val, 0, size);
584 s->here->e_value_size = 0;
585 s->here->e_value_offs = 0;
586 min_offs += size;
587
588 /* Adjust all value offsets. */
589 last = s->first;
590 while (!IS_LAST_ENTRY(last)) {
591 size_t o = le16_to_cpu(last->e_value_offs);
592 if (!last->e_value_block &&
593 last->e_value_size && o < offs)
594 last->e_value_offs =
595 cpu_to_le16(o + size);
596 last = EXT4_XATTR_NEXT(last);
597 }
598 }
599 if (!i->value) {
600 /* Remove the old name. */
601 size_t size = EXT4_XATTR_LEN(name_len);
602 last = ENTRY((void *)last - size);
603 memmove(s->here, (void *)s->here + size,
604 (void *)last - (void *)s->here + sizeof(__u32));
605 memset(last, 0, size);
606 }
607 }
608
609 if (i->value) {
610 /* Insert the new value. */
611 s->here->e_value_size = cpu_to_le32(i->value_len);
612 if (i->value_len) {
613 size_t size = EXT4_XATTR_SIZE(i->value_len);
614 void *val = s->base + min_offs - size;
615 s->here->e_value_offs = cpu_to_le16(min_offs - size);
616 memset(val + size - EXT4_XATTR_PAD, 0,
617 EXT4_XATTR_PAD); /* Clear the pad bytes. */
618 memcpy(val, i->value, i->value_len);
619 }
620 }
621 return 0;
622}
623
624struct ext4_xattr_block_find {
625 struct ext4_xattr_search s;
626 struct buffer_head *bh;
627};
628
629static int
630ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
631 struct ext4_xattr_block_find *bs)
632{
633 struct super_block *sb = inode->i_sb;
634 int error;
635
636 ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
637 i->name_index, i->name, i->value, (long)i->value_len);
638
639 if (EXT4_I(inode)->i_file_acl) {
640 /* The inode already has an extended attribute block. */
641 bs->bh = sb_bread(sb, EXT4_I(inode)->i_file_acl);
642 error = -EIO;
643 if (!bs->bh)
644 goto cleanup;
645 ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
646 atomic_read(&(bs->bh->b_count)),
647 le32_to_cpu(BHDR(bs->bh)->h_refcount));
648 if (ext4_xattr_check_block(bs->bh)) {
649 ext4_error(sb, __FUNCTION__,
650 "inode %lu: bad block %llu", inode->i_ino,
651 EXT4_I(inode)->i_file_acl);
652 error = -EIO;
653 goto cleanup;
654 }
655 /* Find the named attribute. */
656 bs->s.base = BHDR(bs->bh);
657 bs->s.first = BFIRST(bs->bh);
658 bs->s.end = bs->bh->b_data + bs->bh->b_size;
659 bs->s.here = bs->s.first;
660 error = ext4_xattr_find_entry(&bs->s.here, i->name_index,
661 i->name, bs->bh->b_size, 1);
662 if (error && error != -ENODATA)
663 goto cleanup;
664 bs->s.not_found = error;
665 }
666 error = 0;
667
668cleanup:
669 return error;
670}
671
672static int
673ext4_xattr_block_set(handle_t *handle, struct inode *inode,
674 struct ext4_xattr_info *i,
675 struct ext4_xattr_block_find *bs)
676{
677 struct super_block *sb = inode->i_sb;
678 struct buffer_head *new_bh = NULL;
679 struct ext4_xattr_search *s = &bs->s;
680 struct mb_cache_entry *ce = NULL;
681 int error;
682
683#define header(x) ((struct ext4_xattr_header *)(x))
684
685 if (i->value && i->value_len > sb->s_blocksize)
686 return -ENOSPC;
687 if (s->base) {
688 ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev,
689 bs->bh->b_blocknr);
690 if (header(s->base)->h_refcount == cpu_to_le32(1)) {
691 if (ce) {
692 mb_cache_entry_free(ce);
693 ce = NULL;
694 }
695 ea_bdebug(bs->bh, "modifying in-place");
696 error = ext4_journal_get_write_access(handle, bs->bh);
697 if (error)
698 goto cleanup;
699 lock_buffer(bs->bh);
700 error = ext4_xattr_set_entry(i, s);
701 if (!error) {
702 if (!IS_LAST_ENTRY(s->first))
703 ext4_xattr_rehash(header(s->base),
704 s->here);
705 ext4_xattr_cache_insert(bs->bh);
706 }
707 unlock_buffer(bs->bh);
708 if (error == -EIO)
709 goto bad_block;
710 if (!error)
711 error = ext4_journal_dirty_metadata(handle,
712 bs->bh);
713 if (error)
714 goto cleanup;
715 goto inserted;
716 } else {
717 int offset = (char *)s->here - bs->bh->b_data;
718
719 if (ce) {
720 mb_cache_entry_release(ce);
721 ce = NULL;
722 }
723 ea_bdebug(bs->bh, "cloning");
724 s->base = kmalloc(bs->bh->b_size, GFP_KERNEL);
725 error = -ENOMEM;
726 if (s->base == NULL)
727 goto cleanup;
728 memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);
729 s->first = ENTRY(header(s->base)+1);
730 header(s->base)->h_refcount = cpu_to_le32(1);
731 s->here = ENTRY(s->base + offset);
732 s->end = s->base + bs->bh->b_size;
733 }
734 } else {
735 /* Allocate a buffer where we construct the new block. */
736 s->base = kmalloc(sb->s_blocksize, GFP_KERNEL);
737 /* assert(header == s->base) */
738 error = -ENOMEM;
739 if (s->base == NULL)
740 goto cleanup;
741 memset(s->base, 0, sb->s_blocksize);
742 header(s->base)->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
743 header(s->base)->h_blocks = cpu_to_le32(1);
744 header(s->base)->h_refcount = cpu_to_le32(1);
745 s->first = ENTRY(header(s->base)+1);
746 s->here = ENTRY(header(s->base)+1);
747 s->end = s->base + sb->s_blocksize;
748 }
749
750 error = ext4_xattr_set_entry(i, s);
751 if (error == -EIO)
752 goto bad_block;
753 if (error)
754 goto cleanup;
755 if (!IS_LAST_ENTRY(s->first))
756 ext4_xattr_rehash(header(s->base), s->here);
757
758inserted:
759 if (!IS_LAST_ENTRY(s->first)) {
760 new_bh = ext4_xattr_cache_find(inode, header(s->base), &ce);
761 if (new_bh) {
762 /* We found an identical block in the cache. */
763 if (new_bh == bs->bh)
764 ea_bdebug(new_bh, "keeping");
765 else {
766 /* The old block is released after updating
767 the inode. */
768 error = -EDQUOT;
769 if (DQUOT_ALLOC_BLOCK(inode, 1))
770 goto cleanup;
771 error = ext4_journal_get_write_access(handle,
772 new_bh);
773 if (error)
774 goto cleanup_dquot;
775 lock_buffer(new_bh);
776 BHDR(new_bh)->h_refcount = cpu_to_le32(1 +
777 le32_to_cpu(BHDR(new_bh)->h_refcount));
778 ea_bdebug(new_bh, "reusing; refcount now=%d",
779 le32_to_cpu(BHDR(new_bh)->h_refcount));
780 unlock_buffer(new_bh);
781 error = ext4_journal_dirty_metadata(handle,
782 new_bh);
783 if (error)
784 goto cleanup_dquot;
785 }
786 mb_cache_entry_release(ce);
787 ce = NULL;
788 } else if (bs->bh && s->base == bs->bh->b_data) {
789 /* We were modifying this block in-place. */
790 ea_bdebug(bs->bh, "keeping this block");
791 new_bh = bs->bh;
792 get_bh(new_bh);
793 } else {
794 /* We need to allocate a new block */
795 ext4_fsblk_t goal = le32_to_cpu(
796 EXT4_SB(sb)->s_es->s_first_data_block) +
797 (ext4_fsblk_t)EXT4_I(inode)->i_block_group *
798 EXT4_BLOCKS_PER_GROUP(sb);
799 ext4_fsblk_t block = ext4_new_block(handle, inode,
800 goal, &error);
801 if (error)
802 goto cleanup;
803 ea_idebug(inode, "creating block %d", block);
804
805 new_bh = sb_getblk(sb, block);
806 if (!new_bh) {
807getblk_failed:
808 ext4_free_blocks(handle, inode, block, 1);
809 error = -EIO;
810 goto cleanup;
811 }
812 lock_buffer(new_bh);
813 error = ext4_journal_get_create_access(handle, new_bh);
814 if (error) {
815 unlock_buffer(new_bh);
816 goto getblk_failed;
817 }
818 memcpy(new_bh->b_data, s->base, new_bh->b_size);
819 set_buffer_uptodate(new_bh);
820 unlock_buffer(new_bh);
821 ext4_xattr_cache_insert(new_bh);
822 error = ext4_journal_dirty_metadata(handle, new_bh);
823 if (error)
824 goto cleanup;
825 }
826 }
827
828 /* Update the inode. */
829 EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
830
831 /* Drop the previous xattr block. */
832 if (bs->bh && bs->bh != new_bh)
833 ext4_xattr_release_block(handle, inode, bs->bh);
834 error = 0;
835
836cleanup:
837 if (ce)
838 mb_cache_entry_release(ce);
839 brelse(new_bh);
840 if (!(bs->bh && s->base == bs->bh->b_data))
841 kfree(s->base);
842
843 return error;
844
845cleanup_dquot:
846 DQUOT_FREE_BLOCK(inode, 1);
847 goto cleanup;
848
849bad_block:
850 ext4_error(inode->i_sb, __FUNCTION__,
851 "inode %lu: bad block %llu", inode->i_ino,
852 EXT4_I(inode)->i_file_acl);
853 goto cleanup;
854
855#undef header
856}
857
858struct ext4_xattr_ibody_find {
859 struct ext4_xattr_search s;
860 struct ext4_iloc iloc;
861};
862
863static int
864ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
865 struct ext4_xattr_ibody_find *is)
866{
867 struct ext4_xattr_ibody_header *header;
868 struct ext4_inode *raw_inode;
869 int error;
870
871 if (EXT4_I(inode)->i_extra_isize == 0)
872 return 0;
873 raw_inode = ext4_raw_inode(&is->iloc);
874 header = IHDR(inode, raw_inode);
875 is->s.base = is->s.first = IFIRST(header);
876 is->s.here = is->s.first;
877 is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
878 if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
879 error = ext4_xattr_check_names(IFIRST(header), is->s.end);
880 if (error)
881 return error;
882 /* Find the named attribute. */
883 error = ext4_xattr_find_entry(&is->s.here, i->name_index,
884 i->name, is->s.end -
885 (void *)is->s.base, 0);
886 if (error && error != -ENODATA)
887 return error;
888 is->s.not_found = error;
889 }
890 return 0;
891}
892
893static int
894ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
895 struct ext4_xattr_info *i,
896 struct ext4_xattr_ibody_find *is)
897{
898 struct ext4_xattr_ibody_header *header;
899 struct ext4_xattr_search *s = &is->s;
900 int error;
901
902 if (EXT4_I(inode)->i_extra_isize == 0)
903 return -ENOSPC;
904 error = ext4_xattr_set_entry(i, s);
905 if (error)
906 return error;
907 header = IHDR(inode, ext4_raw_inode(&is->iloc));
908 if (!IS_LAST_ENTRY(s->first)) {
909 header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
910 EXT4_I(inode)->i_state |= EXT4_STATE_XATTR;
911 } else {
912 header->h_magic = cpu_to_le32(0);
913 EXT4_I(inode)->i_state &= ~EXT4_STATE_XATTR;
914 }
915 return 0;
916}
917
918/*
919 * ext4_xattr_set_handle()
920 *
921 * Create, replace or remove an extended attribute for this inode. Buffer
922 * is NULL to remove an existing extended attribute, and non-NULL to
923 * either replace an existing extended attribute, or create a new extended
924 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
925 * specify that an extended attribute must exist and must not exist
926 * previous to the call, respectively.
927 *
928 * Returns 0, or a negative error number on failure.
929 */
930int
931ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
932 const char *name, const void *value, size_t value_len,
933 int flags)
934{
935 struct ext4_xattr_info i = {
936 .name_index = name_index,
937 .name = name,
938 .value = value,
939 .value_len = value_len,
940
941 };
942 struct ext4_xattr_ibody_find is = {
943 .s = { .not_found = -ENODATA, },
944 };
945 struct ext4_xattr_block_find bs = {
946 .s = { .not_found = -ENODATA, },
947 };
948 int error;
949
950 if (!name)
951 return -EINVAL;
952 if (strlen(name) > 255)
953 return -ERANGE;
954 down_write(&EXT4_I(inode)->xattr_sem);
955 error = ext4_get_inode_loc(inode, &is.iloc);
956 if (error)
957 goto cleanup;
958
959 if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) {
960 struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
961 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
962 EXT4_I(inode)->i_state &= ~EXT4_STATE_NEW;
963 }
964
965 error = ext4_xattr_ibody_find(inode, &i, &is);
966 if (error)
967 goto cleanup;
968 if (is.s.not_found)
969 error = ext4_xattr_block_find(inode, &i, &bs);
970 if (error)
971 goto cleanup;
972 if (is.s.not_found && bs.s.not_found) {
973 error = -ENODATA;
974 if (flags & XATTR_REPLACE)
975 goto cleanup;
976 error = 0;
977 if (!value)
978 goto cleanup;
979 } else {
980 error = -EEXIST;
981 if (flags & XATTR_CREATE)
982 goto cleanup;
983 }
984 error = ext4_journal_get_write_access(handle, is.iloc.bh);
985 if (error)
986 goto cleanup;
987 if (!value) {
988 if (!is.s.not_found)
989 error = ext4_xattr_ibody_set(handle, inode, &i, &is);
990 else if (!bs.s.not_found)
991 error = ext4_xattr_block_set(handle, inode, &i, &bs);
992 } else {
993 error = ext4_xattr_ibody_set(handle, inode, &i, &is);
994 if (!error && !bs.s.not_found) {
995 i.value = NULL;
996 error = ext4_xattr_block_set(handle, inode, &i, &bs);
997 } else if (error == -ENOSPC) {
998 error = ext4_xattr_block_set(handle, inode, &i, &bs);
999 if (error)
1000 goto cleanup;
1001 if (!is.s.not_found) {
1002 i.value = NULL;
1003 error = ext4_xattr_ibody_set(handle, inode, &i,
1004 &is);
1005 }
1006 }
1007 }
1008 if (!error) {
1009 ext4_xattr_update_super_block(handle, inode->i_sb);
1010 inode->i_ctime = CURRENT_TIME_SEC;
1011 error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
1012 /*
1013 * The bh is consumed by ext4_mark_iloc_dirty, even with
1014 * error != 0.
1015 */
1016 is.iloc.bh = NULL;
1017 if (IS_SYNC(inode))
1018 handle->h_sync = 1;
1019 }
1020
1021cleanup:
1022 brelse(is.iloc.bh);
1023 brelse(bs.bh);
1024 up_write(&EXT4_I(inode)->xattr_sem);
1025 return error;
1026}
1027
1028/*
1029 * ext4_xattr_set()
1030 *
1031 * Like ext4_xattr_set_handle, but start from an inode. This extended
1032 * attribute modification is a filesystem transaction by itself.
1033 *
1034 * Returns 0, or a negative error number on failure.
1035 */
1036int
1037ext4_xattr_set(struct inode *inode, int name_index, const char *name,
1038 const void *value, size_t value_len, int flags)
1039{
1040 handle_t *handle;
1041 int error, retries = 0;
1042
1043retry:
1044 handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
1045 if (IS_ERR(handle)) {
1046 error = PTR_ERR(handle);
1047 } else {
1048 int error2;
1049
1050 error = ext4_xattr_set_handle(handle, inode, name_index, name,
1051 value, value_len, flags);
1052 error2 = ext4_journal_stop(handle);
1053 if (error == -ENOSPC &&
1054 ext4_should_retry_alloc(inode->i_sb, &retries))
1055 goto retry;
1056 if (error == 0)
1057 error = error2;
1058 }
1059
1060 return error;
1061}
1062
1063/*
1064 * ext4_xattr_delete_inode()
1065 *
1066 * Free extended attribute resources associated with this inode. This
1067 * is called immediately before an inode is freed. We have exclusive
1068 * access to the inode.
1069 */
1070void
1071ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
1072{
1073 struct buffer_head *bh = NULL;
1074
1075 if (!EXT4_I(inode)->i_file_acl)
1076 goto cleanup;
1077 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
1078 if (!bh) {
1079 ext4_error(inode->i_sb, __FUNCTION__,
1080 "inode %lu: block %llu read error", inode->i_ino,
1081 EXT4_I(inode)->i_file_acl);
1082 goto cleanup;
1083 }
1084 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
1085 BHDR(bh)->h_blocks != cpu_to_le32(1)) {
1086 ext4_error(inode->i_sb, __FUNCTION__,
1087 "inode %lu: bad block %llu", inode->i_ino,
1088 EXT4_I(inode)->i_file_acl);
1089 goto cleanup;
1090 }
1091 ext4_xattr_release_block(handle, inode, bh);
1092 EXT4_I(inode)->i_file_acl = 0;
1093
1094cleanup:
1095 brelse(bh);
1096}
1097
1098/*
1099 * ext4_xattr_put_super()
1100 *
1101 * This is called when a file system is unmounted.
1102 */
1103void
1104ext4_xattr_put_super(struct super_block *sb)
1105{
1106 mb_cache_shrink(sb->s_bdev);
1107}
1108
1109/*
1110 * ext4_xattr_cache_insert()
1111 *
1112 * Create a new entry in the extended attribute cache, and insert
1113 * it unless such an entry is already in the cache.
1114 *
1115 * Returns 0, or a negative error number on failure.
1116 */
1117static void
1118ext4_xattr_cache_insert(struct buffer_head *bh)
1119{
1120 __u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
1121 struct mb_cache_entry *ce;
1122 int error;
1123
1124 ce = mb_cache_entry_alloc(ext4_xattr_cache);
1125 if (!ce) {
1126 ea_bdebug(bh, "out of memory");
1127 return;
1128 }
1129 error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash);
1130 if (error) {
1131 mb_cache_entry_free(ce);
1132 if (error == -EBUSY) {
1133 ea_bdebug(bh, "already in cache");
1134 error = 0;
1135 }
1136 } else {
1137 ea_bdebug(bh, "inserting [%x]", (int)hash);
1138 mb_cache_entry_release(ce);
1139 }
1140}
1141
1142/*
1143 * ext4_xattr_cmp()
1144 *
1145 * Compare two extended attribute blocks for equality.
1146 *
1147 * Returns 0 if the blocks are equal, 1 if they differ, and
1148 * a negative error number on errors.
1149 */
1150static int
1151ext4_xattr_cmp(struct ext4_xattr_header *header1,
1152 struct ext4_xattr_header *header2)
1153{
1154 struct ext4_xattr_entry *entry1, *entry2;
1155
1156 entry1 = ENTRY(header1+1);
1157 entry2 = ENTRY(header2+1);
1158 while (!IS_LAST_ENTRY(entry1)) {
1159 if (IS_LAST_ENTRY(entry2))
1160 return 1;
1161 if (entry1->e_hash != entry2->e_hash ||
1162 entry1->e_name_index != entry2->e_name_index ||
1163 entry1->e_name_len != entry2->e_name_len ||
1164 entry1->e_value_size != entry2->e_value_size ||
1165 memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
1166 return 1;
1167 if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
1168 return -EIO;
1169 if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
1170 (char *)header2 + le16_to_cpu(entry2->e_value_offs),
1171 le32_to_cpu(entry1->e_value_size)))
1172 return 1;
1173
1174 entry1 = EXT4_XATTR_NEXT(entry1);
1175 entry2 = EXT4_XATTR_NEXT(entry2);
1176 }
1177 if (!IS_LAST_ENTRY(entry2))
1178 return 1;
1179 return 0;
1180}
1181
1182/*
1183 * ext4_xattr_cache_find()
1184 *
1185 * Find an identical extended attribute block.
1186 *
1187 * Returns a pointer to the block found, or NULL if such a block was
1188 * not found or an error occurred.
1189 */
1190static struct buffer_head *
1191ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
1192 struct mb_cache_entry **pce)
1193{
1194 __u32 hash = le32_to_cpu(header->h_hash);
1195 struct mb_cache_entry *ce;
1196
1197 if (!header->h_hash)
1198 return NULL; /* never share */
1199 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
1200again:
1201 ce = mb_cache_entry_find_first(ext4_xattr_cache, 0,
1202 inode->i_sb->s_bdev, hash);
1203 while (ce) {
1204 struct buffer_head *bh;
1205
1206 if (IS_ERR(ce)) {
1207 if (PTR_ERR(ce) == -EAGAIN)
1208 goto again;
1209 break;
1210 }
1211 bh = sb_bread(inode->i_sb, ce->e_block);
1212 if (!bh) {
1213 ext4_error(inode->i_sb, __FUNCTION__,
1214 "inode %lu: block %lu read error",
1215 inode->i_ino, (unsigned long) ce->e_block);
1216 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
1217 EXT4_XATTR_REFCOUNT_MAX) {
1218 ea_idebug(inode, "block %lu refcount %d>=%d",
1219 (unsigned long) ce->e_block,
1220 le32_to_cpu(BHDR(bh)->h_refcount),
1221 EXT4_XATTR_REFCOUNT_MAX);
1222 } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
1223 *pce = ce;
1224 return bh;
1225 }
1226 brelse(bh);
1227 ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash);
1228 }
1229 return NULL;
1230}
1231
1232#define NAME_HASH_SHIFT 5
1233#define VALUE_HASH_SHIFT 16
1234
1235/*
1236 * ext4_xattr_hash_entry()
1237 *
1238 * Compute the hash of an extended attribute.
1239 */
1240static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header,
1241 struct ext4_xattr_entry *entry)
1242{
1243 __u32 hash = 0;
1244 char *name = entry->e_name;
1245 int n;
1246
1247 for (n=0; n < entry->e_name_len; n++) {
1248 hash = (hash << NAME_HASH_SHIFT) ^
1249 (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
1250 *name++;
1251 }
1252
1253 if (entry->e_value_block == 0 && entry->e_value_size != 0) {
1254 __le32 *value = (__le32 *)((char *)header +
1255 le16_to_cpu(entry->e_value_offs));
1256 for (n = (le32_to_cpu(entry->e_value_size) +
1257 EXT4_XATTR_ROUND) >> EXT4_XATTR_PAD_BITS; n; n--) {
1258 hash = (hash << VALUE_HASH_SHIFT) ^
1259 (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
1260 le32_to_cpu(*value++);
1261 }
1262 }
1263 entry->e_hash = cpu_to_le32(hash);
1264}
1265
1266#undef NAME_HASH_SHIFT
1267#undef VALUE_HASH_SHIFT
1268
1269#define BLOCK_HASH_SHIFT 16
1270
1271/*
1272 * ext4_xattr_rehash()
1273 *
1274 * Re-compute the extended attribute hash value after an entry has changed.
1275 */
1276static void ext4_xattr_rehash(struct ext4_xattr_header *header,
1277 struct ext4_xattr_entry *entry)
1278{
1279 struct ext4_xattr_entry *here;
1280 __u32 hash = 0;
1281
1282 ext4_xattr_hash_entry(header, entry);
1283 here = ENTRY(header+1);
1284 while (!IS_LAST_ENTRY(here)) {
1285 if (!here->e_hash) {
1286 /* Block is not shared if an entry's hash value == 0 */
1287 hash = 0;
1288 break;
1289 }
1290 hash = (hash << BLOCK_HASH_SHIFT) ^
1291 (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
1292 le32_to_cpu(here->e_hash);
1293 here = EXT4_XATTR_NEXT(here);
1294 }
1295 header->h_hash = cpu_to_le32(hash);
1296}
1297
1298#undef BLOCK_HASH_SHIFT
1299
1300int __init
1301init_ext4_xattr(void)
1302{
1303 ext4_xattr_cache = mb_cache_create("ext4_xattr", NULL,
1304 sizeof(struct mb_cache_entry) +
1305 sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6);
1306 if (!ext4_xattr_cache)
1307 return -ENOMEM;
1308 return 0;
1309}
1310
1311void
1312exit_ext4_xattr(void)
1313{
1314 if (ext4_xattr_cache)
1315 mb_cache_destroy(ext4_xattr_cache);
1316 ext4_xattr_cache = NULL;
1317}
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
new file mode 100644
index 000000000000..79432b35398f
--- /dev/null
+++ b/fs/ext4/xattr.h
@@ -0,0 +1,145 @@
1/*
2 File: fs/ext4/xattr.h
3
4 On-disk format of extended attributes for the ext4 filesystem.
5
6 (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
7*/
8
9#include <linux/xattr.h>
10
11/* Magic value in attribute blocks */
12#define EXT4_XATTR_MAGIC 0xEA020000
13
14/* Maximum number of references to one attribute block */
15#define EXT4_XATTR_REFCOUNT_MAX 1024
16
17/* Name indexes */
18#define EXT4_XATTR_INDEX_USER 1
19#define EXT4_XATTR_INDEX_POSIX_ACL_ACCESS 2
20#define EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT 3
21#define EXT4_XATTR_INDEX_TRUSTED 4
22#define EXT4_XATTR_INDEX_LUSTRE 5
23#define EXT4_XATTR_INDEX_SECURITY 6
24
25struct ext4_xattr_header {
26 __le32 h_magic; /* magic number for identification */
27 __le32 h_refcount; /* reference count */
28 __le32 h_blocks; /* number of disk blocks used */
29 __le32 h_hash; /* hash value of all attributes */
30 __u32 h_reserved[4]; /* zero right now */
31};
32
33struct ext4_xattr_ibody_header {
34 __le32 h_magic; /* magic number for identification */
35};
36
37struct ext4_xattr_entry {
38 __u8 e_name_len; /* length of name */
39 __u8 e_name_index; /* attribute name index */
40 __le16 e_value_offs; /* offset in disk block of value */
41 __le32 e_value_block; /* disk block attribute is stored on (n/i) */
42 __le32 e_value_size; /* size of attribute value */
43 __le32 e_hash; /* hash value of name and value */
44 char e_name[0]; /* attribute name */
45};
46
47#define EXT4_XATTR_PAD_BITS 2
48#define EXT4_XATTR_PAD (1<<EXT4_XATTR_PAD_BITS)
49#define EXT4_XATTR_ROUND (EXT4_XATTR_PAD-1)
50#define EXT4_XATTR_LEN(name_len) \
51 (((name_len) + EXT4_XATTR_ROUND + \
52 sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND)
53#define EXT4_XATTR_NEXT(entry) \
54 ( (struct ext4_xattr_entry *)( \
55 (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)) )
56#define EXT4_XATTR_SIZE(size) \
57 (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND)
58
59# ifdef CONFIG_EXT4DEV_FS_XATTR
60
61extern struct xattr_handler ext4_xattr_user_handler;
62extern struct xattr_handler ext4_xattr_trusted_handler;
63extern struct xattr_handler ext4_xattr_acl_access_handler;
64extern struct xattr_handler ext4_xattr_acl_default_handler;
65extern struct xattr_handler ext4_xattr_security_handler;
66
67extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
68
69extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
70extern int ext4_xattr_list(struct inode *, char *, size_t);
71extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
72extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
73
74extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
75extern void ext4_xattr_put_super(struct super_block *);
76
77extern int init_ext4_xattr(void);
78extern void exit_ext4_xattr(void);
79
80extern struct xattr_handler *ext4_xattr_handlers[];
81
82# else /* CONFIG_EXT4DEV_FS_XATTR */
83
84static inline int
85ext4_xattr_get(struct inode *inode, int name_index, const char *name,
86 void *buffer, size_t size, int flags)
87{
88 return -EOPNOTSUPP;
89}
90
91static inline int
92ext4_xattr_list(struct inode *inode, void *buffer, size_t size)
93{
94 return -EOPNOTSUPP;
95}
96
97static inline int
98ext4_xattr_set(struct inode *inode, int name_index, const char *name,
99 const void *value, size_t size, int flags)
100{
101 return -EOPNOTSUPP;
102}
103
104static inline int
105ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
106 const char *name, const void *value, size_t size, int flags)
107{
108 return -EOPNOTSUPP;
109}
110
111static inline void
112ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
113{
114}
115
116static inline void
117ext4_xattr_put_super(struct super_block *sb)
118{
119}
120
121static inline int
122init_ext4_xattr(void)
123{
124 return 0;
125}
126
127static inline void
128exit_ext4_xattr(void)
129{
130}
131
132#define ext4_xattr_handlers NULL
133
134# endif /* CONFIG_EXT4DEV_FS_XATTR */
135
136#ifdef CONFIG_EXT4DEV_FS_SECURITY
137extern int ext4_init_security(handle_t *handle, struct inode *inode,
138 struct inode *dir);
139#else
140static inline int ext4_init_security(handle_t *handle, struct inode *inode,
141 struct inode *dir)
142{
143 return 0;
144}
145#endif
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
new file mode 100644
index 000000000000..b6a6861951f9
--- /dev/null
+++ b/fs/ext4/xattr_security.c
@@ -0,0 +1,77 @@
1/*
2 * linux/fs/ext4/xattr_security.c
3 * Handler for storing security labels as extended attributes.
4 */
5
6#include <linux/module.h>
7#include <linux/string.h>
8#include <linux/fs.h>
9#include <linux/smp_lock.h>
10#include <linux/ext4_jbd2.h>
11#include <linux/ext4_fs.h>
12#include <linux/security.h>
13#include "xattr.h"
14
15static size_t
16ext4_xattr_security_list(struct inode *inode, char *list, size_t list_size,
17 const char *name, size_t name_len)
18{
19 const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
20 const size_t total_len = prefix_len + name_len + 1;
21
22
23 if (list && total_len <= list_size) {
24 memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
25 memcpy(list+prefix_len, name, name_len);
26 list[prefix_len + name_len] = '\0';
27 }
28 return total_len;
29}
30
31static int
32ext4_xattr_security_get(struct inode *inode, const char *name,
33 void *buffer, size_t size)
34{
35 if (strcmp(name, "") == 0)
36 return -EINVAL;
37 return ext4_xattr_get(inode, EXT4_XATTR_INDEX_SECURITY, name,
38 buffer, size);
39}
40
41static int
42ext4_xattr_security_set(struct inode *inode, const char *name,
43 const void *value, size_t size, int flags)
44{
45 if (strcmp(name, "") == 0)
46 return -EINVAL;
47 return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY, name,
48 value, size, flags);
49}
50
51int
52ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
53{
54 int err;
55 size_t len;
56 void *value;
57 char *name;
58
59 err = security_inode_init_security(inode, dir, &name, &value, &len);
60 if (err) {
61 if (err == -EOPNOTSUPP)
62 return 0;
63 return err;
64 }
65 err = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_SECURITY,
66 name, value, len, 0);
67 kfree(name);
68 kfree(value);
69 return err;
70}
71
72struct xattr_handler ext4_xattr_security_handler = {
73 .prefix = XATTR_SECURITY_PREFIX,
74 .list = ext4_xattr_security_list,
75 .get = ext4_xattr_security_get,
76 .set = ext4_xattr_security_set,
77};
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
new file mode 100644
index 000000000000..b76f2dbc82da
--- /dev/null
+++ b/fs/ext4/xattr_trusted.c
@@ -0,0 +1,62 @@
1/*
2 * linux/fs/ext4/xattr_trusted.c
3 * Handler for trusted extended attributes.
4 *
5 * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
6 */
7
8#include <linux/module.h>
9#include <linux/string.h>
10#include <linux/capability.h>
11#include <linux/fs.h>
12#include <linux/smp_lock.h>
13#include <linux/ext4_jbd2.h>
14#include <linux/ext4_fs.h>
15#include "xattr.h"
16
17#define XATTR_TRUSTED_PREFIX "trusted."
18
19static size_t
20ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
21 const char *name, size_t name_len)
22{
23 const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
24 const size_t total_len = prefix_len + name_len + 1;
25
26 if (!capable(CAP_SYS_ADMIN))
27 return 0;
28
29 if (list && total_len <= list_size) {
30 memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
31 memcpy(list+prefix_len, name, name_len);
32 list[prefix_len + name_len] = '\0';
33 }
34 return total_len;
35}
36
37static int
38ext4_xattr_trusted_get(struct inode *inode, const char *name,
39 void *buffer, size_t size)
40{
41 if (strcmp(name, "") == 0)
42 return -EINVAL;
43 return ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED, name,
44 buffer, size);
45}
46
47static int
48ext4_xattr_trusted_set(struct inode *inode, const char *name,
49 const void *value, size_t size, int flags)
50{
51 if (strcmp(name, "") == 0)
52 return -EINVAL;
53 return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED, name,
54 value, size, flags);
55}
56
57struct xattr_handler ext4_xattr_trusted_handler = {
58 .prefix = XATTR_TRUSTED_PREFIX,
59 .list = ext4_xattr_trusted_list,
60 .get = ext4_xattr_trusted_get,
61 .set = ext4_xattr_trusted_set,
62};
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
new file mode 100644
index 000000000000..c53cded0761a
--- /dev/null
+++ b/fs/ext4/xattr_user.c
@@ -0,0 +1,64 @@
1/*
2 * linux/fs/ext4/xattr_user.c
3 * Handler for extended user attributes.
4 *
5 * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
6 */
7
8#include <linux/module.h>
9#include <linux/string.h>
10#include <linux/fs.h>
11#include <linux/smp_lock.h>
12#include <linux/ext4_jbd2.h>
13#include <linux/ext4_fs.h>
14#include "xattr.h"
15
16#define XATTR_USER_PREFIX "user."
17
18static size_t
19ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
20 const char *name, size_t name_len)
21{
22 const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
23 const size_t total_len = prefix_len + name_len + 1;
24
25 if (!test_opt(inode->i_sb, XATTR_USER))
26 return 0;
27
28 if (list && total_len <= list_size) {
29 memcpy(list, XATTR_USER_PREFIX, prefix_len);
30 memcpy(list+prefix_len, name, name_len);
31 list[prefix_len + name_len] = '\0';
32 }
33 return total_len;
34}
35
36static int
37ext4_xattr_user_get(struct inode *inode, const char *name,
38 void *buffer, size_t size)
39{
40 if (strcmp(name, "") == 0)
41 return -EINVAL;
42 if (!test_opt(inode->i_sb, XATTR_USER))
43 return -EOPNOTSUPP;
44 return ext4_xattr_get(inode, EXT4_XATTR_INDEX_USER, name, buffer, size);
45}
46
47static int
48ext4_xattr_user_set(struct inode *inode, const char *name,
49 const void *value, size_t size, int flags)
50{
51 if (strcmp(name, "") == 0)
52 return -EINVAL;
53 if (!test_opt(inode->i_sb, XATTR_USER))
54 return -EOPNOTSUPP;
55 return ext4_xattr_set(inode, EXT4_XATTR_INDEX_USER, name,
56 value, size, flags);
57}
58
59struct xattr_handler ext4_xattr_user_handler = {
60 .prefix = XATTR_USER_PREFIX,
61 .list = ext4_xattr_user_list,
62 .get = ext4_xattr_user_get,
63 .set = ext4_xattr_user_set,
64};
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 4613cb202170..78945b53b0f8 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1472,7 +1472,7 @@ int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2)
1472 ret = writeback_inode(i1); 1472 ret = writeback_inode(i1);
1473 if (!ret && i2) 1473 if (!ret && i2)
1474 ret = writeback_inode(i2); 1474 ret = writeback_inode(i2);
1475 if (!ret && sb) { 1475 if (!ret) {
1476 struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; 1476 struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
1477 ret = filemap_flush(mapping); 1477 ret = filemap_flush(mapping);
1478 } 1478 }
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 1f94dd35a943..cdd1694e889b 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -45,7 +45,7 @@ static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp,
45 strncpy(buf, table_name, 256); 45 strncpy(buf, table_name, 256);
46 buf[255] = '\0'; 46 buf[255] = '\0';
47 47
48 p = strstr(buf, ":"); 48 p = strchr(buf, ':');
49 if (!p) { 49 if (!p) {
50 log_info("invalid table_name \"%s\"", table_name); 50 log_info("invalid table_name \"%s\"", table_name);
51 kfree(ls); 51 kfree(ls);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 5e03b2f67b93..4ee3f006b861 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -293,7 +293,7 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff)
293 if (h_vm_pgoff >= h_pgoff) 293 if (h_vm_pgoff >= h_pgoff)
294 v_offset = 0; 294 v_offset = 0;
295 295
296 unmap_hugepage_range(vma, 296 __unmap_hugepage_range(vma,
297 vma->vm_start + v_offset, vma->vm_end); 297 vma->vm_start + v_offset, vma->vm_end);
298 } 298 }
299} 299}
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index c518dd8fe60a..b85c686b60db 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -725,6 +725,7 @@ journal_t * journal_init_dev(struct block_device *bdev,
725 __FUNCTION__); 725 __FUNCTION__);
726 kfree(journal); 726 kfree(journal);
727 journal = NULL; 727 journal = NULL;
728 goto out;
728 } 729 }
729 journal->j_dev = bdev; 730 journal->j_dev = bdev;
730 journal->j_fs_dev = fs_dev; 731 journal->j_fs_dev = fs_dev;
@@ -735,7 +736,7 @@ journal_t * journal_init_dev(struct block_device *bdev,
735 J_ASSERT(bh != NULL); 736 J_ASSERT(bh != NULL);
736 journal->j_sb_buffer = bh; 737 journal->j_sb_buffer = bh;
737 journal->j_superblock = (journal_superblock_t *)bh->b_data; 738 journal->j_superblock = (journal_superblock_t *)bh->b_data;
738 739out:
739 return journal; 740 return journal;
740} 741}
741 742
diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile
new file mode 100644
index 000000000000..802a3413872a
--- /dev/null
+++ b/fs/jbd2/Makefile
@@ -0,0 +1,7 @@
1#
2# Makefile for the linux journaling routines.
3#
4
5obj-$(CONFIG_JBD2) += jbd2.o
6
7jbd2-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
new file mode 100644
index 000000000000..68039fa9a566
--- /dev/null
+++ b/fs/jbd2/checkpoint.c
@@ -0,0 +1,697 @@
1/*
2 * linux/fs/checkpoint.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
5 *
6 * Copyright 1999 Red Hat Software --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Checkpoint routines for the generic filesystem journaling code.
13 * Part of the ext2fs journaling system.
14 *
15 * Checkpointing is the process of ensuring that a section of the log is
16 * committed fully to disk, so that that portion of the log can be
17 * reused.
18 */
19
20#include <linux/time.h>
21#include <linux/fs.h>
22#include <linux/jbd2.h>
23#include <linux/errno.h>
24#include <linux/slab.h>
25
26/*
27 * Unlink a buffer from a transaction checkpoint list.
28 *
29 * Called with j_list_lock held.
30 */
31static inline void __buffer_unlink_first(struct journal_head *jh)
32{
33 transaction_t *transaction = jh->b_cp_transaction;
34
35 jh->b_cpnext->b_cpprev = jh->b_cpprev;
36 jh->b_cpprev->b_cpnext = jh->b_cpnext;
37 if (transaction->t_checkpoint_list == jh) {
38 transaction->t_checkpoint_list = jh->b_cpnext;
39 if (transaction->t_checkpoint_list == jh)
40 transaction->t_checkpoint_list = NULL;
41 }
42}
43
44/*
45 * Unlink a buffer from a transaction checkpoint(io) list.
46 *
47 * Called with j_list_lock held.
48 */
49static inline void __buffer_unlink(struct journal_head *jh)
50{
51 transaction_t *transaction = jh->b_cp_transaction;
52
53 __buffer_unlink_first(jh);
54 if (transaction->t_checkpoint_io_list == jh) {
55 transaction->t_checkpoint_io_list = jh->b_cpnext;
56 if (transaction->t_checkpoint_io_list == jh)
57 transaction->t_checkpoint_io_list = NULL;
58 }
59}
60
61/*
62 * Move a buffer from the checkpoint list to the checkpoint io list
63 *
64 * Called with j_list_lock held
65 */
66static inline void __buffer_relink_io(struct journal_head *jh)
67{
68 transaction_t *transaction = jh->b_cp_transaction;
69
70 __buffer_unlink_first(jh);
71
72 if (!transaction->t_checkpoint_io_list) {
73 jh->b_cpnext = jh->b_cpprev = jh;
74 } else {
75 jh->b_cpnext = transaction->t_checkpoint_io_list;
76 jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
77 jh->b_cpprev->b_cpnext = jh;
78 jh->b_cpnext->b_cpprev = jh;
79 }
80 transaction->t_checkpoint_io_list = jh;
81}
82
83/*
84 * Try to release a checkpointed buffer from its transaction.
85 * Returns 1 if we released it and 2 if we also released the
86 * whole transaction.
87 *
88 * Requires j_list_lock
89 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
90 */
91static int __try_to_free_cp_buf(struct journal_head *jh)
92{
93 int ret = 0;
94 struct buffer_head *bh = jh2bh(jh);
95
96 if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
97 JBUFFER_TRACE(jh, "remove from checkpoint list");
98 ret = __jbd2_journal_remove_checkpoint(jh) + 1;
99 jbd_unlock_bh_state(bh);
100 jbd2_journal_remove_journal_head(bh);
101 BUFFER_TRACE(bh, "release");
102 __brelse(bh);
103 } else {
104 jbd_unlock_bh_state(bh);
105 }
106 return ret;
107}
108
109/*
110 * __jbd2_log_wait_for_space: wait until there is space in the journal.
111 *
112 * Called under j-state_lock *only*. It will be unlocked if we have to wait
113 * for a checkpoint to free up some space in the log.
114 */
115void __jbd2_log_wait_for_space(journal_t *journal)
116{
117 int nblocks;
118 assert_spin_locked(&journal->j_state_lock);
119
120 nblocks = jbd_space_needed(journal);
121 while (__jbd2_log_space_left(journal) < nblocks) {
122 if (journal->j_flags & JBD2_ABORT)
123 return;
124 spin_unlock(&journal->j_state_lock);
125 mutex_lock(&journal->j_checkpoint_mutex);
126
127 /*
128 * Test again, another process may have checkpointed while we
129 * were waiting for the checkpoint lock
130 */
131 spin_lock(&journal->j_state_lock);
132 nblocks = jbd_space_needed(journal);
133 if (__jbd2_log_space_left(journal) < nblocks) {
134 spin_unlock(&journal->j_state_lock);
135 jbd2_log_do_checkpoint(journal);
136 spin_lock(&journal->j_state_lock);
137 }
138 mutex_unlock(&journal->j_checkpoint_mutex);
139 }
140}
141
142/*
143 * We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
144 * The caller must restart a list walk. Wait for someone else to run
145 * jbd_unlock_bh_state().
146 */
147static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
148 __releases(journal->j_list_lock)
149{
150 get_bh(bh);
151 spin_unlock(&journal->j_list_lock);
152 jbd_lock_bh_state(bh);
153 jbd_unlock_bh_state(bh);
154 put_bh(bh);
155}
156
157/*
158 * Clean up transaction's list of buffers submitted for io.
159 * We wait for any pending IO to complete and remove any clean
160 * buffers. Note that we take the buffers in the opposite ordering
161 * from the one in which they were submitted for IO.
162 *
163 * Called with j_list_lock held.
164 */
165static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
166{
167 struct journal_head *jh;
168 struct buffer_head *bh;
169 tid_t this_tid;
170 int released = 0;
171
172 this_tid = transaction->t_tid;
173restart:
174 /* Did somebody clean up the transaction in the meanwhile? */
175 if (journal->j_checkpoint_transactions != transaction ||
176 transaction->t_tid != this_tid)
177 return;
178 while (!released && transaction->t_checkpoint_io_list) {
179 jh = transaction->t_checkpoint_io_list;
180 bh = jh2bh(jh);
181 if (!jbd_trylock_bh_state(bh)) {
182 jbd_sync_bh(journal, bh);
183 spin_lock(&journal->j_list_lock);
184 goto restart;
185 }
186 if (buffer_locked(bh)) {
187 atomic_inc(&bh->b_count);
188 spin_unlock(&journal->j_list_lock);
189 jbd_unlock_bh_state(bh);
190 wait_on_buffer(bh);
191 /* the journal_head may have gone by now */
192 BUFFER_TRACE(bh, "brelse");
193 __brelse(bh);
194 spin_lock(&journal->j_list_lock);
195 goto restart;
196 }
197 /*
198 * Now in whatever state the buffer currently is, we know that
199 * it has been written out and so we can drop it from the list
200 */
201 released = __jbd2_journal_remove_checkpoint(jh);
202 jbd_unlock_bh_state(bh);
203 jbd2_journal_remove_journal_head(bh);
204 __brelse(bh);
205 }
206}
207
208#define NR_BATCH 64
209
210static void
211__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
212{
213 int i;
214
215 ll_rw_block(SWRITE, *batch_count, bhs);
216 for (i = 0; i < *batch_count; i++) {
217 struct buffer_head *bh = bhs[i];
218 clear_buffer_jwrite(bh);
219 BUFFER_TRACE(bh, "brelse");
220 __brelse(bh);
221 }
222 *batch_count = 0;
223}
224
225/*
226 * Try to flush one buffer from the checkpoint list to disk.
227 *
228 * Return 1 if something happened which requires us to abort the current
229 * scan of the checkpoint list.
230 *
231 * Called with j_list_lock held and drops it if 1 is returned
232 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
233 */
234static int __process_buffer(journal_t *journal, struct journal_head *jh,
235 struct buffer_head **bhs, int *batch_count)
236{
237 struct buffer_head *bh = jh2bh(jh);
238 int ret = 0;
239
240 if (buffer_locked(bh)) {
241 atomic_inc(&bh->b_count);
242 spin_unlock(&journal->j_list_lock);
243 jbd_unlock_bh_state(bh);
244 wait_on_buffer(bh);
245 /* the journal_head may have gone by now */
246 BUFFER_TRACE(bh, "brelse");
247 __brelse(bh);
248 ret = 1;
249 } else if (jh->b_transaction != NULL) {
250 transaction_t *t = jh->b_transaction;
251 tid_t tid = t->t_tid;
252
253 spin_unlock(&journal->j_list_lock);
254 jbd_unlock_bh_state(bh);
255 jbd2_log_start_commit(journal, tid);
256 jbd2_log_wait_commit(journal, tid);
257 ret = 1;
258 } else if (!buffer_dirty(bh)) {
259 J_ASSERT_JH(jh, !buffer_jbddirty(bh));
260 BUFFER_TRACE(bh, "remove from checkpoint");
261 __jbd2_journal_remove_checkpoint(jh);
262 spin_unlock(&journal->j_list_lock);
263 jbd_unlock_bh_state(bh);
264 jbd2_journal_remove_journal_head(bh);
265 __brelse(bh);
266 ret = 1;
267 } else {
268 /*
269 * Important: we are about to write the buffer, and
270 * possibly block, while still holding the journal lock.
271 * We cannot afford to let the transaction logic start
272 * messing around with this buffer before we write it to
273 * disk, as that would break recoverability.
274 */
275 BUFFER_TRACE(bh, "queue");
276 get_bh(bh);
277 J_ASSERT_BH(bh, !buffer_jwrite(bh));
278 set_buffer_jwrite(bh);
279 bhs[*batch_count] = bh;
280 __buffer_relink_io(jh);
281 jbd_unlock_bh_state(bh);
282 (*batch_count)++;
283 if (*batch_count == NR_BATCH) {
284 spin_unlock(&journal->j_list_lock);
285 __flush_batch(journal, bhs, batch_count);
286 ret = 1;
287 }
288 }
289 return ret;
290}
291
292/*
293 * Perform an actual checkpoint. We take the first transaction on the
294 * list of transactions to be checkpointed and send all its buffers
295 * to disk. We submit larger chunks of data at once.
296 *
297 * The journal should be locked before calling this function.
298 */
299int jbd2_log_do_checkpoint(journal_t *journal)
300{
301 transaction_t *transaction;
302 tid_t this_tid;
303 int result;
304
305 jbd_debug(1, "Start checkpoint\n");
306
307 /*
308 * First thing: if there are any transactions in the log which
309 * don't need checkpointing, just eliminate them from the
310 * journal straight away.
311 */
312 result = jbd2_cleanup_journal_tail(journal);
313 jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
314 if (result <= 0)
315 return result;
316
317 /*
318 * OK, we need to start writing disk blocks. Take one transaction
319 * and write it.
320 */
321 spin_lock(&journal->j_list_lock);
322 if (!journal->j_checkpoint_transactions)
323 goto out;
324 transaction = journal->j_checkpoint_transactions;
325 this_tid = transaction->t_tid;
326restart:
327 /*
328 * If someone cleaned up this transaction while we slept, we're
329 * done (maybe it's a new transaction, but it fell at the same
330 * address).
331 */
332 if (journal->j_checkpoint_transactions == transaction &&
333 transaction->t_tid == this_tid) {
334 int batch_count = 0;
335 struct buffer_head *bhs[NR_BATCH];
336 struct journal_head *jh;
337 int retry = 0;
338
339 while (!retry && transaction->t_checkpoint_list) {
340 struct buffer_head *bh;
341
342 jh = transaction->t_checkpoint_list;
343 bh = jh2bh(jh);
344 if (!jbd_trylock_bh_state(bh)) {
345 jbd_sync_bh(journal, bh);
346 retry = 1;
347 break;
348 }
349 retry = __process_buffer(journal, jh, bhs,&batch_count);
350 if (!retry && lock_need_resched(&journal->j_list_lock)){
351 spin_unlock(&journal->j_list_lock);
352 retry = 1;
353 break;
354 }
355 }
356
357 if (batch_count) {
358 if (!retry) {
359 spin_unlock(&journal->j_list_lock);
360 retry = 1;
361 }
362 __flush_batch(journal, bhs, &batch_count);
363 }
364
365 if (retry) {
366 spin_lock(&journal->j_list_lock);
367 goto restart;
368 }
369 /*
370 * Now we have cleaned up the first transaction's checkpoint
371 * list. Let's clean up the second one
372 */
373 __wait_cp_io(journal, transaction);
374 }
375out:
376 spin_unlock(&journal->j_list_lock);
377 result = jbd2_cleanup_journal_tail(journal);
378 if (result < 0)
379 return result;
380 return 0;
381}
382
383/*
384 * Check the list of checkpoint transactions for the journal to see if
385 * we have already got rid of any since the last update of the log tail
386 * in the journal superblock. If so, we can instantly roll the
387 * superblock forward to remove those transactions from the log.
388 *
389 * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
390 *
391 * Called with the journal lock held.
392 *
393 * This is the only part of the journaling code which really needs to be
394 * aware of transaction aborts. Checkpointing involves writing to the
395 * main filesystem area rather than to the journal, so it can proceed
396 * even in abort state, but we must not update the journal superblock if
397 * we have an abort error outstanding.
398 */
399
400int jbd2_cleanup_journal_tail(journal_t *journal)
401{
402 transaction_t * transaction;
403 tid_t first_tid;
404 unsigned long blocknr, freed;
405
406 /* OK, work out the oldest transaction remaining in the log, and
407 * the log block it starts at.
408 *
409 * If the log is now empty, we need to work out which is the
410 * next transaction ID we will write, and where it will
411 * start. */
412
413 spin_lock(&journal->j_state_lock);
414 spin_lock(&journal->j_list_lock);
415 transaction = journal->j_checkpoint_transactions;
416 if (transaction) {
417 first_tid = transaction->t_tid;
418 blocknr = transaction->t_log_start;
419 } else if ((transaction = journal->j_committing_transaction) != NULL) {
420 first_tid = transaction->t_tid;
421 blocknr = transaction->t_log_start;
422 } else if ((transaction = journal->j_running_transaction) != NULL) {
423 first_tid = transaction->t_tid;
424 blocknr = journal->j_head;
425 } else {
426 first_tid = journal->j_transaction_sequence;
427 blocknr = journal->j_head;
428 }
429 spin_unlock(&journal->j_list_lock);
430 J_ASSERT(blocknr != 0);
431
432 /* If the oldest pinned transaction is at the tail of the log
433 already then there's not much we can do right now. */
434 if (journal->j_tail_sequence == first_tid) {
435 spin_unlock(&journal->j_state_lock);
436 return 1;
437 }
438
439 /* OK, update the superblock to recover the freed space.
440 * Physical blocks come first: have we wrapped beyond the end of
441 * the log? */
442 freed = blocknr - journal->j_tail;
443 if (blocknr < journal->j_tail)
444 freed = freed + journal->j_last - journal->j_first;
445
446 jbd_debug(1,
447 "Cleaning journal tail from %d to %d (offset %lu), "
448 "freeing %lu\n",
449 journal->j_tail_sequence, first_tid, blocknr, freed);
450
451 journal->j_free += freed;
452 journal->j_tail_sequence = first_tid;
453 journal->j_tail = blocknr;
454 spin_unlock(&journal->j_state_lock);
455 if (!(journal->j_flags & JBD2_ABORT))
456 jbd2_journal_update_superblock(journal, 1);
457 return 0;
458}
459
460
461/* Checkpoint list management */
462
463/*
464 * journal_clean_one_cp_list
465 *
466 * Find all the written-back checkpoint buffers in the given list and release them.
467 *
468 * Called with the journal locked.
469 * Called with j_list_lock held.
470 * Returns number of bufers reaped (for debug)
471 */
472
473static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
474{
475 struct journal_head *last_jh;
476 struct journal_head *next_jh = jh;
477 int ret, freed = 0;
478
479 *released = 0;
480 if (!jh)
481 return 0;
482
483 last_jh = jh->b_cpprev;
484 do {
485 jh = next_jh;
486 next_jh = jh->b_cpnext;
487 /* Use trylock because of the ranking */
488 if (jbd_trylock_bh_state(jh2bh(jh))) {
489 ret = __try_to_free_cp_buf(jh);
490 if (ret) {
491 freed++;
492 if (ret == 2) {
493 *released = 1;
494 return freed;
495 }
496 }
497 }
498 /*
499 * This function only frees up some memory
500 * if possible so we dont have an obligation
501 * to finish processing. Bail out if preemption
502 * requested:
503 */
504 if (need_resched())
505 return freed;
506 } while (jh != last_jh);
507
508 return freed;
509}
510
511/*
512 * journal_clean_checkpoint_list
513 *
514 * Find all the written-back checkpoint buffers in the journal and release them.
515 *
516 * Called with the journal locked.
517 * Called with j_list_lock held.
518 * Returns number of buffers reaped (for debug)
519 */
520
521int __jbd2_journal_clean_checkpoint_list(journal_t *journal)
522{
523 transaction_t *transaction, *last_transaction, *next_transaction;
524 int ret = 0;
525 int released;
526
527 transaction = journal->j_checkpoint_transactions;
528 if (!transaction)
529 goto out;
530
531 last_transaction = transaction->t_cpprev;
532 next_transaction = transaction;
533 do {
534 transaction = next_transaction;
535 next_transaction = transaction->t_cpnext;
536 ret += journal_clean_one_cp_list(transaction->
537 t_checkpoint_list, &released);
538 /*
539 * This function only frees up some memory if possible so we
540 * dont have an obligation to finish processing. Bail out if
541 * preemption requested:
542 */
543 if (need_resched())
544 goto out;
545 if (released)
546 continue;
547 /*
548 * It is essential that we are as careful as in the case of
549 * t_checkpoint_list with removing the buffer from the list as
550 * we can possibly see not yet submitted buffers on io_list
551 */
552 ret += journal_clean_one_cp_list(transaction->
553 t_checkpoint_io_list, &released);
554 if (need_resched())
555 goto out;
556 } while (transaction != last_transaction);
557out:
558 return ret;
559}
560
561/*
562 * journal_remove_checkpoint: called after a buffer has been committed
563 * to disk (either by being write-back flushed to disk, or being
564 * committed to the log).
565 *
566 * We cannot safely clean a transaction out of the log until all of the
567 * buffer updates committed in that transaction have safely been stored
568 * elsewhere on disk. To achieve this, all of the buffers in a
569 * transaction need to be maintained on the transaction's checkpoint
570 * lists until they have been rewritten, at which point this function is
571 * called to remove the buffer from the existing transaction's
572 * checkpoint lists.
573 *
574 * The function returns 1 if it frees the transaction, 0 otherwise.
575 *
576 * This function is called with the journal locked.
577 * This function is called with j_list_lock held.
578 * This function is called with jbd_lock_bh_state(jh2bh(jh))
579 */
580
581int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
582{
583 transaction_t *transaction;
584 journal_t *journal;
585 int ret = 0;
586
587 JBUFFER_TRACE(jh, "entry");
588
589 if ((transaction = jh->b_cp_transaction) == NULL) {
590 JBUFFER_TRACE(jh, "not on transaction");
591 goto out;
592 }
593 journal = transaction->t_journal;
594
595 __buffer_unlink(jh);
596 jh->b_cp_transaction = NULL;
597
598 if (transaction->t_checkpoint_list != NULL ||
599 transaction->t_checkpoint_io_list != NULL)
600 goto out;
601 JBUFFER_TRACE(jh, "transaction has no more buffers");
602
603 /*
604 * There is one special case to worry about: if we have just pulled the
605 * buffer off a committing transaction's forget list, then even if the
606 * checkpoint list is empty, the transaction obviously cannot be
607 * dropped!
608 *
609 * The locking here around j_committing_transaction is a bit sleazy.
610 * See the comment at the end of jbd2_journal_commit_transaction().
611 */
612 if (transaction == journal->j_committing_transaction) {
613 JBUFFER_TRACE(jh, "belongs to committing transaction");
614 goto out;
615 }
616
617 /* OK, that was the last buffer for the transaction: we can now
618 safely remove this transaction from the log */
619
620 __jbd2_journal_drop_transaction(journal, transaction);
621
622 /* Just in case anybody was waiting for more transactions to be
623 checkpointed... */
624 wake_up(&journal->j_wait_logspace);
625 ret = 1;
626out:
627 JBUFFER_TRACE(jh, "exit");
628 return ret;
629}
630
631/*
632 * journal_insert_checkpoint: put a committed buffer onto a checkpoint
633 * list so that we know when it is safe to clean the transaction out of
634 * the log.
635 *
636 * Called with the journal locked.
637 * Called with j_list_lock held.
638 */
639void __jbd2_journal_insert_checkpoint(struct journal_head *jh,
640 transaction_t *transaction)
641{
642 JBUFFER_TRACE(jh, "entry");
643 J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
644 J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
645
646 jh->b_cp_transaction = transaction;
647
648 if (!transaction->t_checkpoint_list) {
649 jh->b_cpnext = jh->b_cpprev = jh;
650 } else {
651 jh->b_cpnext = transaction->t_checkpoint_list;
652 jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
653 jh->b_cpprev->b_cpnext = jh;
654 jh->b_cpnext->b_cpprev = jh;
655 }
656 transaction->t_checkpoint_list = jh;
657}
658
659/*
660 * We've finished with this transaction structure: adios...
661 *
662 * The transaction must have no links except for the checkpoint by this
663 * point.
664 *
665 * Called with the journal locked.
666 * Called with j_list_lock held.
667 */
668
669void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction)
670{
671 assert_spin_locked(&journal->j_list_lock);
672 if (transaction->t_cpnext) {
673 transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
674 transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
675 if (journal->j_checkpoint_transactions == transaction)
676 journal->j_checkpoint_transactions =
677 transaction->t_cpnext;
678 if (journal->j_checkpoint_transactions == transaction)
679 journal->j_checkpoint_transactions = NULL;
680 }
681
682 J_ASSERT(transaction->t_state == T_FINISHED);
683 J_ASSERT(transaction->t_buffers == NULL);
684 J_ASSERT(transaction->t_sync_datalist == NULL);
685 J_ASSERT(transaction->t_forget == NULL);
686 J_ASSERT(transaction->t_iobuf_list == NULL);
687 J_ASSERT(transaction->t_shadow_list == NULL);
688 J_ASSERT(transaction->t_log_list == NULL);
689 J_ASSERT(transaction->t_checkpoint_list == NULL);
690 J_ASSERT(transaction->t_checkpoint_io_list == NULL);
691 J_ASSERT(transaction->t_updates == 0);
692 J_ASSERT(journal->j_committing_transaction != transaction);
693 J_ASSERT(journal->j_running_transaction != transaction);
694
695 jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
696 kfree(transaction);
697}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
new file mode 100644
index 000000000000..70b2ae1ef281
--- /dev/null
+++ b/fs/jbd2/commit.c
@@ -0,0 +1,920 @@
1/*
2 * linux/fs/jbd2/commit.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal commit routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 */
15
16#include <linux/time.h>
17#include <linux/fs.h>
18#include <linux/jbd2.h>
19#include <linux/errno.h>
20#include <linux/slab.h>
21#include <linux/mm.h>
22#include <linux/pagemap.h>
23#include <linux/smp_lock.h>
24
25/*
26 * Default IO end handler for temporary BJ_IO buffer_heads.
27 */
28static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
29{
30 BUFFER_TRACE(bh, "");
31 if (uptodate)
32 set_buffer_uptodate(bh);
33 else
34 clear_buffer_uptodate(bh);
35 unlock_buffer(bh);
36}
37
38/*
39 * When an ext3-ordered file is truncated, it is possible that many pages are
40 * not sucessfully freed, because they are attached to a committing transaction.
41 * After the transaction commits, these pages are left on the LRU, with no
42 * ->mapping, and with attached buffers. These pages are trivially reclaimable
43 * by the VM, but their apparent absence upsets the VM accounting, and it makes
44 * the numbers in /proc/meminfo look odd.
45 *
46 * So here, we have a buffer which has just come off the forget list. Look to
47 * see if we can strip all buffers from the backing page.
48 *
49 * Called under lock_journal(), and possibly under journal_datalist_lock. The
50 * caller provided us with a ref against the buffer, and we drop that here.
51 */
52static void release_buffer_page(struct buffer_head *bh)
53{
54 struct page *page;
55
56 if (buffer_dirty(bh))
57 goto nope;
58 if (atomic_read(&bh->b_count) != 1)
59 goto nope;
60 page = bh->b_page;
61 if (!page)
62 goto nope;
63 if (page->mapping)
64 goto nope;
65
66 /* OK, it's a truncated page */
67 if (TestSetPageLocked(page))
68 goto nope;
69
70 page_cache_get(page);
71 __brelse(bh);
72 try_to_free_buffers(page);
73 unlock_page(page);
74 page_cache_release(page);
75 return;
76
77nope:
78 __brelse(bh);
79}
80
81/*
82 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
83 * held. For ranking reasons we must trylock. If we lose, schedule away and
84 * return 0. j_list_lock is dropped in this case.
85 */
86static int inverted_lock(journal_t *journal, struct buffer_head *bh)
87{
88 if (!jbd_trylock_bh_state(bh)) {
89 spin_unlock(&journal->j_list_lock);
90 schedule();
91 return 0;
92 }
93 return 1;
94}
95
96/* Done it all: now write the commit record. We should have
97 * cleaned up our previous buffers by now, so if we are in abort
98 * mode we can now just skip the rest of the journal write
99 * entirely.
100 *
101 * Returns 1 if the journal needs to be aborted or 0 on success
102 */
103static int journal_write_commit_record(journal_t *journal,
104 transaction_t *commit_transaction)
105{
106 struct journal_head *descriptor;
107 struct buffer_head *bh;
108 int i, ret;
109 int barrier_done = 0;
110
111 if (is_journal_aborted(journal))
112 return 0;
113
114 descriptor = jbd2_journal_get_descriptor_buffer(journal);
115 if (!descriptor)
116 return 1;
117
118 bh = jh2bh(descriptor);
119
120 /* AKPM: buglet - add `i' to tmp! */
121 for (i = 0; i < bh->b_size; i += 512) {
122 journal_header_t *tmp = (journal_header_t*)bh->b_data;
123 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
124 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
125 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
126 }
127
128 JBUFFER_TRACE(descriptor, "write commit block");
129 set_buffer_dirty(bh);
130 if (journal->j_flags & JBD2_BARRIER) {
131 set_buffer_ordered(bh);
132 barrier_done = 1;
133 }
134 ret = sync_dirty_buffer(bh);
135 /* is it possible for another commit to fail at roughly
136 * the same time as this one? If so, we don't want to
137 * trust the barrier flag in the super, but instead want
138 * to remember if we sent a barrier request
139 */
140 if (ret == -EOPNOTSUPP && barrier_done) {
141 char b[BDEVNAME_SIZE];
142
143 printk(KERN_WARNING
144 "JBD: barrier-based sync failed on %s - "
145 "disabling barriers\n",
146 bdevname(journal->j_dev, b));
147 spin_lock(&journal->j_state_lock);
148 journal->j_flags &= ~JBD2_BARRIER;
149 spin_unlock(&journal->j_state_lock);
150
151 /* And try again, without the barrier */
152 clear_buffer_ordered(bh);
153 set_buffer_uptodate(bh);
154 set_buffer_dirty(bh);
155 ret = sync_dirty_buffer(bh);
156 }
157 put_bh(bh); /* One for getblk() */
158 jbd2_journal_put_journal_head(descriptor);
159
160 return (ret == -EIO);
161}
162
163static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
164{
165 int i;
166
167 for (i = 0; i < bufs; i++) {
168 wbuf[i]->b_end_io = end_buffer_write_sync;
169 /* We use-up our safety reference in submit_bh() */
170 submit_bh(WRITE, wbuf[i]);
171 }
172}
173
174/*
175 * Submit all the data buffers to disk
176 */
177static void journal_submit_data_buffers(journal_t *journal,
178 transaction_t *commit_transaction)
179{
180 struct journal_head *jh;
181 struct buffer_head *bh;
182 int locked;
183 int bufs = 0;
184 struct buffer_head **wbuf = journal->j_wbuf;
185
186 /*
187 * Whenever we unlock the journal and sleep, things can get added
188 * onto ->t_sync_datalist, so we have to keep looping back to
189 * write_out_data until we *know* that the list is empty.
190 *
191 * Cleanup any flushed data buffers from the data list. Even in
192 * abort mode, we want to flush this out as soon as possible.
193 */
194write_out_data:
195 cond_resched();
196 spin_lock(&journal->j_list_lock);
197
198 while (commit_transaction->t_sync_datalist) {
199 jh = commit_transaction->t_sync_datalist;
200 bh = jh2bh(jh);
201 locked = 0;
202
203 /* Get reference just to make sure buffer does not disappear
204 * when we are forced to drop various locks */
205 get_bh(bh);
206 /* If the buffer is dirty, we need to submit IO and hence
207 * we need the buffer lock. We try to lock the buffer without
208 * blocking. If we fail, we need to drop j_list_lock and do
209 * blocking lock_buffer().
210 */
211 if (buffer_dirty(bh)) {
212 if (test_set_buffer_locked(bh)) {
213 BUFFER_TRACE(bh, "needs blocking lock");
214 spin_unlock(&journal->j_list_lock);
215 /* Write out all data to prevent deadlocks */
216 journal_do_submit_data(wbuf, bufs);
217 bufs = 0;
218 lock_buffer(bh);
219 spin_lock(&journal->j_list_lock);
220 }
221 locked = 1;
222 }
223 /* We have to get bh_state lock. Again out of order, sigh. */
224 if (!inverted_lock(journal, bh)) {
225 jbd_lock_bh_state(bh);
226 spin_lock(&journal->j_list_lock);
227 }
228 /* Someone already cleaned up the buffer? */
229 if (!buffer_jbd(bh)
230 || jh->b_transaction != commit_transaction
231 || jh->b_jlist != BJ_SyncData) {
232 jbd_unlock_bh_state(bh);
233 if (locked)
234 unlock_buffer(bh);
235 BUFFER_TRACE(bh, "already cleaned up");
236 put_bh(bh);
237 continue;
238 }
239 if (locked && test_clear_buffer_dirty(bh)) {
240 BUFFER_TRACE(bh, "needs writeout, adding to array");
241 wbuf[bufs++] = bh;
242 __jbd2_journal_file_buffer(jh, commit_transaction,
243 BJ_Locked);
244 jbd_unlock_bh_state(bh);
245 if (bufs == journal->j_wbufsize) {
246 spin_unlock(&journal->j_list_lock);
247 journal_do_submit_data(wbuf, bufs);
248 bufs = 0;
249 goto write_out_data;
250 }
251 }
252 else {
253 BUFFER_TRACE(bh, "writeout complete: unfile");
254 __jbd2_journal_unfile_buffer(jh);
255 jbd_unlock_bh_state(bh);
256 if (locked)
257 unlock_buffer(bh);
258 jbd2_journal_remove_journal_head(bh);
259 /* Once for our safety reference, once for
260 * jbd2_journal_remove_journal_head() */
261 put_bh(bh);
262 put_bh(bh);
263 }
264
265 if (lock_need_resched(&journal->j_list_lock)) {
266 spin_unlock(&journal->j_list_lock);
267 goto write_out_data;
268 }
269 }
270 spin_unlock(&journal->j_list_lock);
271 journal_do_submit_data(wbuf, bufs);
272}
273
274static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
275 unsigned long long block)
276{
277 tag->t_blocknr = cpu_to_be32(block & (u32)~0);
278 if (tag_bytes > JBD_TAG_SIZE32)
279 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
280}
281
282/*
283 * jbd2_journal_commit_transaction
284 *
285 * The primary function for committing a transaction to the log. This
286 * function is called by the journal thread to begin a complete commit.
287 */
288void jbd2_journal_commit_transaction(journal_t *journal)
289{
290 transaction_t *commit_transaction;
291 struct journal_head *jh, *new_jh, *descriptor;
292 struct buffer_head **wbuf = journal->j_wbuf;
293 int bufs;
294 int flags;
295 int err;
296 unsigned long long blocknr;
297 char *tagp = NULL;
298 journal_header_t *header;
299 journal_block_tag_t *tag = NULL;
300 int space_left = 0;
301 int first_tag = 0;
302 int tag_flag;
303 int i;
304 int tag_bytes = journal_tag_bytes(journal);
305
306 /*
307 * First job: lock down the current transaction and wait for
308 * all outstanding updates to complete.
309 */
310
311#ifdef COMMIT_STATS
312 spin_lock(&journal->j_list_lock);
313 summarise_journal_usage(journal);
314 spin_unlock(&journal->j_list_lock);
315#endif
316
317 /* Do we need to erase the effects of a prior jbd2_journal_flush? */
318 if (journal->j_flags & JBD2_FLUSHED) {
319 jbd_debug(3, "super block updated\n");
320 jbd2_journal_update_superblock(journal, 1);
321 } else {
322 jbd_debug(3, "superblock not updated\n");
323 }
324
325 J_ASSERT(journal->j_running_transaction != NULL);
326 J_ASSERT(journal->j_committing_transaction == NULL);
327
328 commit_transaction = journal->j_running_transaction;
329 J_ASSERT(commit_transaction->t_state == T_RUNNING);
330
331 jbd_debug(1, "JBD: starting commit of transaction %d\n",
332 commit_transaction->t_tid);
333
334 spin_lock(&journal->j_state_lock);
335 commit_transaction->t_state = T_LOCKED;
336
337 spin_lock(&commit_transaction->t_handle_lock);
338 while (commit_transaction->t_updates) {
339 DEFINE_WAIT(wait);
340
341 prepare_to_wait(&journal->j_wait_updates, &wait,
342 TASK_UNINTERRUPTIBLE);
343 if (commit_transaction->t_updates) {
344 spin_unlock(&commit_transaction->t_handle_lock);
345 spin_unlock(&journal->j_state_lock);
346 schedule();
347 spin_lock(&journal->j_state_lock);
348 spin_lock(&commit_transaction->t_handle_lock);
349 }
350 finish_wait(&journal->j_wait_updates, &wait);
351 }
352 spin_unlock(&commit_transaction->t_handle_lock);
353
354 J_ASSERT (commit_transaction->t_outstanding_credits <=
355 journal->j_max_transaction_buffers);
356
357 /*
358 * First thing we are allowed to do is to discard any remaining
359 * BJ_Reserved buffers. Note, it is _not_ permissible to assume
360 * that there are no such buffers: if a large filesystem
361 * operation like a truncate needs to split itself over multiple
362 * transactions, then it may try to do a jbd2_journal_restart() while
363 * there are still BJ_Reserved buffers outstanding. These must
364 * be released cleanly from the current transaction.
365 *
366 * In this case, the filesystem must still reserve write access
367 * again before modifying the buffer in the new transaction, but
368 * we do not require it to remember exactly which old buffers it
369 * has reserved. This is consistent with the existing behaviour
370 * that multiple jbd2_journal_get_write_access() calls to the same
371 * buffer are perfectly permissable.
372 */
373 while (commit_transaction->t_reserved_list) {
374 jh = commit_transaction->t_reserved_list;
375 JBUFFER_TRACE(jh, "reserved, unused: refile");
376 /*
377 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
378 * leave undo-committed data.
379 */
380 if (jh->b_committed_data) {
381 struct buffer_head *bh = jh2bh(jh);
382
383 jbd_lock_bh_state(bh);
384 jbd2_slab_free(jh->b_committed_data, bh->b_size);
385 jh->b_committed_data = NULL;
386 jbd_unlock_bh_state(bh);
387 }
388 jbd2_journal_refile_buffer(journal, jh);
389 }
390
391 /*
392 * Now try to drop any written-back buffers from the journal's
393 * checkpoint lists. We do this *before* commit because it potentially
394 * frees some memory
395 */
396 spin_lock(&journal->j_list_lock);
397 __jbd2_journal_clean_checkpoint_list(journal);
398 spin_unlock(&journal->j_list_lock);
399
400 jbd_debug (3, "JBD: commit phase 1\n");
401
402 /*
403 * Switch to a new revoke table.
404 */
405 jbd2_journal_switch_revoke_table(journal);
406
407 commit_transaction->t_state = T_FLUSH;
408 journal->j_committing_transaction = commit_transaction;
409 journal->j_running_transaction = NULL;
410 commit_transaction->t_log_start = journal->j_head;
411 wake_up(&journal->j_wait_transaction_locked);
412 spin_unlock(&journal->j_state_lock);
413
414 jbd_debug (3, "JBD: commit phase 2\n");
415
416 /*
417 * First, drop modified flag: all accesses to the buffers
418 * will be tracked for a new trasaction only -bzzz
419 */
420 spin_lock(&journal->j_list_lock);
421 if (commit_transaction->t_buffers) {
422 new_jh = jh = commit_transaction->t_buffers->b_tnext;
423 do {
424 J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
425 new_jh->b_modified == 0);
426 new_jh->b_modified = 0;
427 new_jh = new_jh->b_tnext;
428 } while (new_jh != jh);
429 }
430 spin_unlock(&journal->j_list_lock);
431
432 /*
433 * Now start flushing things to disk, in the order they appear
434 * on the transaction lists. Data blocks go first.
435 */
436 err = 0;
437 journal_submit_data_buffers(journal, commit_transaction);
438
439 /*
440 * Wait for all previously submitted IO to complete.
441 */
442 spin_lock(&journal->j_list_lock);
443 while (commit_transaction->t_locked_list) {
444 struct buffer_head *bh;
445
446 jh = commit_transaction->t_locked_list->b_tprev;
447 bh = jh2bh(jh);
448 get_bh(bh);
449 if (buffer_locked(bh)) {
450 spin_unlock(&journal->j_list_lock);
451 wait_on_buffer(bh);
452 if (unlikely(!buffer_uptodate(bh)))
453 err = -EIO;
454 spin_lock(&journal->j_list_lock);
455 }
456 if (!inverted_lock(journal, bh)) {
457 put_bh(bh);
458 spin_lock(&journal->j_list_lock);
459 continue;
460 }
461 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
462 __jbd2_journal_unfile_buffer(jh);
463 jbd_unlock_bh_state(bh);
464 jbd2_journal_remove_journal_head(bh);
465 put_bh(bh);
466 } else {
467 jbd_unlock_bh_state(bh);
468 }
469 put_bh(bh);
470 cond_resched_lock(&journal->j_list_lock);
471 }
472 spin_unlock(&journal->j_list_lock);
473
474 if (err)
475 __jbd2_journal_abort_hard(journal);
476
477 jbd2_journal_write_revoke_records(journal, commit_transaction);
478
479 jbd_debug(3, "JBD: commit phase 2\n");
480
481 /*
482 * If we found any dirty or locked buffers, then we should have
483 * looped back up to the write_out_data label. If there weren't
484 * any then journal_clean_data_list should have wiped the list
485 * clean by now, so check that it is in fact empty.
486 */
487 J_ASSERT (commit_transaction->t_sync_datalist == NULL);
488
489 jbd_debug (3, "JBD: commit phase 3\n");
490
491 /*
492 * Way to go: we have now written out all of the data for a
493 * transaction! Now comes the tricky part: we need to write out
494 * metadata. Loop over the transaction's entire buffer list:
495 */
496 commit_transaction->t_state = T_COMMIT;
497
498 descriptor = NULL;
499 bufs = 0;
500 while (commit_transaction->t_buffers) {
501
502 /* Find the next buffer to be journaled... */
503
504 jh = commit_transaction->t_buffers;
505
506 /* If we're in abort mode, we just un-journal the buffer and
507 release it for background writing. */
508
509 if (is_journal_aborted(journal)) {
510 JBUFFER_TRACE(jh, "journal is aborting: refile");
511 jbd2_journal_refile_buffer(journal, jh);
512 /* If that was the last one, we need to clean up
513 * any descriptor buffers which may have been
514 * already allocated, even if we are now
515 * aborting. */
516 if (!commit_transaction->t_buffers)
517 goto start_journal_io;
518 continue;
519 }
520
521 /* Make sure we have a descriptor block in which to
522 record the metadata buffer. */
523
524 if (!descriptor) {
525 struct buffer_head *bh;
526
527 J_ASSERT (bufs == 0);
528
529 jbd_debug(4, "JBD: get descriptor\n");
530
531 descriptor = jbd2_journal_get_descriptor_buffer(journal);
532 if (!descriptor) {
533 __jbd2_journal_abort_hard(journal);
534 continue;
535 }
536
537 bh = jh2bh(descriptor);
538 jbd_debug(4, "JBD: got buffer %llu (%p)\n",
539 (unsigned long long)bh->b_blocknr, bh->b_data);
540 header = (journal_header_t *)&bh->b_data[0];
541 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
542 header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
543 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
544
545 tagp = &bh->b_data[sizeof(journal_header_t)];
546 space_left = bh->b_size - sizeof(journal_header_t);
547 first_tag = 1;
548 set_buffer_jwrite(bh);
549 set_buffer_dirty(bh);
550 wbuf[bufs++] = bh;
551
552 /* Record it so that we can wait for IO
553 completion later */
554 BUFFER_TRACE(bh, "ph3: file as descriptor");
555 jbd2_journal_file_buffer(descriptor, commit_transaction,
556 BJ_LogCtl);
557 }
558
559 /* Where is the buffer to be written? */
560
561 err = jbd2_journal_next_log_block(journal, &blocknr);
562 /* If the block mapping failed, just abandon the buffer
563 and repeat this loop: we'll fall into the
564 refile-on-abort condition above. */
565 if (err) {
566 __jbd2_journal_abort_hard(journal);
567 continue;
568 }
569
570 /*
571 * start_this_handle() uses t_outstanding_credits to determine
572 * the free space in the log, but this counter is changed
573 * by jbd2_journal_next_log_block() also.
574 */
575 commit_transaction->t_outstanding_credits--;
576
577 /* Bump b_count to prevent truncate from stumbling over
578 the shadowed buffer! @@@ This can go if we ever get
579 rid of the BJ_IO/BJ_Shadow pairing of buffers. */
580 atomic_inc(&jh2bh(jh)->b_count);
581
582 /* Make a temporary IO buffer with which to write it out
583 (this will requeue both the metadata buffer and the
584 temporary IO buffer). new_bh goes on BJ_IO*/
585
586 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
587 /*
588 * akpm: jbd2_journal_write_metadata_buffer() sets
589 * new_bh->b_transaction to commit_transaction.
590 * We need to clean this up before we release new_bh
591 * (which is of type BJ_IO)
592 */
593 JBUFFER_TRACE(jh, "ph3: write metadata");
594 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
595 jh, &new_jh, blocknr);
596 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
597 wbuf[bufs++] = jh2bh(new_jh);
598
599 /* Record the new block's tag in the current descriptor
600 buffer */
601
602 tag_flag = 0;
603 if (flags & 1)
604 tag_flag |= JBD2_FLAG_ESCAPE;
605 if (!first_tag)
606 tag_flag |= JBD2_FLAG_SAME_UUID;
607
608 tag = (journal_block_tag_t *) tagp;
609 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
610 tag->t_flags = cpu_to_be32(tag_flag);
611 tagp += tag_bytes;
612 space_left -= tag_bytes;
613
614 if (first_tag) {
615 memcpy (tagp, journal->j_uuid, 16);
616 tagp += 16;
617 space_left -= 16;
618 first_tag = 0;
619 }
620
621 /* If there's no more to do, or if the descriptor is full,
622 let the IO rip! */
623
624 if (bufs == journal->j_wbufsize ||
625 commit_transaction->t_buffers == NULL ||
626 space_left < tag_bytes + 16) {
627
628 jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
629
630 /* Write an end-of-descriptor marker before
631 submitting the IOs. "tag" still points to
632 the last tag we set up. */
633
634 tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
635
636start_journal_io:
637 for (i = 0; i < bufs; i++) {
638 struct buffer_head *bh = wbuf[i];
639 lock_buffer(bh);
640 clear_buffer_dirty(bh);
641 set_buffer_uptodate(bh);
642 bh->b_end_io = journal_end_buffer_io_sync;
643 submit_bh(WRITE, bh);
644 }
645 cond_resched();
646
647 /* Force a new descriptor to be generated next
648 time round the loop. */
649 descriptor = NULL;
650 bufs = 0;
651 }
652 }
653
654 /* Lo and behold: we have just managed to send a transaction to
655 the log. Before we can commit it, wait for the IO so far to
656 complete. Control buffers being written are on the
657 transaction's t_log_list queue, and metadata buffers are on
658 the t_iobuf_list queue.
659
660 Wait for the buffers in reverse order. That way we are
661 less likely to be woken up until all IOs have completed, and
662 so we incur less scheduling load.
663 */
664
665 jbd_debug(3, "JBD: commit phase 4\n");
666
667 /*
668 * akpm: these are BJ_IO, and j_list_lock is not needed.
669 * See __journal_try_to_free_buffer.
670 */
671wait_for_iobuf:
672 while (commit_transaction->t_iobuf_list != NULL) {
673 struct buffer_head *bh;
674
675 jh = commit_transaction->t_iobuf_list->b_tprev;
676 bh = jh2bh(jh);
677 if (buffer_locked(bh)) {
678 wait_on_buffer(bh);
679 goto wait_for_iobuf;
680 }
681 if (cond_resched())
682 goto wait_for_iobuf;
683
684 if (unlikely(!buffer_uptodate(bh)))
685 err = -EIO;
686
687 clear_buffer_jwrite(bh);
688
689 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
690 jbd2_journal_unfile_buffer(journal, jh);
691
692 /*
693 * ->t_iobuf_list should contain only dummy buffer_heads
694 * which were created by jbd2_journal_write_metadata_buffer().
695 */
696 BUFFER_TRACE(bh, "dumping temporary bh");
697 jbd2_journal_put_journal_head(jh);
698 __brelse(bh);
699 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
700 free_buffer_head(bh);
701
702 /* We also have to unlock and free the corresponding
703 shadowed buffer */
704 jh = commit_transaction->t_shadow_list->b_tprev;
705 bh = jh2bh(jh);
706 clear_bit(BH_JWrite, &bh->b_state);
707 J_ASSERT_BH(bh, buffer_jbddirty(bh));
708
709 /* The metadata is now released for reuse, but we need
710 to remember it against this transaction so that when
711 we finally commit, we can do any checkpointing
712 required. */
713 JBUFFER_TRACE(jh, "file as BJ_Forget");
714 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
715 /* Wake up any transactions which were waiting for this
716 IO to complete */
717 wake_up_bit(&bh->b_state, BH_Unshadow);
718 JBUFFER_TRACE(jh, "brelse shadowed buffer");
719 __brelse(bh);
720 }
721
722 J_ASSERT (commit_transaction->t_shadow_list == NULL);
723
724 jbd_debug(3, "JBD: commit phase 5\n");
725
726 /* Here we wait for the revoke record and descriptor record buffers */
727 wait_for_ctlbuf:
728 while (commit_transaction->t_log_list != NULL) {
729 struct buffer_head *bh;
730
731 jh = commit_transaction->t_log_list->b_tprev;
732 bh = jh2bh(jh);
733 if (buffer_locked(bh)) {
734 wait_on_buffer(bh);
735 goto wait_for_ctlbuf;
736 }
737 if (cond_resched())
738 goto wait_for_ctlbuf;
739
740 if (unlikely(!buffer_uptodate(bh)))
741 err = -EIO;
742
743 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
744 clear_buffer_jwrite(bh);
745 jbd2_journal_unfile_buffer(journal, jh);
746 jbd2_journal_put_journal_head(jh);
747 __brelse(bh); /* One for getblk */
748 /* AKPM: bforget here */
749 }
750
751 jbd_debug(3, "JBD: commit phase 6\n");
752
753 if (journal_write_commit_record(journal, commit_transaction))
754 err = -EIO;
755
756 if (err)
757 __jbd2_journal_abort_hard(journal);
758
759 /* End of a transaction! Finally, we can do checkpoint
760 processing: any buffers committed as a result of this
761 transaction can be removed from any checkpoint list it was on
762 before. */
763
764 jbd_debug(3, "JBD: commit phase 7\n");
765
766 J_ASSERT(commit_transaction->t_sync_datalist == NULL);
767 J_ASSERT(commit_transaction->t_buffers == NULL);
768 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
769 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
770 J_ASSERT(commit_transaction->t_shadow_list == NULL);
771 J_ASSERT(commit_transaction->t_log_list == NULL);
772
773restart_loop:
774 /*
775 * As there are other places (journal_unmap_buffer()) adding buffers
776 * to this list we have to be careful and hold the j_list_lock.
777 */
778 spin_lock(&journal->j_list_lock);
779 while (commit_transaction->t_forget) {
780 transaction_t *cp_transaction;
781 struct buffer_head *bh;
782
783 jh = commit_transaction->t_forget;
784 spin_unlock(&journal->j_list_lock);
785 bh = jh2bh(jh);
786 jbd_lock_bh_state(bh);
787 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
788 jh->b_transaction == journal->j_running_transaction);
789
790 /*
791 * If there is undo-protected committed data against
792 * this buffer, then we can remove it now. If it is a
793 * buffer needing such protection, the old frozen_data
794 * field now points to a committed version of the
795 * buffer, so rotate that field to the new committed
796 * data.
797 *
798 * Otherwise, we can just throw away the frozen data now.
799 */
800 if (jh->b_committed_data) {
801 jbd2_slab_free(jh->b_committed_data, bh->b_size);
802 jh->b_committed_data = NULL;
803 if (jh->b_frozen_data) {
804 jh->b_committed_data = jh->b_frozen_data;
805 jh->b_frozen_data = NULL;
806 }
807 } else if (jh->b_frozen_data) {
808 jbd2_slab_free(jh->b_frozen_data, bh->b_size);
809 jh->b_frozen_data = NULL;
810 }
811
812 spin_lock(&journal->j_list_lock);
813 cp_transaction = jh->b_cp_transaction;
814 if (cp_transaction) {
815 JBUFFER_TRACE(jh, "remove from old cp transaction");
816 __jbd2_journal_remove_checkpoint(jh);
817 }
818
819 /* Only re-checkpoint the buffer_head if it is marked
820 * dirty. If the buffer was added to the BJ_Forget list
821 * by jbd2_journal_forget, it may no longer be dirty and
822 * there's no point in keeping a checkpoint record for
823 * it. */
824
825 /* A buffer which has been freed while still being
826 * journaled by a previous transaction may end up still
827 * being dirty here, but we want to avoid writing back
828 * that buffer in the future now that the last use has
829 * been committed. That's not only a performance gain,
830 * it also stops aliasing problems if the buffer is left
831 * behind for writeback and gets reallocated for another
832 * use in a different page. */
833 if (buffer_freed(bh)) {
834 clear_buffer_freed(bh);
835 clear_buffer_jbddirty(bh);
836 }
837
838 if (buffer_jbddirty(bh)) {
839 JBUFFER_TRACE(jh, "add to new checkpointing trans");
840 __jbd2_journal_insert_checkpoint(jh, commit_transaction);
841 JBUFFER_TRACE(jh, "refile for checkpoint writeback");
842 __jbd2_journal_refile_buffer(jh);
843 jbd_unlock_bh_state(bh);
844 } else {
845 J_ASSERT_BH(bh, !buffer_dirty(bh));
846 /* The buffer on BJ_Forget list and not jbddirty means
847 * it has been freed by this transaction and hence it
848 * could not have been reallocated until this
849 * transaction has committed. *BUT* it could be
850 * reallocated once we have written all the data to
851 * disk and before we process the buffer on BJ_Forget
852 * list. */
853 JBUFFER_TRACE(jh, "refile or unfile freed buffer");
854 __jbd2_journal_refile_buffer(jh);
855 if (!jh->b_transaction) {
856 jbd_unlock_bh_state(bh);
857 /* needs a brelse */
858 jbd2_journal_remove_journal_head(bh);
859 release_buffer_page(bh);
860 } else
861 jbd_unlock_bh_state(bh);
862 }
863 cond_resched_lock(&journal->j_list_lock);
864 }
865 spin_unlock(&journal->j_list_lock);
866 /*
867 * This is a bit sleazy. We borrow j_list_lock to protect
868 * journal->j_committing_transaction in __jbd2_journal_remove_checkpoint.
869 * Really, __jbd2_journal_remove_checkpoint should be using j_state_lock but
870 * it's a bit hassle to hold that across __jbd2_journal_remove_checkpoint
871 */
872 spin_lock(&journal->j_state_lock);
873 spin_lock(&journal->j_list_lock);
874 /*
875 * Now recheck if some buffers did not get attached to the transaction
876 * while the lock was dropped...
877 */
878 if (commit_transaction->t_forget) {
879 spin_unlock(&journal->j_list_lock);
880 spin_unlock(&journal->j_state_lock);
881 goto restart_loop;
882 }
883
884 /* Done with this transaction! */
885
886 jbd_debug(3, "JBD: commit phase 8\n");
887
888 J_ASSERT(commit_transaction->t_state == T_COMMIT);
889
890 commit_transaction->t_state = T_FINISHED;
891 J_ASSERT(commit_transaction == journal->j_committing_transaction);
892 journal->j_commit_sequence = commit_transaction->t_tid;
893 journal->j_committing_transaction = NULL;
894 spin_unlock(&journal->j_state_lock);
895
896 if (commit_transaction->t_checkpoint_list == NULL) {
897 __jbd2_journal_drop_transaction(journal, commit_transaction);
898 } else {
899 if (journal->j_checkpoint_transactions == NULL) {
900 journal->j_checkpoint_transactions = commit_transaction;
901 commit_transaction->t_cpnext = commit_transaction;
902 commit_transaction->t_cpprev = commit_transaction;
903 } else {
904 commit_transaction->t_cpnext =
905 journal->j_checkpoint_transactions;
906 commit_transaction->t_cpprev =
907 commit_transaction->t_cpnext->t_cpprev;
908 commit_transaction->t_cpnext->t_cpprev =
909 commit_transaction;
910 commit_transaction->t_cpprev->t_cpnext =
911 commit_transaction;
912 }
913 }
914 spin_unlock(&journal->j_list_lock);
915
916 jbd_debug(1, "JBD: commit %d complete, head %d\n",
917 journal->j_commit_sequence, journal->j_tail_sequence);
918
919 wake_up(&journal->j_wait_done_commit);
920}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
new file mode 100644
index 000000000000..10db92ced014
--- /dev/null
+++ b/fs/jbd2/journal.c
@@ -0,0 +1,2083 @@
1/*
2 * linux/fs/jbd2/journal.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Generic filesystem journal-writing code; part of the ext2fs
13 * journaling system.
14 *
15 * This file manages journals: areas of disk reserved for logging
16 * transactional updates. This includes the kernel journaling thread
17 * which is responsible for scheduling updates to the log.
18 *
19 * We do not actually manage the physical storage of the journal in this
20 * file: that is left to a per-journal policy function, which allows us
21 * to store the journal within a filesystem-specified area for ext2
22 * journaling (ext2 can use a reserved inode for storing the log).
23 */
24
25#include <linux/module.h>
26#include <linux/time.h>
27#include <linux/fs.h>
28#include <linux/jbd2.h>
29#include <linux/errno.h>
30#include <linux/slab.h>
31#include <linux/smp_lock.h>
32#include <linux/init.h>
33#include <linux/mm.h>
34#include <linux/suspend.h>
35#include <linux/pagemap.h>
36#include <linux/kthread.h>
37#include <linux/poison.h>
38#include <linux/proc_fs.h>
39
40#include <asm/uaccess.h>
41#include <asm/page.h>
42
43EXPORT_SYMBOL(jbd2_journal_start);
44EXPORT_SYMBOL(jbd2_journal_restart);
45EXPORT_SYMBOL(jbd2_journal_extend);
46EXPORT_SYMBOL(jbd2_journal_stop);
47EXPORT_SYMBOL(jbd2_journal_lock_updates);
48EXPORT_SYMBOL(jbd2_journal_unlock_updates);
49EXPORT_SYMBOL(jbd2_journal_get_write_access);
50EXPORT_SYMBOL(jbd2_journal_get_create_access);
51EXPORT_SYMBOL(jbd2_journal_get_undo_access);
52EXPORT_SYMBOL(jbd2_journal_dirty_data);
53EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
54EXPORT_SYMBOL(jbd2_journal_release_buffer);
55EXPORT_SYMBOL(jbd2_journal_forget);
56#if 0
57EXPORT_SYMBOL(journal_sync_buffer);
58#endif
59EXPORT_SYMBOL(jbd2_journal_flush);
60EXPORT_SYMBOL(jbd2_journal_revoke);
61
62EXPORT_SYMBOL(jbd2_journal_init_dev);
63EXPORT_SYMBOL(jbd2_journal_init_inode);
64EXPORT_SYMBOL(jbd2_journal_update_format);
65EXPORT_SYMBOL(jbd2_journal_check_used_features);
66EXPORT_SYMBOL(jbd2_journal_check_available_features);
67EXPORT_SYMBOL(jbd2_journal_set_features);
68EXPORT_SYMBOL(jbd2_journal_create);
69EXPORT_SYMBOL(jbd2_journal_load);
70EXPORT_SYMBOL(jbd2_journal_destroy);
71EXPORT_SYMBOL(jbd2_journal_update_superblock);
72EXPORT_SYMBOL(jbd2_journal_abort);
73EXPORT_SYMBOL(jbd2_journal_errno);
74EXPORT_SYMBOL(jbd2_journal_ack_err);
75EXPORT_SYMBOL(jbd2_journal_clear_err);
76EXPORT_SYMBOL(jbd2_log_wait_commit);
77EXPORT_SYMBOL(jbd2_journal_start_commit);
78EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
79EXPORT_SYMBOL(jbd2_journal_wipe);
80EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
81EXPORT_SYMBOL(jbd2_journal_invalidatepage);
82EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
83EXPORT_SYMBOL(jbd2_journal_force_commit);
84
85static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
86static void __journal_abort_soft (journal_t *journal, int errno);
87static int jbd2_journal_create_jbd_slab(size_t slab_size);
88
89/*
90 * Helper function used to manage commit timeouts
91 */
92
93static void commit_timeout(unsigned long __data)
94{
95 struct task_struct * p = (struct task_struct *) __data;
96
97 wake_up_process(p);
98}
99
100/*
101 * kjournald2: The main thread function used to manage a logging device
102 * journal.
103 *
104 * This kernel thread is responsible for two things:
105 *
106 * 1) COMMIT: Every so often we need to commit the current state of the
107 * filesystem to disk. The journal thread is responsible for writing
108 * all of the metadata buffers to disk.
109 *
110 * 2) CHECKPOINT: We cannot reuse a used section of the log file until all
111 * of the data in that part of the log has been rewritten elsewhere on
112 * the disk. Flushing these old buffers to reclaim space in the log is
113 * known as checkpointing, and this thread is responsible for that job.
114 */
115
116static int kjournald2(void *arg)
117{
118 journal_t *journal = arg;
119 transaction_t *transaction;
120
121 /*
122 * Set up an interval timer which can be used to trigger a commit wakeup
123 * after the commit interval expires
124 */
125 setup_timer(&journal->j_commit_timer, commit_timeout,
126 (unsigned long)current);
127
128 /* Record that the journal thread is running */
129 journal->j_task = current;
130 wake_up(&journal->j_wait_done_commit);
131
132 printk(KERN_INFO "kjournald2 starting. Commit interval %ld seconds\n",
133 journal->j_commit_interval / HZ);
134
135 /*
136 * And now, wait forever for commit wakeup events.
137 */
138 spin_lock(&journal->j_state_lock);
139
140loop:
141 if (journal->j_flags & JBD2_UNMOUNT)
142 goto end_loop;
143
144 jbd_debug(1, "commit_sequence=%d, commit_request=%d\n",
145 journal->j_commit_sequence, journal->j_commit_request);
146
147 if (journal->j_commit_sequence != journal->j_commit_request) {
148 jbd_debug(1, "OK, requests differ\n");
149 spin_unlock(&journal->j_state_lock);
150 del_timer_sync(&journal->j_commit_timer);
151 jbd2_journal_commit_transaction(journal);
152 spin_lock(&journal->j_state_lock);
153 goto loop;
154 }
155
156 wake_up(&journal->j_wait_done_commit);
157 if (freezing(current)) {
158 /*
159 * The simpler the better. Flushing journal isn't a
160 * good idea, because that depends on threads that may
161 * be already stopped.
162 */
163 jbd_debug(1, "Now suspending kjournald2\n");
164 spin_unlock(&journal->j_state_lock);
165 refrigerator();
166 spin_lock(&journal->j_state_lock);
167 } else {
168 /*
169 * We assume on resume that commits are already there,
170 * so we don't sleep
171 */
172 DEFINE_WAIT(wait);
173 int should_sleep = 1;
174
175 prepare_to_wait(&journal->j_wait_commit, &wait,
176 TASK_INTERRUPTIBLE);
177 if (journal->j_commit_sequence != journal->j_commit_request)
178 should_sleep = 0;
179 transaction = journal->j_running_transaction;
180 if (transaction && time_after_eq(jiffies,
181 transaction->t_expires))
182 should_sleep = 0;
183 if (journal->j_flags & JBD2_UNMOUNT)
184 should_sleep = 0;
185 if (should_sleep) {
186 spin_unlock(&journal->j_state_lock);
187 schedule();
188 spin_lock(&journal->j_state_lock);
189 }
190 finish_wait(&journal->j_wait_commit, &wait);
191 }
192
193 jbd_debug(1, "kjournald2 wakes\n");
194
195 /*
196 * Were we woken up by a commit wakeup event?
197 */
198 transaction = journal->j_running_transaction;
199 if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
200 journal->j_commit_request = transaction->t_tid;
201 jbd_debug(1, "woke because of timeout\n");
202 }
203 goto loop;
204
205end_loop:
206 spin_unlock(&journal->j_state_lock);
207 del_timer_sync(&journal->j_commit_timer);
208 journal->j_task = NULL;
209 wake_up(&journal->j_wait_done_commit);
210 jbd_debug(1, "Journal thread exiting.\n");
211 return 0;
212}
213
214static void jbd2_journal_start_thread(journal_t *journal)
215{
216 kthread_run(kjournald2, journal, "kjournald2");
217 wait_event(journal->j_wait_done_commit, journal->j_task != 0);
218}
219
220static void journal_kill_thread(journal_t *journal)
221{
222 spin_lock(&journal->j_state_lock);
223 journal->j_flags |= JBD2_UNMOUNT;
224
225 while (journal->j_task) {
226 wake_up(&journal->j_wait_commit);
227 spin_unlock(&journal->j_state_lock);
228 wait_event(journal->j_wait_done_commit, journal->j_task == 0);
229 spin_lock(&journal->j_state_lock);
230 }
231 spin_unlock(&journal->j_state_lock);
232}
233
234/*
235 * jbd2_journal_write_metadata_buffer: write a metadata buffer to the journal.
236 *
237 * Writes a metadata buffer to a given disk block. The actual IO is not
238 * performed but a new buffer_head is constructed which labels the data
239 * to be written with the correct destination disk block.
240 *
241 * Any magic-number escaping which needs to be done will cause a
242 * copy-out here. If the buffer happens to start with the
243 * JBD2_MAGIC_NUMBER, then we can't write it to the log directly: the
244 * magic number is only written to the log for descripter blocks. In
245 * this case, we copy the data and replace the first word with 0, and we
246 * return a result code which indicates that this buffer needs to be
247 * marked as an escaped buffer in the corresponding log descriptor
248 * block. The missing word can then be restored when the block is read
249 * during recovery.
250 *
251 * If the source buffer has already been modified by a new transaction
252 * since we took the last commit snapshot, we use the frozen copy of
253 * that data for IO. If we end up using the existing buffer_head's data
254 * for the write, then we *have* to lock the buffer to prevent anyone
255 * else from using and possibly modifying it while the IO is in
256 * progress.
257 *
258 * The function returns a pointer to the buffer_heads to be used for IO.
259 *
260 * We assume that the journal has already been locked in this function.
261 *
262 * Return value:
263 * <0: Error
264 * >=0: Finished OK
265 *
266 * On success:
267 * Bit 0 set == escape performed on the data
268 * Bit 1 set == buffer copy-out performed (kfree the data after IO)
269 */
270
271int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
272 struct journal_head *jh_in,
273 struct journal_head **jh_out,
274 unsigned long long blocknr)
275{
276 int need_copy_out = 0;
277 int done_copy_out = 0;
278 int do_escape = 0;
279 char *mapped_data;
280 struct buffer_head *new_bh;
281 struct journal_head *new_jh;
282 struct page *new_page;
283 unsigned int new_offset;
284 struct buffer_head *bh_in = jh2bh(jh_in);
285
286 /*
287 * The buffer really shouldn't be locked: only the current committing
288 * transaction is allowed to write it, so nobody else is allowed
289 * to do any IO.
290 *
291 * akpm: except if we're journalling data, and write() output is
292 * also part of a shared mapping, and another thread has
293 * decided to launch a writepage() against this buffer.
294 */
295 J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
296
297 new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
298
299 /*
300 * If a new transaction has already done a buffer copy-out, then
301 * we use that version of the data for the commit.
302 */
303 jbd_lock_bh_state(bh_in);
304repeat:
305 if (jh_in->b_frozen_data) {
306 done_copy_out = 1;
307 new_page = virt_to_page(jh_in->b_frozen_data);
308 new_offset = offset_in_page(jh_in->b_frozen_data);
309 } else {
310 new_page = jh2bh(jh_in)->b_page;
311 new_offset = offset_in_page(jh2bh(jh_in)->b_data);
312 }
313
314 mapped_data = kmap_atomic(new_page, KM_USER0);
315 /*
316 * Check for escaping
317 */
318 if (*((__be32 *)(mapped_data + new_offset)) ==
319 cpu_to_be32(JBD2_MAGIC_NUMBER)) {
320 need_copy_out = 1;
321 do_escape = 1;
322 }
323 kunmap_atomic(mapped_data, KM_USER0);
324
325 /*
326 * Do we need to do a data copy?
327 */
328 if (need_copy_out && !done_copy_out) {
329 char *tmp;
330
331 jbd_unlock_bh_state(bh_in);
332 tmp = jbd2_slab_alloc(bh_in->b_size, GFP_NOFS);
333 jbd_lock_bh_state(bh_in);
334 if (jh_in->b_frozen_data) {
335 jbd2_slab_free(tmp, bh_in->b_size);
336 goto repeat;
337 }
338
339 jh_in->b_frozen_data = tmp;
340 mapped_data = kmap_atomic(new_page, KM_USER0);
341 memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size);
342 kunmap_atomic(mapped_data, KM_USER0);
343
344 new_page = virt_to_page(tmp);
345 new_offset = offset_in_page(tmp);
346 done_copy_out = 1;
347 }
348
349 /*
350 * Did we need to do an escaping? Now we've done all the
351 * copying, we can finally do so.
352 */
353 if (do_escape) {
354 mapped_data = kmap_atomic(new_page, KM_USER0);
355 *((unsigned int *)(mapped_data + new_offset)) = 0;
356 kunmap_atomic(mapped_data, KM_USER0);
357 }
358
359 /* keep subsequent assertions sane */
360 new_bh->b_state = 0;
361 init_buffer(new_bh, NULL, NULL);
362 atomic_set(&new_bh->b_count, 1);
363 jbd_unlock_bh_state(bh_in);
364
365 new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */
366
367 set_bh_page(new_bh, new_page, new_offset);
368 new_jh->b_transaction = NULL;
369 new_bh->b_size = jh2bh(jh_in)->b_size;
370 new_bh->b_bdev = transaction->t_journal->j_dev;
371 new_bh->b_blocknr = blocknr;
372 set_buffer_mapped(new_bh);
373 set_buffer_dirty(new_bh);
374
375 *jh_out = new_jh;
376
377 /*
378 * The to-be-written buffer needs to get moved to the io queue,
379 * and the original buffer whose contents we are shadowing or
380 * copying is moved to the transaction's shadow queue.
381 */
382 JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
383 jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
384 JBUFFER_TRACE(new_jh, "file as BJ_IO");
385 jbd2_journal_file_buffer(new_jh, transaction, BJ_IO);
386
387 return do_escape | (done_copy_out << 1);
388}
389
390/*
391 * Allocation code for the journal file. Manage the space left in the
392 * journal, so that we can begin checkpointing when appropriate.
393 */
394
395/*
396 * __jbd2_log_space_left: Return the number of free blocks left in the journal.
397 *
398 * Called with the journal already locked.
399 *
400 * Called under j_state_lock
401 */
402
403int __jbd2_log_space_left(journal_t *journal)
404{
405 int left = journal->j_free;
406
407 assert_spin_locked(&journal->j_state_lock);
408
409 /*
410 * Be pessimistic here about the number of those free blocks which
411 * might be required for log descriptor control blocks.
412 */
413
414#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
415
416 left -= MIN_LOG_RESERVED_BLOCKS;
417
418 if (left <= 0)
419 return 0;
420 left -= (left >> 3);
421 return left;
422}
423
424/*
425 * Called under j_state_lock. Returns true if a transaction was started.
426 */
427int __jbd2_log_start_commit(journal_t *journal, tid_t target)
428{
429 /*
430 * Are we already doing a recent enough commit?
431 */
432 if (!tid_geq(journal->j_commit_request, target)) {
433 /*
434 * We want a new commit: OK, mark the request and wakup the
435 * commit thread. We do _not_ do the commit ourselves.
436 */
437
438 journal->j_commit_request = target;
439 jbd_debug(1, "JBD: requesting commit %d/%d\n",
440 journal->j_commit_request,
441 journal->j_commit_sequence);
442 wake_up(&journal->j_wait_commit);
443 return 1;
444 }
445 return 0;
446}
447
448int jbd2_log_start_commit(journal_t *journal, tid_t tid)
449{
450 int ret;
451
452 spin_lock(&journal->j_state_lock);
453 ret = __jbd2_log_start_commit(journal, tid);
454 spin_unlock(&journal->j_state_lock);
455 return ret;
456}
457
458/*
459 * Force and wait upon a commit if the calling process is not within
460 * transaction. This is used for forcing out undo-protected data which contains
461 * bitmaps, when the fs is running out of space.
462 *
463 * We can only force the running transaction if we don't have an active handle;
464 * otherwise, we will deadlock.
465 *
466 * Returns true if a transaction was started.
467 */
468int jbd2_journal_force_commit_nested(journal_t *journal)
469{
470 transaction_t *transaction = NULL;
471 tid_t tid;
472
473 spin_lock(&journal->j_state_lock);
474 if (journal->j_running_transaction && !current->journal_info) {
475 transaction = journal->j_running_transaction;
476 __jbd2_log_start_commit(journal, transaction->t_tid);
477 } else if (journal->j_committing_transaction)
478 transaction = journal->j_committing_transaction;
479
480 if (!transaction) {
481 spin_unlock(&journal->j_state_lock);
482 return 0; /* Nothing to retry */
483 }
484
485 tid = transaction->t_tid;
486 spin_unlock(&journal->j_state_lock);
487 jbd2_log_wait_commit(journal, tid);
488 return 1;
489}
490
491/*
492 * Start a commit of the current running transaction (if any). Returns true
493 * if a transaction was started, and fills its tid in at *ptid
494 */
495int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
496{
497 int ret = 0;
498
499 spin_lock(&journal->j_state_lock);
500 if (journal->j_running_transaction) {
501 tid_t tid = journal->j_running_transaction->t_tid;
502
503 ret = __jbd2_log_start_commit(journal, tid);
504 if (ret && ptid)
505 *ptid = tid;
506 } else if (journal->j_committing_transaction && ptid) {
507 /*
508 * If ext3_write_super() recently started a commit, then we
509 * have to wait for completion of that transaction
510 */
511 *ptid = journal->j_committing_transaction->t_tid;
512 ret = 1;
513 }
514 spin_unlock(&journal->j_state_lock);
515 return ret;
516}
517
518/*
519 * Wait for a specified commit to complete.
520 * The caller may not hold the journal lock.
521 */
522int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
523{
524 int err = 0;
525
526#ifdef CONFIG_JBD_DEBUG
527 spin_lock(&journal->j_state_lock);
528 if (!tid_geq(journal->j_commit_request, tid)) {
529 printk(KERN_EMERG
530 "%s: error: j_commit_request=%d, tid=%d\n",
531 __FUNCTION__, journal->j_commit_request, tid);
532 }
533 spin_unlock(&journal->j_state_lock);
534#endif
535 spin_lock(&journal->j_state_lock);
536 while (tid_gt(tid, journal->j_commit_sequence)) {
537 jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
538 tid, journal->j_commit_sequence);
539 wake_up(&journal->j_wait_commit);
540 spin_unlock(&journal->j_state_lock);
541 wait_event(journal->j_wait_done_commit,
542 !tid_gt(tid, journal->j_commit_sequence));
543 spin_lock(&journal->j_state_lock);
544 }
545 spin_unlock(&journal->j_state_lock);
546
547 if (unlikely(is_journal_aborted(journal))) {
548 printk(KERN_EMERG "journal commit I/O error\n");
549 err = -EIO;
550 }
551 return err;
552}
553
554/*
555 * Log buffer allocation routines:
556 */
557
558int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
559{
560 unsigned long blocknr;
561
562 spin_lock(&journal->j_state_lock);
563 J_ASSERT(journal->j_free > 1);
564
565 blocknr = journal->j_head;
566 journal->j_head++;
567 journal->j_free--;
568 if (journal->j_head == journal->j_last)
569 journal->j_head = journal->j_first;
570 spin_unlock(&journal->j_state_lock);
571 return jbd2_journal_bmap(journal, blocknr, retp);
572}
573
574/*
575 * Conversion of logical to physical block numbers for the journal
576 *
577 * On external journals the journal blocks are identity-mapped, so
578 * this is a no-op. If needed, we can use j_blk_offset - everything is
579 * ready.
580 */
581int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
582 unsigned long long *retp)
583{
584 int err = 0;
585 unsigned long long ret;
586
587 if (journal->j_inode) {
588 ret = bmap(journal->j_inode, blocknr);
589 if (ret)
590 *retp = ret;
591 else {
592 char b[BDEVNAME_SIZE];
593
594 printk(KERN_ALERT "%s: journal block not found "
595 "at offset %lu on %s\n",
596 __FUNCTION__,
597 blocknr,
598 bdevname(journal->j_dev, b));
599 err = -EIO;
600 __journal_abort_soft(journal, err);
601 }
602 } else {
603 *retp = blocknr; /* +journal->j_blk_offset */
604 }
605 return err;
606}
607
608/*
609 * We play buffer_head aliasing tricks to write data/metadata blocks to
610 * the journal without copying their contents, but for journal
611 * descriptor blocks we do need to generate bona fide buffers.
612 *
613 * After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying
614 * the buffer's contents they really should run flush_dcache_page(bh->b_page).
615 * But we don't bother doing that, so there will be coherency problems with
616 * mmaps of blockdevs which hold live JBD-controlled filesystems.
617 */
618struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
619{
620 struct buffer_head *bh;
621 unsigned long long blocknr;
622 int err;
623
624 err = jbd2_journal_next_log_block(journal, &blocknr);
625
626 if (err)
627 return NULL;
628
629 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
630 lock_buffer(bh);
631 memset(bh->b_data, 0, journal->j_blocksize);
632 set_buffer_uptodate(bh);
633 unlock_buffer(bh);
634 BUFFER_TRACE(bh, "return this buffer");
635 return jbd2_journal_add_journal_head(bh);
636}
637
638/*
639 * Management for journal control blocks: functions to create and
640 * destroy journal_t structures, and to initialise and read existing
641 * journal blocks from disk. */
642
643/* First: create and setup a journal_t object in memory. We initialise
644 * very few fields yet: that has to wait until we have created the
645 * journal structures from from scratch, or loaded them from disk. */
646
647static journal_t * journal_init_common (void)
648{
649 journal_t *journal;
650 int err;
651
652 journal = jbd_kmalloc(sizeof(*journal), GFP_KERNEL);
653 if (!journal)
654 goto fail;
655 memset(journal, 0, sizeof(*journal));
656
657 init_waitqueue_head(&journal->j_wait_transaction_locked);
658 init_waitqueue_head(&journal->j_wait_logspace);
659 init_waitqueue_head(&journal->j_wait_done_commit);
660 init_waitqueue_head(&journal->j_wait_checkpoint);
661 init_waitqueue_head(&journal->j_wait_commit);
662 init_waitqueue_head(&journal->j_wait_updates);
663 mutex_init(&journal->j_barrier);
664 mutex_init(&journal->j_checkpoint_mutex);
665 spin_lock_init(&journal->j_revoke_lock);
666 spin_lock_init(&journal->j_list_lock);
667 spin_lock_init(&journal->j_state_lock);
668
669 journal->j_commit_interval = (HZ * JBD_DEFAULT_MAX_COMMIT_AGE);
670
671 /* The journal is marked for error until we succeed with recovery! */
672 journal->j_flags = JBD2_ABORT;
673
674 /* Set up a default-sized revoke table for the new mount. */
675 err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
676 if (err) {
677 kfree(journal);
678 goto fail;
679 }
680 return journal;
681fail:
682 return NULL;
683}
684
685/* jbd2_journal_init_dev and jbd2_journal_init_inode:
686 *
687 * Create a journal structure assigned some fixed set of disk blocks to
688 * the journal. We don't actually touch those disk blocks yet, but we
689 * need to set up all of the mapping information to tell the journaling
690 * system where the journal blocks are.
691 *
692 */
693
694/**
695 * journal_t * jbd2_journal_init_dev() - creates an initialises a journal structure
696 * @bdev: Block device on which to create the journal
697 * @fs_dev: Device which hold journalled filesystem for this journal.
698 * @start: Block nr Start of journal.
699 * @len: Length of the journal in blocks.
700 * @blocksize: blocksize of journalling device
701 * @returns: a newly created journal_t *
702 *
703 * jbd2_journal_init_dev creates a journal which maps a fixed contiguous
704 * range of blocks on an arbitrary block device.
705 *
706 */
707journal_t * jbd2_journal_init_dev(struct block_device *bdev,
708 struct block_device *fs_dev,
709 unsigned long long start, int len, int blocksize)
710{
711 journal_t *journal = journal_init_common();
712 struct buffer_head *bh;
713 int n;
714
715 if (!journal)
716 return NULL;
717
718 /* journal descriptor can store up to n blocks -bzzz */
719 journal->j_blocksize = blocksize;
720 n = journal->j_blocksize / sizeof(journal_block_tag_t);
721 journal->j_wbufsize = n;
722 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
723 if (!journal->j_wbuf) {
724 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
725 __FUNCTION__);
726 kfree(journal);
727 journal = NULL;
728 }
729 journal->j_dev = bdev;
730 journal->j_fs_dev = fs_dev;
731 journal->j_blk_offset = start;
732 journal->j_maxlen = len;
733
734 bh = __getblk(journal->j_dev, start, journal->j_blocksize);
735 J_ASSERT(bh != NULL);
736 journal->j_sb_buffer = bh;
737 journal->j_superblock = (journal_superblock_t *)bh->b_data;
738
739 return journal;
740}
741
742/**
743 * journal_t * jbd2_journal_init_inode () - creates a journal which maps to a inode.
744 * @inode: An inode to create the journal in
745 *
746 * jbd2_journal_init_inode creates a journal which maps an on-disk inode as
747 * the journal. The inode must exist already, must support bmap() and
748 * must have all data blocks preallocated.
749 */
750journal_t * jbd2_journal_init_inode (struct inode *inode)
751{
752 struct buffer_head *bh;
753 journal_t *journal = journal_init_common();
754 int err;
755 int n;
756 unsigned long long blocknr;
757
758 if (!journal)
759 return NULL;
760
761 journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
762 journal->j_inode = inode;
763 jbd_debug(1,
764 "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
765 journal, inode->i_sb->s_id, inode->i_ino,
766 (long long) inode->i_size,
767 inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
768
769 journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
770 journal->j_blocksize = inode->i_sb->s_blocksize;
771
772 /* journal descriptor can store up to n blocks -bzzz */
773 n = journal->j_blocksize / sizeof(journal_block_tag_t);
774 journal->j_wbufsize = n;
775 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
776 if (!journal->j_wbuf) {
777 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
778 __FUNCTION__);
779 kfree(journal);
780 return NULL;
781 }
782
783 err = jbd2_journal_bmap(journal, 0, &blocknr);
784 /* If that failed, give up */
785 if (err) {
786 printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
787 __FUNCTION__);
788 kfree(journal);
789 return NULL;
790 }
791
792 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
793 J_ASSERT(bh != NULL);
794 journal->j_sb_buffer = bh;
795 journal->j_superblock = (journal_superblock_t *)bh->b_data;
796
797 return journal;
798}
799
800/*
801 * If the journal init or create aborts, we need to mark the journal
802 * superblock as being NULL to prevent the journal destroy from writing
803 * back a bogus superblock.
804 */
805static void journal_fail_superblock (journal_t *journal)
806{
807 struct buffer_head *bh = journal->j_sb_buffer;
808 brelse(bh);
809 journal->j_sb_buffer = NULL;
810}
811
812/*
813 * Given a journal_t structure, initialise the various fields for
814 * startup of a new journaling session. We use this both when creating
815 * a journal, and after recovering an old journal to reset it for
816 * subsequent use.
817 */
818
819static int journal_reset(journal_t *journal)
820{
821 journal_superblock_t *sb = journal->j_superblock;
822 unsigned long long first, last;
823
824 first = be32_to_cpu(sb->s_first);
825 last = be32_to_cpu(sb->s_maxlen);
826
827 journal->j_first = first;
828 journal->j_last = last;
829
830 journal->j_head = first;
831 journal->j_tail = first;
832 journal->j_free = last - first;
833
834 journal->j_tail_sequence = journal->j_transaction_sequence;
835 journal->j_commit_sequence = journal->j_transaction_sequence - 1;
836 journal->j_commit_request = journal->j_commit_sequence;
837
838 journal->j_max_transaction_buffers = journal->j_maxlen / 4;
839
840 /* Add the dynamic fields and write it to disk. */
841 jbd2_journal_update_superblock(journal, 1);
842 jbd2_journal_start_thread(journal);
843 return 0;
844}
845
846/**
847 * int jbd2_journal_create() - Initialise the new journal file
848 * @journal: Journal to create. This structure must have been initialised
849 *
850 * Given a journal_t structure which tells us which disk blocks we can
851 * use, create a new journal superblock and initialise all of the
852 * journal fields from scratch.
853 **/
854int jbd2_journal_create(journal_t *journal)
855{
856 unsigned long long blocknr;
857 struct buffer_head *bh;
858 journal_superblock_t *sb;
859 int i, err;
860
861 if (journal->j_maxlen < JBD2_MIN_JOURNAL_BLOCKS) {
862 printk (KERN_ERR "Journal length (%d blocks) too short.\n",
863 journal->j_maxlen);
864 journal_fail_superblock(journal);
865 return -EINVAL;
866 }
867
868 if (journal->j_inode == NULL) {
869 /*
870 * We don't know what block to start at!
871 */
872 printk(KERN_EMERG
873 "%s: creation of journal on external device!\n",
874 __FUNCTION__);
875 BUG();
876 }
877
878 /* Zero out the entire journal on disk. We cannot afford to
879 have any blocks on disk beginning with JBD2_MAGIC_NUMBER. */
880 jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
881 for (i = 0; i < journal->j_maxlen; i++) {
882 err = jbd2_journal_bmap(journal, i, &blocknr);
883 if (err)
884 return err;
885 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
886 lock_buffer(bh);
887 memset (bh->b_data, 0, journal->j_blocksize);
888 BUFFER_TRACE(bh, "marking dirty");
889 mark_buffer_dirty(bh);
890 BUFFER_TRACE(bh, "marking uptodate");
891 set_buffer_uptodate(bh);
892 unlock_buffer(bh);
893 __brelse(bh);
894 }
895
896 sync_blockdev(journal->j_dev);
897 jbd_debug(1, "JBD: journal cleared.\n");
898
899 /* OK, fill in the initial static fields in the new superblock */
900 sb = journal->j_superblock;
901
902 sb->s_header.h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
903 sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
904
905 sb->s_blocksize = cpu_to_be32(journal->j_blocksize);
906 sb->s_maxlen = cpu_to_be32(journal->j_maxlen);
907 sb->s_first = cpu_to_be32(1);
908
909 journal->j_transaction_sequence = 1;
910
911 journal->j_flags &= ~JBD2_ABORT;
912 journal->j_format_version = 2;
913
914 return journal_reset(journal);
915}
916
917/**
918 * void jbd2_journal_update_superblock() - Update journal sb on disk.
919 * @journal: The journal to update.
920 * @wait: Set to '0' if you don't want to wait for IO completion.
921 *
922 * Update a journal's dynamic superblock fields and write it to disk,
923 * optionally waiting for the IO to complete.
924 */
925void jbd2_journal_update_superblock(journal_t *journal, int wait)
926{
927 journal_superblock_t *sb = journal->j_superblock;
928 struct buffer_head *bh = journal->j_sb_buffer;
929
930 /*
931 * As a special case, if the on-disk copy is already marked as needing
932 * no recovery (s_start == 0) and there are no outstanding transactions
933 * in the filesystem, then we can safely defer the superblock update
934 * until the next commit by setting JBD2_FLUSHED. This avoids
935 * attempting a write to a potential-readonly device.
936 */
937 if (sb->s_start == 0 && journal->j_tail_sequence ==
938 journal->j_transaction_sequence) {
939 jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
940 "(start %ld, seq %d, errno %d)\n",
941 journal->j_tail, journal->j_tail_sequence,
942 journal->j_errno);
943 goto out;
944 }
945
946 spin_lock(&journal->j_state_lock);
947 jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
948 journal->j_tail, journal->j_tail_sequence, journal->j_errno);
949
950 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
951 sb->s_start = cpu_to_be32(journal->j_tail);
952 sb->s_errno = cpu_to_be32(journal->j_errno);
953 spin_unlock(&journal->j_state_lock);
954
955 BUFFER_TRACE(bh, "marking dirty");
956 mark_buffer_dirty(bh);
957 if (wait)
958 sync_dirty_buffer(bh);
959 else
960 ll_rw_block(SWRITE, 1, &bh);
961
962out:
963 /* If we have just flushed the log (by marking s_start==0), then
964 * any future commit will have to be careful to update the
965 * superblock again to re-record the true start of the log. */
966
967 spin_lock(&journal->j_state_lock);
968 if (sb->s_start)
969 journal->j_flags &= ~JBD2_FLUSHED;
970 else
971 journal->j_flags |= JBD2_FLUSHED;
972 spin_unlock(&journal->j_state_lock);
973}
974
975/*
976 * Read the superblock for a given journal, performing initial
977 * validation of the format.
978 */
979
980static int journal_get_superblock(journal_t *journal)
981{
982 struct buffer_head *bh;
983 journal_superblock_t *sb;
984 int err = -EIO;
985
986 bh = journal->j_sb_buffer;
987
988 J_ASSERT(bh != NULL);
989 if (!buffer_uptodate(bh)) {
990 ll_rw_block(READ, 1, &bh);
991 wait_on_buffer(bh);
992 if (!buffer_uptodate(bh)) {
993 printk (KERN_ERR
994 "JBD: IO error reading journal superblock\n");
995 goto out;
996 }
997 }
998
999 sb = journal->j_superblock;
1000
1001 err = -EINVAL;
1002
1003 if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) ||
1004 sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
1005 printk(KERN_WARNING "JBD: no valid journal superblock found\n");
1006 goto out;
1007 }
1008
1009 switch(be32_to_cpu(sb->s_header.h_blocktype)) {
1010 case JBD2_SUPERBLOCK_V1:
1011 journal->j_format_version = 1;
1012 break;
1013 case JBD2_SUPERBLOCK_V2:
1014 journal->j_format_version = 2;
1015 break;
1016 default:
1017 printk(KERN_WARNING "JBD: unrecognised superblock format ID\n");
1018 goto out;
1019 }
1020
1021 if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
1022 journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
1023 else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
1024 printk (KERN_WARNING "JBD: journal file too short\n");
1025 goto out;
1026 }
1027
1028 return 0;
1029
1030out:
1031 journal_fail_superblock(journal);
1032 return err;
1033}
1034
1035/*
1036 * Load the on-disk journal superblock and read the key fields into the
1037 * journal_t.
1038 */
1039
1040static int load_superblock(journal_t *journal)
1041{
1042 int err;
1043 journal_superblock_t *sb;
1044
1045 err = journal_get_superblock(journal);
1046 if (err)
1047 return err;
1048
1049 sb = journal->j_superblock;
1050
1051 journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
1052 journal->j_tail = be32_to_cpu(sb->s_start);
1053 journal->j_first = be32_to_cpu(sb->s_first);
1054 journal->j_last = be32_to_cpu(sb->s_maxlen);
1055 journal->j_errno = be32_to_cpu(sb->s_errno);
1056
1057 return 0;
1058}
1059
1060
1061/**
1062 * int jbd2_journal_load() - Read journal from disk.
1063 * @journal: Journal to act on.
1064 *
1065 * Given a journal_t structure which tells us which disk blocks contain
1066 * a journal, read the journal from disk to initialise the in-memory
1067 * structures.
1068 */
1069int jbd2_journal_load(journal_t *journal)
1070{
1071 int err;
1072 journal_superblock_t *sb;
1073
1074 err = load_superblock(journal);
1075 if (err)
1076 return err;
1077
1078 sb = journal->j_superblock;
1079 /* If this is a V2 superblock, then we have to check the
1080 * features flags on it. */
1081
1082 if (journal->j_format_version >= 2) {
1083 if ((sb->s_feature_ro_compat &
1084 ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) ||
1085 (sb->s_feature_incompat &
1086 ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) {
1087 printk (KERN_WARNING
1088 "JBD: Unrecognised features on journal\n");
1089 return -EINVAL;
1090 }
1091 }
1092
1093 /*
1094 * Create a slab for this blocksize
1095 */
1096 err = jbd2_journal_create_jbd_slab(be32_to_cpu(sb->s_blocksize));
1097 if (err)
1098 return err;
1099
1100 /* Let the recovery code check whether it needs to recover any
1101 * data from the journal. */
1102 if (jbd2_journal_recover(journal))
1103 goto recovery_error;
1104
1105 /* OK, we've finished with the dynamic journal bits:
1106 * reinitialise the dynamic contents of the superblock in memory
1107 * and reset them on disk. */
1108 if (journal_reset(journal))
1109 goto recovery_error;
1110
1111 journal->j_flags &= ~JBD2_ABORT;
1112 journal->j_flags |= JBD2_LOADED;
1113 return 0;
1114
1115recovery_error:
1116 printk (KERN_WARNING "JBD: recovery failed\n");
1117 return -EIO;
1118}
1119
1120/**
1121 * void jbd2_journal_destroy() - Release a journal_t structure.
1122 * @journal: Journal to act on.
1123 *
1124 * Release a journal_t structure once it is no longer in use by the
1125 * journaled object.
1126 */
1127void jbd2_journal_destroy(journal_t *journal)
1128{
1129 /* Wait for the commit thread to wake up and die. */
1130 journal_kill_thread(journal);
1131
1132 /* Force a final log commit */
1133 if (journal->j_running_transaction)
1134 jbd2_journal_commit_transaction(journal);
1135
1136 /* Force any old transactions to disk */
1137
1138 /* Totally anal locking here... */
1139 spin_lock(&journal->j_list_lock);
1140 while (journal->j_checkpoint_transactions != NULL) {
1141 spin_unlock(&journal->j_list_lock);
1142 jbd2_log_do_checkpoint(journal);
1143 spin_lock(&journal->j_list_lock);
1144 }
1145
1146 J_ASSERT(journal->j_running_transaction == NULL);
1147 J_ASSERT(journal->j_committing_transaction == NULL);
1148 J_ASSERT(journal->j_checkpoint_transactions == NULL);
1149 spin_unlock(&journal->j_list_lock);
1150
1151 /* We can now mark the journal as empty. */
1152 journal->j_tail = 0;
1153 journal->j_tail_sequence = ++journal->j_transaction_sequence;
1154 if (journal->j_sb_buffer) {
1155 jbd2_journal_update_superblock(journal, 1);
1156 brelse(journal->j_sb_buffer);
1157 }
1158
1159 if (journal->j_inode)
1160 iput(journal->j_inode);
1161 if (journal->j_revoke)
1162 jbd2_journal_destroy_revoke(journal);
1163 kfree(journal->j_wbuf);
1164 kfree(journal);
1165}
1166
1167
1168/**
1169 *int jbd2_journal_check_used_features () - Check if features specified are used.
1170 * @journal: Journal to check.
1171 * @compat: bitmask of compatible features
1172 * @ro: bitmask of features that force read-only mount
1173 * @incompat: bitmask of incompatible features
1174 *
1175 * Check whether the journal uses all of a given set of
1176 * features. Return true (non-zero) if it does.
1177 **/
1178
1179int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
1180 unsigned long ro, unsigned long incompat)
1181{
1182 journal_superblock_t *sb;
1183
1184 if (!compat && !ro && !incompat)
1185 return 1;
1186 if (journal->j_format_version == 1)
1187 return 0;
1188
1189 sb = journal->j_superblock;
1190
1191 if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
1192 ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
1193 ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
1194 return 1;
1195
1196 return 0;
1197}
1198
1199/**
1200 * int jbd2_journal_check_available_features() - Check feature set in journalling layer
1201 * @journal: Journal to check.
1202 * @compat: bitmask of compatible features
1203 * @ro: bitmask of features that force read-only mount
1204 * @incompat: bitmask of incompatible features
1205 *
1206 * Check whether the journaling code supports the use of
1207 * all of a given set of features on this journal. Return true
1208 * (non-zero) if it can. */
1209
1210int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat,
1211 unsigned long ro, unsigned long incompat)
1212{
1213 journal_superblock_t *sb;
1214
1215 if (!compat && !ro && !incompat)
1216 return 1;
1217
1218 sb = journal->j_superblock;
1219
1220 /* We can support any known requested features iff the
1221 * superblock is in version 2. Otherwise we fail to support any
1222 * extended sb features. */
1223
1224 if (journal->j_format_version != 2)
1225 return 0;
1226
1227 if ((compat & JBD2_KNOWN_COMPAT_FEATURES) == compat &&
1228 (ro & JBD2_KNOWN_ROCOMPAT_FEATURES) == ro &&
1229 (incompat & JBD2_KNOWN_INCOMPAT_FEATURES) == incompat)
1230 return 1;
1231
1232 return 0;
1233}
1234
1235/**
1236 * int jbd2_journal_set_features () - Mark a given journal feature in the superblock
1237 * @journal: Journal to act on.
1238 * @compat: bitmask of compatible features
1239 * @ro: bitmask of features that force read-only mount
1240 * @incompat: bitmask of incompatible features
1241 *
1242 * Mark a given journal feature as present on the
1243 * superblock. Returns true if the requested features could be set.
1244 *
1245 */
1246
1247int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
1248 unsigned long ro, unsigned long incompat)
1249{
1250 journal_superblock_t *sb;
1251
1252 if (jbd2_journal_check_used_features(journal, compat, ro, incompat))
1253 return 1;
1254
1255 if (!jbd2_journal_check_available_features(journal, compat, ro, incompat))
1256 return 0;
1257
1258 jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
1259 compat, ro, incompat);
1260
1261 sb = journal->j_superblock;
1262
1263 sb->s_feature_compat |= cpu_to_be32(compat);
1264 sb->s_feature_ro_compat |= cpu_to_be32(ro);
1265 sb->s_feature_incompat |= cpu_to_be32(incompat);
1266
1267 return 1;
1268}
1269
1270
1271/**
1272 * int jbd2_journal_update_format () - Update on-disk journal structure.
1273 * @journal: Journal to act on.
1274 *
1275 * Given an initialised but unloaded journal struct, poke about in the
1276 * on-disk structure to update it to the most recent supported version.
1277 */
1278int jbd2_journal_update_format (journal_t *journal)
1279{
1280 journal_superblock_t *sb;
1281 int err;
1282
1283 err = journal_get_superblock(journal);
1284 if (err)
1285 return err;
1286
1287 sb = journal->j_superblock;
1288
1289 switch (be32_to_cpu(sb->s_header.h_blocktype)) {
1290 case JBD2_SUPERBLOCK_V2:
1291 return 0;
1292 case JBD2_SUPERBLOCK_V1:
1293 return journal_convert_superblock_v1(journal, sb);
1294 default:
1295 break;
1296 }
1297 return -EINVAL;
1298}
1299
1300static int journal_convert_superblock_v1(journal_t *journal,
1301 journal_superblock_t *sb)
1302{
1303 int offset, blocksize;
1304 struct buffer_head *bh;
1305
1306 printk(KERN_WARNING
1307 "JBD: Converting superblock from version 1 to 2.\n");
1308
1309 /* Pre-initialise new fields to zero */
1310 offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
1311 blocksize = be32_to_cpu(sb->s_blocksize);
1312 memset(&sb->s_feature_compat, 0, blocksize-offset);
1313
1314 sb->s_nr_users = cpu_to_be32(1);
1315 sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
1316 journal->j_format_version = 2;
1317
1318 bh = journal->j_sb_buffer;
1319 BUFFER_TRACE(bh, "marking dirty");
1320 mark_buffer_dirty(bh);
1321 sync_dirty_buffer(bh);
1322 return 0;
1323}
1324
1325
1326/**
1327 * int jbd2_journal_flush () - Flush journal
1328 * @journal: Journal to act on.
1329 *
1330 * Flush all data for a given journal to disk and empty the journal.
1331 * Filesystems can use this when remounting readonly to ensure that
1332 * recovery does not need to happen on remount.
1333 */
1334
1335int jbd2_journal_flush(journal_t *journal)
1336{
1337 int err = 0;
1338 transaction_t *transaction = NULL;
1339 unsigned long old_tail;
1340
1341 spin_lock(&journal->j_state_lock);
1342
1343 /* Force everything buffered to the log... */
1344 if (journal->j_running_transaction) {
1345 transaction = journal->j_running_transaction;
1346 __jbd2_log_start_commit(journal, transaction->t_tid);
1347 } else if (journal->j_committing_transaction)
1348 transaction = journal->j_committing_transaction;
1349
1350 /* Wait for the log commit to complete... */
1351 if (transaction) {
1352 tid_t tid = transaction->t_tid;
1353
1354 spin_unlock(&journal->j_state_lock);
1355 jbd2_log_wait_commit(journal, tid);
1356 } else {
1357 spin_unlock(&journal->j_state_lock);
1358 }
1359
1360 /* ...and flush everything in the log out to disk. */
1361 spin_lock(&journal->j_list_lock);
1362 while (!err && journal->j_checkpoint_transactions != NULL) {
1363 spin_unlock(&journal->j_list_lock);
1364 err = jbd2_log_do_checkpoint(journal);
1365 spin_lock(&journal->j_list_lock);
1366 }
1367 spin_unlock(&journal->j_list_lock);
1368 jbd2_cleanup_journal_tail(journal);
1369
1370 /* Finally, mark the journal as really needing no recovery.
1371 * This sets s_start==0 in the underlying superblock, which is
1372 * the magic code for a fully-recovered superblock. Any future
1373 * commits of data to the journal will restore the current
1374 * s_start value. */
1375 spin_lock(&journal->j_state_lock);
1376 old_tail = journal->j_tail;
1377 journal->j_tail = 0;
1378 spin_unlock(&journal->j_state_lock);
1379 jbd2_journal_update_superblock(journal, 1);
1380 spin_lock(&journal->j_state_lock);
1381 journal->j_tail = old_tail;
1382
1383 J_ASSERT(!journal->j_running_transaction);
1384 J_ASSERT(!journal->j_committing_transaction);
1385 J_ASSERT(!journal->j_checkpoint_transactions);
1386 J_ASSERT(journal->j_head == journal->j_tail);
1387 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
1388 spin_unlock(&journal->j_state_lock);
1389 return err;
1390}
1391
1392/**
1393 * int jbd2_journal_wipe() - Wipe journal contents
1394 * @journal: Journal to act on.
1395 * @write: flag (see below)
1396 *
1397 * Wipe out all of the contents of a journal, safely. This will produce
1398 * a warning if the journal contains any valid recovery information.
1399 * Must be called between journal_init_*() and jbd2_journal_load().
1400 *
1401 * If 'write' is non-zero, then we wipe out the journal on disk; otherwise
1402 * we merely suppress recovery.
1403 */
1404
1405int jbd2_journal_wipe(journal_t *journal, int write)
1406{
1407 journal_superblock_t *sb;
1408 int err = 0;
1409
1410 J_ASSERT (!(journal->j_flags & JBD2_LOADED));
1411
1412 err = load_superblock(journal);
1413 if (err)
1414 return err;
1415
1416 sb = journal->j_superblock;
1417
1418 if (!journal->j_tail)
1419 goto no_recovery;
1420
1421 printk (KERN_WARNING "JBD: %s recovery information on journal\n",
1422 write ? "Clearing" : "Ignoring");
1423
1424 err = jbd2_journal_skip_recovery(journal);
1425 if (write)
1426 jbd2_journal_update_superblock(journal, 1);
1427
1428 no_recovery:
1429 return err;
1430}
1431
1432/*
1433 * journal_dev_name: format a character string to describe on what
1434 * device this journal is present.
1435 */
1436
1437static const char *journal_dev_name(journal_t *journal, char *buffer)
1438{
1439 struct block_device *bdev;
1440
1441 if (journal->j_inode)
1442 bdev = journal->j_inode->i_sb->s_bdev;
1443 else
1444 bdev = journal->j_dev;
1445
1446 return bdevname(bdev, buffer);
1447}
1448
1449/*
1450 * Journal abort has very specific semantics, which we describe
1451 * for journal abort.
1452 *
1453 * Two internal function, which provide abort to te jbd layer
1454 * itself are here.
1455 */
1456
1457/*
1458 * Quick version for internal journal use (doesn't lock the journal).
1459 * Aborts hard --- we mark the abort as occurred, but do _nothing_ else,
1460 * and don't attempt to make any other journal updates.
1461 */
1462void __jbd2_journal_abort_hard(journal_t *journal)
1463{
1464 transaction_t *transaction;
1465 char b[BDEVNAME_SIZE];
1466
1467 if (journal->j_flags & JBD2_ABORT)
1468 return;
1469
1470 printk(KERN_ERR "Aborting journal on device %s.\n",
1471 journal_dev_name(journal, b));
1472
1473 spin_lock(&journal->j_state_lock);
1474 journal->j_flags |= JBD2_ABORT;
1475 transaction = journal->j_running_transaction;
1476 if (transaction)
1477 __jbd2_log_start_commit(journal, transaction->t_tid);
1478 spin_unlock(&journal->j_state_lock);
1479}
1480
1481/* Soft abort: record the abort error status in the journal superblock,
1482 * but don't do any other IO. */
1483static void __journal_abort_soft (journal_t *journal, int errno)
1484{
1485 if (journal->j_flags & JBD2_ABORT)
1486 return;
1487
1488 if (!journal->j_errno)
1489 journal->j_errno = errno;
1490
1491 __jbd2_journal_abort_hard(journal);
1492
1493 if (errno)
1494 jbd2_journal_update_superblock(journal, 1);
1495}
1496
1497/**
1498 * void jbd2_journal_abort () - Shutdown the journal immediately.
1499 * @journal: the journal to shutdown.
1500 * @errno: an error number to record in the journal indicating
1501 * the reason for the shutdown.
1502 *
1503 * Perform a complete, immediate shutdown of the ENTIRE
1504 * journal (not of a single transaction). This operation cannot be
1505 * undone without closing and reopening the journal.
1506 *
1507 * The jbd2_journal_abort function is intended to support higher level error
1508 * recovery mechanisms such as the ext2/ext3 remount-readonly error
1509 * mode.
1510 *
1511 * Journal abort has very specific semantics. Any existing dirty,
1512 * unjournaled buffers in the main filesystem will still be written to
1513 * disk by bdflush, but the journaling mechanism will be suspended
1514 * immediately and no further transaction commits will be honoured.
1515 *
1516 * Any dirty, journaled buffers will be written back to disk without
1517 * hitting the journal. Atomicity cannot be guaranteed on an aborted
1518 * filesystem, but we _do_ attempt to leave as much data as possible
1519 * behind for fsck to use for cleanup.
1520 *
1521 * Any attempt to get a new transaction handle on a journal which is in
1522 * ABORT state will just result in an -EROFS error return. A
1523 * jbd2_journal_stop on an existing handle will return -EIO if we have
1524 * entered abort state during the update.
1525 *
1526 * Recursive transactions are not disturbed by journal abort until the
1527 * final jbd2_journal_stop, which will receive the -EIO error.
1528 *
1529 * Finally, the jbd2_journal_abort call allows the caller to supply an errno
1530 * which will be recorded (if possible) in the journal superblock. This
1531 * allows a client to record failure conditions in the middle of a
1532 * transaction without having to complete the transaction to record the
1533 * failure to disk. ext3_error, for example, now uses this
1534 * functionality.
1535 *
1536 * Errors which originate from within the journaling layer will NOT
1537 * supply an errno; a null errno implies that absolutely no further
1538 * writes are done to the journal (unless there are any already in
1539 * progress).
1540 *
1541 */
1542
1543void jbd2_journal_abort(journal_t *journal, int errno)
1544{
1545 __journal_abort_soft(journal, errno);
1546}
1547
1548/**
1549 * int jbd2_journal_errno () - returns the journal's error state.
1550 * @journal: journal to examine.
1551 *
1552 * This is the errno numbet set with jbd2_journal_abort(), the last
1553 * time the journal was mounted - if the journal was stopped
1554 * without calling abort this will be 0.
1555 *
1556 * If the journal has been aborted on this mount time -EROFS will
1557 * be returned.
1558 */
1559int jbd2_journal_errno(journal_t *journal)
1560{
1561 int err;
1562
1563 spin_lock(&journal->j_state_lock);
1564 if (journal->j_flags & JBD2_ABORT)
1565 err = -EROFS;
1566 else
1567 err = journal->j_errno;
1568 spin_unlock(&journal->j_state_lock);
1569 return err;
1570}
1571
1572/**
1573 * int jbd2_journal_clear_err () - clears the journal's error state
1574 * @journal: journal to act on.
1575 *
1576 * An error must be cleared or Acked to take a FS out of readonly
1577 * mode.
1578 */
1579int jbd2_journal_clear_err(journal_t *journal)
1580{
1581 int err = 0;
1582
1583 spin_lock(&journal->j_state_lock);
1584 if (journal->j_flags & JBD2_ABORT)
1585 err = -EROFS;
1586 else
1587 journal->j_errno = 0;
1588 spin_unlock(&journal->j_state_lock);
1589 return err;
1590}
1591
1592/**
1593 * void jbd2_journal_ack_err() - Ack journal err.
1594 * @journal: journal to act on.
1595 *
1596 * An error must be cleared or Acked to take a FS out of readonly
1597 * mode.
1598 */
1599void jbd2_journal_ack_err(journal_t *journal)
1600{
1601 spin_lock(&journal->j_state_lock);
1602 if (journal->j_errno)
1603 journal->j_flags |= JBD2_ACK_ERR;
1604 spin_unlock(&journal->j_state_lock);
1605}
1606
1607int jbd2_journal_blocks_per_page(struct inode *inode)
1608{
1609 return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1610}
1611
1612/*
1613 * helper functions to deal with 32 or 64bit block numbers.
1614 */
1615size_t journal_tag_bytes(journal_t *journal)
1616{
1617 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
1618 return JBD_TAG_SIZE64;
1619 else
1620 return JBD_TAG_SIZE32;
1621}
1622
1623/*
1624 * Simple support for retrying memory allocations. Introduced to help to
1625 * debug different VM deadlock avoidance strategies.
1626 */
1627void * __jbd2_kmalloc (const char *where, size_t size, gfp_t flags, int retry)
1628{
1629 return kmalloc(size, flags | (retry ? __GFP_NOFAIL : 0));
1630}
1631
1632/*
1633 * jbd slab management: create 1k, 2k, 4k, 8k slabs as needed
1634 * and allocate frozen and commit buffers from these slabs.
1635 *
1636 * Reason for doing this is to avoid, SLAB_DEBUG - since it could
1637 * cause bh to cross page boundary.
1638 */
1639
1640#define JBD_MAX_SLABS 5
1641#define JBD_SLAB_INDEX(size) (size >> 11)
1642
1643static kmem_cache_t *jbd_slab[JBD_MAX_SLABS];
1644static const char *jbd_slab_names[JBD_MAX_SLABS] = {
1645 "jbd2_1k", "jbd2_2k", "jbd2_4k", NULL, "jbd2_8k"
1646};
1647
1648static void jbd2_journal_destroy_jbd_slabs(void)
1649{
1650 int i;
1651
1652 for (i = 0; i < JBD_MAX_SLABS; i++) {
1653 if (jbd_slab[i])
1654 kmem_cache_destroy(jbd_slab[i]);
1655 jbd_slab[i] = NULL;
1656 }
1657}
1658
1659static int jbd2_journal_create_jbd_slab(size_t slab_size)
1660{
1661 int i = JBD_SLAB_INDEX(slab_size);
1662
1663 BUG_ON(i >= JBD_MAX_SLABS);
1664
1665 /*
1666 * Check if we already have a slab created for this size
1667 */
1668 if (jbd_slab[i])
1669 return 0;
1670
1671 /*
1672 * Create a slab and force alignment to be same as slabsize -
1673 * this will make sure that allocations won't cross the page
1674 * boundary.
1675 */
1676 jbd_slab[i] = kmem_cache_create(jbd_slab_names[i],
1677 slab_size, slab_size, 0, NULL, NULL);
1678 if (!jbd_slab[i]) {
1679 printk(KERN_EMERG "JBD: no memory for jbd_slab cache\n");
1680 return -ENOMEM;
1681 }
1682 return 0;
1683}
1684
1685void * jbd2_slab_alloc(size_t size, gfp_t flags)
1686{
1687 int idx;
1688
1689 idx = JBD_SLAB_INDEX(size);
1690 BUG_ON(jbd_slab[idx] == NULL);
1691 return kmem_cache_alloc(jbd_slab[idx], flags | __GFP_NOFAIL);
1692}
1693
1694void jbd2_slab_free(void *ptr, size_t size)
1695{
1696 int idx;
1697
1698 idx = JBD_SLAB_INDEX(size);
1699 BUG_ON(jbd_slab[idx] == NULL);
1700 kmem_cache_free(jbd_slab[idx], ptr);
1701}
1702
1703/*
1704 * Journal_head storage management
1705 */
1706static kmem_cache_t *jbd2_journal_head_cache;
1707#ifdef CONFIG_JBD_DEBUG
1708static atomic_t nr_journal_heads = ATOMIC_INIT(0);
1709#endif
1710
1711static int journal_init_jbd2_journal_head_cache(void)
1712{
1713 int retval;
1714
1715 J_ASSERT(jbd2_journal_head_cache == 0);
1716 jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
1717 sizeof(struct journal_head),
1718 0, /* offset */
1719 0, /* flags */
1720 NULL, /* ctor */
1721 NULL); /* dtor */
1722 retval = 0;
1723 if (jbd2_journal_head_cache == 0) {
1724 retval = -ENOMEM;
1725 printk(KERN_EMERG "JBD: no memory for journal_head cache\n");
1726 }
1727 return retval;
1728}
1729
1730static void jbd2_journal_destroy_jbd2_journal_head_cache(void)
1731{
1732 J_ASSERT(jbd2_journal_head_cache != NULL);
1733 kmem_cache_destroy(jbd2_journal_head_cache);
1734 jbd2_journal_head_cache = NULL;
1735}
1736
1737/*
1738 * journal_head splicing and dicing
1739 */
1740static struct journal_head *journal_alloc_journal_head(void)
1741{
1742 struct journal_head *ret;
1743 static unsigned long last_warning;
1744
1745#ifdef CONFIG_JBD_DEBUG
1746 atomic_inc(&nr_journal_heads);
1747#endif
1748 ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
1749 if (ret == 0) {
1750 jbd_debug(1, "out of memory for journal_head\n");
1751 if (time_after(jiffies, last_warning + 5*HZ)) {
1752 printk(KERN_NOTICE "ENOMEM in %s, retrying.\n",
1753 __FUNCTION__);
1754 last_warning = jiffies;
1755 }
1756 while (ret == 0) {
1757 yield();
1758 ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
1759 }
1760 }
1761 return ret;
1762}
1763
1764static void journal_free_journal_head(struct journal_head *jh)
1765{
1766#ifdef CONFIG_JBD_DEBUG
1767 atomic_dec(&nr_journal_heads);
1768 memset(jh, JBD_POISON_FREE, sizeof(*jh));
1769#endif
1770 kmem_cache_free(jbd2_journal_head_cache, jh);
1771}
1772
1773/*
1774 * A journal_head is attached to a buffer_head whenever JBD has an
1775 * interest in the buffer.
1776 *
1777 * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
1778 * is set. This bit is tested in core kernel code where we need to take
1779 * JBD-specific actions. Testing the zeroness of ->b_private is not reliable
1780 * there.
1781 *
1782 * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
1783 *
1784 * When a buffer has its BH_JBD bit set it is immune from being released by
1785 * core kernel code, mainly via ->b_count.
1786 *
1787 * A journal_head may be detached from its buffer_head when the journal_head's
1788 * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL.
1789 * Various places in JBD call jbd2_journal_remove_journal_head() to indicate that the
1790 * journal_head can be dropped if needed.
1791 *
1792 * Various places in the kernel want to attach a journal_head to a buffer_head
1793 * _before_ attaching the journal_head to a transaction. To protect the
1794 * journal_head in this situation, jbd2_journal_add_journal_head elevates the
1795 * journal_head's b_jcount refcount by one. The caller must call
1796 * jbd2_journal_put_journal_head() to undo this.
1797 *
1798 * So the typical usage would be:
1799 *
1800 * (Attach a journal_head if needed. Increments b_jcount)
1801 * struct journal_head *jh = jbd2_journal_add_journal_head(bh);
1802 * ...
1803 * jh->b_transaction = xxx;
1804 * jbd2_journal_put_journal_head(jh);
1805 *
1806 * Now, the journal_head's b_jcount is zero, but it is safe from being released
1807 * because it has a non-zero b_transaction.
1808 */
1809
1810/*
1811 * Give a buffer_head a journal_head.
1812 *
1813 * Doesn't need the journal lock.
1814 * May sleep.
1815 */
1816struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh)
1817{
1818 struct journal_head *jh;
1819 struct journal_head *new_jh = NULL;
1820
1821repeat:
1822 if (!buffer_jbd(bh)) {
1823 new_jh = journal_alloc_journal_head();
1824 memset(new_jh, 0, sizeof(*new_jh));
1825 }
1826
1827 jbd_lock_bh_journal_head(bh);
1828 if (buffer_jbd(bh)) {
1829 jh = bh2jh(bh);
1830 } else {
1831 J_ASSERT_BH(bh,
1832 (atomic_read(&bh->b_count) > 0) ||
1833 (bh->b_page && bh->b_page->mapping));
1834
1835 if (!new_jh) {
1836 jbd_unlock_bh_journal_head(bh);
1837 goto repeat;
1838 }
1839
1840 jh = new_jh;
1841 new_jh = NULL; /* We consumed it */
1842 set_buffer_jbd(bh);
1843 bh->b_private = jh;
1844 jh->b_bh = bh;
1845 get_bh(bh);
1846 BUFFER_TRACE(bh, "added journal_head");
1847 }
1848 jh->b_jcount++;
1849 jbd_unlock_bh_journal_head(bh);
1850 if (new_jh)
1851 journal_free_journal_head(new_jh);
1852 return bh->b_private;
1853}
1854
1855/*
1856 * Grab a ref against this buffer_head's journal_head. If it ended up not
1857 * having a journal_head, return NULL
1858 */
1859struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh)
1860{
1861 struct journal_head *jh = NULL;
1862
1863 jbd_lock_bh_journal_head(bh);
1864 if (buffer_jbd(bh)) {
1865 jh = bh2jh(bh);
1866 jh->b_jcount++;
1867 }
1868 jbd_unlock_bh_journal_head(bh);
1869 return jh;
1870}
1871
1872static void __journal_remove_journal_head(struct buffer_head *bh)
1873{
1874 struct journal_head *jh = bh2jh(bh);
1875
1876 J_ASSERT_JH(jh, jh->b_jcount >= 0);
1877
1878 get_bh(bh);
1879 if (jh->b_jcount == 0) {
1880 if (jh->b_transaction == NULL &&
1881 jh->b_next_transaction == NULL &&
1882 jh->b_cp_transaction == NULL) {
1883 J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
1884 J_ASSERT_BH(bh, buffer_jbd(bh));
1885 J_ASSERT_BH(bh, jh2bh(jh) == bh);
1886 BUFFER_TRACE(bh, "remove journal_head");
1887 if (jh->b_frozen_data) {
1888 printk(KERN_WARNING "%s: freeing "
1889 "b_frozen_data\n",
1890 __FUNCTION__);
1891 jbd2_slab_free(jh->b_frozen_data, bh->b_size);
1892 }
1893 if (jh->b_committed_data) {
1894 printk(KERN_WARNING "%s: freeing "
1895 "b_committed_data\n",
1896 __FUNCTION__);
1897 jbd2_slab_free(jh->b_committed_data, bh->b_size);
1898 }
1899 bh->b_private = NULL;
1900 jh->b_bh = NULL; /* debug, really */
1901 clear_buffer_jbd(bh);
1902 __brelse(bh);
1903 journal_free_journal_head(jh);
1904 } else {
1905 BUFFER_TRACE(bh, "journal_head was locked");
1906 }
1907 }
1908}
1909
1910/*
1911 * jbd2_journal_remove_journal_head(): if the buffer isn't attached to a transaction
1912 * and has a zero b_jcount then remove and release its journal_head. If we did
1913 * see that the buffer is not used by any transaction we also "logically"
1914 * decrement ->b_count.
1915 *
1916 * We in fact take an additional increment on ->b_count as a convenience,
1917 * because the caller usually wants to do additional things with the bh
1918 * after calling here.
1919 * The caller of jbd2_journal_remove_journal_head() *must* run __brelse(bh) at some
1920 * time. Once the caller has run __brelse(), the buffer is eligible for
1921 * reaping by try_to_free_buffers().
1922 */
1923void jbd2_journal_remove_journal_head(struct buffer_head *bh)
1924{
1925 jbd_lock_bh_journal_head(bh);
1926 __journal_remove_journal_head(bh);
1927 jbd_unlock_bh_journal_head(bh);
1928}
1929
1930/*
1931 * Drop a reference on the passed journal_head. If it fell to zero then try to
1932 * release the journal_head from the buffer_head.
1933 */
1934void jbd2_journal_put_journal_head(struct journal_head *jh)
1935{
1936 struct buffer_head *bh = jh2bh(jh);
1937
1938 jbd_lock_bh_journal_head(bh);
1939 J_ASSERT_JH(jh, jh->b_jcount > 0);
1940 --jh->b_jcount;
1941 if (!jh->b_jcount && !jh->b_transaction) {
1942 __journal_remove_journal_head(bh);
1943 __brelse(bh);
1944 }
1945 jbd_unlock_bh_journal_head(bh);
1946}
1947
1948/*
1949 * /proc tunables
1950 */
1951#if defined(CONFIG_JBD_DEBUG)
1952int jbd2_journal_enable_debug;
1953EXPORT_SYMBOL(jbd2_journal_enable_debug);
1954#endif
1955
1956#if defined(CONFIG_JBD_DEBUG) && defined(CONFIG_PROC_FS)
1957
1958static struct proc_dir_entry *proc_jbd_debug;
1959
1960static int read_jbd_debug(char *page, char **start, off_t off,
1961 int count, int *eof, void *data)
1962{
1963 int ret;
1964
1965 ret = sprintf(page + off, "%d\n", jbd2_journal_enable_debug);
1966 *eof = 1;
1967 return ret;
1968}
1969
1970static int write_jbd_debug(struct file *file, const char __user *buffer,
1971 unsigned long count, void *data)
1972{
1973 char buf[32];
1974
1975 if (count > ARRAY_SIZE(buf) - 1)
1976 count = ARRAY_SIZE(buf) - 1;
1977 if (copy_from_user(buf, buffer, count))
1978 return -EFAULT;
1979 buf[ARRAY_SIZE(buf) - 1] = '\0';
1980 jbd2_journal_enable_debug = simple_strtoul(buf, NULL, 10);
1981 return count;
1982}
1983
1984#define JBD_PROC_NAME "sys/fs/jbd2-debug"
1985
1986static void __init create_jbd_proc_entry(void)
1987{
1988 proc_jbd_debug = create_proc_entry(JBD_PROC_NAME, 0644, NULL);
1989 if (proc_jbd_debug) {
1990 /* Why is this so hard? */
1991 proc_jbd_debug->read_proc = read_jbd_debug;
1992 proc_jbd_debug->write_proc = write_jbd_debug;
1993 }
1994}
1995
1996static void __exit jbd2_remove_jbd_proc_entry(void)
1997{
1998 if (proc_jbd_debug)
1999 remove_proc_entry(JBD_PROC_NAME, NULL);
2000}
2001
2002#else
2003
2004#define create_jbd_proc_entry() do {} while (0)
2005#define jbd2_remove_jbd_proc_entry() do {} while (0)
2006
2007#endif
2008
2009kmem_cache_t *jbd2_handle_cache;
2010
2011static int __init journal_init_handle_cache(void)
2012{
2013 jbd2_handle_cache = kmem_cache_create("jbd2_journal_handle",
2014 sizeof(handle_t),
2015 0, /* offset */
2016 0, /* flags */
2017 NULL, /* ctor */
2018 NULL); /* dtor */
2019 if (jbd2_handle_cache == NULL) {
2020 printk(KERN_EMERG "JBD: failed to create handle cache\n");
2021 return -ENOMEM;
2022 }
2023 return 0;
2024}
2025
2026static void jbd2_journal_destroy_handle_cache(void)
2027{
2028 if (jbd2_handle_cache)
2029 kmem_cache_destroy(jbd2_handle_cache);
2030}
2031
2032/*
2033 * Module startup and shutdown
2034 */
2035
2036static int __init journal_init_caches(void)
2037{
2038 int ret;
2039
2040 ret = jbd2_journal_init_revoke_caches();
2041 if (ret == 0)
2042 ret = journal_init_jbd2_journal_head_cache();
2043 if (ret == 0)
2044 ret = journal_init_handle_cache();
2045 return ret;
2046}
2047
2048static void jbd2_journal_destroy_caches(void)
2049{
2050 jbd2_journal_destroy_revoke_caches();
2051 jbd2_journal_destroy_jbd2_journal_head_cache();
2052 jbd2_journal_destroy_handle_cache();
2053 jbd2_journal_destroy_jbd_slabs();
2054}
2055
2056static int __init journal_init(void)
2057{
2058 int ret;
2059
2060 BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024);
2061
2062 ret = journal_init_caches();
2063 if (ret != 0)
2064 jbd2_journal_destroy_caches();
2065 create_jbd_proc_entry();
2066 return ret;
2067}
2068
2069static void __exit journal_exit(void)
2070{
2071#ifdef CONFIG_JBD_DEBUG
2072 int n = atomic_read(&nr_journal_heads);
2073 if (n)
2074 printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
2075#endif
2076 jbd2_remove_jbd_proc_entry();
2077 jbd2_journal_destroy_caches();
2078}
2079
2080MODULE_LICENSE("GPL");
2081module_init(journal_init);
2082module_exit(journal_exit);
2083
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
new file mode 100644
index 000000000000..9f10acafaf70
--- /dev/null
+++ b/fs/jbd2/recovery.c
@@ -0,0 +1,609 @@
1/*
2 * linux/fs/recovery.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
5 *
6 * Copyright 1999-2000 Red Hat Software --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal recovery routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 */
15
16#ifndef __KERNEL__
17#include "jfs_user.h"
18#else
19#include <linux/time.h>
20#include <linux/fs.h>
21#include <linux/jbd2.h>
22#include <linux/errno.h>
23#include <linux/slab.h>
24#endif
25
26/*
27 * Maintain information about the progress of the recovery job, so that
28 * the different passes can carry information between them.
29 */
30struct recovery_info
31{
32 tid_t start_transaction;
33 tid_t end_transaction;
34
35 int nr_replays;
36 int nr_revokes;
37 int nr_revoke_hits;
38};
39
40enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
41static int do_one_pass(journal_t *journal,
42 struct recovery_info *info, enum passtype pass);
43static int scan_revoke_records(journal_t *, struct buffer_head *,
44 tid_t, struct recovery_info *);
45
46#ifdef __KERNEL__
47
48/* Release readahead buffers after use */
49static void journal_brelse_array(struct buffer_head *b[], int n)
50{
51 while (--n >= 0)
52 brelse (b[n]);
53}
54
55
56/*
57 * When reading from the journal, we are going through the block device
58 * layer directly and so there is no readahead being done for us. We
59 * need to implement any readahead ourselves if we want it to happen at
60 * all. Recovery is basically one long sequential read, so make sure we
61 * do the IO in reasonably large chunks.
62 *
63 * This is not so critical that we need to be enormously clever about
64 * the readahead size, though. 128K is a purely arbitrary, good-enough
65 * fixed value.
66 */
67
68#define MAXBUF 8
69static int do_readahead(journal_t *journal, unsigned int start)
70{
71 int err;
72 unsigned int max, nbufs, next;
73 unsigned long long blocknr;
74 struct buffer_head *bh;
75
76 struct buffer_head * bufs[MAXBUF];
77
78 /* Do up to 128K of readahead */
79 max = start + (128 * 1024 / journal->j_blocksize);
80 if (max > journal->j_maxlen)
81 max = journal->j_maxlen;
82
83 /* Do the readahead itself. We'll submit MAXBUF buffer_heads at
84 * a time to the block device IO layer. */
85
86 nbufs = 0;
87
88 for (next = start; next < max; next++) {
89 err = jbd2_journal_bmap(journal, next, &blocknr);
90
91 if (err) {
92 printk (KERN_ERR "JBD: bad block at offset %u\n",
93 next);
94 goto failed;
95 }
96
97 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
98 if (!bh) {
99 err = -ENOMEM;
100 goto failed;
101 }
102
103 if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
104 bufs[nbufs++] = bh;
105 if (nbufs == MAXBUF) {
106 ll_rw_block(READ, nbufs, bufs);
107 journal_brelse_array(bufs, nbufs);
108 nbufs = 0;
109 }
110 } else
111 brelse(bh);
112 }
113
114 if (nbufs)
115 ll_rw_block(READ, nbufs, bufs);
116 err = 0;
117
118failed:
119 if (nbufs)
120 journal_brelse_array(bufs, nbufs);
121 return err;
122}
123
124#endif /* __KERNEL__ */
125
126
127/*
128 * Read a block from the journal
129 */
130
131static int jread(struct buffer_head **bhp, journal_t *journal,
132 unsigned int offset)
133{
134 int err;
135 unsigned long long blocknr;
136 struct buffer_head *bh;
137
138 *bhp = NULL;
139
140 if (offset >= journal->j_maxlen) {
141 printk(KERN_ERR "JBD: corrupted journal superblock\n");
142 return -EIO;
143 }
144
145 err = jbd2_journal_bmap(journal, offset, &blocknr);
146
147 if (err) {
148 printk (KERN_ERR "JBD: bad block at offset %u\n",
149 offset);
150 return err;
151 }
152
153 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
154 if (!bh)
155 return -ENOMEM;
156
157 if (!buffer_uptodate(bh)) {
158 /* If this is a brand new buffer, start readahead.
159 Otherwise, we assume we are already reading it. */
160 if (!buffer_req(bh))
161 do_readahead(journal, offset);
162 wait_on_buffer(bh);
163 }
164
165 if (!buffer_uptodate(bh)) {
166 printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
167 offset);
168 brelse(bh);
169 return -EIO;
170 }
171
172 *bhp = bh;
173 return 0;
174}
175
176
177/*
178 * Count the number of in-use tags in a journal descriptor block.
179 */
180
181static int count_tags(journal_t *journal, struct buffer_head *bh)
182{
183 char * tagp;
184 journal_block_tag_t * tag;
185 int nr = 0, size = journal->j_blocksize;
186 int tag_bytes = journal_tag_bytes(journal);
187
188 tagp = &bh->b_data[sizeof(journal_header_t)];
189
190 while ((tagp - bh->b_data + tag_bytes) <= size) {
191 tag = (journal_block_tag_t *) tagp;
192
193 nr++;
194 tagp += tag_bytes;
195 if (!(tag->t_flags & cpu_to_be32(JBD2_FLAG_SAME_UUID)))
196 tagp += 16;
197
198 if (tag->t_flags & cpu_to_be32(JBD2_FLAG_LAST_TAG))
199 break;
200 }
201
202 return nr;
203}
204
205
206/* Make sure we wrap around the log correctly! */
207#define wrap(journal, var) \
208do { \
209 if (var >= (journal)->j_last) \
210 var -= ((journal)->j_last - (journal)->j_first); \
211} while (0)
212
213/**
214 * jbd2_journal_recover - recovers a on-disk journal
215 * @journal: the journal to recover
216 *
217 * The primary function for recovering the log contents when mounting a
218 * journaled device.
219 *
220 * Recovery is done in three passes. In the first pass, we look for the
221 * end of the log. In the second, we assemble the list of revoke
222 * blocks. In the third and final pass, we replay any un-revoked blocks
223 * in the log.
224 */
225int jbd2_journal_recover(journal_t *journal)
226{
227 int err;
228 journal_superblock_t * sb;
229
230 struct recovery_info info;
231
232 memset(&info, 0, sizeof(info));
233 sb = journal->j_superblock;
234
235 /*
236 * The journal superblock's s_start field (the current log head)
237 * is always zero if, and only if, the journal was cleanly
238 * unmounted.
239 */
240
241 if (!sb->s_start) {
242 jbd_debug(1, "No recovery required, last transaction %d\n",
243 be32_to_cpu(sb->s_sequence));
244 journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
245 return 0;
246 }
247
248 err = do_one_pass(journal, &info, PASS_SCAN);
249 if (!err)
250 err = do_one_pass(journal, &info, PASS_REVOKE);
251 if (!err)
252 err = do_one_pass(journal, &info, PASS_REPLAY);
253
254 jbd_debug(0, "JBD: recovery, exit status %d, "
255 "recovered transactions %u to %u\n",
256 err, info.start_transaction, info.end_transaction);
257 jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n",
258 info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
259
260 /* Restart the log at the next transaction ID, thus invalidating
261 * any existing commit records in the log. */
262 journal->j_transaction_sequence = ++info.end_transaction;
263
264 jbd2_journal_clear_revoke(journal);
265 sync_blockdev(journal->j_fs_dev);
266 return err;
267}
268
269/**
270 * jbd2_journal_skip_recovery - Start journal and wipe exiting records
271 * @journal: journal to startup
272 *
273 * Locate any valid recovery information from the journal and set up the
274 * journal structures in memory to ignore it (presumably because the
275 * caller has evidence that it is out of date).
276 * This function does'nt appear to be exorted..
277 *
278 * We perform one pass over the journal to allow us to tell the user how
279 * much recovery information is being erased, and to let us initialise
280 * the journal transaction sequence numbers to the next unused ID.
281 */
282int jbd2_journal_skip_recovery(journal_t *journal)
283{
284 int err;
285 journal_superblock_t * sb;
286
287 struct recovery_info info;
288
289 memset (&info, 0, sizeof(info));
290 sb = journal->j_superblock;
291
292 err = do_one_pass(journal, &info, PASS_SCAN);
293
294 if (err) {
295 printk(KERN_ERR "JBD: error %d scanning journal\n", err);
296 ++journal->j_transaction_sequence;
297 } else {
298#ifdef CONFIG_JBD_DEBUG
299 int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence);
300#endif
301 jbd_debug(0,
302 "JBD: ignoring %d transaction%s from the journal.\n",
303 dropped, (dropped == 1) ? "" : "s");
304 journal->j_transaction_sequence = ++info.end_transaction;
305 }
306
307 journal->j_tail = 0;
308 return err;
309}
310
311static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag_t *tag)
312{
313 unsigned long long block = be32_to_cpu(tag->t_blocknr);
314 if (tag_bytes > JBD_TAG_SIZE32)
315 block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32;
316 return block;
317}
318
319static int do_one_pass(journal_t *journal,
320 struct recovery_info *info, enum passtype pass)
321{
322 unsigned int first_commit_ID, next_commit_ID;
323 unsigned long next_log_block;
324 int err, success = 0;
325 journal_superblock_t * sb;
326 journal_header_t * tmp;
327 struct buffer_head * bh;
328 unsigned int sequence;
329 int blocktype;
330 int tag_bytes = journal_tag_bytes(journal);
331
332 /* Precompute the maximum metadata descriptors in a descriptor block */
333 int MAX_BLOCKS_PER_DESC;
334 MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
335 / tag_bytes);
336
337 /*
338 * First thing is to establish what we expect to find in the log
339 * (in terms of transaction IDs), and where (in terms of log
340 * block offsets): query the superblock.
341 */
342
343 sb = journal->j_superblock;
344 next_commit_ID = be32_to_cpu(sb->s_sequence);
345 next_log_block = be32_to_cpu(sb->s_start);
346
347 first_commit_ID = next_commit_ID;
348 if (pass == PASS_SCAN)
349 info->start_transaction = first_commit_ID;
350
351 jbd_debug(1, "Starting recovery pass %d\n", pass);
352
353 /*
354 * Now we walk through the log, transaction by transaction,
355 * making sure that each transaction has a commit block in the
356 * expected place. Each complete transaction gets replayed back
357 * into the main filesystem.
358 */
359
360 while (1) {
361 int flags;
362 char * tagp;
363 journal_block_tag_t * tag;
364 struct buffer_head * obh;
365 struct buffer_head * nbh;
366
367 cond_resched(); /* We're under lock_kernel() */
368
369 /* If we already know where to stop the log traversal,
370 * check right now that we haven't gone past the end of
371 * the log. */
372
373 if (pass != PASS_SCAN)
374 if (tid_geq(next_commit_ID, info->end_transaction))
375 break;
376
377 jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
378 next_commit_ID, next_log_block, journal->j_last);
379
380 /* Skip over each chunk of the transaction looking
381 * either the next descriptor block or the final commit
382 * record. */
383
384 jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
385 err = jread(&bh, journal, next_log_block);
386 if (err)
387 goto failed;
388
389 next_log_block++;
390 wrap(journal, next_log_block);
391
392 /* What kind of buffer is it?
393 *
394 * If it is a descriptor block, check that it has the
395 * expected sequence number. Otherwise, we're all done
396 * here. */
397
398 tmp = (journal_header_t *)bh->b_data;
399
400 if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) {
401 brelse(bh);
402 break;
403 }
404
405 blocktype = be32_to_cpu(tmp->h_blocktype);
406 sequence = be32_to_cpu(tmp->h_sequence);
407 jbd_debug(3, "Found magic %d, sequence %d\n",
408 blocktype, sequence);
409
410 if (sequence != next_commit_ID) {
411 brelse(bh);
412 break;
413 }
414
415 /* OK, we have a valid descriptor block which matches
416 * all of the sequence number checks. What are we going
417 * to do with it? That depends on the pass... */
418
419 switch(blocktype) {
420 case JBD2_DESCRIPTOR_BLOCK:
421 /* If it is a valid descriptor block, replay it
422 * in pass REPLAY; otherwise, just skip over the
423 * blocks it describes. */
424 if (pass != PASS_REPLAY) {
425 next_log_block += count_tags(journal, bh);
426 wrap(journal, next_log_block);
427 brelse(bh);
428 continue;
429 }
430
431 /* A descriptor block: we can now write all of
432 * the data blocks. Yay, useful work is finally
433 * getting done here! */
434
435 tagp = &bh->b_data[sizeof(journal_header_t)];
436 while ((tagp - bh->b_data + tag_bytes)
437 <= journal->j_blocksize) {
438 unsigned long io_block;
439
440 tag = (journal_block_tag_t *) tagp;
441 flags = be32_to_cpu(tag->t_flags);
442
443 io_block = next_log_block++;
444 wrap(journal, next_log_block);
445 err = jread(&obh, journal, io_block);
446 if (err) {
447 /* Recover what we can, but
448 * report failure at the end. */
449 success = err;
450 printk (KERN_ERR
451 "JBD: IO error %d recovering "
452 "block %ld in log\n",
453 err, io_block);
454 } else {
455 unsigned long long blocknr;
456
457 J_ASSERT(obh != NULL);
458 blocknr = read_tag_block(tag_bytes,
459 tag);
460
461 /* If the block has been
462 * revoked, then we're all done
463 * here. */
464 if (jbd2_journal_test_revoke
465 (journal, blocknr,
466 next_commit_ID)) {
467 brelse(obh);
468 ++info->nr_revoke_hits;
469 goto skip_write;
470 }
471
472 /* Find a buffer for the new
473 * data being restored */
474 nbh = __getblk(journal->j_fs_dev,
475 blocknr,
476 journal->j_blocksize);
477 if (nbh == NULL) {
478 printk(KERN_ERR
479 "JBD: Out of memory "
480 "during recovery.\n");
481 err = -ENOMEM;
482 brelse(bh);
483 brelse(obh);
484 goto failed;
485 }
486
487 lock_buffer(nbh);
488 memcpy(nbh->b_data, obh->b_data,
489 journal->j_blocksize);
490 if (flags & JBD2_FLAG_ESCAPE) {
491 *((__be32 *)bh->b_data) =
492 cpu_to_be32(JBD2_MAGIC_NUMBER);
493 }
494
495 BUFFER_TRACE(nbh, "marking dirty");
496 set_buffer_uptodate(nbh);
497 mark_buffer_dirty(nbh);
498 BUFFER_TRACE(nbh, "marking uptodate");
499 ++info->nr_replays;
500 /* ll_rw_block(WRITE, 1, &nbh); */
501 unlock_buffer(nbh);
502 brelse(obh);
503 brelse(nbh);
504 }
505
506 skip_write:
507 tagp += tag_bytes;
508 if (!(flags & JBD2_FLAG_SAME_UUID))
509 tagp += 16;
510
511 if (flags & JBD2_FLAG_LAST_TAG)
512 break;
513 }
514
515 brelse(bh);
516 continue;
517
518 case JBD2_COMMIT_BLOCK:
519 /* Found an expected commit block: not much to
520 * do other than move on to the next sequence
521 * number. */
522 brelse(bh);
523 next_commit_ID++;
524 continue;
525
526 case JBD2_REVOKE_BLOCK:
527 /* If we aren't in the REVOKE pass, then we can
528 * just skip over this block. */
529 if (pass != PASS_REVOKE) {
530 brelse(bh);
531 continue;
532 }
533
534 err = scan_revoke_records(journal, bh,
535 next_commit_ID, info);
536 brelse(bh);
537 if (err)
538 goto failed;
539 continue;
540
541 default:
542 jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
543 blocktype);
544 brelse(bh);
545 goto done;
546 }
547 }
548
549 done:
550 /*
551 * We broke out of the log scan loop: either we came to the
552 * known end of the log or we found an unexpected block in the
553 * log. If the latter happened, then we know that the "current"
554 * transaction marks the end of the valid log.
555 */
556
557 if (pass == PASS_SCAN)
558 info->end_transaction = next_commit_ID;
559 else {
560 /* It's really bad news if different passes end up at
561 * different places (but possible due to IO errors). */
562 if (info->end_transaction != next_commit_ID) {
563 printk (KERN_ERR "JBD: recovery pass %d ended at "
564 "transaction %u, expected %u\n",
565 pass, next_commit_ID, info->end_transaction);
566 if (!success)
567 success = -EIO;
568 }
569 }
570
571 return success;
572
573 failed:
574 return err;
575}
576
577
578/* Scan a revoke record, marking all blocks mentioned as revoked. */
579
580static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
581 tid_t sequence, struct recovery_info *info)
582{
583 jbd2_journal_revoke_header_t *header;
584 int offset, max;
585 int record_len = 4;
586
587 header = (jbd2_journal_revoke_header_t *) bh->b_data;
588 offset = sizeof(jbd2_journal_revoke_header_t);
589 max = be32_to_cpu(header->r_count);
590
591 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
592 record_len = 8;
593
594 while (offset + record_len <= max) {
595 unsigned long long blocknr;
596 int err;
597
598 if (record_len == 4)
599 blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
600 else
601 blocknr = be64_to_cpu(* ((__be64 *) (bh->b_data+offset)));
602 offset += record_len;
603 err = jbd2_journal_set_revoke(journal, blocknr, sequence);
604 if (err)
605 return err;
606 ++info->nr_revokes;
607 }
608 return 0;
609}
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
new file mode 100644
index 000000000000..380d19917f37
--- /dev/null
+++ b/fs/jbd2/revoke.c
@@ -0,0 +1,712 @@
1/*
2 * linux/fs/revoke.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 2000
5 *
6 * Copyright 2000 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal revoke routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 *
15 * Revoke is the mechanism used to prevent old log records for deleted
16 * metadata from being replayed on top of newer data using the same
17 * blocks. The revoke mechanism is used in two separate places:
18 *
19 * + Commit: during commit we write the entire list of the current
20 * transaction's revoked blocks to the journal
21 *
22 * + Recovery: during recovery we record the transaction ID of all
23 * revoked blocks. If there are multiple revoke records in the log
24 * for a single block, only the last one counts, and if there is a log
25 * entry for a block beyond the last revoke, then that log entry still
26 * gets replayed.
27 *
28 * We can get interactions between revokes and new log data within a
29 * single transaction:
30 *
31 * Block is revoked and then journaled:
32 * The desired end result is the journaling of the new block, so we
33 * cancel the revoke before the transaction commits.
34 *
35 * Block is journaled and then revoked:
36 * The revoke must take precedence over the write of the block, so we
37 * need either to cancel the journal entry or to write the revoke
38 * later in the log than the log block. In this case, we choose the
39 * latter: journaling a block cancels any revoke record for that block
40 * in the current transaction, so any revoke for that block in the
41 * transaction must have happened after the block was journaled and so
42 * the revoke must take precedence.
43 *
44 * Block is revoked and then written as data:
45 * The data write is allowed to succeed, but the revoke is _not_
46 * cancelled. We still need to prevent old log records from
47 * overwriting the new data. We don't even need to clear the revoke
48 * bit here.
49 *
50 * Revoke information on buffers is a tri-state value:
51 *
52 * RevokeValid clear: no cached revoke status, need to look it up
53 * RevokeValid set, Revoked clear:
54 * buffer has not been revoked, and cancel_revoke
55 * need do nothing.
56 * RevokeValid set, Revoked set:
57 * buffer has been revoked.
58 */
59
60#ifndef __KERNEL__
61#include "jfs_user.h"
62#else
63#include <linux/time.h>
64#include <linux/fs.h>
65#include <linux/jbd2.h>
66#include <linux/errno.h>
67#include <linux/slab.h>
68#include <linux/list.h>
69#include <linux/smp_lock.h>
70#include <linux/init.h>
71#endif
72
73static kmem_cache_t *jbd2_revoke_record_cache;
74static kmem_cache_t *jbd2_revoke_table_cache;
75
76/* Each revoke record represents one single revoked block. During
77 journal replay, this involves recording the transaction ID of the
78 last transaction to revoke this block. */
79
80struct jbd2_revoke_record_s
81{
82 struct list_head hash;
83 tid_t sequence; /* Used for recovery only */
84 unsigned long long blocknr;
85};
86
87
88/* The revoke table is just a simple hash table of revoke records. */
89struct jbd2_revoke_table_s
90{
91 /* It is conceivable that we might want a larger hash table
92 * for recovery. Must be a power of two. */
93 int hash_size;
94 int hash_shift;
95 struct list_head *hash_table;
96};
97
98
99#ifdef __KERNEL__
100static void write_one_revoke_record(journal_t *, transaction_t *,
101 struct journal_head **, int *,
102 struct jbd2_revoke_record_s *);
103static void flush_descriptor(journal_t *, struct journal_head *, int);
104#endif
105
106/* Utility functions to maintain the revoke table */
107
108/* Borrowed from buffer.c: this is a tried and tested block hash function */
109static inline int hash(journal_t *journal, unsigned long long block)
110{
111 struct jbd2_revoke_table_s *table = journal->j_revoke;
112 int hash_shift = table->hash_shift;
113 int hash = (int)block ^ (int)((block >> 31) >> 1);
114
115 return ((hash << (hash_shift - 6)) ^
116 (hash >> 13) ^
117 (hash << (hash_shift - 12))) & (table->hash_size - 1);
118}
119
120static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr,
121 tid_t seq)
122{
123 struct list_head *hash_list;
124 struct jbd2_revoke_record_s *record;
125
126repeat:
127 record = kmem_cache_alloc(jbd2_revoke_record_cache, GFP_NOFS);
128 if (!record)
129 goto oom;
130
131 record->sequence = seq;
132 record->blocknr = blocknr;
133 hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
134 spin_lock(&journal->j_revoke_lock);
135 list_add(&record->hash, hash_list);
136 spin_unlock(&journal->j_revoke_lock);
137 return 0;
138
139oom:
140 if (!journal_oom_retry)
141 return -ENOMEM;
142 jbd_debug(1, "ENOMEM in %s, retrying\n", __FUNCTION__);
143 yield();
144 goto repeat;
145}
146
147/* Find a revoke record in the journal's hash table. */
148
149static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal,
150 unsigned long long blocknr)
151{
152 struct list_head *hash_list;
153 struct jbd2_revoke_record_s *record;
154
155 hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
156
157 spin_lock(&journal->j_revoke_lock);
158 record = (struct jbd2_revoke_record_s *) hash_list->next;
159 while (&(record->hash) != hash_list) {
160 if (record->blocknr == blocknr) {
161 spin_unlock(&journal->j_revoke_lock);
162 return record;
163 }
164 record = (struct jbd2_revoke_record_s *) record->hash.next;
165 }
166 spin_unlock(&journal->j_revoke_lock);
167 return NULL;
168}
169
170int __init jbd2_journal_init_revoke_caches(void)
171{
172 jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record",
173 sizeof(struct jbd2_revoke_record_s),
174 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
175 if (jbd2_revoke_record_cache == 0)
176 return -ENOMEM;
177
178 jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table",
179 sizeof(struct jbd2_revoke_table_s),
180 0, 0, NULL, NULL);
181 if (jbd2_revoke_table_cache == 0) {
182 kmem_cache_destroy(jbd2_revoke_record_cache);
183 jbd2_revoke_record_cache = NULL;
184 return -ENOMEM;
185 }
186 return 0;
187}
188
189void jbd2_journal_destroy_revoke_caches(void)
190{
191 kmem_cache_destroy(jbd2_revoke_record_cache);
192 jbd2_revoke_record_cache = NULL;
193 kmem_cache_destroy(jbd2_revoke_table_cache);
194 jbd2_revoke_table_cache = NULL;
195}
196
197/* Initialise the revoke table for a given journal to a given size. */
198
199int jbd2_journal_init_revoke(journal_t *journal, int hash_size)
200{
201 int shift, tmp;
202
203 J_ASSERT (journal->j_revoke_table[0] == NULL);
204
205 shift = 0;
206 tmp = hash_size;
207 while((tmp >>= 1UL) != 0UL)
208 shift++;
209
210 journal->j_revoke_table[0] = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL);
211 if (!journal->j_revoke_table[0])
212 return -ENOMEM;
213 journal->j_revoke = journal->j_revoke_table[0];
214
215 /* Check that the hash_size is a power of two */
216 J_ASSERT ((hash_size & (hash_size-1)) == 0);
217
218 journal->j_revoke->hash_size = hash_size;
219
220 journal->j_revoke->hash_shift = shift;
221
222 journal->j_revoke->hash_table =
223 kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
224 if (!journal->j_revoke->hash_table) {
225 kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[0]);
226 journal->j_revoke = NULL;
227 return -ENOMEM;
228 }
229
230 for (tmp = 0; tmp < hash_size; tmp++)
231 INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
232
233 journal->j_revoke_table[1] = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL);
234 if (!journal->j_revoke_table[1]) {
235 kfree(journal->j_revoke_table[0]->hash_table);
236 kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[0]);
237 return -ENOMEM;
238 }
239
240 journal->j_revoke = journal->j_revoke_table[1];
241
242 /* Check that the hash_size is a power of two */
243 J_ASSERT ((hash_size & (hash_size-1)) == 0);
244
245 journal->j_revoke->hash_size = hash_size;
246
247 journal->j_revoke->hash_shift = shift;
248
249 journal->j_revoke->hash_table =
250 kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
251 if (!journal->j_revoke->hash_table) {
252 kfree(journal->j_revoke_table[0]->hash_table);
253 kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[0]);
254 kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[1]);
255 journal->j_revoke = NULL;
256 return -ENOMEM;
257 }
258
259 for (tmp = 0; tmp < hash_size; tmp++)
260 INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
261
262 spin_lock_init(&journal->j_revoke_lock);
263
264 return 0;
265}
266
267/* Destoy a journal's revoke table. The table must already be empty! */
268
269void jbd2_journal_destroy_revoke(journal_t *journal)
270{
271 struct jbd2_revoke_table_s *table;
272 struct list_head *hash_list;
273 int i;
274
275 table = journal->j_revoke_table[0];
276 if (!table)
277 return;
278
279 for (i=0; i<table->hash_size; i++) {
280 hash_list = &table->hash_table[i];
281 J_ASSERT (list_empty(hash_list));
282 }
283
284 kfree(table->hash_table);
285 kmem_cache_free(jbd2_revoke_table_cache, table);
286 journal->j_revoke = NULL;
287
288 table = journal->j_revoke_table[1];
289 if (!table)
290 return;
291
292 for (i=0; i<table->hash_size; i++) {
293 hash_list = &table->hash_table[i];
294 J_ASSERT (list_empty(hash_list));
295 }
296
297 kfree(table->hash_table);
298 kmem_cache_free(jbd2_revoke_table_cache, table);
299 journal->j_revoke = NULL;
300}
301
302
303#ifdef __KERNEL__
304
305/*
306 * jbd2_journal_revoke: revoke a given buffer_head from the journal. This
307 * prevents the block from being replayed during recovery if we take a
308 * crash after this current transaction commits. Any subsequent
309 * metadata writes of the buffer in this transaction cancel the
310 * revoke.
311 *
312 * Note that this call may block --- it is up to the caller to make
313 * sure that there are no further calls to journal_write_metadata
314 * before the revoke is complete. In ext3, this implies calling the
315 * revoke before clearing the block bitmap when we are deleting
316 * metadata.
317 *
318 * Revoke performs a jbd2_journal_forget on any buffer_head passed in as a
319 * parameter, but does _not_ forget the buffer_head if the bh was only
320 * found implicitly.
321 *
322 * bh_in may not be a journalled buffer - it may have come off
323 * the hash tables without an attached journal_head.
324 *
325 * If bh_in is non-zero, jbd2_journal_revoke() will decrement its b_count
326 * by one.
327 */
328
329int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
330 struct buffer_head *bh_in)
331{
332 struct buffer_head *bh = NULL;
333 journal_t *journal;
334 struct block_device *bdev;
335 int err;
336
337 might_sleep();
338 if (bh_in)
339 BUFFER_TRACE(bh_in, "enter");
340
341 journal = handle->h_transaction->t_journal;
342 if (!jbd2_journal_set_features(journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)){
343 J_ASSERT (!"Cannot set revoke feature!");
344 return -EINVAL;
345 }
346
347 bdev = journal->j_fs_dev;
348 bh = bh_in;
349
350 if (!bh) {
351 bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
352 if (bh)
353 BUFFER_TRACE(bh, "found on hash");
354 }
355#ifdef JBD_EXPENSIVE_CHECKING
356 else {
357 struct buffer_head *bh2;
358
359 /* If there is a different buffer_head lying around in
360 * memory anywhere... */
361 bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
362 if (bh2) {
363 /* ... and it has RevokeValid status... */
364 if (bh2 != bh && buffer_revokevalid(bh2))
365 /* ...then it better be revoked too,
366 * since it's illegal to create a revoke
367 * record against a buffer_head which is
368 * not marked revoked --- that would
369 * risk missing a subsequent revoke
370 * cancel. */
371 J_ASSERT_BH(bh2, buffer_revoked(bh2));
372 put_bh(bh2);
373 }
374 }
375#endif
376
377 /* We really ought not ever to revoke twice in a row without
378 first having the revoke cancelled: it's illegal to free a
379 block twice without allocating it in between! */
380 if (bh) {
381 if (!J_EXPECT_BH(bh, !buffer_revoked(bh),
382 "inconsistent data on disk")) {
383 if (!bh_in)
384 brelse(bh);
385 return -EIO;
386 }
387 set_buffer_revoked(bh);
388 set_buffer_revokevalid(bh);
389 if (bh_in) {
390 BUFFER_TRACE(bh_in, "call jbd2_journal_forget");
391 jbd2_journal_forget(handle, bh_in);
392 } else {
393 BUFFER_TRACE(bh, "call brelse");
394 __brelse(bh);
395 }
396 }
397
398 jbd_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in);
399 err = insert_revoke_hash(journal, blocknr,
400 handle->h_transaction->t_tid);
401 BUFFER_TRACE(bh_in, "exit");
402 return err;
403}
404
405/*
406 * Cancel an outstanding revoke. For use only internally by the
407 * journaling code (called from jbd2_journal_get_write_access).
408 *
409 * We trust buffer_revoked() on the buffer if the buffer is already
410 * being journaled: if there is no revoke pending on the buffer, then we
411 * don't do anything here.
412 *
413 * This would break if it were possible for a buffer to be revoked and
414 * discarded, and then reallocated within the same transaction. In such
415 * a case we would have lost the revoked bit, but when we arrived here
416 * the second time we would still have a pending revoke to cancel. So,
417 * do not trust the Revoked bit on buffers unless RevokeValid is also
418 * set.
419 *
420 * The caller must have the journal locked.
421 */
422int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
423{
424 struct jbd2_revoke_record_s *record;
425 journal_t *journal = handle->h_transaction->t_journal;
426 int need_cancel;
427 int did_revoke = 0; /* akpm: debug */
428 struct buffer_head *bh = jh2bh(jh);
429
430 jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
431
432 /* Is the existing Revoke bit valid? If so, we trust it, and
433 * only perform the full cancel if the revoke bit is set. If
434 * not, we can't trust the revoke bit, and we need to do the
435 * full search for a revoke record. */
436 if (test_set_buffer_revokevalid(bh)) {
437 need_cancel = test_clear_buffer_revoked(bh);
438 } else {
439 need_cancel = 1;
440 clear_buffer_revoked(bh);
441 }
442
443 if (need_cancel) {
444 record = find_revoke_record(journal, bh->b_blocknr);
445 if (record) {
446 jbd_debug(4, "cancelled existing revoke on "
447 "blocknr %llu\n", (unsigned long long)bh->b_blocknr);
448 spin_lock(&journal->j_revoke_lock);
449 list_del(&record->hash);
450 spin_unlock(&journal->j_revoke_lock);
451 kmem_cache_free(jbd2_revoke_record_cache, record);
452 did_revoke = 1;
453 }
454 }
455
456#ifdef JBD_EXPENSIVE_CHECKING
457 /* There better not be one left behind by now! */
458 record = find_revoke_record(journal, bh->b_blocknr);
459 J_ASSERT_JH(jh, record == NULL);
460#endif
461
462 /* Finally, have we just cleared revoke on an unhashed
463 * buffer_head? If so, we'd better make sure we clear the
464 * revoked status on any hashed alias too, otherwise the revoke
465 * state machine will get very upset later on. */
466 if (need_cancel) {
467 struct buffer_head *bh2;
468 bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
469 if (bh2) {
470 if (bh2 != bh)
471 clear_buffer_revoked(bh2);
472 __brelse(bh2);
473 }
474 }
475 return did_revoke;
476}
477
478/* journal_switch_revoke table select j_revoke for next transaction
479 * we do not want to suspend any processing until all revokes are
480 * written -bzzz
481 */
482void jbd2_journal_switch_revoke_table(journal_t *journal)
483{
484 int i;
485
486 if (journal->j_revoke == journal->j_revoke_table[0])
487 journal->j_revoke = journal->j_revoke_table[1];
488 else
489 journal->j_revoke = journal->j_revoke_table[0];
490
491 for (i = 0; i < journal->j_revoke->hash_size; i++)
492 INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]);
493}
494
495/*
496 * Write revoke records to the journal for all entries in the current
497 * revoke hash, deleting the entries as we go.
498 *
499 * Called with the journal lock held.
500 */
501
502void jbd2_journal_write_revoke_records(journal_t *journal,
503 transaction_t *transaction)
504{
505 struct journal_head *descriptor;
506 struct jbd2_revoke_record_s *record;
507 struct jbd2_revoke_table_s *revoke;
508 struct list_head *hash_list;
509 int i, offset, count;
510
511 descriptor = NULL;
512 offset = 0;
513 count = 0;
514
515 /* select revoke table for committing transaction */
516 revoke = journal->j_revoke == journal->j_revoke_table[0] ?
517 journal->j_revoke_table[1] : journal->j_revoke_table[0];
518
519 for (i = 0; i < revoke->hash_size; i++) {
520 hash_list = &revoke->hash_table[i];
521
522 while (!list_empty(hash_list)) {
523 record = (struct jbd2_revoke_record_s *)
524 hash_list->next;
525 write_one_revoke_record(journal, transaction,
526 &descriptor, &offset,
527 record);
528 count++;
529 list_del(&record->hash);
530 kmem_cache_free(jbd2_revoke_record_cache, record);
531 }
532 }
533 if (descriptor)
534 flush_descriptor(journal, descriptor, offset);
535 jbd_debug(1, "Wrote %d revoke records\n", count);
536}
537
538/*
539 * Write out one revoke record. We need to create a new descriptor
540 * block if the old one is full or if we have not already created one.
541 */
542
543static void write_one_revoke_record(journal_t *journal,
544 transaction_t *transaction,
545 struct journal_head **descriptorp,
546 int *offsetp,
547 struct jbd2_revoke_record_s *record)
548{
549 struct journal_head *descriptor;
550 int offset;
551 journal_header_t *header;
552
553 /* If we are already aborting, this all becomes a noop. We
554 still need to go round the loop in
555 jbd2_journal_write_revoke_records in order to free all of the
556 revoke records: only the IO to the journal is omitted. */
557 if (is_journal_aborted(journal))
558 return;
559
560 descriptor = *descriptorp;
561 offset = *offsetp;
562
563 /* Make sure we have a descriptor with space left for the record */
564 if (descriptor) {
565 if (offset == journal->j_blocksize) {
566 flush_descriptor(journal, descriptor, offset);
567 descriptor = NULL;
568 }
569 }
570
571 if (!descriptor) {
572 descriptor = jbd2_journal_get_descriptor_buffer(journal);
573 if (!descriptor)
574 return;
575 header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
576 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
577 header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK);
578 header->h_sequence = cpu_to_be32(transaction->t_tid);
579
580 /* Record it so that we can wait for IO completion later */
581 JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
582 jbd2_journal_file_buffer(descriptor, transaction, BJ_LogCtl);
583
584 offset = sizeof(jbd2_journal_revoke_header_t);
585 *descriptorp = descriptor;
586 }
587
588 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) {
589 * ((__be64 *)(&jh2bh(descriptor)->b_data[offset])) =
590 cpu_to_be64(record->blocknr);
591 offset += 8;
592
593 } else {
594 * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) =
595 cpu_to_be32(record->blocknr);
596 offset += 4;
597 }
598
599 *offsetp = offset;
600}
601
602/*
603 * Flush a revoke descriptor out to the journal. If we are aborting,
604 * this is a noop; otherwise we are generating a buffer which needs to
605 * be waited for during commit, so it has to go onto the appropriate
606 * journal buffer list.
607 */
608
609static void flush_descriptor(journal_t *journal,
610 struct journal_head *descriptor,
611 int offset)
612{
613 jbd2_journal_revoke_header_t *header;
614 struct buffer_head *bh = jh2bh(descriptor);
615
616 if (is_journal_aborted(journal)) {
617 put_bh(bh);
618 return;
619 }
620
621 header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data;
622 header->r_count = cpu_to_be32(offset);
623 set_buffer_jwrite(bh);
624 BUFFER_TRACE(bh, "write");
625 set_buffer_dirty(bh);
626 ll_rw_block(SWRITE, 1, &bh);
627}
628#endif
629
630/*
631 * Revoke support for recovery.
632 *
633 * Recovery needs to be able to:
634 *
635 * record all revoke records, including the tid of the latest instance
636 * of each revoke in the journal
637 *
638 * check whether a given block in a given transaction should be replayed
639 * (ie. has not been revoked by a revoke record in that or a subsequent
640 * transaction)
641 *
642 * empty the revoke table after recovery.
643 */
644
645/*
646 * First, setting revoke records. We create a new revoke record for
647 * every block ever revoked in the log as we scan it for recovery, and
648 * we update the existing records if we find multiple revokes for a
649 * single block.
650 */
651
652int jbd2_journal_set_revoke(journal_t *journal,
653 unsigned long long blocknr,
654 tid_t sequence)
655{
656 struct jbd2_revoke_record_s *record;
657
658 record = find_revoke_record(journal, blocknr);
659 if (record) {
660 /* If we have multiple occurrences, only record the
661 * latest sequence number in the hashed record */
662 if (tid_gt(sequence, record->sequence))
663 record->sequence = sequence;
664 return 0;
665 }
666 return insert_revoke_hash(journal, blocknr, sequence);
667}
668
669/*
670 * Test revoke records. For a given block referenced in the log, has
671 * that block been revoked? A revoke record with a given transaction
672 * sequence number revokes all blocks in that transaction and earlier
673 * ones, but later transactions still need replayed.
674 */
675
676int jbd2_journal_test_revoke(journal_t *journal,
677 unsigned long long blocknr,
678 tid_t sequence)
679{
680 struct jbd2_revoke_record_s *record;
681
682 record = find_revoke_record(journal, blocknr);
683 if (!record)
684 return 0;
685 if (tid_gt(sequence, record->sequence))
686 return 0;
687 return 1;
688}
689
690/*
691 * Finally, once recovery is over, we need to clear the revoke table so
692 * that it can be reused by the running filesystem.
693 */
694
695void jbd2_journal_clear_revoke(journal_t *journal)
696{
697 int i;
698 struct list_head *hash_list;
699 struct jbd2_revoke_record_s *record;
700 struct jbd2_revoke_table_s *revoke;
701
702 revoke = journal->j_revoke;
703
704 for (i = 0; i < revoke->hash_size; i++) {
705 hash_list = &revoke->hash_table[i];
706 while (!list_empty(hash_list)) {
707 record = (struct jbd2_revoke_record_s*) hash_list->next;
708 list_del(&record->hash);
709 kmem_cache_free(jbd2_revoke_record_cache, record);
710 }
711 }
712}
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
new file mode 100644
index 000000000000..149957bef907
--- /dev/null
+++ b/fs/jbd2/transaction.c
@@ -0,0 +1,2080 @@
1/*
2 * linux/fs/transaction.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Generic filesystem transaction handling code; part of the ext2fs
13 * journaling system.
14 *
15 * This file manages transactions (compound commits managed by the
16 * journaling code) and handles (individual atomic operations by the
17 * filesystem).
18 */
19
20#include <linux/time.h>
21#include <linux/fs.h>
22#include <linux/jbd2.h>
23#include <linux/errno.h>
24#include <linux/slab.h>
25#include <linux/timer.h>
26#include <linux/smp_lock.h>
27#include <linux/mm.h>
28#include <linux/highmem.h>
29
30/*
31 * jbd2_get_transaction: obtain a new transaction_t object.
32 *
33 * Simply allocate and initialise a new transaction. Create it in
34 * RUNNING state and add it to the current journal (which should not
35 * have an existing running transaction: we only make a new transaction
36 * once we have started to commit the old one).
37 *
38 * Preconditions:
39 * The journal MUST be locked. We don't perform atomic mallocs on the
40 * new transaction and we can't block without protecting against other
41 * processes trying to touch the journal while it is in transition.
42 *
43 * Called under j_state_lock
44 */
45
46static transaction_t *
47jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
48{
49 transaction->t_journal = journal;
50 transaction->t_state = T_RUNNING;
51 transaction->t_tid = journal->j_transaction_sequence++;
52 transaction->t_expires = jiffies + journal->j_commit_interval;
53 spin_lock_init(&transaction->t_handle_lock);
54
55 /* Set up the commit timer for the new transaction. */
56 journal->j_commit_timer.expires = transaction->t_expires;
57 add_timer(&journal->j_commit_timer);
58
59 J_ASSERT(journal->j_running_transaction == NULL);
60 journal->j_running_transaction = transaction;
61
62 return transaction;
63}
64
65/*
66 * Handle management.
67 *
68 * A handle_t is an object which represents a single atomic update to a
69 * filesystem, and which tracks all of the modifications which form part
70 * of that one update.
71 */
72
73/*
74 * start_this_handle: Given a handle, deal with any locking or stalling
75 * needed to make sure that there is enough journal space for the handle
76 * to begin. Attach the handle to a transaction and set up the
77 * transaction's buffer credits.
78 */
79
80static int start_this_handle(journal_t *journal, handle_t *handle)
81{
82 transaction_t *transaction;
83 int needed;
84 int nblocks = handle->h_buffer_credits;
85 transaction_t *new_transaction = NULL;
86 int ret = 0;
87
88 if (nblocks > journal->j_max_transaction_buffers) {
89 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
90 current->comm, nblocks,
91 journal->j_max_transaction_buffers);
92 ret = -ENOSPC;
93 goto out;
94 }
95
96alloc_transaction:
97 if (!journal->j_running_transaction) {
98 new_transaction = jbd_kmalloc(sizeof(*new_transaction),
99 GFP_NOFS);
100 if (!new_transaction) {
101 ret = -ENOMEM;
102 goto out;
103 }
104 memset(new_transaction, 0, sizeof(*new_transaction));
105 }
106
107 jbd_debug(3, "New handle %p going live.\n", handle);
108
109repeat:
110
111 /*
112 * We need to hold j_state_lock until t_updates has been incremented,
113 * for proper journal barrier handling
114 */
115 spin_lock(&journal->j_state_lock);
116repeat_locked:
117 if (is_journal_aborted(journal) ||
118 (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
119 spin_unlock(&journal->j_state_lock);
120 ret = -EROFS;
121 goto out;
122 }
123
124 /* Wait on the journal's transaction barrier if necessary */
125 if (journal->j_barrier_count) {
126 spin_unlock(&journal->j_state_lock);
127 wait_event(journal->j_wait_transaction_locked,
128 journal->j_barrier_count == 0);
129 goto repeat;
130 }
131
132 if (!journal->j_running_transaction) {
133 if (!new_transaction) {
134 spin_unlock(&journal->j_state_lock);
135 goto alloc_transaction;
136 }
137 jbd2_get_transaction(journal, new_transaction);
138 new_transaction = NULL;
139 }
140
141 transaction = journal->j_running_transaction;
142
143 /*
144 * If the current transaction is locked down for commit, wait for the
145 * lock to be released.
146 */
147 if (transaction->t_state == T_LOCKED) {
148 DEFINE_WAIT(wait);
149
150 prepare_to_wait(&journal->j_wait_transaction_locked,
151 &wait, TASK_UNINTERRUPTIBLE);
152 spin_unlock(&journal->j_state_lock);
153 schedule();
154 finish_wait(&journal->j_wait_transaction_locked, &wait);
155 goto repeat;
156 }
157
158 /*
159 * If there is not enough space left in the log to write all potential
160 * buffers requested by this operation, we need to stall pending a log
161 * checkpoint to free some more log space.
162 */
163 spin_lock(&transaction->t_handle_lock);
164 needed = transaction->t_outstanding_credits + nblocks;
165
166 if (needed > journal->j_max_transaction_buffers) {
167 /*
168 * If the current transaction is already too large, then start
169 * to commit it: we can then go back and attach this handle to
170 * a new transaction.
171 */
172 DEFINE_WAIT(wait);
173
174 jbd_debug(2, "Handle %p starting new commit...\n", handle);
175 spin_unlock(&transaction->t_handle_lock);
176 prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
177 TASK_UNINTERRUPTIBLE);
178 __jbd2_log_start_commit(journal, transaction->t_tid);
179 spin_unlock(&journal->j_state_lock);
180 schedule();
181 finish_wait(&journal->j_wait_transaction_locked, &wait);
182 goto repeat;
183 }
184
185 /*
186 * The commit code assumes that it can get enough log space
187 * without forcing a checkpoint. This is *critical* for
188 * correctness: a checkpoint of a buffer which is also
189 * associated with a committing transaction creates a deadlock,
190 * so commit simply cannot force through checkpoints.
191 *
192 * We must therefore ensure the necessary space in the journal
193 * *before* starting to dirty potentially checkpointed buffers
194 * in the new transaction.
195 *
196 * The worst part is, any transaction currently committing can
197 * reduce the free space arbitrarily. Be careful to account for
198 * those buffers when checkpointing.
199 */
200
201 /*
202 * @@@ AKPM: This seems rather over-defensive. We're giving commit
203 * a _lot_ of headroom: 1/4 of the journal plus the size of
204 * the committing transaction. Really, we only need to give it
205 * committing_transaction->t_outstanding_credits plus "enough" for
206 * the log control blocks.
207 * Also, this test is inconsitent with the matching one in
208 * jbd2_journal_extend().
209 */
210 if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
211 jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
212 spin_unlock(&transaction->t_handle_lock);
213 __jbd2_log_wait_for_space(journal);
214 goto repeat_locked;
215 }
216
217 /* OK, account for the buffers that this operation expects to
218 * use and add the handle to the running transaction. */
219
220 handle->h_transaction = transaction;
221 transaction->t_outstanding_credits += nblocks;
222 transaction->t_updates++;
223 transaction->t_handle_count++;
224 jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
225 handle, nblocks, transaction->t_outstanding_credits,
226 __jbd2_log_space_left(journal));
227 spin_unlock(&transaction->t_handle_lock);
228 spin_unlock(&journal->j_state_lock);
229out:
230 if (unlikely(new_transaction)) /* It's usually NULL */
231 kfree(new_transaction);
232 return ret;
233}
234
235/* Allocate a new handle. This should probably be in a slab... */
236static handle_t *new_handle(int nblocks)
237{
238 handle_t *handle = jbd_alloc_handle(GFP_NOFS);
239 if (!handle)
240 return NULL;
241 memset(handle, 0, sizeof(*handle));
242 handle->h_buffer_credits = nblocks;
243 handle->h_ref = 1;
244
245 return handle;
246}
247
248/**
249 * handle_t *jbd2_journal_start() - Obtain a new handle.
250 * @journal: Journal to start transaction on.
251 * @nblocks: number of block buffer we might modify
252 *
253 * We make sure that the transaction can guarantee at least nblocks of
254 * modified buffers in the log. We block until the log can guarantee
255 * that much space.
256 *
257 * This function is visible to journal users (like ext3fs), so is not
258 * called with the journal already locked.
259 *
260 * Return a pointer to a newly allocated handle, or NULL on failure
261 */
262handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
263{
264 handle_t *handle = journal_current_handle();
265 int err;
266
267 if (!journal)
268 return ERR_PTR(-EROFS);
269
270 if (handle) {
271 J_ASSERT(handle->h_transaction->t_journal == journal);
272 handle->h_ref++;
273 return handle;
274 }
275
276 handle = new_handle(nblocks);
277 if (!handle)
278 return ERR_PTR(-ENOMEM);
279
280 current->journal_info = handle;
281
282 err = start_this_handle(journal, handle);
283 if (err < 0) {
284 jbd_free_handle(handle);
285 current->journal_info = NULL;
286 handle = ERR_PTR(err);
287 }
288 return handle;
289}
290
291/**
292 * int jbd2_journal_extend() - extend buffer credits.
293 * @handle: handle to 'extend'
294 * @nblocks: nr blocks to try to extend by.
295 *
296 * Some transactions, such as large extends and truncates, can be done
297 * atomically all at once or in several stages. The operation requests
298 * a credit for a number of buffer modications in advance, but can
299 * extend its credit if it needs more.
300 *
301 * jbd2_journal_extend tries to give the running handle more buffer credits.
302 * It does not guarantee that allocation - this is a best-effort only.
303 * The calling process MUST be able to deal cleanly with a failure to
304 * extend here.
305 *
306 * Return 0 on success, non-zero on failure.
307 *
308 * return code < 0 implies an error
309 * return code > 0 implies normal transaction-full status.
310 */
311int jbd2_journal_extend(handle_t *handle, int nblocks)
312{
313 transaction_t *transaction = handle->h_transaction;
314 journal_t *journal = transaction->t_journal;
315 int result;
316 int wanted;
317
318 result = -EIO;
319 if (is_handle_aborted(handle))
320 goto out;
321
322 result = 1;
323
324 spin_lock(&journal->j_state_lock);
325
326 /* Don't extend a locked-down transaction! */
327 if (handle->h_transaction->t_state != T_RUNNING) {
328 jbd_debug(3, "denied handle %p %d blocks: "
329 "transaction not running\n", handle, nblocks);
330 goto error_out;
331 }
332
333 spin_lock(&transaction->t_handle_lock);
334 wanted = transaction->t_outstanding_credits + nblocks;
335
336 if (wanted > journal->j_max_transaction_buffers) {
337 jbd_debug(3, "denied handle %p %d blocks: "
338 "transaction too large\n", handle, nblocks);
339 goto unlock;
340 }
341
342 if (wanted > __jbd2_log_space_left(journal)) {
343 jbd_debug(3, "denied handle %p %d blocks: "
344 "insufficient log space\n", handle, nblocks);
345 goto unlock;
346 }
347
348 handle->h_buffer_credits += nblocks;
349 transaction->t_outstanding_credits += nblocks;
350 result = 0;
351
352 jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
353unlock:
354 spin_unlock(&transaction->t_handle_lock);
355error_out:
356 spin_unlock(&journal->j_state_lock);
357out:
358 return result;
359}
360
361
362/**
363 * int jbd2_journal_restart() - restart a handle .
364 * @handle: handle to restart
365 * @nblocks: nr credits requested
366 *
367 * Restart a handle for a multi-transaction filesystem
368 * operation.
369 *
370 * If the jbd2_journal_extend() call above fails to grant new buffer credits
371 * to a running handle, a call to jbd2_journal_restart will commit the
372 * handle's transaction so far and reattach the handle to a new
373 * transaction capabable of guaranteeing the requested number of
374 * credits.
375 */
376
377int jbd2_journal_restart(handle_t *handle, int nblocks)
378{
379 transaction_t *transaction = handle->h_transaction;
380 journal_t *journal = transaction->t_journal;
381 int ret;
382
383 /* If we've had an abort of any type, don't even think about
384 * actually doing the restart! */
385 if (is_handle_aborted(handle))
386 return 0;
387
388 /*
389 * First unlink the handle from its current transaction, and start the
390 * commit on that.
391 */
392 J_ASSERT(transaction->t_updates > 0);
393 J_ASSERT(journal_current_handle() == handle);
394
395 spin_lock(&journal->j_state_lock);
396 spin_lock(&transaction->t_handle_lock);
397 transaction->t_outstanding_credits -= handle->h_buffer_credits;
398 transaction->t_updates--;
399
400 if (!transaction->t_updates)
401 wake_up(&journal->j_wait_updates);
402 spin_unlock(&transaction->t_handle_lock);
403
404 jbd_debug(2, "restarting handle %p\n", handle);
405 __jbd2_log_start_commit(journal, transaction->t_tid);
406 spin_unlock(&journal->j_state_lock);
407
408 handle->h_buffer_credits = nblocks;
409 ret = start_this_handle(journal, handle);
410 return ret;
411}
412
413
414/**
415 * void jbd2_journal_lock_updates () - establish a transaction barrier.
416 * @journal: Journal to establish a barrier on.
417 *
418 * This locks out any further updates from being started, and blocks
419 * until all existing updates have completed, returning only once the
420 * journal is in a quiescent state with no updates running.
421 *
422 * The journal lock should not be held on entry.
423 */
424void jbd2_journal_lock_updates(journal_t *journal)
425{
426 DEFINE_WAIT(wait);
427
428 spin_lock(&journal->j_state_lock);
429 ++journal->j_barrier_count;
430
431 /* Wait until there are no running updates */
432 while (1) {
433 transaction_t *transaction = journal->j_running_transaction;
434
435 if (!transaction)
436 break;
437
438 spin_lock(&transaction->t_handle_lock);
439 if (!transaction->t_updates) {
440 spin_unlock(&transaction->t_handle_lock);
441 break;
442 }
443 prepare_to_wait(&journal->j_wait_updates, &wait,
444 TASK_UNINTERRUPTIBLE);
445 spin_unlock(&transaction->t_handle_lock);
446 spin_unlock(&journal->j_state_lock);
447 schedule();
448 finish_wait(&journal->j_wait_updates, &wait);
449 spin_lock(&journal->j_state_lock);
450 }
451 spin_unlock(&journal->j_state_lock);
452
453 /*
454 * We have now established a barrier against other normal updates, but
455 * we also need to barrier against other jbd2_journal_lock_updates() calls
456 * to make sure that we serialise special journal-locked operations
457 * too.
458 */
459 mutex_lock(&journal->j_barrier);
460}
461
462/**
463 * void jbd2_journal_unlock_updates (journal_t* journal) - release barrier
464 * @journal: Journal to release the barrier on.
465 *
466 * Release a transaction barrier obtained with jbd2_journal_lock_updates().
467 *
468 * Should be called without the journal lock held.
469 */
470void jbd2_journal_unlock_updates (journal_t *journal)
471{
472 J_ASSERT(journal->j_barrier_count != 0);
473
474 mutex_unlock(&journal->j_barrier);
475 spin_lock(&journal->j_state_lock);
476 --journal->j_barrier_count;
477 spin_unlock(&journal->j_state_lock);
478 wake_up(&journal->j_wait_transaction_locked);
479}
480
481/*
482 * Report any unexpected dirty buffers which turn up. Normally those
483 * indicate an error, but they can occur if the user is running (say)
484 * tune2fs to modify the live filesystem, so we need the option of
485 * continuing as gracefully as possible. #
486 *
487 * The caller should already hold the journal lock and
488 * j_list_lock spinlock: most callers will need those anyway
489 * in order to probe the buffer's journaling state safely.
490 */
491static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
492{
493 int jlist;
494
495 /* If this buffer is one which might reasonably be dirty
496 * --- ie. data, or not part of this journal --- then
497 * we're OK to leave it alone, but otherwise we need to
498 * move the dirty bit to the journal's own internal
499 * JBDDirty bit. */
500 jlist = jh->b_jlist;
501
502 if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
503 jlist == BJ_Shadow || jlist == BJ_Forget) {
504 struct buffer_head *bh = jh2bh(jh);
505
506 if (test_clear_buffer_dirty(bh))
507 set_buffer_jbddirty(bh);
508 }
509}
510
511/*
512 * If the buffer is already part of the current transaction, then there
513 * is nothing we need to do. If it is already part of a prior
514 * transaction which we are still committing to disk, then we need to
515 * make sure that we do not overwrite the old copy: we do copy-out to
516 * preserve the copy going to disk. We also account the buffer against
517 * the handle's metadata buffer credits (unless the buffer is already
518 * part of the transaction, that is).
519 *
520 */
521static int
522do_get_write_access(handle_t *handle, struct journal_head *jh,
523 int force_copy)
524{
525 struct buffer_head *bh;
526 transaction_t *transaction;
527 journal_t *journal;
528 int error;
529 char *frozen_buffer = NULL;
530 int need_copy = 0;
531
532 if (is_handle_aborted(handle))
533 return -EROFS;
534
535 transaction = handle->h_transaction;
536 journal = transaction->t_journal;
537
538 jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy);
539
540 JBUFFER_TRACE(jh, "entry");
541repeat:
542 bh = jh2bh(jh);
543
544 /* @@@ Need to check for errors here at some point. */
545
546 lock_buffer(bh);
547 jbd_lock_bh_state(bh);
548
549 /* We now hold the buffer lock so it is safe to query the buffer
550 * state. Is the buffer dirty?
551 *
552 * If so, there are two possibilities. The buffer may be
553 * non-journaled, and undergoing a quite legitimate writeback.
554 * Otherwise, it is journaled, and we don't expect dirty buffers
555 * in that state (the buffers should be marked JBD_Dirty
556 * instead.) So either the IO is being done under our own
557 * control and this is a bug, or it's a third party IO such as
558 * dump(8) (which may leave the buffer scheduled for read ---
559 * ie. locked but not dirty) or tune2fs (which may actually have
560 * the buffer dirtied, ugh.) */
561
562 if (buffer_dirty(bh)) {
563 /*
564 * First question: is this buffer already part of the current
565 * transaction or the existing committing transaction?
566 */
567 if (jh->b_transaction) {
568 J_ASSERT_JH(jh,
569 jh->b_transaction == transaction ||
570 jh->b_transaction ==
571 journal->j_committing_transaction);
572 if (jh->b_next_transaction)
573 J_ASSERT_JH(jh, jh->b_next_transaction ==
574 transaction);
575 }
576 /*
577 * In any case we need to clean the dirty flag and we must
578 * do it under the buffer lock to be sure we don't race
579 * with running write-out.
580 */
581 JBUFFER_TRACE(jh, "Unexpected dirty buffer");
582 jbd_unexpected_dirty_buffer(jh);
583 }
584
585 unlock_buffer(bh);
586
587 error = -EROFS;
588 if (is_handle_aborted(handle)) {
589 jbd_unlock_bh_state(bh);
590 goto out;
591 }
592 error = 0;
593
594 /*
595 * The buffer is already part of this transaction if b_transaction or
596 * b_next_transaction points to it
597 */
598 if (jh->b_transaction == transaction ||
599 jh->b_next_transaction == transaction)
600 goto done;
601
602 /*
603 * If there is already a copy-out version of this buffer, then we don't
604 * need to make another one
605 */
606 if (jh->b_frozen_data) {
607 JBUFFER_TRACE(jh, "has frozen data");
608 J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
609 jh->b_next_transaction = transaction;
610 goto done;
611 }
612
613 /* Is there data here we need to preserve? */
614
615 if (jh->b_transaction && jh->b_transaction != transaction) {
616 JBUFFER_TRACE(jh, "owned by older transaction");
617 J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
618 J_ASSERT_JH(jh, jh->b_transaction ==
619 journal->j_committing_transaction);
620
621 /* There is one case we have to be very careful about.
622 * If the committing transaction is currently writing
623 * this buffer out to disk and has NOT made a copy-out,
624 * then we cannot modify the buffer contents at all
625 * right now. The essence of copy-out is that it is the
626 * extra copy, not the primary copy, which gets
627 * journaled. If the primary copy is already going to
628 * disk then we cannot do copy-out here. */
629
630 if (jh->b_jlist == BJ_Shadow) {
631 DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
632 wait_queue_head_t *wqh;
633
634 wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
635
636 JBUFFER_TRACE(jh, "on shadow: sleep");
637 jbd_unlock_bh_state(bh);
638 /* commit wakes up all shadow buffers after IO */
639 for ( ; ; ) {
640 prepare_to_wait(wqh, &wait.wait,
641 TASK_UNINTERRUPTIBLE);
642 if (jh->b_jlist != BJ_Shadow)
643 break;
644 schedule();
645 }
646 finish_wait(wqh, &wait.wait);
647 goto repeat;
648 }
649
650 /* Only do the copy if the currently-owning transaction
651 * still needs it. If it is on the Forget list, the
652 * committing transaction is past that stage. The
653 * buffer had better remain locked during the kmalloc,
654 * but that should be true --- we hold the journal lock
655 * still and the buffer is already on the BUF_JOURNAL
656 * list so won't be flushed.
657 *
658 * Subtle point, though: if this is a get_undo_access,
659 * then we will be relying on the frozen_data to contain
660 * the new value of the committed_data record after the
661 * transaction, so we HAVE to force the frozen_data copy
662 * in that case. */
663
664 if (jh->b_jlist != BJ_Forget || force_copy) {
665 JBUFFER_TRACE(jh, "generate frozen data");
666 if (!frozen_buffer) {
667 JBUFFER_TRACE(jh, "allocate memory for buffer");
668 jbd_unlock_bh_state(bh);
669 frozen_buffer =
670 jbd2_slab_alloc(jh2bh(jh)->b_size,
671 GFP_NOFS);
672 if (!frozen_buffer) {
673 printk(KERN_EMERG
674 "%s: OOM for frozen_buffer\n",
675 __FUNCTION__);
676 JBUFFER_TRACE(jh, "oom!");
677 error = -ENOMEM;
678 jbd_lock_bh_state(bh);
679 goto done;
680 }
681 goto repeat;
682 }
683 jh->b_frozen_data = frozen_buffer;
684 frozen_buffer = NULL;
685 need_copy = 1;
686 }
687 jh->b_next_transaction = transaction;
688 }
689
690
691 /*
692 * Finally, if the buffer is not journaled right now, we need to make
693 * sure it doesn't get written to disk before the caller actually
694 * commits the new data
695 */
696 if (!jh->b_transaction) {
697 JBUFFER_TRACE(jh, "no transaction");
698 J_ASSERT_JH(jh, !jh->b_next_transaction);
699 jh->b_transaction = transaction;
700 JBUFFER_TRACE(jh, "file as BJ_Reserved");
701 spin_lock(&journal->j_list_lock);
702 __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
703 spin_unlock(&journal->j_list_lock);
704 }
705
706done:
707 if (need_copy) {
708 struct page *page;
709 int offset;
710 char *source;
711
712 J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
713 "Possible IO failure.\n");
714 page = jh2bh(jh)->b_page;
715 offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
716 source = kmap_atomic(page, KM_USER0);
717 memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
718 kunmap_atomic(source, KM_USER0);
719 }
720 jbd_unlock_bh_state(bh);
721
722 /*
723 * If we are about to journal a buffer, then any revoke pending on it is
724 * no longer valid
725 */
726 jbd2_journal_cancel_revoke(handle, jh);
727
728out:
729 if (unlikely(frozen_buffer)) /* It's usually NULL */
730 jbd2_slab_free(frozen_buffer, bh->b_size);
731
732 JBUFFER_TRACE(jh, "exit");
733 return error;
734}
735
736/**
737 * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
738 * @handle: transaction to add buffer modifications to
739 * @bh: bh to be used for metadata writes
740 * @credits: variable that will receive credits for the buffer
741 *
742 * Returns an error code or 0 on success.
743 *
744 * In full data journalling mode the buffer may be of type BJ_AsyncData,
745 * because we're write()ing a buffer which is also part of a shared mapping.
746 */
747
748int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
749{
750 struct journal_head *jh = jbd2_journal_add_journal_head(bh);
751 int rc;
752
753 /* We do not want to get caught playing with fields which the
754 * log thread also manipulates. Make sure that the buffer
755 * completes any outstanding IO before proceeding. */
756 rc = do_get_write_access(handle, jh, 0);
757 jbd2_journal_put_journal_head(jh);
758 return rc;
759}
760
761
762/*
763 * When the user wants to journal a newly created buffer_head
764 * (ie. getblk() returned a new buffer and we are going to populate it
765 * manually rather than reading off disk), then we need to keep the
766 * buffer_head locked until it has been completely filled with new
767 * data. In this case, we should be able to make the assertion that
768 * the bh is not already part of an existing transaction.
769 *
770 * The buffer should already be locked by the caller by this point.
771 * There is no lock ranking violation: it was a newly created,
772 * unlocked buffer beforehand. */
773
774/**
775 * int jbd2_journal_get_create_access () - notify intent to use newly created bh
776 * @handle: transaction to new buffer to
777 * @bh: new buffer.
778 *
779 * Call this if you create a new bh.
780 */
781int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
782{
783 transaction_t *transaction = handle->h_transaction;
784 journal_t *journal = transaction->t_journal;
785 struct journal_head *jh = jbd2_journal_add_journal_head(bh);
786 int err;
787
788 jbd_debug(5, "journal_head %p\n", jh);
789 err = -EROFS;
790 if (is_handle_aborted(handle))
791 goto out;
792 err = 0;
793
794 JBUFFER_TRACE(jh, "entry");
795 /*
796 * The buffer may already belong to this transaction due to pre-zeroing
797 * in the filesystem's new_block code. It may also be on the previous,
798 * committing transaction's lists, but it HAS to be in Forget state in
799 * that case: the transaction must have deleted the buffer for it to be
800 * reused here.
801 */
802 jbd_lock_bh_state(bh);
803 spin_lock(&journal->j_list_lock);
804 J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
805 jh->b_transaction == NULL ||
806 (jh->b_transaction == journal->j_committing_transaction &&
807 jh->b_jlist == BJ_Forget)));
808
809 J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
810 J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
811
812 if (jh->b_transaction == NULL) {
813 jh->b_transaction = transaction;
814 JBUFFER_TRACE(jh, "file as BJ_Reserved");
815 __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
816 } else if (jh->b_transaction == journal->j_committing_transaction) {
817 JBUFFER_TRACE(jh, "set next transaction");
818 jh->b_next_transaction = transaction;
819 }
820 spin_unlock(&journal->j_list_lock);
821 jbd_unlock_bh_state(bh);
822
823 /*
824 * akpm: I added this. ext3_alloc_branch can pick up new indirect
825 * blocks which contain freed but then revoked metadata. We need
826 * to cancel the revoke in case we end up freeing it yet again
827 * and the reallocating as data - this would cause a second revoke,
828 * which hits an assertion error.
829 */
830 JBUFFER_TRACE(jh, "cancelling revoke");
831 jbd2_journal_cancel_revoke(handle, jh);
832 jbd2_journal_put_journal_head(jh);
833out:
834 return err;
835}
836
837/**
838 * int jbd2_journal_get_undo_access() - Notify intent to modify metadata with
839 * non-rewindable consequences
840 * @handle: transaction
841 * @bh: buffer to undo
842 * @credits: store the number of taken credits here (if not NULL)
843 *
844 * Sometimes there is a need to distinguish between metadata which has
845 * been committed to disk and that which has not. The ext3fs code uses
846 * this for freeing and allocating space, we have to make sure that we
847 * do not reuse freed space until the deallocation has been committed,
848 * since if we overwrote that space we would make the delete
849 * un-rewindable in case of a crash.
850 *
851 * To deal with that, jbd2_journal_get_undo_access requests write access to a
852 * buffer for parts of non-rewindable operations such as delete
853 * operations on the bitmaps. The journaling code must keep a copy of
854 * the buffer's contents prior to the undo_access call until such time
855 * as we know that the buffer has definitely been committed to disk.
856 *
857 * We never need to know which transaction the committed data is part
858 * of, buffers touched here are guaranteed to be dirtied later and so
859 * will be committed to a new transaction in due course, at which point
860 * we can discard the old committed data pointer.
861 *
862 * Returns error number or 0 on success.
863 */
864int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
865{
866 int err;
867 struct journal_head *jh = jbd2_journal_add_journal_head(bh);
868 char *committed_data = NULL;
869
870 JBUFFER_TRACE(jh, "entry");
871
872 /*
873 * Do this first --- it can drop the journal lock, so we want to
874 * make sure that obtaining the committed_data is done
875 * atomically wrt. completion of any outstanding commits.
876 */
877 err = do_get_write_access(handle, jh, 1);
878 if (err)
879 goto out;
880
881repeat:
882 if (!jh->b_committed_data) {
883 committed_data = jbd2_slab_alloc(jh2bh(jh)->b_size, GFP_NOFS);
884 if (!committed_data) {
885 printk(KERN_EMERG "%s: No memory for committed data\n",
886 __FUNCTION__);
887 err = -ENOMEM;
888 goto out;
889 }
890 }
891
892 jbd_lock_bh_state(bh);
893 if (!jh->b_committed_data) {
894 /* Copy out the current buffer contents into the
895 * preserved, committed copy. */
896 JBUFFER_TRACE(jh, "generate b_committed data");
897 if (!committed_data) {
898 jbd_unlock_bh_state(bh);
899 goto repeat;
900 }
901
902 jh->b_committed_data = committed_data;
903 committed_data = NULL;
904 memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
905 }
906 jbd_unlock_bh_state(bh);
907out:
908 jbd2_journal_put_journal_head(jh);
909 if (unlikely(committed_data))
910 jbd2_slab_free(committed_data, bh->b_size);
911 return err;
912}
913
914/**
915 * int jbd2_journal_dirty_data() - mark a buffer as containing dirty data which
916 * needs to be flushed before we can commit the
917 * current transaction.
918 * @handle: transaction
919 * @bh: bufferhead to mark
920 *
921 * The buffer is placed on the transaction's data list and is marked as
922 * belonging to the transaction.
923 *
924 * Returns error number or 0 on success.
925 *
926 * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage
927 * by kswapd.
928 */
929int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
930{
931 journal_t *journal = handle->h_transaction->t_journal;
932 int need_brelse = 0;
933 struct journal_head *jh;
934
935 if (is_handle_aborted(handle))
936 return 0;
937
938 jh = jbd2_journal_add_journal_head(bh);
939 JBUFFER_TRACE(jh, "entry");
940
941 /*
942 * The buffer could *already* be dirty. Writeout can start
943 * at any time.
944 */
945 jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
946
947 /*
948 * What if the buffer is already part of a running transaction?
949 *
950 * There are two cases:
951 * 1) It is part of the current running transaction. Refile it,
952 * just in case we have allocated it as metadata, deallocated
953 * it, then reallocated it as data.
954 * 2) It is part of the previous, still-committing transaction.
955 * If all we want to do is to guarantee that the buffer will be
956 * written to disk before this new transaction commits, then
957 * being sure that the *previous* transaction has this same
958 * property is sufficient for us! Just leave it on its old
959 * transaction.
960 *
961 * In case (2), the buffer must not already exist as metadata
962 * --- that would violate write ordering (a transaction is free
963 * to write its data at any point, even before the previous
964 * committing transaction has committed). The caller must
965 * never, ever allow this to happen: there's nothing we can do
966 * about it in this layer.
967 */
968 jbd_lock_bh_state(bh);
969 spin_lock(&journal->j_list_lock);
970 if (jh->b_transaction) {
971 JBUFFER_TRACE(jh, "has transaction");
972 if (jh->b_transaction != handle->h_transaction) {
973 JBUFFER_TRACE(jh, "belongs to older transaction");
974 J_ASSERT_JH(jh, jh->b_transaction ==
975 journal->j_committing_transaction);
976
977 /* @@@ IS THIS TRUE ? */
978 /*
979 * Not any more. Scenario: someone does a write()
980 * in data=journal mode. The buffer's transaction has
981 * moved into commit. Then someone does another
982 * write() to the file. We do the frozen data copyout
983 * and set b_next_transaction to point to j_running_t.
984 * And while we're in that state, someone does a
985 * writepage() in an attempt to pageout the same area
986 * of the file via a shared mapping. At present that
987 * calls jbd2_journal_dirty_data(), and we get right here.
988 * It may be too late to journal the data. Simply
989 * falling through to the next test will suffice: the
990 * data will be dirty and wil be checkpointed. The
991 * ordering comments in the next comment block still
992 * apply.
993 */
994 //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
995
996 /*
997 * If we're journalling data, and this buffer was
998 * subject to a write(), it could be metadata, forget
999 * or shadow against the committing transaction. Now,
1000 * someone has dirtied the same darn page via a mapping
1001 * and it is being writepage()'d.
1002 * We *could* just steal the page from commit, with some
1003 * fancy locking there. Instead, we just skip it -
1004 * don't tie the page's buffers to the new transaction
1005 * at all.
1006 * Implication: if we crash before the writepage() data
1007 * is written into the filesystem, recovery will replay
1008 * the write() data.
1009 */
1010 if (jh->b_jlist != BJ_None &&
1011 jh->b_jlist != BJ_SyncData &&
1012 jh->b_jlist != BJ_Locked) {
1013 JBUFFER_TRACE(jh, "Not stealing");
1014 goto no_journal;
1015 }
1016
1017 /*
1018 * This buffer may be undergoing writeout in commit. We
1019 * can't return from here and let the caller dirty it
1020 * again because that can cause the write-out loop in
1021 * commit to never terminate.
1022 */
1023 if (buffer_dirty(bh)) {
1024 get_bh(bh);
1025 spin_unlock(&journal->j_list_lock);
1026 jbd_unlock_bh_state(bh);
1027 need_brelse = 1;
1028 sync_dirty_buffer(bh);
1029 jbd_lock_bh_state(bh);
1030 spin_lock(&journal->j_list_lock);
1031 /* The buffer may become locked again at any
1032 time if it is redirtied */
1033 }
1034
1035 /* journal_clean_data_list() may have got there first */
1036 if (jh->b_transaction != NULL) {
1037 JBUFFER_TRACE(jh, "unfile from commit");
1038 __jbd2_journal_temp_unlink_buffer(jh);
1039 /* It still points to the committing
1040 * transaction; move it to this one so
1041 * that the refile assert checks are
1042 * happy. */
1043 jh->b_transaction = handle->h_transaction;
1044 }
1045 /* The buffer will be refiled below */
1046
1047 }
1048 /*
1049 * Special case --- the buffer might actually have been
1050 * allocated and then immediately deallocated in the previous,
1051 * committing transaction, so might still be left on that
1052 * transaction's metadata lists.
1053 */
1054 if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
1055 JBUFFER_TRACE(jh, "not on correct data list: unfile");
1056 J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
1057 __jbd2_journal_temp_unlink_buffer(jh);
1058 jh->b_transaction = handle->h_transaction;
1059 JBUFFER_TRACE(jh, "file as data");
1060 __jbd2_journal_file_buffer(jh, handle->h_transaction,
1061 BJ_SyncData);
1062 }
1063 } else {
1064 JBUFFER_TRACE(jh, "not on a transaction");
1065 __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
1066 }
1067no_journal:
1068 spin_unlock(&journal->j_list_lock);
1069 jbd_unlock_bh_state(bh);
1070 if (need_brelse) {
1071 BUFFER_TRACE(bh, "brelse");
1072 __brelse(bh);
1073 }
1074 JBUFFER_TRACE(jh, "exit");
1075 jbd2_journal_put_journal_head(jh);
1076 return 0;
1077}
1078
1079/**
1080 * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
1081 * @handle: transaction to add buffer to.
1082 * @bh: buffer to mark
1083 *
1084 * mark dirty metadata which needs to be journaled as part of the current
1085 * transaction.
1086 *
1087 * The buffer is placed on the transaction's metadata list and is marked
1088 * as belonging to the transaction.
1089 *
1090 * Returns error number or 0 on success.
1091 *
1092 * Special care needs to be taken if the buffer already belongs to the
1093 * current committing transaction (in which case we should have frozen
1094 * data present for that commit). In that case, we don't relink the
1095 * buffer: that only gets done when the old transaction finally
1096 * completes its commit.
1097 */
1098int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1099{
1100 transaction_t *transaction = handle->h_transaction;
1101 journal_t *journal = transaction->t_journal;
1102 struct journal_head *jh = bh2jh(bh);
1103
1104 jbd_debug(5, "journal_head %p\n", jh);
1105 JBUFFER_TRACE(jh, "entry");
1106 if (is_handle_aborted(handle))
1107 goto out;
1108
1109 jbd_lock_bh_state(bh);
1110
1111 if (jh->b_modified == 0) {
1112 /*
1113 * This buffer's got modified and becoming part
1114 * of the transaction. This needs to be done
1115 * once a transaction -bzzz
1116 */
1117 jh->b_modified = 1;
1118 J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
1119 handle->h_buffer_credits--;
1120 }
1121
1122 /*
1123 * fastpath, to avoid expensive locking. If this buffer is already
1124 * on the running transaction's metadata list there is nothing to do.
1125 * Nobody can take it off again because there is a handle open.
1126 * I _think_ we're OK here with SMP barriers - a mistaken decision will
1127 * result in this test being false, so we go in and take the locks.
1128 */
1129 if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
1130 JBUFFER_TRACE(jh, "fastpath");
1131 J_ASSERT_JH(jh, jh->b_transaction ==
1132 journal->j_running_transaction);
1133 goto out_unlock_bh;
1134 }
1135
1136 set_buffer_jbddirty(bh);
1137
1138 /*
1139 * Metadata already on the current transaction list doesn't
1140 * need to be filed. Metadata on another transaction's list must
1141 * be committing, and will be refiled once the commit completes:
1142 * leave it alone for now.
1143 */
1144 if (jh->b_transaction != transaction) {
1145 JBUFFER_TRACE(jh, "already on other transaction");
1146 J_ASSERT_JH(jh, jh->b_transaction ==
1147 journal->j_committing_transaction);
1148 J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
1149 /* And this case is illegal: we can't reuse another
1150 * transaction's data buffer, ever. */
1151 goto out_unlock_bh;
1152 }
1153
1154 /* That test should have eliminated the following case: */
1155 J_ASSERT_JH(jh, jh->b_frozen_data == 0);
1156
1157 JBUFFER_TRACE(jh, "file as BJ_Metadata");
1158 spin_lock(&journal->j_list_lock);
1159 __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
1160 spin_unlock(&journal->j_list_lock);
1161out_unlock_bh:
1162 jbd_unlock_bh_state(bh);
1163out:
1164 JBUFFER_TRACE(jh, "exit");
1165 return 0;
1166}
1167
1168/*
1169 * jbd2_journal_release_buffer: undo a get_write_access without any buffer
1170 * updates, if the update decided in the end that it didn't need access.
1171 *
1172 */
1173void
1174jbd2_journal_release_buffer(handle_t *handle, struct buffer_head *bh)
1175{
1176 BUFFER_TRACE(bh, "entry");
1177}
1178
1179/**
1180 * void jbd2_journal_forget() - bforget() for potentially-journaled buffers.
1181 * @handle: transaction handle
1182 * @bh: bh to 'forget'
1183 *
1184 * We can only do the bforget if there are no commits pending against the
1185 * buffer. If the buffer is dirty in the current running transaction we
1186 * can safely unlink it.
1187 *
1188 * bh may not be a journalled buffer at all - it may be a non-JBD
1189 * buffer which came off the hashtable. Check for this.
1190 *
1191 * Decrements bh->b_count by one.
1192 *
1193 * Allow this call even if the handle has aborted --- it may be part of
1194 * the caller's cleanup after an abort.
1195 */
1196int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
1197{
1198 transaction_t *transaction = handle->h_transaction;
1199 journal_t *journal = transaction->t_journal;
1200 struct journal_head *jh;
1201 int drop_reserve = 0;
1202 int err = 0;
1203
1204 BUFFER_TRACE(bh, "entry");
1205
1206 jbd_lock_bh_state(bh);
1207 spin_lock(&journal->j_list_lock);
1208
1209 if (!buffer_jbd(bh))
1210 goto not_jbd;
1211 jh = bh2jh(bh);
1212
1213 /* Critical error: attempting to delete a bitmap buffer, maybe?
1214 * Don't do any jbd operations, and return an error. */
1215 if (!J_EXPECT_JH(jh, !jh->b_committed_data,
1216 "inconsistent data on disk")) {
1217 err = -EIO;
1218 goto not_jbd;
1219 }
1220
1221 /*
1222 * The buffer's going from the transaction, we must drop
1223 * all references -bzzz
1224 */
1225 jh->b_modified = 0;
1226
1227 if (jh->b_transaction == handle->h_transaction) {
1228 J_ASSERT_JH(jh, !jh->b_frozen_data);
1229
1230 /* If we are forgetting a buffer which is already part
1231 * of this transaction, then we can just drop it from
1232 * the transaction immediately. */
1233 clear_buffer_dirty(bh);
1234 clear_buffer_jbddirty(bh);
1235
1236 JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
1237
1238 drop_reserve = 1;
1239
1240 /*
1241 * We are no longer going to journal this buffer.
1242 * However, the commit of this transaction is still
1243 * important to the buffer: the delete that we are now
1244 * processing might obsolete an old log entry, so by
1245 * committing, we can satisfy the buffer's checkpoint.
1246 *
1247 * So, if we have a checkpoint on the buffer, we should
1248 * now refile the buffer on our BJ_Forget list so that
1249 * we know to remove the checkpoint after we commit.
1250 */
1251
1252 if (jh->b_cp_transaction) {
1253 __jbd2_journal_temp_unlink_buffer(jh);
1254 __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
1255 } else {
1256 __jbd2_journal_unfile_buffer(jh);
1257 jbd2_journal_remove_journal_head(bh);
1258 __brelse(bh);
1259 if (!buffer_jbd(bh)) {
1260 spin_unlock(&journal->j_list_lock);
1261 jbd_unlock_bh_state(bh);
1262 __bforget(bh);
1263 goto drop;
1264 }
1265 }
1266 } else if (jh->b_transaction) {
1267 J_ASSERT_JH(jh, (jh->b_transaction ==
1268 journal->j_committing_transaction));
1269 /* However, if the buffer is still owned by a prior
1270 * (committing) transaction, we can't drop it yet... */
1271 JBUFFER_TRACE(jh, "belongs to older transaction");
1272 /* ... but we CAN drop it from the new transaction if we
1273 * have also modified it since the original commit. */
1274
1275 if (jh->b_next_transaction) {
1276 J_ASSERT(jh->b_next_transaction == transaction);
1277 jh->b_next_transaction = NULL;
1278 drop_reserve = 1;
1279 }
1280 }
1281
1282not_jbd:
1283 spin_unlock(&journal->j_list_lock);
1284 jbd_unlock_bh_state(bh);
1285 __brelse(bh);
1286drop:
1287 if (drop_reserve) {
1288 /* no need to reserve log space for this block -bzzz */
1289 handle->h_buffer_credits++;
1290 }
1291 return err;
1292}
1293
1294/**
1295 * int jbd2_journal_stop() - complete a transaction
1296 * @handle: tranaction to complete.
1297 *
1298 * All done for a particular handle.
1299 *
1300 * There is not much action needed here. We just return any remaining
1301 * buffer credits to the transaction and remove the handle. The only
1302 * complication is that we need to start a commit operation if the
1303 * filesystem is marked for synchronous update.
1304 *
1305 * jbd2_journal_stop itself will not usually return an error, but it may
1306 * do so in unusual circumstances. In particular, expect it to
1307 * return -EIO if a jbd2_journal_abort has been executed since the
1308 * transaction began.
1309 */
1310int jbd2_journal_stop(handle_t *handle)
1311{
1312 transaction_t *transaction = handle->h_transaction;
1313 journal_t *journal = transaction->t_journal;
1314 int old_handle_count, err;
1315 pid_t pid;
1316
1317 J_ASSERT(transaction->t_updates > 0);
1318 J_ASSERT(journal_current_handle() == handle);
1319
1320 if (is_handle_aborted(handle))
1321 err = -EIO;
1322 else
1323 err = 0;
1324
1325 if (--handle->h_ref > 0) {
1326 jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
1327 handle->h_ref);
1328 return err;
1329 }
1330
1331 jbd_debug(4, "Handle %p going down\n", handle);
1332
1333 /*
1334 * Implement synchronous transaction batching. If the handle
1335 * was synchronous, don't force a commit immediately. Let's
1336 * yield and let another thread piggyback onto this transaction.
1337 * Keep doing that while new threads continue to arrive.
1338 * It doesn't cost much - we're about to run a commit and sleep
1339 * on IO anyway. Speeds up many-threaded, many-dir operations
1340 * by 30x or more...
1341 *
1342 * But don't do this if this process was the most recent one to
1343 * perform a synchronous write. We do this to detect the case where a
1344 * single process is doing a stream of sync writes. No point in waiting
1345 * for joiners in that case.
1346 */
1347 pid = current->pid;
1348 if (handle->h_sync && journal->j_last_sync_writer != pid) {
1349 journal->j_last_sync_writer = pid;
1350 do {
1351 old_handle_count = transaction->t_handle_count;
1352 schedule_timeout_uninterruptible(1);
1353 } while (old_handle_count != transaction->t_handle_count);
1354 }
1355
1356 current->journal_info = NULL;
1357 spin_lock(&journal->j_state_lock);
1358 spin_lock(&transaction->t_handle_lock);
1359 transaction->t_outstanding_credits -= handle->h_buffer_credits;
1360 transaction->t_updates--;
1361 if (!transaction->t_updates) {
1362 wake_up(&journal->j_wait_updates);
1363 if (journal->j_barrier_count)
1364 wake_up(&journal->j_wait_transaction_locked);
1365 }
1366
1367 /*
1368 * If the handle is marked SYNC, we need to set another commit
1369 * going! We also want to force a commit if the current
1370 * transaction is occupying too much of the log, or if the
1371 * transaction is too old now.
1372 */
1373 if (handle->h_sync ||
1374 transaction->t_outstanding_credits >
1375 journal->j_max_transaction_buffers ||
1376 time_after_eq(jiffies, transaction->t_expires)) {
1377 /* Do this even for aborted journals: an abort still
1378 * completes the commit thread, it just doesn't write
1379 * anything to disk. */
1380 tid_t tid = transaction->t_tid;
1381
1382 spin_unlock(&transaction->t_handle_lock);
1383 jbd_debug(2, "transaction too old, requesting commit for "
1384 "handle %p\n", handle);
1385 /* This is non-blocking */
1386 __jbd2_log_start_commit(journal, transaction->t_tid);
1387 spin_unlock(&journal->j_state_lock);
1388
1389 /*
1390 * Special case: JBD2_SYNC synchronous updates require us
1391 * to wait for the commit to complete.
1392 */
1393 if (handle->h_sync && !(current->flags & PF_MEMALLOC))
1394 err = jbd2_log_wait_commit(journal, tid);
1395 } else {
1396 spin_unlock(&transaction->t_handle_lock);
1397 spin_unlock(&journal->j_state_lock);
1398 }
1399
1400 jbd_free_handle(handle);
1401 return err;
1402}
1403
1404/**int jbd2_journal_force_commit() - force any uncommitted transactions
1405 * @journal: journal to force
1406 *
1407 * For synchronous operations: force any uncommitted transactions
1408 * to disk. May seem kludgy, but it reuses all the handle batching
1409 * code in a very simple manner.
1410 */
1411int jbd2_journal_force_commit(journal_t *journal)
1412{
1413 handle_t *handle;
1414 int ret;
1415
1416 handle = jbd2_journal_start(journal, 1);
1417 if (IS_ERR(handle)) {
1418 ret = PTR_ERR(handle);
1419 } else {
1420 handle->h_sync = 1;
1421 ret = jbd2_journal_stop(handle);
1422 }
1423 return ret;
1424}
1425
1426/*
1427 *
1428 * List management code snippets: various functions for manipulating the
1429 * transaction buffer lists.
1430 *
1431 */
1432
1433/*
1434 * Append a buffer to a transaction list, given the transaction's list head
1435 * pointer.
1436 *
1437 * j_list_lock is held.
1438 *
1439 * jbd_lock_bh_state(jh2bh(jh)) is held.
1440 */
1441
1442static inline void
1443__blist_add_buffer(struct journal_head **list, struct journal_head *jh)
1444{
1445 if (!*list) {
1446 jh->b_tnext = jh->b_tprev = jh;
1447 *list = jh;
1448 } else {
1449 /* Insert at the tail of the list to preserve order */
1450 struct journal_head *first = *list, *last = first->b_tprev;
1451 jh->b_tprev = last;
1452 jh->b_tnext = first;
1453 last->b_tnext = first->b_tprev = jh;
1454 }
1455}
1456
1457/*
1458 * Remove a buffer from a transaction list, given the transaction's list
1459 * head pointer.
1460 *
1461 * Called with j_list_lock held, and the journal may not be locked.
1462 *
1463 * jbd_lock_bh_state(jh2bh(jh)) is held.
1464 */
1465
1466static inline void
1467__blist_del_buffer(struct journal_head **list, struct journal_head *jh)
1468{
1469 if (*list == jh) {
1470 *list = jh->b_tnext;
1471 if (*list == jh)
1472 *list = NULL;
1473 }
1474 jh->b_tprev->b_tnext = jh->b_tnext;
1475 jh->b_tnext->b_tprev = jh->b_tprev;
1476}
1477
1478/*
1479 * Remove a buffer from the appropriate transaction list.
1480 *
1481 * Note that this function can *change* the value of
1482 * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
1483 * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller
1484 * is holding onto a copy of one of thee pointers, it could go bad.
1485 * Generally the caller needs to re-read the pointer from the transaction_t.
1486 *
1487 * Called under j_list_lock. The journal may not be locked.
1488 */
1489void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
1490{
1491 struct journal_head **list = NULL;
1492 transaction_t *transaction;
1493 struct buffer_head *bh = jh2bh(jh);
1494
1495 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
1496 transaction = jh->b_transaction;
1497 if (transaction)
1498 assert_spin_locked(&transaction->t_journal->j_list_lock);
1499
1500 J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
1501 if (jh->b_jlist != BJ_None)
1502 J_ASSERT_JH(jh, transaction != 0);
1503
1504 switch (jh->b_jlist) {
1505 case BJ_None:
1506 return;
1507 case BJ_SyncData:
1508 list = &transaction->t_sync_datalist;
1509 break;
1510 case BJ_Metadata:
1511 transaction->t_nr_buffers--;
1512 J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
1513 list = &transaction->t_buffers;
1514 break;
1515 case BJ_Forget:
1516 list = &transaction->t_forget;
1517 break;
1518 case BJ_IO:
1519 list = &transaction->t_iobuf_list;
1520 break;
1521 case BJ_Shadow:
1522 list = &transaction->t_shadow_list;
1523 break;
1524 case BJ_LogCtl:
1525 list = &transaction->t_log_list;
1526 break;
1527 case BJ_Reserved:
1528 list = &transaction->t_reserved_list;
1529 break;
1530 case BJ_Locked:
1531 list = &transaction->t_locked_list;
1532 break;
1533 }
1534
1535 __blist_del_buffer(list, jh);
1536 jh->b_jlist = BJ_None;
1537 if (test_clear_buffer_jbddirty(bh))
1538 mark_buffer_dirty(bh); /* Expose it to the VM */
1539}
1540
1541void __jbd2_journal_unfile_buffer(struct journal_head *jh)
1542{
1543 __jbd2_journal_temp_unlink_buffer(jh);
1544 jh->b_transaction = NULL;
1545}
1546
1547void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
1548{
1549 jbd_lock_bh_state(jh2bh(jh));
1550 spin_lock(&journal->j_list_lock);
1551 __jbd2_journal_unfile_buffer(jh);
1552 spin_unlock(&journal->j_list_lock);
1553 jbd_unlock_bh_state(jh2bh(jh));
1554}
1555
1556/*
1557 * Called from jbd2_journal_try_to_free_buffers().
1558 *
1559 * Called under jbd_lock_bh_state(bh)
1560 */
1561static void
1562__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
1563{
1564 struct journal_head *jh;
1565
1566 jh = bh2jh(bh);
1567
1568 if (buffer_locked(bh) || buffer_dirty(bh))
1569 goto out;
1570
1571 if (jh->b_next_transaction != 0)
1572 goto out;
1573
1574 spin_lock(&journal->j_list_lock);
1575 if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) {
1576 if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
1577 /* A written-back ordered data buffer */
1578 JBUFFER_TRACE(jh, "release data");
1579 __jbd2_journal_unfile_buffer(jh);
1580 jbd2_journal_remove_journal_head(bh);
1581 __brelse(bh);
1582 }
1583 } else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) {
1584 /* written-back checkpointed metadata buffer */
1585 if (jh->b_jlist == BJ_None) {
1586 JBUFFER_TRACE(jh, "remove from checkpoint list");
1587 __jbd2_journal_remove_checkpoint(jh);
1588 jbd2_journal_remove_journal_head(bh);
1589 __brelse(bh);
1590 }
1591 }
1592 spin_unlock(&journal->j_list_lock);
1593out:
1594 return;
1595}
1596
1597
1598/**
1599 * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
1600 * @journal: journal for operation
1601 * @page: to try and free
1602 * @unused_gfp_mask: unused
1603 *
1604 *
1605 * For all the buffers on this page,
1606 * if they are fully written out ordered data, move them onto BUF_CLEAN
1607 * so try_to_free_buffers() can reap them.
1608 *
1609 * This function returns non-zero if we wish try_to_free_buffers()
1610 * to be called. We do this if the page is releasable by try_to_free_buffers().
1611 * We also do it if the page has locked or dirty buffers and the caller wants
1612 * us to perform sync or async writeout.
1613 *
1614 * This complicates JBD locking somewhat. We aren't protected by the
1615 * BKL here. We wish to remove the buffer from its committing or
1616 * running transaction's ->t_datalist via __jbd2_journal_unfile_buffer.
1617 *
1618 * This may *change* the value of transaction_t->t_datalist, so anyone
1619 * who looks at t_datalist needs to lock against this function.
1620 *
1621 * Even worse, someone may be doing a jbd2_journal_dirty_data on this
1622 * buffer. So we need to lock against that. jbd2_journal_dirty_data()
1623 * will come out of the lock with the buffer dirty, which makes it
1624 * ineligible for release here.
1625 *
1626 * Who else is affected by this? hmm... Really the only contender
1627 * is do_get_write_access() - it could be looking at the buffer while
1628 * journal_try_to_free_buffer() is changing its state. But that
1629 * cannot happen because we never reallocate freed data as metadata
1630 * while the data is part of a transaction. Yes?
1631 */
1632int jbd2_journal_try_to_free_buffers(journal_t *journal,
1633 struct page *page, gfp_t unused_gfp_mask)
1634{
1635 struct buffer_head *head;
1636 struct buffer_head *bh;
1637 int ret = 0;
1638
1639 J_ASSERT(PageLocked(page));
1640
1641 head = page_buffers(page);
1642 bh = head;
1643 do {
1644 struct journal_head *jh;
1645
1646 /*
1647 * We take our own ref against the journal_head here to avoid
1648 * having to add tons of locking around each instance of
1649 * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head().
1650 */
1651 jh = jbd2_journal_grab_journal_head(bh);
1652 if (!jh)
1653 continue;
1654
1655 jbd_lock_bh_state(bh);
1656 __journal_try_to_free_buffer(journal, bh);
1657 jbd2_journal_put_journal_head(jh);
1658 jbd_unlock_bh_state(bh);
1659 if (buffer_jbd(bh))
1660 goto busy;
1661 } while ((bh = bh->b_this_page) != head);
1662 ret = try_to_free_buffers(page);
1663busy:
1664 return ret;
1665}
1666
1667/*
1668 * This buffer is no longer needed. If it is on an older transaction's
1669 * checkpoint list we need to record it on this transaction's forget list
1670 * to pin this buffer (and hence its checkpointing transaction) down until
1671 * this transaction commits. If the buffer isn't on a checkpoint list, we
1672 * release it.
1673 * Returns non-zero if JBD no longer has an interest in the buffer.
1674 *
1675 * Called under j_list_lock.
1676 *
1677 * Called under jbd_lock_bh_state(bh).
1678 */
1679static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
1680{
1681 int may_free = 1;
1682 struct buffer_head *bh = jh2bh(jh);
1683
1684 __jbd2_journal_unfile_buffer(jh);
1685
1686 if (jh->b_cp_transaction) {
1687 JBUFFER_TRACE(jh, "on running+cp transaction");
1688 __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
1689 clear_buffer_jbddirty(bh);
1690 may_free = 0;
1691 } else {
1692 JBUFFER_TRACE(jh, "on running transaction");
1693 jbd2_journal_remove_journal_head(bh);
1694 __brelse(bh);
1695 }
1696 return may_free;
1697}
1698
1699/*
1700 * jbd2_journal_invalidatepage
1701 *
1702 * This code is tricky. It has a number of cases to deal with.
1703 *
1704 * There are two invariants which this code relies on:
1705 *
1706 * i_size must be updated on disk before we start calling invalidatepage on the
1707 * data.
1708 *
1709 * This is done in ext3 by defining an ext3_setattr method which
1710 * updates i_size before truncate gets going. By maintaining this
1711 * invariant, we can be sure that it is safe to throw away any buffers
1712 * attached to the current transaction: once the transaction commits,
1713 * we know that the data will not be needed.
1714 *
1715 * Note however that we can *not* throw away data belonging to the
1716 * previous, committing transaction!
1717 *
1718 * Any disk blocks which *are* part of the previous, committing
1719 * transaction (and which therefore cannot be discarded immediately) are
1720 * not going to be reused in the new running transaction
1721 *
1722 * The bitmap committed_data images guarantee this: any block which is
1723 * allocated in one transaction and removed in the next will be marked
1724 * as in-use in the committed_data bitmap, so cannot be reused until
1725 * the next transaction to delete the block commits. This means that
1726 * leaving committing buffers dirty is quite safe: the disk blocks
1727 * cannot be reallocated to a different file and so buffer aliasing is
1728 * not possible.
1729 *
1730 *
1731 * The above applies mainly to ordered data mode. In writeback mode we
1732 * don't make guarantees about the order in which data hits disk --- in
1733 * particular we don't guarantee that new dirty data is flushed before
1734 * transaction commit --- so it is always safe just to discard data
1735 * immediately in that mode. --sct
1736 */
1737
1738/*
1739 * The journal_unmap_buffer helper function returns zero if the buffer
1740 * concerned remains pinned as an anonymous buffer belonging to an older
1741 * transaction.
1742 *
1743 * We're outside-transaction here. Either or both of j_running_transaction
1744 * and j_committing_transaction may be NULL.
1745 */
1746static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1747{
1748 transaction_t *transaction;
1749 struct journal_head *jh;
1750 int may_free = 1;
1751 int ret;
1752
1753 BUFFER_TRACE(bh, "entry");
1754
1755 /*
1756 * It is safe to proceed here without the j_list_lock because the
1757 * buffers cannot be stolen by try_to_free_buffers as long as we are
1758 * holding the page lock. --sct
1759 */
1760
1761 if (!buffer_jbd(bh))
1762 goto zap_buffer_unlocked;
1763
1764 spin_lock(&journal->j_state_lock);
1765 jbd_lock_bh_state(bh);
1766 spin_lock(&journal->j_list_lock);
1767
1768 jh = jbd2_journal_grab_journal_head(bh);
1769 if (!jh)
1770 goto zap_buffer_no_jh;
1771
1772 transaction = jh->b_transaction;
1773 if (transaction == NULL) {
1774 /* First case: not on any transaction. If it
1775 * has no checkpoint link, then we can zap it:
1776 * it's a writeback-mode buffer so we don't care
1777 * if it hits disk safely. */
1778 if (!jh->b_cp_transaction) {
1779 JBUFFER_TRACE(jh, "not on any transaction: zap");
1780 goto zap_buffer;
1781 }
1782
1783 if (!buffer_dirty(bh)) {
1784 /* bdflush has written it. We can drop it now */
1785 goto zap_buffer;
1786 }
1787
1788 /* OK, it must be in the journal but still not
1789 * written fully to disk: it's metadata or
1790 * journaled data... */
1791
1792 if (journal->j_running_transaction) {
1793 /* ... and once the current transaction has
1794 * committed, the buffer won't be needed any
1795 * longer. */
1796 JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
1797 ret = __dispose_buffer(jh,
1798 journal->j_running_transaction);
1799 jbd2_journal_put_journal_head(jh);
1800 spin_unlock(&journal->j_list_lock);
1801 jbd_unlock_bh_state(bh);
1802 spin_unlock(&journal->j_state_lock);
1803 return ret;
1804 } else {
1805 /* There is no currently-running transaction. So the
1806 * orphan record which we wrote for this file must have
1807 * passed into commit. We must attach this buffer to
1808 * the committing transaction, if it exists. */
1809 if (journal->j_committing_transaction) {
1810 JBUFFER_TRACE(jh, "give to committing trans");
1811 ret = __dispose_buffer(jh,
1812 journal->j_committing_transaction);
1813 jbd2_journal_put_journal_head(jh);
1814 spin_unlock(&journal->j_list_lock);
1815 jbd_unlock_bh_state(bh);
1816 spin_unlock(&journal->j_state_lock);
1817 return ret;
1818 } else {
1819 /* The orphan record's transaction has
1820 * committed. We can cleanse this buffer */
1821 clear_buffer_jbddirty(bh);
1822 goto zap_buffer;
1823 }
1824 }
1825 } else if (transaction == journal->j_committing_transaction) {
1826 if (jh->b_jlist == BJ_Locked) {
1827 /*
1828 * The buffer is on the committing transaction's locked
1829 * list. We have the buffer locked, so I/O has
1830 * completed. So we can nail the buffer now.
1831 */
1832 may_free = __dispose_buffer(jh, transaction);
1833 goto zap_buffer;
1834 }
1835 /*
1836 * If it is committing, we simply cannot touch it. We
1837 * can remove it's next_transaction pointer from the
1838 * running transaction if that is set, but nothing
1839 * else. */
1840 JBUFFER_TRACE(jh, "on committing transaction");
1841 set_buffer_freed(bh);
1842 if (jh->b_next_transaction) {
1843 J_ASSERT(jh->b_next_transaction ==
1844 journal->j_running_transaction);
1845 jh->b_next_transaction = NULL;
1846 }
1847 jbd2_journal_put_journal_head(jh);
1848 spin_unlock(&journal->j_list_lock);
1849 jbd_unlock_bh_state(bh);
1850 spin_unlock(&journal->j_state_lock);
1851 return 0;
1852 } else {
1853 /* Good, the buffer belongs to the running transaction.
1854 * We are writing our own transaction's data, not any
1855 * previous one's, so it is safe to throw it away
1856 * (remember that we expect the filesystem to have set
1857 * i_size already for this truncate so recovery will not
1858 * expose the disk blocks we are discarding here.) */
1859 J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
1860 may_free = __dispose_buffer(jh, transaction);
1861 }
1862
1863zap_buffer:
1864 jbd2_journal_put_journal_head(jh);
1865zap_buffer_no_jh:
1866 spin_unlock(&journal->j_list_lock);
1867 jbd_unlock_bh_state(bh);
1868 spin_unlock(&journal->j_state_lock);
1869zap_buffer_unlocked:
1870 clear_buffer_dirty(bh);
1871 J_ASSERT_BH(bh, !buffer_jbddirty(bh));
1872 clear_buffer_mapped(bh);
1873 clear_buffer_req(bh);
1874 clear_buffer_new(bh);
1875 bh->b_bdev = NULL;
1876 return may_free;
1877}
1878
1879/**
1880 * void jbd2_journal_invalidatepage()
1881 * @journal: journal to use for flush...
1882 * @page: page to flush
1883 * @offset: length of page to invalidate.
1884 *
1885 * Reap page buffers containing data after offset in page.
1886 *
1887 */
1888void jbd2_journal_invalidatepage(journal_t *journal,
1889 struct page *page,
1890 unsigned long offset)
1891{
1892 struct buffer_head *head, *bh, *next;
1893 unsigned int curr_off = 0;
1894 int may_free = 1;
1895
1896 if (!PageLocked(page))
1897 BUG();
1898 if (!page_has_buffers(page))
1899 return;
1900
1901 /* We will potentially be playing with lists other than just the
1902 * data lists (especially for journaled data mode), so be
1903 * cautious in our locking. */
1904
1905 head = bh = page_buffers(page);
1906 do {
1907 unsigned int next_off = curr_off + bh->b_size;
1908 next = bh->b_this_page;
1909
1910 if (offset <= curr_off) {
1911 /* This block is wholly outside the truncation point */
1912 lock_buffer(bh);
1913 may_free &= journal_unmap_buffer(journal, bh);
1914 unlock_buffer(bh);
1915 }
1916 curr_off = next_off;
1917 bh = next;
1918
1919 } while (bh != head);
1920
1921 if (!offset) {
1922 if (may_free && try_to_free_buffers(page))
1923 J_ASSERT(!page_has_buffers(page));
1924 }
1925}
1926
1927/*
1928 * File a buffer on the given transaction list.
1929 */
1930void __jbd2_journal_file_buffer(struct journal_head *jh,
1931 transaction_t *transaction, int jlist)
1932{
1933 struct journal_head **list = NULL;
1934 int was_dirty = 0;
1935 struct buffer_head *bh = jh2bh(jh);
1936
1937 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
1938 assert_spin_locked(&transaction->t_journal->j_list_lock);
1939
1940 J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
1941 J_ASSERT_JH(jh, jh->b_transaction == transaction ||
1942 jh->b_transaction == 0);
1943
1944 if (jh->b_transaction && jh->b_jlist == jlist)
1945 return;
1946
1947 /* The following list of buffer states needs to be consistent
1948 * with __jbd_unexpected_dirty_buffer()'s handling of dirty
1949 * state. */
1950
1951 if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
1952 jlist == BJ_Shadow || jlist == BJ_Forget) {
1953 if (test_clear_buffer_dirty(bh) ||
1954 test_clear_buffer_jbddirty(bh))
1955 was_dirty = 1;
1956 }
1957
1958 if (jh->b_transaction)
1959 __jbd2_journal_temp_unlink_buffer(jh);
1960 jh->b_transaction = transaction;
1961
1962 switch (jlist) {
1963 case BJ_None:
1964 J_ASSERT_JH(jh, !jh->b_committed_data);
1965 J_ASSERT_JH(jh, !jh->b_frozen_data);
1966 return;
1967 case BJ_SyncData:
1968 list = &transaction->t_sync_datalist;
1969 break;
1970 case BJ_Metadata:
1971 transaction->t_nr_buffers++;
1972 list = &transaction->t_buffers;
1973 break;
1974 case BJ_Forget:
1975 list = &transaction->t_forget;
1976 break;
1977 case BJ_IO:
1978 list = &transaction->t_iobuf_list;
1979 break;
1980 case BJ_Shadow:
1981 list = &transaction->t_shadow_list;
1982 break;
1983 case BJ_LogCtl:
1984 list = &transaction->t_log_list;
1985 break;
1986 case BJ_Reserved:
1987 list = &transaction->t_reserved_list;
1988 break;
1989 case BJ_Locked:
1990 list = &transaction->t_locked_list;
1991 break;
1992 }
1993
1994 __blist_add_buffer(list, jh);
1995 jh->b_jlist = jlist;
1996
1997 if (was_dirty)
1998 set_buffer_jbddirty(bh);
1999}
2000
2001void jbd2_journal_file_buffer(struct journal_head *jh,
2002 transaction_t *transaction, int jlist)
2003{
2004 jbd_lock_bh_state(jh2bh(jh));
2005 spin_lock(&transaction->t_journal->j_list_lock);
2006 __jbd2_journal_file_buffer(jh, transaction, jlist);
2007 spin_unlock(&transaction->t_journal->j_list_lock);
2008 jbd_unlock_bh_state(jh2bh(jh));
2009}
2010
2011/*
2012 * Remove a buffer from its current buffer list in preparation for
2013 * dropping it from its current transaction entirely. If the buffer has
2014 * already started to be used by a subsequent transaction, refile the
2015 * buffer on that transaction's metadata list.
2016 *
2017 * Called under journal->j_list_lock
2018 *
2019 * Called under jbd_lock_bh_state(jh2bh(jh))
2020 */
2021void __jbd2_journal_refile_buffer(struct journal_head *jh)
2022{
2023 int was_dirty;
2024 struct buffer_head *bh = jh2bh(jh);
2025
2026 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
2027 if (jh->b_transaction)
2028 assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
2029
2030 /* If the buffer is now unused, just drop it. */
2031 if (jh->b_next_transaction == NULL) {
2032 __jbd2_journal_unfile_buffer(jh);
2033 return;
2034 }
2035
2036 /*
2037 * It has been modified by a later transaction: add it to the new
2038 * transaction's metadata list.
2039 */
2040
2041 was_dirty = test_clear_buffer_jbddirty(bh);
2042 __jbd2_journal_temp_unlink_buffer(jh);
2043 jh->b_transaction = jh->b_next_transaction;
2044 jh->b_next_transaction = NULL;
2045 __jbd2_journal_file_buffer(jh, jh->b_transaction,
2046 was_dirty ? BJ_Metadata : BJ_Reserved);
2047 J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
2048
2049 if (was_dirty)
2050 set_buffer_jbddirty(bh);
2051}
2052
2053/*
2054 * For the unlocked version of this call, also make sure that any
2055 * hanging journal_head is cleaned up if necessary.
2056 *
2057 * __jbd2_journal_refile_buffer is usually called as part of a single locked
2058 * operation on a buffer_head, in which the caller is probably going to
2059 * be hooking the journal_head onto other lists. In that case it is up
2060 * to the caller to remove the journal_head if necessary. For the
2061 * unlocked jbd2_journal_refile_buffer call, the caller isn't going to be
2062 * doing anything else to the buffer so we need to do the cleanup
2063 * ourselves to avoid a jh leak.
2064 *
2065 * *** The journal_head may be freed by this call! ***
2066 */
2067void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
2068{
2069 struct buffer_head *bh = jh2bh(jh);
2070
2071 jbd_lock_bh_state(bh);
2072 spin_lock(&journal->j_list_lock);
2073
2074 __jbd2_journal_refile_buffer(jh);
2075 jbd_unlock_bh_state(bh);
2076 jbd2_journal_remove_journal_head(bh);
2077
2078 spin_unlock(&journal->j_list_lock);
2079 __brelse(bh);
2080}
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 6de374513c01..bc4b8106a490 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -334,10 +334,10 @@ static int __init init_jffs2_fs(void)
334 which means just 'no padding', without the alignment 334 which means just 'no padding', without the alignment
335 thing. But GCC doesn't have that -- we have to just 335 thing. But GCC doesn't have that -- we have to just
336 hope the structs are the right sizes, instead. */ 336 hope the structs are the right sizes, instead. */
337 BUG_ON(sizeof(struct jffs2_unknown_node) != 12); 337 BUILD_BUG_ON(sizeof(struct jffs2_unknown_node) != 12);
338 BUG_ON(sizeof(struct jffs2_raw_dirent) != 40); 338 BUILD_BUG_ON(sizeof(struct jffs2_raw_dirent) != 40);
339 BUG_ON(sizeof(struct jffs2_raw_inode) != 68); 339 BUILD_BUG_ON(sizeof(struct jffs2_raw_inode) != 68);
340 BUG_ON(sizeof(struct jffs2_raw_summary) != 32); 340 BUILD_BUG_ON(sizeof(struct jffs2_raw_summary) != 32);
341 341
342 printk(KERN_INFO "JFFS2 version 2.2." 342 printk(KERN_INFO "JFFS2 version 2.2."
343#ifdef CONFIG_JFFS2_FS_WRITEBUFFER 343#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index c11a4b9fb863..1e36bae4d0eb 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -149,12 +149,8 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
149 return -ENOMEM; 149 return -ENOMEM;
150 s->s_fs_info = sbi; 150 s->s_fs_info = sbi;
151 151
152 /* N.B. These should be compile-time tests. 152 BUILD_BUG_ON(32 != sizeof (struct minix_inode));
153 Unfortunately that is impossible. */ 153 BUILD_BUG_ON(64 != sizeof(struct minix2_inode));
154 if (32 != sizeof (struct minix_inode))
155 panic("bad V1 i-node size");
156 if (64 != sizeof(struct minix2_inode))
157 panic("bad V2 i-node size");
158 154
159 if (!sb_set_blocksize(s, BLOCK_SIZE)) 155 if (!sb_set_blocksize(s, BLOCK_SIZE))
160 goto out_bad_hblock; 156 goto out_bad_hblock;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 4c29cd7cc8e6..76b46ebbb10c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -339,7 +339,7 @@ static unsigned long long ocfs2_max_file_offset(unsigned int blockshift)
339 339
340#if BITS_PER_LONG == 32 340#if BITS_PER_LONG == 32
341# if defined(CONFIG_LBD) 341# if defined(CONFIG_LBD)
342 BUG_ON(sizeof(sector_t) != 8); 342 BUILD_BUG_ON(sizeof(sector_t) != 8);
343 pagefactor = PAGE_CACHE_SIZE; 343 pagefactor = PAGE_CACHE_SIZE;
344 bitshift = BITS_PER_LONG; 344 bitshift = BITS_PER_LONG;
345# else 345# else
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index c89aa2338191..9041802df832 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -430,20 +430,29 @@ int remove_save_link(struct inode *inode, int truncate)
430 return journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT); 430 return journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT);
431} 431}
432 432
433static void reiserfs_put_super(struct super_block *s) 433static void reiserfs_kill_sb(struct super_block *s)
434{ 434{
435 struct reiserfs_transaction_handle th; 435 if (REISERFS_SB(s)) {
436 th.t_trans_id = 0; 436 if (REISERFS_SB(s)->xattr_root) {
437 d_invalidate(REISERFS_SB(s)->xattr_root);
438 dput(REISERFS_SB(s)->xattr_root);
439 REISERFS_SB(s)->xattr_root = NULL;
440 }
437 441
438 if (REISERFS_SB(s)->xattr_root) { 442 if (REISERFS_SB(s)->priv_root) {
439 d_invalidate(REISERFS_SB(s)->xattr_root); 443 d_invalidate(REISERFS_SB(s)->priv_root);
440 dput(REISERFS_SB(s)->xattr_root); 444 dput(REISERFS_SB(s)->priv_root);
445 REISERFS_SB(s)->priv_root = NULL;
446 }
441 } 447 }
442 448
443 if (REISERFS_SB(s)->priv_root) { 449 kill_block_super(s);
444 d_invalidate(REISERFS_SB(s)->priv_root); 450}
445 dput(REISERFS_SB(s)->priv_root); 451
446 } 452static void reiserfs_put_super(struct super_block *s)
453{
454 struct reiserfs_transaction_handle th;
455 th.t_trans_id = 0;
447 456
448 /* change file system state to current state if it was mounted with read-write permissions */ 457 /* change file system state to current state if it was mounted with read-write permissions */
449 if (!(s->s_flags & MS_RDONLY)) { 458 if (!(s->s_flags & MS_RDONLY)) {
@@ -2156,7 +2165,7 @@ struct file_system_type reiserfs_fs_type = {
2156 .owner = THIS_MODULE, 2165 .owner = THIS_MODULE,
2157 .name = "reiserfs", 2166 .name = "reiserfs",
2158 .get_sb = get_super_block, 2167 .get_sb = get_super_block,
2159 .kill_sb = kill_block_super, 2168 .kill_sb = reiserfs_kill_sb,
2160 .fs_flags = FS_REQUIRES_DEV, 2169 .fs_flags = FS_REQUIRES_DEV,
2161}; 2170};
2162 2171
diff --git a/fs/super.c b/fs/super.c
index aec99ddbe53f..47e554c12e76 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -260,17 +260,17 @@ int fsync_super(struct super_block *sb)
260 * that need destruction out of superblock, call generic_shutdown_super() 260 * that need destruction out of superblock, call generic_shutdown_super()
261 * and release aforementioned objects. Note: dentries and inodes _are_ 261 * and release aforementioned objects. Note: dentries and inodes _are_
262 * taken care of and do not need specific handling. 262 * taken care of and do not need specific handling.
263 *
264 * Upon calling this function, the filesystem may no longer alter or
265 * rearrange the set of dentries belonging to this super_block, nor may it
266 * change the attachments of dentries to inodes.
263 */ 267 */
264void generic_shutdown_super(struct super_block *sb) 268void generic_shutdown_super(struct super_block *sb)
265{ 269{
266 struct dentry *root = sb->s_root;
267 struct super_operations *sop = sb->s_op; 270 struct super_operations *sop = sb->s_op;
268 271
269 if (root) { 272 if (sb->s_root) {
270 sb->s_root = NULL; 273 shrink_dcache_for_umount(sb);
271 shrink_dcache_parent(root);
272 shrink_dcache_sb(sb);
273 dput(root);
274 fsync_super(sb); 274 fsync_super(sb);
275 lock_super(sb); 275 lock_super(sb);
276 sb->s_flags &= ~MS_ACTIVE; 276 sb->s_flags &= ~MS_ACTIVE;
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 350cba5d6803..dc9e7dc07fb7 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -358,16 +358,11 @@ static int sysv_fill_super(struct super_block *sb, void *data, int silent)
358 unsigned long blocknr; 358 unsigned long blocknr;
359 int size = 0, i; 359 int size = 0, i;
360 360
361 if (1024 != sizeof (struct xenix_super_block)) 361 BUILD_BUG_ON(1024 != sizeof (struct xenix_super_block));
362 panic("Xenix FS: bad superblock size"); 362 BUILD_BUG_ON(512 != sizeof (struct sysv4_super_block));
363 if (512 != sizeof (struct sysv4_super_block)) 363 BUILD_BUG_ON(512 != sizeof (struct sysv2_super_block));
364 panic("SystemV FS: bad superblock size"); 364 BUILD_BUG_ON(500 != sizeof (struct coh_super_block));
365 if (512 != sizeof (struct sysv2_super_block)) 365 BUILD_BUG_ON(64 != sizeof (struct sysv_inode));
366 panic("SystemV FS: bad superblock size");
367 if (500 != sizeof (struct coh_super_block))
368 panic("Coherent FS: bad superblock size");
369 if (64 != sizeof (struct sysv_inode))
370 panic("sysv fs: bad inode size");
371 366
372 sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL); 367 sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL);
373 if (!sbi) 368 if (!sbi)
diff --git a/include/asm-alpha/io.h b/include/asm-alpha/io.h
index f5ae98c25d1f..5d15af24573b 100644
--- a/include/asm-alpha/io.h
+++ b/include/asm-alpha/io.h
@@ -533,19 +533,6 @@ extern void outsl (unsigned long port, const void *src, unsigned long count);
533#define eth_io_copy_and_sum(skb,src,len,unused) \ 533#define eth_io_copy_and_sum(skb,src,len,unused) \
534 memcpy_fromio((skb)->data,src,len) 534 memcpy_fromio((skb)->data,src,len)
535 535
536static inline int
537check_signature(const volatile void __iomem *io_addr,
538 const unsigned char *signature, int length)
539{
540 do {
541 if (readb(io_addr) != *signature)
542 return 0;
543 io_addr++;
544 signature++;
545 } while (--length);
546 return 1;
547}
548
549/* 536/*
550 * The Alpha Jensen hardware for some rather strange reason puts 537 * The Alpha Jensen hardware for some rather strange reason puts
551 * the RTC clock at 0x170 instead of 0x70. Probably due to some 538 * the RTC clock at 0x170 instead of 0x70. Probably due to some
diff --git a/include/asm-arm/arch-versatile/hardware.h b/include/asm-arm/arch-versatile/hardware.h
index 41c1bee342ad..edc06598d187 100644
--- a/include/asm-arm/arch-versatile/hardware.h
+++ b/include/asm-arm/arch-versatile/hardware.h
@@ -28,8 +28,8 @@
28/* 28/*
29 * PCI space virtual addresses 29 * PCI space virtual addresses
30 */ 30 */
31#define VERSATILE_PCI_VIRT_BASE 0xe8000000 31#define VERSATILE_PCI_VIRT_BASE (void __iomem *)0xe8000000ul
32#define VERSATILE_PCI_CFG_VIRT_BASE 0xe9000000 32#define VERSATILE_PCI_CFG_VIRT_BASE (void __iomem *)0xe9000000ul
33 33
34#if 0 34#if 0
35#define VERSATILE_PCI_VIRT_MEM_BASE0 0xf4000000 35#define VERSATILE_PCI_VIRT_MEM_BASE0 0xf4000000
diff --git a/include/asm-arm/io.h b/include/asm-arm/io.h
index 34aaaac4f617..ae999fd5dc67 100644
--- a/include/asm-arm/io.h
+++ b/include/asm-arm/io.h
@@ -193,23 +193,6 @@ extern void _memset_io(volatile void __iomem *, int, size_t);
193#define eth_io_copy_and_sum(s,c,l,b) \ 193#define eth_io_copy_and_sum(s,c,l,b) \
194 eth_copy_and_sum((s),__mem_pci(c),(l),(b)) 194 eth_copy_and_sum((s),__mem_pci(c),(l),(b))
195 195
196static inline int
197check_signature(void __iomem *io_addr, const unsigned char *signature,
198 int length)
199{
200 int retval = 0;
201 do {
202 if (readb(io_addr) != *signature)
203 goto out;
204 io_addr++;
205 signature++;
206 length--;
207 } while (length);
208 retval = 1;
209out:
210 return retval;
211}
212
213#elif !defined(readb) 196#elif !defined(readb)
214 197
215#define readb(c) (__readwrite_bug("readb"),0) 198#define readb(c) (__readwrite_bug("readb"),0)
diff --git a/include/asm-arm/uaccess.h b/include/asm-arm/uaccess.h
index 87aba57a66c4..09ad0cab9014 100644
--- a/include/asm-arm/uaccess.h
+++ b/include/asm-arm/uaccess.h
@@ -110,7 +110,7 @@ extern int __get_user_4(void *);
110#define get_user(x,p) \ 110#define get_user(x,p) \
111 ({ \ 111 ({ \
112 const register typeof(*(p)) __user *__p asm("r0") = (p);\ 112 const register typeof(*(p)) __user *__p asm("r0") = (p);\
113 register unsigned int __r2 asm("r2"); \ 113 register unsigned long __r2 asm("r2"); \
114 register int __e asm("r0"); \ 114 register int __e asm("r0"); \
115 switch (sizeof(*(__p))) { \ 115 switch (sizeof(*(__p))) { \
116 case 1: \ 116 case 1: \
diff --git a/include/asm-avr32/irq_regs.h b/include/asm-avr32/irq_regs.h
new file mode 100644
index 000000000000..3dd9c0b70270
--- /dev/null
+++ b/include/asm-avr32/irq_regs.h
@@ -0,0 +1 @@
#include <asm-generic/irq_regs.h>
diff --git a/include/asm-frv/io.h b/include/asm-frv/io.h
index 7765f5528894..20e44fe00abf 100644
--- a/include/asm-frv/io.h
+++ b/include/asm-frv/io.h
@@ -385,27 +385,6 @@ static inline void pci_iounmap(struct pci_dev *dev, void __iomem *p)
385 */ 385 */
386#define xlate_dev_kmem_ptr(p) p 386#define xlate_dev_kmem_ptr(p) p
387 387
388/*
389 * Check BIOS signature
390 */
391static inline int check_signature(volatile void __iomem *io_addr,
392 const unsigned char *signature, int length)
393{
394 int retval = 0;
395
396 do {
397 if (readb(io_addr) != *signature)
398 goto out;
399 io_addr++;
400 signature++;
401 length--;
402 } while (length);
403
404 retval = 1;
405out:
406 return retval;
407}
408
409#endif /* __KERNEL__ */ 388#endif /* __KERNEL__ */
410 389
411#endif /* _ASM_IO_H */ 390#endif /* _ASM_IO_H */
diff --git a/include/asm-generic/bitops/sched.h b/include/asm-generic/bitops/sched.h
index 5ef93a4d009f..815bb0148060 100644
--- a/include/asm-generic/bitops/sched.h
+++ b/include/asm-generic/bitops/sched.h
@@ -15,7 +15,7 @@ static inline int sched_find_first_bit(const unsigned long *b)
15#if BITS_PER_LONG == 64 15#if BITS_PER_LONG == 64
16 if (unlikely(b[0])) 16 if (unlikely(b[0]))
17 return __ffs(b[0]); 17 return __ffs(b[0]);
18 if (unlikely(b[1])) 18 if (likely(b[1]))
19 return __ffs(b[1]) + 64; 19 return __ffs(b[1]) + 64;
20 return __ffs(b[2]) + 128; 20 return __ffs(b[2]) + 128;
21#elif BITS_PER_LONG == 32 21#elif BITS_PER_LONG == 32
diff --git a/include/asm-i386/io.h b/include/asm-i386/io.h
index b3724fe93ff1..68df0dc3ab8f 100644
--- a/include/asm-i386/io.h
+++ b/include/asm-i386/io.h
@@ -224,33 +224,6 @@ static inline void memcpy_toio(volatile void __iomem *dst, const void *src, int
224 224
225#define eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void __force *)(b),(c),(d)) 225#define eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void __force *)(b),(c),(d))
226 226
227/**
228 * check_signature - find BIOS signatures
229 * @io_addr: mmio address to check
230 * @signature: signature block
231 * @length: length of signature
232 *
233 * Perform a signature comparison with the mmio address io_addr. This
234 * address should have been obtained by ioremap.
235 * Returns 1 on a match.
236 */
237
238static inline int check_signature(volatile void __iomem * io_addr,
239 const unsigned char *signature, int length)
240{
241 int retval = 0;
242 do {
243 if (readb(io_addr) != *signature)
244 goto out;
245 io_addr++;
246 signature++;
247 length--;
248 } while (length);
249 retval = 1;
250out:
251 return retval;
252}
253
254/* 227/*
255 * Cache management 228 * Cache management
256 * 229 *
diff --git a/include/asm-i386/uaccess.h b/include/asm-i386/uaccess.h
index 54d905ebc63d..eef5133b9ce2 100644
--- a/include/asm-i386/uaccess.h
+++ b/include/asm-i386/uaccess.h
@@ -404,20 +404,6 @@ unsigned long __must_check __copy_from_user_ll_nocache_nozero(void *to,
404 * anything, so this is accurate. 404 * anything, so this is accurate.
405 */ 405 */
406 406
407/**
408 * __copy_to_user: - Copy a block of data into user space, with less checking.
409 * @to: Destination address, in user space.
410 * @from: Source address, in kernel space.
411 * @n: Number of bytes to copy.
412 *
413 * Context: User context only. This function may sleep.
414 *
415 * Copy data from kernel space to user space. Caller must check
416 * the specified block with access_ok() before calling this function.
417 *
418 * Returns number of bytes that could not be copied.
419 * On success, this will be zero.
420 */
421static __always_inline unsigned long __must_check 407static __always_inline unsigned long __must_check
422__copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) 408__copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
423{ 409{
@@ -439,35 +425,27 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
439 return __copy_to_user_ll(to, from, n); 425 return __copy_to_user_ll(to, from, n);
440} 426}
441 427
442static __always_inline unsigned long __must_check
443__copy_to_user(void __user *to, const void *from, unsigned long n)
444{
445 might_sleep();
446 return __copy_to_user_inatomic(to, from, n);
447}
448
449/** 428/**
450 * __copy_from_user: - Copy a block of data from user space, with less checking. 429 * __copy_to_user: - Copy a block of data into user space, with less checking.
451 * @to: Destination address, in kernel space. 430 * @to: Destination address, in user space.
452 * @from: Source address, in user space. 431 * @from: Source address, in kernel space.
453 * @n: Number of bytes to copy. 432 * @n: Number of bytes to copy.
454 * 433 *
455 * Context: User context only. This function may sleep. 434 * Context: User context only. This function may sleep.
456 * 435 *
457 * Copy data from user space to kernel space. Caller must check 436 * Copy data from kernel space to user space. Caller must check
458 * the specified block with access_ok() before calling this function. 437 * the specified block with access_ok() before calling this function.
459 * 438 *
460 * Returns number of bytes that could not be copied. 439 * Returns number of bytes that could not be copied.
461 * On success, this will be zero. 440 * On success, this will be zero.
462 *
463 * If some data could not be copied, this function will pad the copied
464 * data to the requested size using zero bytes.
465 *
466 * An alternate version - __copy_from_user_inatomic() - may be called from
467 * atomic context and will fail rather than sleep. In this case the
468 * uncopied bytes will *NOT* be padded with zeros. See fs/filemap.h
469 * for explanation of why this is needed.
470 */ 441 */
442static __always_inline unsigned long __must_check
443__copy_to_user(void __user *to, const void *from, unsigned long n)
444{
445 might_sleep();
446 return __copy_to_user_inatomic(to, from, n);
447}
448
471static __always_inline unsigned long 449static __always_inline unsigned long
472__copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) 450__copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
473{ 451{
@@ -493,6 +471,29 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
493 } 471 }
494 return __copy_from_user_ll_nozero(to, from, n); 472 return __copy_from_user_ll_nozero(to, from, n);
495} 473}
474
475/**
476 * __copy_from_user: - Copy a block of data from user space, with less checking.
477 * @to: Destination address, in kernel space.
478 * @from: Source address, in user space.
479 * @n: Number of bytes to copy.
480 *
481 * Context: User context only. This function may sleep.
482 *
483 * Copy data from user space to kernel space. Caller must check
484 * the specified block with access_ok() before calling this function.
485 *
486 * Returns number of bytes that could not be copied.
487 * On success, this will be zero.
488 *
489 * If some data could not be copied, this function will pad the copied
490 * data to the requested size using zero bytes.
491 *
492 * An alternate version - __copy_from_user_inatomic() - may be called from
493 * atomic context and will fail rather than sleep. In this case the
494 * uncopied bytes will *NOT* be padded with zeros. See fs/filemap.h
495 * for explanation of why this is needed.
496 */
496static __always_inline unsigned long 497static __always_inline unsigned long
497__copy_from_user(void *to, const void __user *from, unsigned long n) 498__copy_from_user(void *to, const void __user *from, unsigned long n)
498{ 499{
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index 3ca7ab963d7d..beeeaf6b054a 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -324,10 +324,11 @@
324#define __NR_vmsplice 316 324#define __NR_vmsplice 316
325#define __NR_move_pages 317 325#define __NR_move_pages 317
326#define __NR_getcpu 318 326#define __NR_getcpu 318
327#define __NR_epoll_pwait 319
327 328
328#ifdef __KERNEL__ 329#ifdef __KERNEL__
329 330
330#define NR_syscalls 319 331#define NR_syscalls 320
331#include <linux/err.h> 332#include <linux/err.h>
332 333
333/* 334/*
diff --git a/include/asm-m32r/io.h b/include/asm-m32r/io.h
index 70ad1c949c2b..d06933bd6318 100644
--- a/include/asm-m32r/io.h
+++ b/include/asm-m32r/io.h
@@ -166,38 +166,6 @@ static inline void _writel(unsigned long l, unsigned long addr)
166 166
167#define flush_write_buffers() do { } while (0) /* M32R_FIXME */ 167#define flush_write_buffers() do { } while (0) /* M32R_FIXME */
168 168
169/**
170 * check_signature - find BIOS signatures
171 * @io_addr: mmio address to check
172 * @signature: signature block
173 * @length: length of signature
174 *
175 * Perform a signature comparison with the ISA mmio address io_addr.
176 * Returns 1 on a match.
177 *
178 * This function is deprecated. New drivers should use ioremap and
179 * check_signature.
180 */
181
182static inline int check_signature(void __iomem *io_addr,
183 const unsigned char *signature, int length)
184{
185 int retval = 0;
186#if 0
187printk("check_signature\n");
188 do {
189 if (readb(io_addr) != *signature)
190 goto out;
191 io_addr++;
192 signature++;
193 length--;
194 } while (length);
195 retval = 1;
196out:
197#endif
198 return retval;
199}
200
201static inline void 169static inline void
202memset_io(volatile void __iomem *addr, unsigned char val, int count) 170memset_io(volatile void __iomem *addr, unsigned char val, int count)
203{ 171{
diff --git a/include/asm-m68k/uaccess.h b/include/asm-m68k/uaccess.h
index 88b1f47400e1..e4c9f080ff20 100644
--- a/include/asm-m68k/uaccess.h
+++ b/include/asm-m68k/uaccess.h
@@ -76,7 +76,7 @@ asm volatile ("\n" \
76 break; \ 76 break; \
77 case 8: \ 77 case 8: \
78 { \ 78 { \
79 const void *__pu_ptr = (ptr); \ 79 const void __user *__pu_ptr = (ptr); \
80 asm volatile ("\n" \ 80 asm volatile ("\n" \
81 "1: moves.l %2,(%1)+\n" \ 81 "1: moves.l %2,(%1)+\n" \
82 "2: moves.l %R2,(%1)\n" \ 82 "2: moves.l %R2,(%1)\n" \
@@ -125,7 +125,7 @@ asm volatile ("\n" \
125 " .previous" \ 125 " .previous" \
126 : "+d" (res), "=&" #reg (__gu_val) \ 126 : "+d" (res), "=&" #reg (__gu_val) \
127 : "m" (*(ptr)), "i" (err)); \ 127 : "m" (*(ptr)), "i" (err)); \
128 (x) = (typeof(*(ptr)))(long)__gu_val; \ 128 (x) = (typeof(*(ptr)))(unsigned long)__gu_val; \
129}) 129})
130 130
131#define __get_user(x, ptr) \ 131#define __get_user(x, ptr) \
@@ -221,16 +221,16 @@ __constant_copy_from_user(void *to, const void __user *from, unsigned long n)
221 221
222 switch (n) { 222 switch (n) {
223 case 1: 223 case 1:
224 __get_user_asm(res, *(u8 *)to, (u8 *)from, u8, b, d, 1); 224 __get_user_asm(res, *(u8 *)to, (u8 __user *)from, u8, b, d, 1);
225 break; 225 break;
226 case 2: 226 case 2:
227 __get_user_asm(res, *(u16 *)to, (u16 *)from, u16, w, d, 2); 227 __get_user_asm(res, *(u16 *)to, (u16 __user *)from, u16, w, d, 2);
228 break; 228 break;
229 case 3: 229 case 3:
230 __constant_copy_from_user_asm(res, to, from, tmp, 3, w, b,); 230 __constant_copy_from_user_asm(res, to, from, tmp, 3, w, b,);
231 break; 231 break;
232 case 4: 232 case 4:
233 __get_user_asm(res, *(u32 *)to, (u32 *)from, u32, l, r, 4); 233 __get_user_asm(res, *(u32 *)to, (u32 __user *)from, u32, l, r, 4);
234 break; 234 break;
235 case 5: 235 case 5:
236 __constant_copy_from_user_asm(res, to, from, tmp, 5, l, b,); 236 __constant_copy_from_user_asm(res, to, from, tmp, 5, l, b,);
@@ -302,16 +302,16 @@ __constant_copy_to_user(void __user *to, const void *from, unsigned long n)
302 302
303 switch (n) { 303 switch (n) {
304 case 1: 304 case 1:
305 __put_user_asm(res, *(u8 *)from, (u8 *)to, b, d, 1); 305 __put_user_asm(res, *(u8 *)from, (u8 __user *)to, b, d, 1);
306 break; 306 break;
307 case 2: 307 case 2:
308 __put_user_asm(res, *(u16 *)from, (u16 *)to, w, d, 2); 308 __put_user_asm(res, *(u16 *)from, (u16 __user *)to, w, d, 2);
309 break; 309 break;
310 case 3: 310 case 3:
311 __constant_copy_to_user_asm(res, to, from, tmp, 3, w, b,); 311 __constant_copy_to_user_asm(res, to, from, tmp, 3, w, b,);
312 break; 312 break;
313 case 4: 313 case 4:
314 __put_user_asm(res, *(u32 *)from, (u32 *)to, l, r, 4); 314 __put_user_asm(res, *(u32 *)from, (u32 __user *)to, l, r, 4);
315 break; 315 break;
316 case 5: 316 case 5:
317 __constant_copy_to_user_asm(res, to, from, tmp, 5, l, b,); 317 __constant_copy_to_user_asm(res, to, from, tmp, 5, l, b,);
diff --git a/include/asm-mips/io.h b/include/asm-mips/io.h
index df624e1ee6e2..c2d124badbe5 100644
--- a/include/asm-mips/io.h
+++ b/include/asm-mips/io.h
@@ -562,32 +562,6 @@ extern void pci_iounmap(struct pci_dev *dev, void __iomem *);
562#define eth_io_copy_and_sum(skb,src,len,unused) memcpy_fromio((skb)->data,(src),(len)) 562#define eth_io_copy_and_sum(skb,src,len,unused) memcpy_fromio((skb)->data,(src),(len))
563 563
564/* 564/*
565 * check_signature - find BIOS signatures
566 * @io_addr: mmio address to check
567 * @signature: signature block
568 * @length: length of signature
569 *
570 * Perform a signature comparison with the mmio address io_addr. This
571 * address should have been obtained by ioremap.
572 * Returns 1 on a match.
573 */
574static inline int check_signature(char __iomem *io_addr,
575 const unsigned char *signature, int length)
576{
577 int retval = 0;
578 do {
579 if (readb(io_addr) != *signature)
580 goto out;
581 io_addr++;
582 signature++;
583 length--;
584 } while (length);
585 retval = 1;
586out:
587 return retval;
588}
589
590/*
591 * The caches on some architectures aren't dma-coherent and have need to 565 * The caches on some architectures aren't dma-coherent and have need to
592 * handle this in software. There are three types of operations that 566 * handle this in software. There are three types of operations that
593 * can be applied to dma buffers. 567 * can be applied to dma buffers.
diff --git a/include/asm-powerpc/io.h b/include/asm-powerpc/io.h
index cbbd8c648df1..3baff8b0fd5a 100644
--- a/include/asm-powerpc/io.h
+++ b/include/asm-powerpc/io.h
@@ -404,32 +404,6 @@ static inline void __out_be64(volatile unsigned long __iomem *addr, unsigned lon
404 404
405#include <asm/eeh.h> 405#include <asm/eeh.h>
406 406
407/**
408 * check_signature - find BIOS signatures
409 * @io_addr: mmio address to check
410 * @signature: signature block
411 * @length: length of signature
412 *
413 * Perform a signature comparison with the mmio address io_addr. This
414 * address should have been obtained by ioremap.
415 * Returns 1 on a match.
416 */
417static inline int check_signature(const volatile void __iomem * io_addr,
418 const unsigned char *signature, int length)
419{
420 int retval = 0;
421 do {
422 if (readb(io_addr) != *signature)
423 goto out;
424 io_addr++;
425 signature++;
426 length--;
427 } while (length);
428 retval = 1;
429out:
430 return retval;
431}
432
433/* Nothing to do */ 407/* Nothing to do */
434 408
435#define dma_cache_inv(_start,_size) do { } while (0) 409#define dma_cache_inv(_start,_size) do { } while (0)
diff --git a/include/asm-ppc/io.h b/include/asm-ppc/io.h
index 3d9a9e6f3321..a4c411b753ef 100644
--- a/include/asm-ppc/io.h
+++ b/include/asm-ppc/io.h
@@ -439,22 +439,6 @@ extern inline void * phys_to_virt(unsigned long address)
439#define iobarrier_r() eieio() 439#define iobarrier_r() eieio()
440#define iobarrier_w() eieio() 440#define iobarrier_w() eieio()
441 441
442static inline int check_signature(volatile void __iomem * io_addr,
443 const unsigned char *signature, int length)
444{
445 int retval = 0;
446 do {
447 if (readb(io_addr) != *signature)
448 goto out;
449 io_addr++;
450 signature++;
451 length--;
452 } while (length);
453 retval = 1;
454out:
455 return retval;
456}
457
458/* 442/*
459 * Here comes the ppc implementation of the IOMAP 443 * Here comes the ppc implementation of the IOMAP
460 * interfaces. 444 * interfaces.
diff --git a/include/asm-sh/io.h b/include/asm-sh/io.h
index ed12d38e8c00..a0e55b09e4fd 100644
--- a/include/asm-sh/io.h
+++ b/include/asm-sh/io.h
@@ -304,22 +304,6 @@ __ioremap_mode(unsigned long offset, unsigned long size, unsigned long flags)
304#define iounmap(addr) \ 304#define iounmap(addr) \
305 __iounmap((addr)) 305 __iounmap((addr))
306 306
307static inline int check_signature(char __iomem *io_addr,
308 const unsigned char *signature, int length)
309{
310 int retval = 0;
311 do {
312 if (readb(io_addr) != *signature)
313 goto out;
314 io_addr++;
315 signature++;
316 length--;
317 } while (length);
318 retval = 1;
319out:
320 return retval;
321}
322
323/* 307/*
324 * The caches on some architectures aren't dma-coherent and have need to 308 * The caches on some architectures aren't dma-coherent and have need to
325 * handle this in software. There are three types of operations that 309 * handle this in software. There are three types of operations that
diff --git a/include/asm-sh64/io.h b/include/asm-sh64/io.h
index 252fedbb6621..14d8e7b4bf4b 100644
--- a/include/asm-sh64/io.h
+++ b/include/asm-sh64/io.h
@@ -178,22 +178,6 @@ extern void iounmap(void *addr);
178unsigned long onchip_remap(unsigned long addr, unsigned long size, const char* name); 178unsigned long onchip_remap(unsigned long addr, unsigned long size, const char* name);
179extern void onchip_unmap(unsigned long vaddr); 179extern void onchip_unmap(unsigned long vaddr);
180 180
181static __inline__ int check_signature(volatile void __iomem *io_addr,
182 const unsigned char *signature, int length)
183{
184 int retval = 0;
185 do {
186 if (readb(io_addr) != *signature)
187 goto out;
188 io_addr++;
189 signature++;
190 length--;
191 } while (length);
192 retval = 1;
193out:
194 return retval;
195}
196
197/* 181/*
198 * The caches on some architectures aren't dma-coherent and have need to 182 * The caches on some architectures aren't dma-coherent and have need to
199 * handle this in software. There are three types of operations that 183 * handle this in software. There are three types of operations that
diff --git a/include/asm-sparc64/io.h b/include/asm-sparc64/io.h
index 0056770e83ad..30b912d8e8bc 100644
--- a/include/asm-sparc64/io.h
+++ b/include/asm-sparc64/io.h
@@ -440,21 +440,6 @@ _memcpy_toio(volatile void __iomem *dst, const void *src, __kernel_size_t n)
440 440
441#define memcpy_toio(d,s,sz) _memcpy_toio(d,s,sz) 441#define memcpy_toio(d,s,sz) _memcpy_toio(d,s,sz)
442 442
443static inline int check_signature(void __iomem *io_addr,
444 const unsigned char *signature,
445 int length)
446{
447 int retval = 0;
448 do {
449 if (readb(io_addr) != *signature++)
450 goto out;
451 io_addr++;
452 } while (--length);
453 retval = 1;
454out:
455 return retval;
456}
457
458#define mmiowb() 443#define mmiowb()
459 444
460#ifdef __KERNEL__ 445#ifdef __KERNEL__
diff --git a/include/asm-x86_64/io.h b/include/asm-x86_64/io.h
index 70e91fe76344..6ee9fadaaacb 100644
--- a/include/asm-x86_64/io.h
+++ b/include/asm-x86_64/io.h
@@ -254,33 +254,6 @@ void memset_io(volatile void __iomem *a, int b, size_t c);
254 254
255#define eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void *)(b),(c),(d)) 255#define eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void *)(b),(c),(d))
256 256
257/**
258 * check_signature - find BIOS signatures
259 * @io_addr: mmio address to check
260 * @signature: signature block
261 * @length: length of signature
262 *
263 * Perform a signature comparison with the mmio address io_addr. This
264 * address should have been obtained by ioremap.
265 * Returns 1 on a match.
266 */
267
268static inline int check_signature(void __iomem *io_addr,
269 const unsigned char *signature, int length)
270{
271 int retval = 0;
272 do {
273 if (readb(io_addr) != *signature)
274 goto out;
275 io_addr++;
276 signature++;
277 length--;
278 } while (length);
279 retval = 1;
280out:
281 return retval;
282}
283
284/* Nothing to do */ 257/* Nothing to do */
285 258
286#define dma_cache_inv(_start,_size) do { } while (0) 259#define dma_cache_inv(_start,_size) do { } while (0)
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index dcc5de7cc487..64b4641904fe 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -46,7 +46,8 @@
46 * bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src) 46 * bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src)
47 * bitmap_bitremap(oldbit, old, new, nbits) newbit = map(old, new)(oldbit) 47 * bitmap_bitremap(oldbit, old, new, nbits) newbit = map(old, new)(oldbit)
48 * bitmap_scnprintf(buf, len, src, nbits) Print bitmap src to buf 48 * bitmap_scnprintf(buf, len, src, nbits) Print bitmap src to buf
49 * bitmap_parse(ubuf, ulen, dst, nbits) Parse bitmap dst from user buf 49 * bitmap_parse(buf, buflen, dst, nbits) Parse bitmap dst from kernel buf
50 * bitmap_parse_user(ubuf, ulen, dst, nbits) Parse bitmap dst from user buf
50 * bitmap_scnlistprintf(buf, len, src, nbits) Print bitmap src as list to buf 51 * bitmap_scnlistprintf(buf, len, src, nbits) Print bitmap src as list to buf
51 * bitmap_parselist(buf, dst, nbits) Parse bitmap dst from list 52 * bitmap_parselist(buf, dst, nbits) Parse bitmap dst from list
52 * bitmap_find_free_region(bitmap, bits, order) Find and allocate bit region 53 * bitmap_find_free_region(bitmap, bits, order) Find and allocate bit region
@@ -106,7 +107,9 @@ extern int __bitmap_weight(const unsigned long *bitmap, int bits);
106 107
107extern int bitmap_scnprintf(char *buf, unsigned int len, 108extern int bitmap_scnprintf(char *buf, unsigned int len,
108 const unsigned long *src, int nbits); 109 const unsigned long *src, int nbits);
109extern int bitmap_parse(const char __user *ubuf, unsigned int ulen, 110extern int __bitmap_parse(const char *buf, unsigned int buflen, int is_user,
111 unsigned long *dst, int nbits);
112extern int bitmap_parse_user(const char __user *ubuf, unsigned int ulen,
110 unsigned long *dst, int nbits); 113 unsigned long *dst, int nbits);
111extern int bitmap_scnlistprintf(char *buf, unsigned int len, 114extern int bitmap_scnlistprintf(char *buf, unsigned int len,
112 const unsigned long *src, int nbits); 115 const unsigned long *src, int nbits);
@@ -270,6 +273,12 @@ static inline void bitmap_shift_left(unsigned long *dst,
270 __bitmap_shift_left(dst, src, n, nbits); 273 __bitmap_shift_left(dst, src, n, nbits);
271} 274}
272 275
276static inline int bitmap_parse(const char *buf, unsigned int buflen,
277 unsigned long *maskp, int nmaskbits)
278{
279 return __bitmap_parse(buf, buflen, 0, maskp, nmaskbits);
280}
281
273#endif /* __ASSEMBLY__ */ 282#endif /* __ASSEMBLY__ */
274 283
275#endif /* __LINUX_BITMAP_H */ 284#endif /* __LINUX_BITMAP_H */
diff --git a/include/linux/carta_random32.h b/include/linux/carta_random32.h
new file mode 100644
index 000000000000..f6f3bd9f20b5
--- /dev/null
+++ b/include/linux/carta_random32.h
@@ -0,0 +1,29 @@
1/*
2 * Fast, simple, yet decent quality random number generator based on
3 * a paper by David G. Carta ("Two Fast Implementations of the
4 * `Minimal Standard' Random Number Generator," Communications of the
5 * ACM, January, 1990).
6 *
7 * Copyright (c) 2002-2006 Hewlett-Packard Development Company, L.P.
8 * Contributed by Stephane Eranian <eranian@hpl.hp.com>
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of version 2 of the GNU General Public
12 * License as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
22 * 02111-1307 USA
23 */
24#ifndef _LINUX_CARTA_RANDOM32_H_
25#define _LINUX_CARTA_RANDOM32_H_
26
27u64 carta_random32(u64 seed);
28
29#endif /* _LINUX_CARTA_RANDOM32_H_ */
diff --git a/include/linux/compat_ioctl.h b/include/linux/compat_ioctl.h
index 4e1663d7691e..cfdb4f6a89d4 100644
--- a/include/linux/compat_ioctl.h
+++ b/include/linux/compat_ioctl.h
@@ -61,17 +61,23 @@ COMPATIBLE_IOCTL(FIGETBSZ)
61 * Some need translations, these do not. 61 * Some need translations, these do not.
62 */ 62 */
63COMPATIBLE_IOCTL(HDIO_GET_IDENTITY) 63COMPATIBLE_IOCTL(HDIO_GET_IDENTITY)
64COMPATIBLE_IOCTL(HDIO_SET_DMA)
65COMPATIBLE_IOCTL(HDIO_SET_UNMASKINTR)
66COMPATIBLE_IOCTL(HDIO_SET_NOWERR)
67COMPATIBLE_IOCTL(HDIO_SET_32BIT)
68COMPATIBLE_IOCTL(HDIO_SET_MULTCOUNT)
69COMPATIBLE_IOCTL(HDIO_DRIVE_CMD)
70COMPATIBLE_IOCTL(HDIO_DRIVE_TASK) 64COMPATIBLE_IOCTL(HDIO_DRIVE_TASK)
71COMPATIBLE_IOCTL(HDIO_SET_PIO_MODE) 65COMPATIBLE_IOCTL(HDIO_DRIVE_CMD)
72COMPATIBLE_IOCTL(HDIO_SET_NICE) 66ULONG_IOCTL(HDIO_SET_MULTCOUNT)
73COMPATIBLE_IOCTL(HDIO_SET_KEEPSETTINGS) 67ULONG_IOCTL(HDIO_SET_UNMASKINTR)
68ULONG_IOCTL(HDIO_SET_KEEPSETTINGS)
69ULONG_IOCTL(HDIO_SET_32BIT)
70ULONG_IOCTL(HDIO_SET_NOWERR)
71ULONG_IOCTL(HDIO_SET_DMA)
72ULONG_IOCTL(HDIO_SET_PIO_MODE)
73ULONG_IOCTL(HDIO_SET_NICE)
74ULONG_IOCTL(HDIO_SET_WCACHE)
75ULONG_IOCTL(HDIO_SET_ACOUSTIC)
76ULONG_IOCTL(HDIO_SET_BUSSTATE)
77ULONG_IOCTL(HDIO_SET_ADDRESS)
74COMPATIBLE_IOCTL(HDIO_SCAN_HWIF) 78COMPATIBLE_IOCTL(HDIO_SCAN_HWIF)
79/* 0x330 is reserved -- it used to be HDIO_GETGEO_BIG */
80COMPATIBLE_IOCTL(0x330)
75/* 0x02 -- Floppy ioctls */ 81/* 0x02 -- Floppy ioctls */
76COMPATIBLE_IOCTL(FDMSGON) 82COMPATIBLE_IOCTL(FDMSGON)
77COMPATIBLE_IOCTL(FDMSGOFF) 83COMPATIBLE_IOCTL(FDMSGOFF)
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index b268a3c0c376..d0e8c8b0e34d 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -8,8 +8,8 @@
8 * See detailed comments in the file linux/bitmap.h describing the 8 * See detailed comments in the file linux/bitmap.h describing the
9 * data type on which these cpumasks are based. 9 * data type on which these cpumasks are based.
10 * 10 *
11 * For details of cpumask_scnprintf() and cpumask_parse(), 11 * For details of cpumask_scnprintf() and cpumask_parse_user(),
12 * see bitmap_scnprintf() and bitmap_parse() in lib/bitmap.c. 12 * see bitmap_scnprintf() and bitmap_parse_user() in lib/bitmap.c.
13 * For details of cpulist_scnprintf() and cpulist_parse(), see 13 * For details of cpulist_scnprintf() and cpulist_parse(), see
14 * bitmap_scnlistprintf() and bitmap_parselist(), also in bitmap.c. 14 * bitmap_scnlistprintf() and bitmap_parselist(), also in bitmap.c.
15 * For details of cpu_remap(), see bitmap_bitremap in lib/bitmap.c 15 * For details of cpu_remap(), see bitmap_bitremap in lib/bitmap.c
@@ -49,7 +49,7 @@
49 * unsigned long *cpus_addr(mask) Array of unsigned long's in mask 49 * unsigned long *cpus_addr(mask) Array of unsigned long's in mask
50 * 50 *
51 * int cpumask_scnprintf(buf, len, mask) Format cpumask for printing 51 * int cpumask_scnprintf(buf, len, mask) Format cpumask for printing
52 * int cpumask_parse(ubuf, ulen, mask) Parse ascii string as cpumask 52 * int cpumask_parse_user(ubuf, ulen, mask) Parse ascii string as cpumask
53 * int cpulist_scnprintf(buf, len, mask) Format cpumask as list for printing 53 * int cpulist_scnprintf(buf, len, mask) Format cpumask as list for printing
54 * int cpulist_parse(buf, map) Parse ascii string as cpulist 54 * int cpulist_parse(buf, map) Parse ascii string as cpulist
55 * int cpu_remap(oldbit, old, new) newbit = map(old, new)(oldbit) 55 * int cpu_remap(oldbit, old, new) newbit = map(old, new)(oldbit)
@@ -273,12 +273,12 @@ static inline int __cpumask_scnprintf(char *buf, int len,
273 return bitmap_scnprintf(buf, len, srcp->bits, nbits); 273 return bitmap_scnprintf(buf, len, srcp->bits, nbits);
274} 274}
275 275
276#define cpumask_parse(ubuf, ulen, dst) \ 276#define cpumask_parse_user(ubuf, ulen, dst) \
277 __cpumask_parse((ubuf), (ulen), &(dst), NR_CPUS) 277 __cpumask_parse_user((ubuf), (ulen), &(dst), NR_CPUS)
278static inline int __cpumask_parse(const char __user *buf, int len, 278static inline int __cpumask_parse_user(const char __user *buf, int len,
279 cpumask_t *dstp, int nbits) 279 cpumask_t *dstp, int nbits)
280{ 280{
281 return bitmap_parse(buf, len, dstp->bits, nbits); 281 return bitmap_parse_user(buf, len, dstp->bits, nbits);
282} 282}
283 283
284#define cpulist_scnprintf(buf, len, src) \ 284#define cpulist_scnprintf(buf, len, src) \
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 44605be59409..63f64a9a5bf7 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -230,6 +230,7 @@ extern struct dentry * d_alloc_anon(struct inode *);
230extern struct dentry * d_splice_alias(struct inode *, struct dentry *); 230extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
231extern void shrink_dcache_sb(struct super_block *); 231extern void shrink_dcache_sb(struct super_block *);
232extern void shrink_dcache_parent(struct dentry *); 232extern void shrink_dcache_parent(struct dentry *);
233extern void shrink_dcache_for_umount(struct super_block *);
233extern int d_invalidate(struct dentry *); 234extern int d_invalidate(struct dentry *);
234 235
235/* only used at mount-time */ 236/* only used at mount-time */
diff --git a/include/linux/ext4_fs.h b/include/linux/ext4_fs.h
new file mode 100644
index 000000000000..498503ee613d
--- /dev/null
+++ b/include/linux/ext4_fs.h
@@ -0,0 +1,994 @@
1/*
2 * linux/include/linux/ext4_fs.h
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/include/linux/minix_fs.h
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 */
15
16#ifndef _LINUX_EXT4_FS_H
17#define _LINUX_EXT4_FS_H
18
19#include <linux/types.h>
20#include <linux/blkdev.h>
21#include <linux/magic.h>
22
23/*
24 * The second extended filesystem constants/structures
25 */
26
27/*
28 * Define EXT4FS_DEBUG to produce debug messages
29 */
30#undef EXT4FS_DEBUG
31
32/*
33 * Define EXT4_RESERVATION to reserve data blocks for expanding files
34 */
35#define EXT4_DEFAULT_RESERVE_BLOCKS 8
36/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */
37#define EXT4_MAX_RESERVE_BLOCKS 1027
38#define EXT4_RESERVE_WINDOW_NOT_ALLOCATED 0
39/*
40 * Always enable hashed directories
41 */
42#define CONFIG_EXT4_INDEX
43
44/*
45 * Debug code
46 */
47#ifdef EXT4FS_DEBUG
48#define ext4_debug(f, a...) \
49 do { \
50 printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \
51 __FILE__, __LINE__, __FUNCTION__); \
52 printk (KERN_DEBUG f, ## a); \
53 } while (0)
54#else
55#define ext4_debug(f, a...) do {} while (0)
56#endif
57
58/*
59 * Special inodes numbers
60 */
61#define EXT4_BAD_INO 1 /* Bad blocks inode */
62#define EXT4_ROOT_INO 2 /* Root inode */
63#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */
64#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */
65#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */
66#define EXT4_JOURNAL_INO 8 /* Journal inode */
67
68/* First non-reserved inode for old ext4 filesystems */
69#define EXT4_GOOD_OLD_FIRST_INO 11
70
71/*
72 * Maximal count of links to a file
73 */
74#define EXT4_LINK_MAX 32000
75
76/*
77 * Macro-instructions used to manage several block sizes
78 */
79#define EXT4_MIN_BLOCK_SIZE 1024
80#define EXT4_MAX_BLOCK_SIZE 4096
81#define EXT4_MIN_BLOCK_LOG_SIZE 10
82#ifdef __KERNEL__
83# define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize)
84#else
85# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size)
86#endif
87#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof (__u32))
88#ifdef __KERNEL__
89# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits)
90#else
91# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10)
92#endif
93#ifdef __KERNEL__
94#define EXT4_ADDR_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_addr_per_block_bits)
95#define EXT4_INODE_SIZE(s) (EXT4_SB(s)->s_inode_size)
96#define EXT4_FIRST_INO(s) (EXT4_SB(s)->s_first_ino)
97#else
98#define EXT4_INODE_SIZE(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \
99 EXT4_GOOD_OLD_INODE_SIZE : \
100 (s)->s_inode_size)
101#define EXT4_FIRST_INO(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \
102 EXT4_GOOD_OLD_FIRST_INO : \
103 (s)->s_first_ino)
104#endif
105
106/*
107 * Macro-instructions used to manage fragments
108 */
109#define EXT4_MIN_FRAG_SIZE 1024
110#define EXT4_MAX_FRAG_SIZE 4096
111#define EXT4_MIN_FRAG_LOG_SIZE 10
112#ifdef __KERNEL__
113# define EXT4_FRAG_SIZE(s) (EXT4_SB(s)->s_frag_size)
114# define EXT4_FRAGS_PER_BLOCK(s) (EXT4_SB(s)->s_frags_per_block)
115#else
116# define EXT4_FRAG_SIZE(s) (EXT4_MIN_FRAG_SIZE << (s)->s_log_frag_size)
117# define EXT4_FRAGS_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / EXT4_FRAG_SIZE(s))
118#endif
119
120/*
121 * Structure of a blocks group descriptor
122 */
123struct ext4_group_desc
124{
125 __le32 bg_block_bitmap; /* Blocks bitmap block */
126 __le32 bg_inode_bitmap; /* Inodes bitmap block */
127 __le32 bg_inode_table; /* Inodes table block */
128 __le16 bg_free_blocks_count; /* Free blocks count */
129 __le16 bg_free_inodes_count; /* Free inodes count */
130 __le16 bg_used_dirs_count; /* Directories count */
131 __u16 bg_flags;
132 __u32 bg_reserved[3];
133 __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */
134 __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */
135 __le32 bg_inode_table_hi; /* Inodes table block MSB */
136};
137
138#ifdef __KERNEL__
139#include <linux/ext4_fs_i.h>
140#include <linux/ext4_fs_sb.h>
141#endif
142/*
143 * Macro-instructions used to manage group descriptors
144 */
145#define EXT4_MIN_DESC_SIZE 32
146#define EXT4_MIN_DESC_SIZE_64BIT 64
147#define EXT4_MAX_DESC_SIZE EXT4_MIN_BLOCK_SIZE
148#define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size)
149#ifdef __KERNEL__
150# define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group)
151# define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block)
152# define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group)
153# define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits)
154#else
155# define EXT4_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group)
156# define EXT4_DESC_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s))
157# define EXT4_INODES_PER_GROUP(s) ((s)->s_inodes_per_group)
158#endif
159
160/*
161 * Constants relative to the data blocks
162 */
163#define EXT4_NDIR_BLOCKS 12
164#define EXT4_IND_BLOCK EXT4_NDIR_BLOCKS
165#define EXT4_DIND_BLOCK (EXT4_IND_BLOCK + 1)
166#define EXT4_TIND_BLOCK (EXT4_DIND_BLOCK + 1)
167#define EXT4_N_BLOCKS (EXT4_TIND_BLOCK + 1)
168
169/*
170 * Inode flags
171 */
172#define EXT4_SECRM_FL 0x00000001 /* Secure deletion */
173#define EXT4_UNRM_FL 0x00000002 /* Undelete */
174#define EXT4_COMPR_FL 0x00000004 /* Compress file */
175#define EXT4_SYNC_FL 0x00000008 /* Synchronous updates */
176#define EXT4_IMMUTABLE_FL 0x00000010 /* Immutable file */
177#define EXT4_APPEND_FL 0x00000020 /* writes to file may only append */
178#define EXT4_NODUMP_FL 0x00000040 /* do not dump file */
179#define EXT4_NOATIME_FL 0x00000080 /* do not update atime */
180/* Reserved for compression usage... */
181#define EXT4_DIRTY_FL 0x00000100
182#define EXT4_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */
183#define EXT4_NOCOMPR_FL 0x00000400 /* Don't compress */
184#define EXT4_ECOMPR_FL 0x00000800 /* Compression error */
185/* End compression flags --- maybe not all used */
186#define EXT4_INDEX_FL 0x00001000 /* hash-indexed directory */
187#define EXT4_IMAGIC_FL 0x00002000 /* AFS directory */
188#define EXT4_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */
189#define EXT4_NOTAIL_FL 0x00008000 /* file tail should not be merged */
190#define EXT4_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */
191#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
192#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
193#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
194
195#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
196#define EXT4_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */
197
198/*
199 * Inode dynamic state flags
200 */
201#define EXT4_STATE_JDATA 0x00000001 /* journaled data exists */
202#define EXT4_STATE_NEW 0x00000002 /* inode is newly created */
203#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
204
205/* Used to pass group descriptor data when online resize is done */
206struct ext4_new_group_input {
207 __u32 group; /* Group number for this data */
208 __u64 block_bitmap; /* Absolute block number of block bitmap */
209 __u64 inode_bitmap; /* Absolute block number of inode bitmap */
210 __u64 inode_table; /* Absolute block number of inode table start */
211 __u32 blocks_count; /* Total number of blocks in this group */
212 __u16 reserved_blocks; /* Number of reserved blocks in this group */
213 __u16 unused;
214};
215
216/* The struct ext4_new_group_input in kernel space, with free_blocks_count */
217struct ext4_new_group_data {
218 __u32 group;
219 __u64 block_bitmap;
220 __u64 inode_bitmap;
221 __u64 inode_table;
222 __u32 blocks_count;
223 __u16 reserved_blocks;
224 __u16 unused;
225 __u32 free_blocks_count;
226};
227
228
229/*
230 * ioctl commands
231 */
232#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS
233#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS
234#define EXT4_IOC_GETVERSION _IOR('f', 3, long)
235#define EXT4_IOC_SETVERSION _IOW('f', 4, long)
236#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
237#define EXT4_IOC_GROUP_ADD _IOW('f', 8,struct ext4_new_group_input)
238#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION
239#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION
240#ifdef CONFIG_JBD_DEBUG
241#define EXT4_IOC_WAIT_FOR_READONLY _IOR('f', 99, long)
242#endif
243#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long)
244#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long)
245
246/*
247 * ioctl commands in 32 bit emulation
248 */
249#define EXT4_IOC32_GETFLAGS FS_IOC32_GETFLAGS
250#define EXT4_IOC32_SETFLAGS FS_IOC32_SETFLAGS
251#define EXT4_IOC32_GETVERSION _IOR('f', 3, int)
252#define EXT4_IOC32_SETVERSION _IOW('f', 4, int)
253#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int)
254#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int)
255#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int)
256#ifdef CONFIG_JBD_DEBUG
257#define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int)
258#endif
259#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
260#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
261
262
263/*
264 * Mount options
265 */
266struct ext4_mount_options {
267 unsigned long s_mount_opt;
268 uid_t s_resuid;
269 gid_t s_resgid;
270 unsigned long s_commit_interval;
271#ifdef CONFIG_QUOTA
272 int s_jquota_fmt;
273 char *s_qf_names[MAXQUOTAS];
274#endif
275};
276
277/*
278 * Structure of an inode on the disk
279 */
280struct ext4_inode {
281 __le16 i_mode; /* File mode */
282 __le16 i_uid; /* Low 16 bits of Owner Uid */
283 __le32 i_size; /* Size in bytes */
284 __le32 i_atime; /* Access time */
285 __le32 i_ctime; /* Creation time */
286 __le32 i_mtime; /* Modification time */
287 __le32 i_dtime; /* Deletion Time */
288 __le16 i_gid; /* Low 16 bits of Group Id */
289 __le16 i_links_count; /* Links count */
290 __le32 i_blocks; /* Blocks count */
291 __le32 i_flags; /* File flags */
292 union {
293 struct {
294 __u32 l_i_reserved1;
295 } linux1;
296 struct {
297 __u32 h_i_translator;
298 } hurd1;
299 struct {
300 __u32 m_i_reserved1;
301 } masix1;
302 } osd1; /* OS dependent 1 */
303 __le32 i_block[EXT4_N_BLOCKS];/* Pointers to blocks */
304 __le32 i_generation; /* File version (for NFS) */
305 __le32 i_file_acl; /* File ACL */
306 __le32 i_dir_acl; /* Directory ACL */
307 __le32 i_faddr; /* Fragment address */
308 union {
309 struct {
310 __u8 l_i_frag; /* Fragment number */
311 __u8 l_i_fsize; /* Fragment size */
312 __le16 l_i_file_acl_high;
313 __le16 l_i_uid_high; /* these 2 fields */
314 __le16 l_i_gid_high; /* were reserved2[0] */
315 __u32 l_i_reserved2;
316 } linux2;
317 struct {
318 __u8 h_i_frag; /* Fragment number */
319 __u8 h_i_fsize; /* Fragment size */
320 __u16 h_i_mode_high;
321 __u16 h_i_uid_high;
322 __u16 h_i_gid_high;
323 __u32 h_i_author;
324 } hurd2;
325 struct {
326 __u8 m_i_frag; /* Fragment number */
327 __u8 m_i_fsize; /* Fragment size */
328 __le16 m_i_file_acl_high;
329 __u32 m_i_reserved2[2];
330 } masix2;
331 } osd2; /* OS dependent 2 */
332 __le16 i_extra_isize;
333 __le16 i_pad1;
334};
335
336#define i_size_high i_dir_acl
337
338#if defined(__KERNEL__) || defined(__linux__)
339#define i_reserved1 osd1.linux1.l_i_reserved1
340#define i_frag osd2.linux2.l_i_frag
341#define i_fsize osd2.linux2.l_i_fsize
342#define i_file_acl_high osd2.linux2.l_i_file_acl_high
343#define i_uid_low i_uid
344#define i_gid_low i_gid
345#define i_uid_high osd2.linux2.l_i_uid_high
346#define i_gid_high osd2.linux2.l_i_gid_high
347#define i_reserved2 osd2.linux2.l_i_reserved2
348
349#elif defined(__GNU__)
350
351#define i_translator osd1.hurd1.h_i_translator
352#define i_frag osd2.hurd2.h_i_frag;
353#define i_fsize osd2.hurd2.h_i_fsize;
354#define i_uid_high osd2.hurd2.h_i_uid_high
355#define i_gid_high osd2.hurd2.h_i_gid_high
356#define i_author osd2.hurd2.h_i_author
357
358#elif defined(__masix__)
359
360#define i_reserved1 osd1.masix1.m_i_reserved1
361#define i_frag osd2.masix2.m_i_frag
362#define i_fsize osd2.masix2.m_i_fsize
363#define i_file_acl_high osd2.masix2.m_i_file_acl_high
364#define i_reserved2 osd2.masix2.m_i_reserved2
365
366#endif /* defined(__KERNEL__) || defined(__linux__) */
367
368/*
369 * File system states
370 */
371#define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */
372#define EXT4_ERROR_FS 0x0002 /* Errors detected */
373#define EXT4_ORPHAN_FS 0x0004 /* Orphans being recovered */
374
375/*
376 * Mount flags
377 */
378#define EXT4_MOUNT_CHECK 0x00001 /* Do mount-time checks */
379#define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */
380#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */
381#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
382#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */
383#define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */
384#define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */
385#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
386#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
387#define EXT4_MOUNT_ABORT 0x00200 /* Fatal error detected */
388#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
389#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
390#define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */
391#define EXT4_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */
392#define EXT4_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */
393#define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */
394#define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */
395#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */
396#define EXT4_MOUNT_RESERVATION 0x10000 /* Preallocation */
397#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */
398#define EXT4_MOUNT_NOBH 0x40000 /* No bufferheads */
399#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
400#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
401#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
402#define EXT4_MOUNT_EXTENTS 0x400000 /* Extents support */
403
404/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
405#ifndef _LINUX_EXT2_FS_H
406#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
407#define set_opt(o, opt) o |= EXT4_MOUNT_##opt
408#define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \
409 EXT4_MOUNT_##opt)
410#else
411#define EXT2_MOUNT_NOLOAD EXT4_MOUNT_NOLOAD
412#define EXT2_MOUNT_ABORT EXT4_MOUNT_ABORT
413#define EXT2_MOUNT_DATA_FLAGS EXT4_MOUNT_DATA_FLAGS
414#endif
415
416#define ext4_set_bit ext2_set_bit
417#define ext4_set_bit_atomic ext2_set_bit_atomic
418#define ext4_clear_bit ext2_clear_bit
419#define ext4_clear_bit_atomic ext2_clear_bit_atomic
420#define ext4_test_bit ext2_test_bit
421#define ext4_find_first_zero_bit ext2_find_first_zero_bit
422#define ext4_find_next_zero_bit ext2_find_next_zero_bit
423
424/*
425 * Maximal mount counts between two filesystem checks
426 */
427#define EXT4_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */
428#define EXT4_DFL_CHECKINTERVAL 0 /* Don't use interval check */
429
430/*
431 * Behaviour when detecting errors
432 */
433#define EXT4_ERRORS_CONTINUE 1 /* Continue execution */
434#define EXT4_ERRORS_RO 2 /* Remount fs read-only */
435#define EXT4_ERRORS_PANIC 3 /* Panic */
436#define EXT4_ERRORS_DEFAULT EXT4_ERRORS_CONTINUE
437
438/*
439 * Structure of the super block
440 */
441struct ext4_super_block {
442/*00*/ __le32 s_inodes_count; /* Inodes count */
443 __le32 s_blocks_count; /* Blocks count */
444 __le32 s_r_blocks_count; /* Reserved blocks count */
445 __le32 s_free_blocks_count; /* Free blocks count */
446/*10*/ __le32 s_free_inodes_count; /* Free inodes count */
447 __le32 s_first_data_block; /* First Data Block */
448 __le32 s_log_block_size; /* Block size */
449 __le32 s_log_frag_size; /* Fragment size */
450/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */
451 __le32 s_frags_per_group; /* # Fragments per group */
452 __le32 s_inodes_per_group; /* # Inodes per group */
453 __le32 s_mtime; /* Mount time */
454/*30*/ __le32 s_wtime; /* Write time */
455 __le16 s_mnt_count; /* Mount count */
456 __le16 s_max_mnt_count; /* Maximal mount count */
457 __le16 s_magic; /* Magic signature */
458 __le16 s_state; /* File system state */
459 __le16 s_errors; /* Behaviour when detecting errors */
460 __le16 s_minor_rev_level; /* minor revision level */
461/*40*/ __le32 s_lastcheck; /* time of last check */
462 __le32 s_checkinterval; /* max. time between checks */
463 __le32 s_creator_os; /* OS */
464 __le32 s_rev_level; /* Revision level */
465/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */
466 __le16 s_def_resgid; /* Default gid for reserved blocks */
467 /*
468 * These fields are for EXT4_DYNAMIC_REV superblocks only.
469 *
470 * Note: the difference between the compatible feature set and
471 * the incompatible feature set is that if there is a bit set
472 * in the incompatible feature set that the kernel doesn't
473 * know about, it should refuse to mount the filesystem.
474 *
475 * e2fsck's requirements are more strict; if it doesn't know
476 * about a feature in either the compatible or incompatible
477 * feature set, it must abort and not try to meddle with
478 * things it doesn't understand...
479 */
480 __le32 s_first_ino; /* First non-reserved inode */
481 __le16 s_inode_size; /* size of inode structure */
482 __le16 s_block_group_nr; /* block group # of this superblock */
483 __le32 s_feature_compat; /* compatible feature set */
484/*60*/ __le32 s_feature_incompat; /* incompatible feature set */
485 __le32 s_feature_ro_compat; /* readonly-compatible feature set */
486/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */
487/*78*/ char s_volume_name[16]; /* volume name */
488/*88*/ char s_last_mounted[64]; /* directory where last mounted */
489/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */
490 /*
491 * Performance hints. Directory preallocation should only
492 * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on.
493 */
494 __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/
495 __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */
496 __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */
497 /*
498 * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set.
499 */
500/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */
501/*E0*/ __le32 s_journal_inum; /* inode number of journal file */
502 __le32 s_journal_dev; /* device number of journal file */
503 __le32 s_last_orphan; /* start of list of inodes to delete */
504 __le32 s_hash_seed[4]; /* HTREE hash seed */
505 __u8 s_def_hash_version; /* Default hash version to use */
506 __u8 s_reserved_char_pad;
507 __le16 s_desc_size; /* size of group descriptor */
508/*100*/ __le32 s_default_mount_opts;
509 __le32 s_first_meta_bg; /* First metablock block group */
510 __le32 s_mkfs_time; /* When the filesystem was created */
511 __le32 s_jnl_blocks[17]; /* Backup of the journal inode */
512 /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */
513/*150*/ __le32 s_blocks_count_hi; /* Blocks count */
514 __le32 s_r_blocks_count_hi; /* Reserved blocks count */
515 __le32 s_free_blocks_count_hi; /* Free blocks count */
516 __u32 s_reserved[169]; /* Padding to the end of the block */
517};
518
519#ifdef __KERNEL__
520static inline struct ext4_sb_info * EXT4_SB(struct super_block *sb)
521{
522 return sb->s_fs_info;
523}
524static inline struct ext4_inode_info *EXT4_I(struct inode *inode)
525{
526 return container_of(inode, struct ext4_inode_info, vfs_inode);
527}
528
529static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
530{
531 return ino == EXT4_ROOT_INO ||
532 ino == EXT4_JOURNAL_INO ||
533 ino == EXT4_RESIZE_INO ||
534 (ino >= EXT4_FIRST_INO(sb) &&
535 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
536}
537#else
538/* Assume that user mode programs are passing in an ext4fs superblock, not
539 * a kernel struct super_block. This will allow us to call the feature-test
540 * macros from user land. */
541#define EXT4_SB(sb) (sb)
542#endif
543
544#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime
545
546/*
547 * Codes for operating systems
548 */
549#define EXT4_OS_LINUX 0
550#define EXT4_OS_HURD 1
551#define EXT4_OS_MASIX 2
552#define EXT4_OS_FREEBSD 3
553#define EXT4_OS_LITES 4
554
555/*
556 * Revision levels
557 */
558#define EXT4_GOOD_OLD_REV 0 /* The good old (original) format */
559#define EXT4_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */
560
561#define EXT4_CURRENT_REV EXT4_GOOD_OLD_REV
562#define EXT4_MAX_SUPP_REV EXT4_DYNAMIC_REV
563
564#define EXT4_GOOD_OLD_INODE_SIZE 128
565
566/*
567 * Feature set definitions
568 */
569
570#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \
571 ( EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) )
572#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \
573 ( EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) )
574#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \
575 ( EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) )
576#define EXT4_SET_COMPAT_FEATURE(sb,mask) \
577 EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
578#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \
579 EXT4_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask)
580#define EXT4_SET_INCOMPAT_FEATURE(sb,mask) \
581 EXT4_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask)
582#define EXT4_CLEAR_COMPAT_FEATURE(sb,mask) \
583 EXT4_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask)
584#define EXT4_CLEAR_RO_COMPAT_FEATURE(sb,mask) \
585 EXT4_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask)
586#define EXT4_CLEAR_INCOMPAT_FEATURE(sb,mask) \
587 EXT4_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask)
588
589#define EXT4_FEATURE_COMPAT_DIR_PREALLOC 0x0001
590#define EXT4_FEATURE_COMPAT_IMAGIC_INODES 0x0002
591#define EXT4_FEATURE_COMPAT_HAS_JOURNAL 0x0004
592#define EXT4_FEATURE_COMPAT_EXT_ATTR 0x0008
593#define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010
594#define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020
595
596#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001
597#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002
598#define EXT4_FEATURE_RO_COMPAT_BTREE_DIR 0x0004
599
600#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
601#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
602#define EXT4_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */
603#define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */
604#define EXT4_FEATURE_INCOMPAT_META_BG 0x0010
605#define EXT4_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */
606#define EXT4_FEATURE_INCOMPAT_64BIT 0x0080
607
608#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
609#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
610 EXT4_FEATURE_INCOMPAT_RECOVER| \
611 EXT4_FEATURE_INCOMPAT_META_BG| \
612 EXT4_FEATURE_INCOMPAT_EXTENTS| \
613 EXT4_FEATURE_INCOMPAT_64BIT)
614#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
615 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
616 EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
617
618/*
619 * Default values for user and/or group using reserved blocks
620 */
621#define EXT4_DEF_RESUID 0
622#define EXT4_DEF_RESGID 0
623
624/*
625 * Default mount options
626 */
627#define EXT4_DEFM_DEBUG 0x0001
628#define EXT4_DEFM_BSDGROUPS 0x0002
629#define EXT4_DEFM_XATTR_USER 0x0004
630#define EXT4_DEFM_ACL 0x0008
631#define EXT4_DEFM_UID16 0x0010
632#define EXT4_DEFM_JMODE 0x0060
633#define EXT4_DEFM_JMODE_DATA 0x0020
634#define EXT4_DEFM_JMODE_ORDERED 0x0040
635#define EXT4_DEFM_JMODE_WBACK 0x0060
636
637/*
638 * Structure of a directory entry
639 */
640#define EXT4_NAME_LEN 255
641
642struct ext4_dir_entry {
643 __le32 inode; /* Inode number */
644 __le16 rec_len; /* Directory entry length */
645 __le16 name_len; /* Name length */
646 char name[EXT4_NAME_LEN]; /* File name */
647};
648
649/*
650 * The new version of the directory entry. Since EXT4 structures are
651 * stored in intel byte order, and the name_len field could never be
652 * bigger than 255 chars, it's safe to reclaim the extra byte for the
653 * file_type field.
654 */
655struct ext4_dir_entry_2 {
656 __le32 inode; /* Inode number */
657 __le16 rec_len; /* Directory entry length */
658 __u8 name_len; /* Name length */
659 __u8 file_type;
660 char name[EXT4_NAME_LEN]; /* File name */
661};
662
663/*
664 * Ext4 directory file types. Only the low 3 bits are used. The
665 * other bits are reserved for now.
666 */
667#define EXT4_FT_UNKNOWN 0
668#define EXT4_FT_REG_FILE 1
669#define EXT4_FT_DIR 2
670#define EXT4_FT_CHRDEV 3
671#define EXT4_FT_BLKDEV 4
672#define EXT4_FT_FIFO 5
673#define EXT4_FT_SOCK 6
674#define EXT4_FT_SYMLINK 7
675
676#define EXT4_FT_MAX 8
677
678/*
679 * EXT4_DIR_PAD defines the directory entries boundaries
680 *
681 * NOTE: It must be a multiple of 4
682 */
683#define EXT4_DIR_PAD 4
684#define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1)
685#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \
686 ~EXT4_DIR_ROUND)
687/*
688 * Hash Tree Directory indexing
689 * (c) Daniel Phillips, 2001
690 */
691
692#ifdef CONFIG_EXT4_INDEX
693 #define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
694 EXT4_FEATURE_COMPAT_DIR_INDEX) && \
695 (EXT4_I(dir)->i_flags & EXT4_INDEX_FL))
696#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
697#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
698#else
699 #define is_dx(dir) 0
700#define EXT4_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT4_LINK_MAX)
701#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2)
702#endif
703
704/* Legal values for the dx_root hash_version field: */
705
706#define DX_HASH_LEGACY 0
707#define DX_HASH_HALF_MD4 1
708#define DX_HASH_TEA 2
709
710#ifdef __KERNEL__
711
712/* hash info structure used by the directory hash */
713struct dx_hash_info
714{
715 u32 hash;
716 u32 minor_hash;
717 int hash_version;
718 u32 *seed;
719};
720
721#define EXT4_HTREE_EOF 0x7fffffff
722
723/*
724 * Control parameters used by ext4_htree_next_block
725 */
726#define HASH_NB_ALWAYS 1
727
728
729/*
730 * Describe an inode's exact location on disk and in memory
731 */
732struct ext4_iloc
733{
734 struct buffer_head *bh;
735 unsigned long offset;
736 unsigned long block_group;
737};
738
739static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
740{
741 return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
742}
743
744/*
745 * This structure is stuffed into the struct file's private_data field
746 * for directories. It is where we put information so that we can do
747 * readdir operations in hash tree order.
748 */
749struct dir_private_info {
750 struct rb_root root;
751 struct rb_node *curr_node;
752 struct fname *extra_fname;
753 loff_t last_pos;
754 __u32 curr_hash;
755 __u32 curr_minor_hash;
756 __u32 next_hash;
757};
758
759/* calculate the first block number of the group */
760static inline ext4_fsblk_t
761ext4_group_first_block_no(struct super_block *sb, unsigned long group_no)
762{
763 return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) +
764 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
765}
766
767/*
768 * Special error return code only used by dx_probe() and its callers.
769 */
770#define ERR_BAD_DX_DIR -75000
771
772void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
773 unsigned long *blockgrpp, ext4_grpblk_t *offsetp);
774
775/*
776 * Function prototypes
777 */
778
779/*
780 * Ok, these declarations are also in <linux/kernel.h> but none of the
781 * ext4 source programs needs to include it so they are duplicated here.
782 */
783# define NORET_TYPE /**/
784# define ATTRIB_NORET __attribute__((noreturn))
785# define NORET_AND noreturn,
786
787/* balloc.c */
788extern unsigned int ext4_block_group(struct super_block *sb,
789 ext4_fsblk_t blocknr);
790extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
791 ext4_fsblk_t blocknr);
792extern int ext4_bg_has_super(struct super_block *sb, int group);
793extern unsigned long ext4_bg_num_gdb(struct super_block *sb, int group);
794extern ext4_fsblk_t ext4_new_block (handle_t *handle, struct inode *inode,
795 ext4_fsblk_t goal, int *errp);
796extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode,
797 ext4_fsblk_t goal, unsigned long *count, int *errp);
798extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
799 ext4_fsblk_t block, unsigned long count);
800extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
801 ext4_fsblk_t block, unsigned long count,
802 unsigned long *pdquot_freed_blocks);
803extern ext4_fsblk_t ext4_count_free_blocks (struct super_block *);
804extern void ext4_check_blocks_bitmap (struct super_block *);
805extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
806 unsigned int block_group,
807 struct buffer_head ** bh);
808extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
809extern void ext4_init_block_alloc_info(struct inode *);
810extern void ext4_rsv_window_add(struct super_block *sb, struct ext4_reserve_window_node *rsv);
811
812/* dir.c */
813extern int ext4_check_dir_entry(const char *, struct inode *,
814 struct ext4_dir_entry_2 *,
815 struct buffer_head *, unsigned long);
816extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
817 __u32 minor_hash,
818 struct ext4_dir_entry_2 *dirent);
819extern void ext4_htree_free_dir_info(struct dir_private_info *p);
820
821/* fsync.c */
822extern int ext4_sync_file (struct file *, struct dentry *, int);
823
824/* hash.c */
825extern int ext4fs_dirhash(const char *name, int len, struct
826 dx_hash_info *hinfo);
827
828/* ialloc.c */
829extern struct inode * ext4_new_inode (handle_t *, struct inode *, int);
830extern void ext4_free_inode (handle_t *, struct inode *);
831extern struct inode * ext4_orphan_get (struct super_block *, unsigned long);
832extern unsigned long ext4_count_free_inodes (struct super_block *);
833extern unsigned long ext4_count_dirs (struct super_block *);
834extern void ext4_check_inodes_bitmap (struct super_block *);
835extern unsigned long ext4_count_free (struct buffer_head *, unsigned);
836
837
838/* inode.c */
839int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
840 struct buffer_head *bh, ext4_fsblk_t blocknr);
841struct buffer_head * ext4_getblk (handle_t *, struct inode *, long, int, int *);
842struct buffer_head * ext4_bread (handle_t *, struct inode *, int, int, int *);
843int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
844 sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result,
845 int create, int extend_disksize);
846
847extern void ext4_read_inode (struct inode *);
848extern int ext4_write_inode (struct inode *, int);
849extern int ext4_setattr (struct dentry *, struct iattr *);
850extern void ext4_delete_inode (struct inode *);
851extern int ext4_sync_inode (handle_t *, struct inode *);
852extern void ext4_discard_reservation (struct inode *);
853extern void ext4_dirty_inode(struct inode *);
854extern int ext4_change_inode_journal_flag(struct inode *, int);
855extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
856extern void ext4_truncate (struct inode *);
857extern void ext4_set_inode_flags(struct inode *);
858extern void ext4_set_aops(struct inode *inode);
859extern int ext4_writepage_trans_blocks(struct inode *);
860extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
861 struct address_space *mapping, loff_t from);
862
863/* ioctl.c */
864extern int ext4_ioctl (struct inode *, struct file *, unsigned int,
865 unsigned long);
866extern long ext4_compat_ioctl (struct file *, unsigned int, unsigned long);
867
868/* namei.c */
869extern int ext4_orphan_add(handle_t *, struct inode *);
870extern int ext4_orphan_del(handle_t *, struct inode *);
871extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
872 __u32 start_minor_hash, __u32 *next_hash);
873
874/* resize.c */
875extern int ext4_group_add(struct super_block *sb,
876 struct ext4_new_group_data *input);
877extern int ext4_group_extend(struct super_block *sb,
878 struct ext4_super_block *es,
879 ext4_fsblk_t n_blocks_count);
880
881/* super.c */
882extern void ext4_error (struct super_block *, const char *, const char *, ...)
883 __attribute__ ((format (printf, 3, 4)));
884extern void __ext4_std_error (struct super_block *, const char *, int);
885extern void ext4_abort (struct super_block *, const char *, const char *, ...)
886 __attribute__ ((format (printf, 3, 4)));
887extern void ext4_warning (struct super_block *, const char *, const char *, ...)
888 __attribute__ ((format (printf, 3, 4)));
889extern void ext4_update_dynamic_rev (struct super_block *sb);
890extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
891 struct ext4_group_desc *bg);
892extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
893 struct ext4_group_desc *bg);
894extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
895 struct ext4_group_desc *bg);
896extern void ext4_block_bitmap_set(struct super_block *sb,
897 struct ext4_group_desc *bg, ext4_fsblk_t blk);
898extern void ext4_inode_bitmap_set(struct super_block *sb,
899 struct ext4_group_desc *bg, ext4_fsblk_t blk);
900extern void ext4_inode_table_set(struct super_block *sb,
901 struct ext4_group_desc *bg, ext4_fsblk_t blk);
902
903static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
904{
905 return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
906 le32_to_cpu(es->s_blocks_count);
907}
908
909static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es)
910{
911 return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) |
912 le32_to_cpu(es->s_r_blocks_count);
913}
914
915static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es)
916{
917 return ((ext4_fsblk_t)le32_to_cpu(es->s_free_blocks_count_hi) << 32) |
918 le32_to_cpu(es->s_free_blocks_count);
919}
920
921static inline void ext4_blocks_count_set(struct ext4_super_block *es,
922 ext4_fsblk_t blk)
923{
924 es->s_blocks_count = cpu_to_le32((u32)blk);
925 es->s_blocks_count_hi = cpu_to_le32(blk >> 32);
926}
927
928static inline void ext4_free_blocks_count_set(struct ext4_super_block *es,
929 ext4_fsblk_t blk)
930{
931 es->s_free_blocks_count = cpu_to_le32((u32)blk);
932 es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32);
933}
934
935static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
936 ext4_fsblk_t blk)
937{
938 es->s_r_blocks_count = cpu_to_le32((u32)blk);
939 es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
940}
941
942
943
944#define ext4_std_error(sb, errno) \
945do { \
946 if ((errno)) \
947 __ext4_std_error((sb), __FUNCTION__, (errno)); \
948} while (0)
949
950/*
951 * Inodes and files operations
952 */
953
954/* dir.c */
955extern const struct file_operations ext4_dir_operations;
956
957/* file.c */
958extern struct inode_operations ext4_file_inode_operations;
959extern const struct file_operations ext4_file_operations;
960
961/* namei.c */
962extern struct inode_operations ext4_dir_inode_operations;
963extern struct inode_operations ext4_special_inode_operations;
964
965/* symlink.c */
966extern struct inode_operations ext4_symlink_inode_operations;
967extern struct inode_operations ext4_fast_symlink_inode_operations;
968
969/* extents.c */
970extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
971extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
972extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
973 ext4_fsblk_t iblock,
974 unsigned long max_blocks, struct buffer_head *bh_result,
975 int create, int extend_disksize);
976extern void ext4_ext_truncate(struct inode *, struct page *);
977extern void ext4_ext_init(struct super_block *);
978extern void ext4_ext_release(struct super_block *);
979static inline int
980ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
981 unsigned long max_blocks, struct buffer_head *bh,
982 int create, int extend_disksize)
983{
984 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
985 return ext4_ext_get_blocks(handle, inode, block, max_blocks,
986 bh, create, extend_disksize);
987 return ext4_get_blocks_handle(handle, inode, block, max_blocks, bh,
988 create, extend_disksize);
989}
990
991
992#endif /* __KERNEL__ */
993
994#endif /* _LINUX_EXT4_FS_H */
diff --git a/include/linux/ext4_fs_extents.h b/include/linux/ext4_fs_extents.h
new file mode 100644
index 000000000000..a41cc24568ca
--- /dev/null
+++ b/include/linux/ext4_fs_extents.h
@@ -0,0 +1,198 @@
1/*
2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3 * Written by Alex Tomas <alex@clusterfs.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public Licens
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
17 */
18
19#ifndef _LINUX_EXT4_EXTENTS
20#define _LINUX_EXT4_EXTENTS
21
22#include <linux/ext4_fs.h>
23
24/*
25 * With AGRESSIVE_TEST defined, the capacity of index/leaf blocks
26 * becomes very small, so index split, in-depth growing and
27 * other hard changes happen much more often.
28 * This is for debug purposes only.
29 */
30#define AGRESSIVE_TEST_
31
32/*
33 * With EXTENTS_STATS defined, the number of blocks and extents
34 * are collected in the truncate path. They'll be shown at
35 * umount time.
36 */
37#define EXTENTS_STATS__
38
39/*
40 * If CHECK_BINSEARCH is defined, then the results of the binary search
41 * will also be checked by linear search.
42 */
43#define CHECK_BINSEARCH__
44
45/*
46 * If EXT_DEBUG is defined you can use the 'extdebug' mount option
47 * to get lots of info about what's going on.
48 */
49#define EXT_DEBUG__
50#ifdef EXT_DEBUG
51#define ext_debug(a...) printk(a)
52#else
53#define ext_debug(a...)
54#endif
55
56/*
57 * If EXT_STATS is defined then stats numbers are collected.
58 * These number will be displayed at umount time.
59 */
60#define EXT_STATS_
61
62
63/*
64 * ext4_inode has i_block array (60 bytes total).
65 * The first 12 bytes store ext4_extent_header;
66 * the remainder stores an array of ext4_extent.
67 */
68
69/*
70 * This is the extent on-disk structure.
71 * It's used at the bottom of the tree.
72 */
73struct ext4_extent {
74 __le32 ee_block; /* first logical block extent covers */
75 __le16 ee_len; /* number of blocks covered by extent */
76 __le16 ee_start_hi; /* high 16 bits of physical block */
77 __le32 ee_start; /* low 32 bits of physical block */
78};
79
80/*
81 * This is index on-disk structure.
82 * It's used at all the levels except the bottom.
83 */
84struct ext4_extent_idx {
85 __le32 ei_block; /* index covers logical blocks from 'block' */
86 __le32 ei_leaf; /* pointer to the physical block of the next *
87 * level. leaf or next index could be there */
88 __le16 ei_leaf_hi; /* high 16 bits of physical block */
89 __u16 ei_unused;
90};
91
92/*
93 * Each block (leaves and indexes), even inode-stored has header.
94 */
95struct ext4_extent_header {
96 __le16 eh_magic; /* probably will support different formats */
97 __le16 eh_entries; /* number of valid entries */
98 __le16 eh_max; /* capacity of store in entries */
99 __le16 eh_depth; /* has tree real underlying blocks? */
100 __le32 eh_generation; /* generation of the tree */
101};
102
103#define EXT4_EXT_MAGIC cpu_to_le16(0xf30a)
104
105/*
106 * Array of ext4_ext_path contains path to some extent.
107 * Creation/lookup routines use it for traversal/splitting/etc.
108 * Truncate uses it to simulate recursive walking.
109 */
110struct ext4_ext_path {
111 ext4_fsblk_t p_block;
112 __u16 p_depth;
113 struct ext4_extent *p_ext;
114 struct ext4_extent_idx *p_idx;
115 struct ext4_extent_header *p_hdr;
116 struct buffer_head *p_bh;
117};
118
119/*
120 * structure for external API
121 */
122
123#define EXT4_EXT_CACHE_NO 0
124#define EXT4_EXT_CACHE_GAP 1
125#define EXT4_EXT_CACHE_EXTENT 2
126
127/*
128 * to be called by ext4_ext_walk_space()
129 * negative retcode - error
130 * positive retcode - signal for ext4_ext_walk_space(), see below
131 * callback must return valid extent (passed or newly created)
132 */
133typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
134 struct ext4_ext_cache *,
135 void *);
136
137#define EXT_CONTINUE 0
138#define EXT_BREAK 1
139#define EXT_REPEAT 2
140
141
142#define EXT_MAX_BLOCK 0xffffffff
143
144#define EXT_MAX_LEN ((1UL << 15) - 1)
145
146
147#define EXT_FIRST_EXTENT(__hdr__) \
148 ((struct ext4_extent *) (((char *) (__hdr__)) + \
149 sizeof(struct ext4_extent_header)))
150#define EXT_FIRST_INDEX(__hdr__) \
151 ((struct ext4_extent_idx *) (((char *) (__hdr__)) + \
152 sizeof(struct ext4_extent_header)))
153#define EXT_HAS_FREE_INDEX(__path__) \
154 (le16_to_cpu((__path__)->p_hdr->eh_entries) \
155 < le16_to_cpu((__path__)->p_hdr->eh_max))
156#define EXT_LAST_EXTENT(__hdr__) \
157 (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1)
158#define EXT_LAST_INDEX(__hdr__) \
159 (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1)
160#define EXT_MAX_EXTENT(__hdr__) \
161 (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)
162#define EXT_MAX_INDEX(__hdr__) \
163 (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)
164
165static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode)
166{
167 return (struct ext4_extent_header *) EXT4_I(inode)->i_data;
168}
169
170static inline struct ext4_extent_header *ext_block_hdr(struct buffer_head *bh)
171{
172 return (struct ext4_extent_header *) bh->b_data;
173}
174
175static inline unsigned short ext_depth(struct inode *inode)
176{
177 return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
178}
179
180static inline void ext4_ext_tree_changed(struct inode *inode)
181{
182 EXT4_I(inode)->i_ext_generation++;
183}
184
185static inline void
186ext4_ext_invalidate_cache(struct inode *inode)
187{
188 EXT4_I(inode)->i_cached_extent.ec_type = EXT4_EXT_CACHE_NO;
189}
190
191extern int ext4_extent_tree_init(handle_t *, struct inode *);
192extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *);
193extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
194extern int ext4_ext_walk_space(struct inode *, unsigned long, unsigned long, ext_prepare_callback, void *);
195extern struct ext4_ext_path * ext4_ext_find_extent(struct inode *, int, struct ext4_ext_path *);
196
197#endif /* _LINUX_EXT4_EXTENTS */
198
diff --git a/include/linux/ext4_fs_i.h b/include/linux/ext4_fs_i.h
new file mode 100644
index 000000000000..bb42379cb7fd
--- /dev/null
+++ b/include/linux/ext4_fs_i.h
@@ -0,0 +1,158 @@
1/*
2 * linux/include/linux/ext4_fs_i.h
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/include/linux/minix_fs_i.h
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 */
15
16#ifndef _LINUX_EXT4_FS_I
17#define _LINUX_EXT4_FS_I
18
19#include <linux/rwsem.h>
20#include <linux/rbtree.h>
21#include <linux/seqlock.h>
22#include <linux/mutex.h>
23
24/* data type for block offset of block group */
25typedef int ext4_grpblk_t;
26
27/* data type for filesystem-wide blocks number */
28typedef unsigned long long ext4_fsblk_t;
29
30struct ext4_reserve_window {
31 ext4_fsblk_t _rsv_start; /* First byte reserved */
32 ext4_fsblk_t _rsv_end; /* Last byte reserved or 0 */
33};
34
35struct ext4_reserve_window_node {
36 struct rb_node rsv_node;
37 __u32 rsv_goal_size;
38 __u32 rsv_alloc_hit;
39 struct ext4_reserve_window rsv_window;
40};
41
42struct ext4_block_alloc_info {
43 /* information about reservation window */
44 struct ext4_reserve_window_node rsv_window_node;
45 /*
46 * was i_next_alloc_block in ext4_inode_info
47 * is the logical (file-relative) number of the
48 * most-recently-allocated block in this file.
49 * We use this for detecting linearly ascending allocation requests.
50 */
51 __u32 last_alloc_logical_block;
52 /*
53 * Was i_next_alloc_goal in ext4_inode_info
54 * is the *physical* companion to i_next_alloc_block.
55 * it the the physical block number of the block which was most-recentl
56 * allocated to this file. This give us the goal (target) for the next
57 * allocation when we detect linearly ascending requests.
58 */
59 ext4_fsblk_t last_alloc_physical_block;
60};
61
62#define rsv_start rsv_window._rsv_start
63#define rsv_end rsv_window._rsv_end
64
65/*
66 * storage for cached extent
67 */
68struct ext4_ext_cache {
69 ext4_fsblk_t ec_start;
70 __u32 ec_block;
71 __u32 ec_len; /* must be 32bit to return holes */
72 __u32 ec_type;
73};
74
75/*
76 * third extended file system inode data in memory
77 */
78struct ext4_inode_info {
79 __le32 i_data[15]; /* unconverted */
80 __u32 i_flags;
81#ifdef EXT4_FRAGMENTS
82 __u32 i_faddr;
83 __u8 i_frag_no;
84 __u8 i_frag_size;
85#endif
86 ext4_fsblk_t i_file_acl;
87 __u32 i_dir_acl;
88 __u32 i_dtime;
89
90 /*
91 * i_block_group is the number of the block group which contains
92 * this file's inode. Constant across the lifetime of the inode,
93 * it is ued for making block allocation decisions - we try to
94 * place a file's data blocks near its inode block, and new inodes
95 * near to their parent directory's inode.
96 */
97 __u32 i_block_group;
98 __u32 i_state; /* Dynamic state flags for ext4 */
99
100 /* block reservation info */
101 struct ext4_block_alloc_info *i_block_alloc_info;
102
103 __u32 i_dir_start_lookup;
104#ifdef CONFIG_EXT4DEV_FS_XATTR
105 /*
106 * Extended attributes can be read independently of the main file
107 * data. Taking i_mutex even when reading would cause contention
108 * between readers of EAs and writers of regular file data, so
109 * instead we synchronize on xattr_sem when reading or changing
110 * EAs.
111 */
112 struct rw_semaphore xattr_sem;
113#endif
114#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
115 struct posix_acl *i_acl;
116 struct posix_acl *i_default_acl;
117#endif
118
119 struct list_head i_orphan; /* unlinked but open inodes */
120
121 /*
122 * i_disksize keeps track of what the inode size is ON DISK, not
123 * in memory. During truncate, i_size is set to the new size by
124 * the VFS prior to calling ext4_truncate(), but the filesystem won't
125 * set i_disksize to 0 until the truncate is actually under way.
126 *
127 * The intent is that i_disksize always represents the blocks which
128 * are used by this file. This allows recovery to restart truncate
129 * on orphans if we crash during truncate. We actually write i_disksize
130 * into the on-disk inode when writing inodes out, instead of i_size.
131 *
132 * The only time when i_disksize and i_size may be different is when
133 * a truncate is in progress. The only things which change i_disksize
134 * are ext4_get_block (growth) and ext4_truncate (shrinkth).
135 */
136 loff_t i_disksize;
137
138 /* on-disk additional length */
139 __u16 i_extra_isize;
140
141 /*
142 * truncate_mutex is for serialising ext4_truncate() against
143 * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's
144 * data tree are chopped off during truncate. We can't do that in
145 * ext4 because whenever we perform intermediate commits during
146 * truncate, the inode and all the metadata blocks *must* be in a
147 * consistent state which allows truncation of the orphans to restart
148 * during recovery. Hence we must fix the get_block-vs-truncate race
149 * by other means, so we have truncate_mutex.
150 */
151 struct mutex truncate_mutex;
152 struct inode vfs_inode;
153
154 unsigned long i_ext_generation;
155 struct ext4_ext_cache i_cached_extent;
156};
157
158#endif /* _LINUX_EXT4_FS_I */
diff --git a/include/linux/ext4_fs_sb.h b/include/linux/ext4_fs_sb.h
new file mode 100644
index 000000000000..691a713139ce
--- /dev/null
+++ b/include/linux/ext4_fs_sb.h
@@ -0,0 +1,94 @@
1/*
2 * linux/include/linux/ext4_fs_sb.h
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/include/linux/minix_fs_sb.h
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 */
15
16#ifndef _LINUX_EXT4_FS_SB
17#define _LINUX_EXT4_FS_SB
18
19#ifdef __KERNEL__
20#include <linux/timer.h>
21#include <linux/wait.h>
22#include <linux/blockgroup_lock.h>
23#include <linux/percpu_counter.h>
24#endif
25#include <linux/rbtree.h>
26
27/*
28 * third extended-fs super-block data in memory
29 */
30struct ext4_sb_info {
31 unsigned long s_frag_size; /* Size of a fragment in bytes */
32 unsigned long s_desc_size; /* Size of a group descriptor in bytes */
33 unsigned long s_frags_per_block;/* Number of fragments per block */
34 unsigned long s_inodes_per_block;/* Number of inodes per block */
35 unsigned long s_frags_per_group;/* Number of fragments in a group */
36 unsigned long s_blocks_per_group;/* Number of blocks in a group */
37 unsigned long s_inodes_per_group;/* Number of inodes in a group */
38 unsigned long s_itb_per_group; /* Number of inode table blocks per group */
39 unsigned long s_gdb_count; /* Number of group descriptor blocks */
40 unsigned long s_desc_per_block; /* Number of group descriptors per block */
41 unsigned long s_groups_count; /* Number of groups in the fs */
42 struct buffer_head * s_sbh; /* Buffer containing the super block */
43 struct ext4_super_block * s_es; /* Pointer to the super block in the buffer */
44 struct buffer_head ** s_group_desc;
45 unsigned long s_mount_opt;
46 uid_t s_resuid;
47 gid_t s_resgid;
48 unsigned short s_mount_state;
49 unsigned short s_pad;
50 int s_addr_per_block_bits;
51 int s_desc_per_block_bits;
52 int s_inode_size;
53 int s_first_ino;
54 spinlock_t s_next_gen_lock;
55 u32 s_next_generation;
56 u32 s_hash_seed[4];
57 int s_def_hash_version;
58 struct percpu_counter s_freeblocks_counter;
59 struct percpu_counter s_freeinodes_counter;
60 struct percpu_counter s_dirs_counter;
61 struct blockgroup_lock s_blockgroup_lock;
62
63 /* root of the per fs reservation window tree */
64 spinlock_t s_rsv_window_lock;
65 struct rb_root s_rsv_window_root;
66 struct ext4_reserve_window_node s_rsv_window_head;
67
68 /* Journaling */
69 struct inode * s_journal_inode;
70 struct journal_s * s_journal;
71 struct list_head s_orphan;
72 unsigned long s_commit_interval;
73 struct block_device *journal_bdev;
74#ifdef CONFIG_JBD_DEBUG
75 struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
76 wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */
77#endif
78#ifdef CONFIG_QUOTA
79 char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
80 int s_jquota_fmt; /* Format of quota to use */
81#endif
82
83#ifdef EXTENTS_STATS
84 /* ext4 extents stats */
85 unsigned long s_ext_min;
86 unsigned long s_ext_max;
87 unsigned long s_depth_max;
88 spinlock_t s_ext_stats_lock;
89 unsigned long s_ext_blocks;
90 unsigned long s_ext_extents;
91#endif
92};
93
94#endif /* _LINUX_EXT4_FS_SB */
diff --git a/include/linux/ext4_jbd2.h b/include/linux/ext4_jbd2.h
new file mode 100644
index 000000000000..72dd631912e4
--- /dev/null
+++ b/include/linux/ext4_jbd2.h
@@ -0,0 +1,273 @@
1/*
2 * linux/include/linux/ext4_jbd2.h
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
5 *
6 * Copyright 1998--1999 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Ext4-specific journaling extensions.
13 */
14
15#ifndef _LINUX_EXT4_JBD_H
16#define _LINUX_EXT4_JBD_H
17
18#include <linux/fs.h>
19#include <linux/jbd2.h>
20#include <linux/ext4_fs.h>
21
22#define EXT4_JOURNAL(inode) (EXT4_SB((inode)->i_sb)->s_journal)
23
24/* Define the number of blocks we need to account to a transaction to
25 * modify one block of data.
26 *
27 * We may have to touch one inode, one bitmap buffer, up to three
28 * indirection blocks, the group and superblock summaries, and the data
29 * block to complete the transaction.
30 *
31 * For extents-enabled fs we may have to allocate and modify up to
32 * 5 levels of tree + root which are stored in the inode. */
33
34#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \
35 (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \
36 || test_opt(sb, EXTENTS) ? 27U : 8U)
37
38/* Extended attribute operations touch at most two data buffers,
39 * two bitmap buffers, and two group summaries, in addition to the inode
40 * and the superblock, which are already accounted for. */
41
42#define EXT4_XATTR_TRANS_BLOCKS 6U
43
44/* Define the minimum size for a transaction which modifies data. This
45 * needs to take into account the fact that we may end up modifying two
46 * quota files too (one for the group, one for the user quota). The
47 * superblock only gets updated once, of course, so don't bother
48 * counting that again for the quota updates. */
49
50#define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
51 EXT4_XATTR_TRANS_BLOCKS - 2 + \
52 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
53
54/* Delete operations potentially hit one directory's namespace plus an
55 * entire inode, plus arbitrary amounts of bitmap/indirection data. Be
56 * generous. We can grow the delete transaction later if necessary. */
57
58#define EXT4_DELETE_TRANS_BLOCKS(sb) (2 * EXT4_DATA_TRANS_BLOCKS(sb) + 64)
59
60/* Define an arbitrary limit for the amount of data we will anticipate
61 * writing to any given transaction. For unbounded transactions such as
62 * write(2) and truncate(2) we can write more than this, but we always
63 * start off at the maximum transaction size and grow the transaction
64 * optimistically as we go. */
65
66#define EXT4_MAX_TRANS_DATA 64U
67
68/* We break up a large truncate or write transaction once the handle's
69 * buffer credits gets this low, we need either to extend the
70 * transaction or to start a new one. Reserve enough space here for
71 * inode, bitmap, superblock, group and indirection updates for at least
72 * one block, plus two quota updates. Quota allocations are not
73 * needed. */
74
75#define EXT4_RESERVE_TRANS_BLOCKS 12U
76
77#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 8
78
79#ifdef CONFIG_QUOTA
80/* Amount of blocks needed for quota update - we know that the structure was
81 * allocated so we need to update only inode+data */
82#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0)
83/* Amount of blocks needed for quota insert/delete - we do some block writes
84 * but inode, sb and group updates are done only once */
85#define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
86 (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0)
87#define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
88 (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0)
89#else
90#define EXT4_QUOTA_TRANS_BLOCKS(sb) 0
91#define EXT4_QUOTA_INIT_BLOCKS(sb) 0
92#define EXT4_QUOTA_DEL_BLOCKS(sb) 0
93#endif
94
95int
96ext4_mark_iloc_dirty(handle_t *handle,
97 struct inode *inode,
98 struct ext4_iloc *iloc);
99
100/*
101 * On success, We end up with an outstanding reference count against
102 * iloc->bh. This _must_ be cleaned up later.
103 */
104
105int ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
106 struct ext4_iloc *iloc);
107
108int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
109
110/*
111 * Wrapper functions with which ext4 calls into JBD. The intent here is
112 * to allow these to be turned into appropriate stubs so ext4 can control
113 * ext2 filesystems, so ext2+ext4 systems only nee one fs. This work hasn't
114 * been done yet.
115 */
116
117void ext4_journal_abort_handle(const char *caller, const char *err_fn,
118 struct buffer_head *bh, handle_t *handle, int err);
119
120static inline int
121__ext4_journal_get_undo_access(const char *where, handle_t *handle,
122 struct buffer_head *bh)
123{
124 int err = jbd2_journal_get_undo_access(handle, bh);
125 if (err)
126 ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
127 return err;
128}
129
130static inline int
131__ext4_journal_get_write_access(const char *where, handle_t *handle,
132 struct buffer_head *bh)
133{
134 int err = jbd2_journal_get_write_access(handle, bh);
135 if (err)
136 ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
137 return err;
138}
139
140static inline void
141ext4_journal_release_buffer(handle_t *handle, struct buffer_head *bh)
142{
143 jbd2_journal_release_buffer(handle, bh);
144}
145
146static inline int
147__ext4_journal_forget(const char *where, handle_t *handle, struct buffer_head *bh)
148{
149 int err = jbd2_journal_forget(handle, bh);
150 if (err)
151 ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
152 return err;
153}
154
155static inline int
156__ext4_journal_revoke(const char *where, handle_t *handle,
157 ext4_fsblk_t blocknr, struct buffer_head *bh)
158{
159 int err = jbd2_journal_revoke(handle, blocknr, bh);
160 if (err)
161 ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
162 return err;
163}
164
165static inline int
166__ext4_journal_get_create_access(const char *where,
167 handle_t *handle, struct buffer_head *bh)
168{
169 int err = jbd2_journal_get_create_access(handle, bh);
170 if (err)
171 ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
172 return err;
173}
174
175static inline int
176__ext4_journal_dirty_metadata(const char *where,
177 handle_t *handle, struct buffer_head *bh)
178{
179 int err = jbd2_journal_dirty_metadata(handle, bh);
180 if (err)
181 ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
182 return err;
183}
184
185
186#define ext4_journal_get_undo_access(handle, bh) \
187 __ext4_journal_get_undo_access(__FUNCTION__, (handle), (bh))
188#define ext4_journal_get_write_access(handle, bh) \
189 __ext4_journal_get_write_access(__FUNCTION__, (handle), (bh))
190#define ext4_journal_revoke(handle, blocknr, bh) \
191 __ext4_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh))
192#define ext4_journal_get_create_access(handle, bh) \
193 __ext4_journal_get_create_access(__FUNCTION__, (handle), (bh))
194#define ext4_journal_dirty_metadata(handle, bh) \
195 __ext4_journal_dirty_metadata(__FUNCTION__, (handle), (bh))
196#define ext4_journal_forget(handle, bh) \
197 __ext4_journal_forget(__FUNCTION__, (handle), (bh))
198
199int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
200
201handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
202int __ext4_journal_stop(const char *where, handle_t *handle);
203
204static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
205{
206 return ext4_journal_start_sb(inode->i_sb, nblocks);
207}
208
209#define ext4_journal_stop(handle) \
210 __ext4_journal_stop(__FUNCTION__, (handle))
211
212static inline handle_t *ext4_journal_current_handle(void)
213{
214 return journal_current_handle();
215}
216
217static inline int ext4_journal_extend(handle_t *handle, int nblocks)
218{
219 return jbd2_journal_extend(handle, nblocks);
220}
221
222static inline int ext4_journal_restart(handle_t *handle, int nblocks)
223{
224 return jbd2_journal_restart(handle, nblocks);
225}
226
227static inline int ext4_journal_blocks_per_page(struct inode *inode)
228{
229 return jbd2_journal_blocks_per_page(inode);
230}
231
232static inline int ext4_journal_force_commit(journal_t *journal)
233{
234 return jbd2_journal_force_commit(journal);
235}
236
237/* super.c */
238int ext4_force_commit(struct super_block *sb);
239
240static inline int ext4_should_journal_data(struct inode *inode)
241{
242 if (!S_ISREG(inode->i_mode))
243 return 1;
244 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
245 return 1;
246 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
247 return 1;
248 return 0;
249}
250
251static inline int ext4_should_order_data(struct inode *inode)
252{
253 if (!S_ISREG(inode->i_mode))
254 return 0;
255 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
256 return 0;
257 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
258 return 1;
259 return 0;
260}
261
262static inline int ext4_should_writeback_data(struct inode *inode)
263{
264 if (!S_ISREG(inode->i_mode))
265 return 0;
266 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
267 return 0;
268 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
269 return 1;
270 return 0;
271}
272
273#endif /* _LINUX_EXT4_JBD_H */
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index c25a38d8f600..5081d27bfa27 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -17,6 +17,7 @@ int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *
17int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); 17int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
18int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int); 18int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int);
19void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long); 19void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
20void __unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
20int hugetlb_prefault(struct address_space *, struct vm_area_struct *); 21int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
21int hugetlb_report_meminfo(char *); 22int hugetlb_report_meminfo(char *);
22int hugetlb_report_node_meminfo(int, char *); 23int hugetlb_report_node_meminfo(int, char *);
diff --git a/include/linux/io.h b/include/linux/io.h
index 2ad96c3f0e4e..81877ea39309 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -28,4 +28,31 @@ void __iowrite64_copy(void __iomem *to, const void *from, size_t count);
28int ioremap_page_range(unsigned long addr, unsigned long end, 28int ioremap_page_range(unsigned long addr, unsigned long end,
29 unsigned long phys_addr, pgprot_t prot); 29 unsigned long phys_addr, pgprot_t prot);
30 30
31/**
32 * check_signature - find BIOS signatures
33 * @io_addr: mmio address to check
34 * @signature: signature block
35 * @length: length of signature
36 *
37 * Perform a signature comparison with the mmio address io_addr. This
38 * address should have been obtained by ioremap.
39 * Returns 1 on a match.
40 */
41
42static inline int check_signature(const volatile void __iomem *io_addr,
43 const unsigned char *signature, int length)
44{
45 int retval = 0;
46 do {
47 if (readb(io_addr) != *signature)
48 goto out;
49 io_addr++;
50 signature++;
51 length--;
52 } while (length);
53 retval = 1;
54out:
55 return retval;
56}
57
31#endif /* _LINUX_IO_H */ 58#endif /* _LINUX_IO_H */
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
new file mode 100644
index 000000000000..ddb128795781
--- /dev/null
+++ b/include/linux/jbd2.h
@@ -0,0 +1,1107 @@
1/*
2 * linux/include/linux/jbd2.h
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>
5 *
6 * Copyright 1998-2000 Red Hat, Inc --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Definitions for transaction data structures for the buffer cache
13 * filesystem journaling support.
14 */
15
16#ifndef _LINUX_JBD_H
17#define _LINUX_JBD_H
18
19/* Allow this file to be included directly into e2fsprogs */
20#ifndef __KERNEL__
21#include "jfs_compat.h"
22#define JBD2_DEBUG
23#define jfs_debug jbd_debug
24#else
25
26#include <linux/types.h>
27#include <linux/buffer_head.h>
28#include <linux/journal-head.h>
29#include <linux/stddef.h>
30#include <linux/bit_spinlock.h>
31#include <linux/mutex.h>
32#include <linux/timer.h>
33
34#include <asm/semaphore.h>
35#endif
36
37#define journal_oom_retry 1
38
39/*
40 * Define JBD_PARANIOD_IOFAIL to cause a kernel BUG() if ext3 finds
41 * certain classes of error which can occur due to failed IOs. Under
42 * normal use we want ext3 to continue after such errors, because
43 * hardware _can_ fail, but for debugging purposes when running tests on
44 * known-good hardware we may want to trap these errors.
45 */
46#undef JBD_PARANOID_IOFAIL
47
48/*
49 * The default maximum commit age, in seconds.
50 */
51#define JBD_DEFAULT_MAX_COMMIT_AGE 5
52
53#ifdef CONFIG_JBD_DEBUG
54/*
55 * Define JBD_EXPENSIVE_CHECKING to enable more expensive internal
56 * consistency checks. By default we don't do this unless
57 * CONFIG_JBD_DEBUG is on.
58 */
59#define JBD_EXPENSIVE_CHECKING
60extern int jbd2_journal_enable_debug;
61
62#define jbd_debug(n, f, a...) \
63 do { \
64 if ((n) <= jbd2_journal_enable_debug) { \
65 printk (KERN_DEBUG "(%s, %d): %s: ", \
66 __FILE__, __LINE__, __FUNCTION__); \
67 printk (f, ## a); \
68 } \
69 } while (0)
70#else
71#define jbd_debug(f, a...) /**/
72#endif
73
74extern void * __jbd2_kmalloc (const char *where, size_t size, gfp_t flags, int retry);
75extern void * jbd2_slab_alloc(size_t size, gfp_t flags);
76extern void jbd2_slab_free(void *ptr, size_t size);
77
78#define jbd_kmalloc(size, flags) \
79 __jbd2_kmalloc(__FUNCTION__, (size), (flags), journal_oom_retry)
80#define jbd_rep_kmalloc(size, flags) \
81 __jbd2_kmalloc(__FUNCTION__, (size), (flags), 1)
82
83#define JBD2_MIN_JOURNAL_BLOCKS 1024
84
85#ifdef __KERNEL__
86
87/**
88 * typedef handle_t - The handle_t type represents a single atomic update being performed by some process.
89 *
90 * All filesystem modifications made by the process go
91 * through this handle. Recursive operations (such as quota operations)
92 * are gathered into a single update.
93 *
94 * The buffer credits field is used to account for journaled buffers
95 * being modified by the running process. To ensure that there is
96 * enough log space for all outstanding operations, we need to limit the
97 * number of outstanding buffers possible at any time. When the
98 * operation completes, any buffer credits not used are credited back to
99 * the transaction, so that at all times we know how many buffers the
100 * outstanding updates on a transaction might possibly touch.
101 *
102 * This is an opaque datatype.
103 **/
104typedef struct handle_s handle_t; /* Atomic operation type */
105
106
107/**
108 * typedef journal_t - The journal_t maintains all of the journaling state information for a single filesystem.
109 *
110 * journal_t is linked to from the fs superblock structure.
111 *
112 * We use the journal_t to keep track of all outstanding transaction
113 * activity on the filesystem, and to manage the state of the log
114 * writing process.
115 *
116 * This is an opaque datatype.
117 **/
118typedef struct journal_s journal_t; /* Journal control structure */
119#endif
120
121/*
122 * Internal structures used by the logging mechanism:
123 */
124
125#define JBD2_MAGIC_NUMBER 0xc03b3998U /* The first 4 bytes of /dev/random! */
126
127/*
128 * On-disk structures
129 */
130
131/*
132 * Descriptor block types:
133 */
134
135#define JBD2_DESCRIPTOR_BLOCK 1
136#define JBD2_COMMIT_BLOCK 2
137#define JBD2_SUPERBLOCK_V1 3
138#define JBD2_SUPERBLOCK_V2 4
139#define JBD2_REVOKE_BLOCK 5
140
141/*
142 * Standard header for all descriptor blocks:
143 */
144typedef struct journal_header_s
145{
146 __be32 h_magic;
147 __be32 h_blocktype;
148 __be32 h_sequence;
149} journal_header_t;
150
151
152/*
153 * The block tag: used to describe a single buffer in the journal.
154 * t_blocknr_high is only used if INCOMPAT_64BIT is set, so this
155 * raw struct shouldn't be used for pointer math or sizeof() - use
156 * journal_tag_bytes(journal) instead to compute this.
157 */
158typedef struct journal_block_tag_s
159{
160 __be32 t_blocknr; /* The on-disk block number */
161 __be32 t_flags; /* See below */
162 __be32 t_blocknr_high; /* most-significant high 32bits. */
163} journal_block_tag_t;
164
165#define JBD_TAG_SIZE32 (offsetof(journal_block_tag_t, t_blocknr_high))
166#define JBD_TAG_SIZE64 (sizeof(journal_block_tag_t))
167
168/*
169 * The revoke descriptor: used on disk to describe a series of blocks to
170 * be revoked from the log
171 */
172typedef struct jbd2_journal_revoke_header_s
173{
174 journal_header_t r_header;
175 __be32 r_count; /* Count of bytes used in the block */
176} jbd2_journal_revoke_header_t;
177
178
179/* Definitions for the journal tag flags word: */
180#define JBD2_FLAG_ESCAPE 1 /* on-disk block is escaped */
181#define JBD2_FLAG_SAME_UUID 2 /* block has same uuid as previous */
182#define JBD2_FLAG_DELETED 4 /* block deleted by this transaction */
183#define JBD2_FLAG_LAST_TAG 8 /* last tag in this descriptor block */
184
185
186/*
187 * The journal superblock. All fields are in big-endian byte order.
188 */
189typedef struct journal_superblock_s
190{
191/* 0x0000 */
192 journal_header_t s_header;
193
194/* 0x000C */
195 /* Static information describing the journal */
196 __be32 s_blocksize; /* journal device blocksize */
197 __be32 s_maxlen; /* total blocks in journal file */
198 __be32 s_first; /* first block of log information */
199
200/* 0x0018 */
201 /* Dynamic information describing the current state of the log */
202 __be32 s_sequence; /* first commit ID expected in log */
203 __be32 s_start; /* blocknr of start of log */
204
205/* 0x0020 */
206 /* Error value, as set by jbd2_journal_abort(). */
207 __be32 s_errno;
208
209/* 0x0024 */
210 /* Remaining fields are only valid in a version-2 superblock */
211 __be32 s_feature_compat; /* compatible feature set */
212 __be32 s_feature_incompat; /* incompatible feature set */
213 __be32 s_feature_ro_compat; /* readonly-compatible feature set */
214/* 0x0030 */
215 __u8 s_uuid[16]; /* 128-bit uuid for journal */
216
217/* 0x0040 */
218 __be32 s_nr_users; /* Nr of filesystems sharing log */
219
220 __be32 s_dynsuper; /* Blocknr of dynamic superblock copy*/
221
222/* 0x0048 */
223 __be32 s_max_transaction; /* Limit of journal blocks per trans.*/
224 __be32 s_max_trans_data; /* Limit of data blocks per trans. */
225
226/* 0x0050 */
227 __u32 s_padding[44];
228
229/* 0x0100 */
230 __u8 s_users[16*48]; /* ids of all fs'es sharing the log */
231/* 0x0400 */
232} journal_superblock_t;
233
234#define JBD2_HAS_COMPAT_FEATURE(j,mask) \
235 ((j)->j_format_version >= 2 && \
236 ((j)->j_superblock->s_feature_compat & cpu_to_be32((mask))))
237#define JBD2_HAS_RO_COMPAT_FEATURE(j,mask) \
238 ((j)->j_format_version >= 2 && \
239 ((j)->j_superblock->s_feature_ro_compat & cpu_to_be32((mask))))
240#define JBD2_HAS_INCOMPAT_FEATURE(j,mask) \
241 ((j)->j_format_version >= 2 && \
242 ((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask))))
243
244#define JBD2_FEATURE_INCOMPAT_REVOKE 0x00000001
245#define JBD2_FEATURE_INCOMPAT_64BIT 0x00000002
246
247/* Features known to this kernel version: */
248#define JBD2_KNOWN_COMPAT_FEATURES 0
249#define JBD2_KNOWN_ROCOMPAT_FEATURES 0
250#define JBD2_KNOWN_INCOMPAT_FEATURES (JBD2_FEATURE_INCOMPAT_REVOKE | \
251 JBD2_FEATURE_INCOMPAT_64BIT)
252
253#ifdef __KERNEL__
254
255#include <linux/fs.h>
256#include <linux/sched.h>
257
258#define JBD_ASSERTIONS
259#ifdef JBD_ASSERTIONS
260#define J_ASSERT(assert) \
261do { \
262 if (!(assert)) { \
263 printk (KERN_EMERG \
264 "Assertion failure in %s() at %s:%d: \"%s\"\n", \
265 __FUNCTION__, __FILE__, __LINE__, # assert); \
266 BUG(); \
267 } \
268} while (0)
269
270#if defined(CONFIG_BUFFER_DEBUG)
271void buffer_assertion_failure(struct buffer_head *bh);
272#define J_ASSERT_BH(bh, expr) \
273 do { \
274 if (!(expr)) \
275 buffer_assertion_failure(bh); \
276 J_ASSERT(expr); \
277 } while (0)
278#define J_ASSERT_JH(jh, expr) J_ASSERT_BH(jh2bh(jh), expr)
279#else
280#define J_ASSERT_BH(bh, expr) J_ASSERT(expr)
281#define J_ASSERT_JH(jh, expr) J_ASSERT(expr)
282#endif
283
284#else
285#define J_ASSERT(assert) do { } while (0)
286#endif /* JBD_ASSERTIONS */
287
288#if defined(JBD_PARANOID_IOFAIL)
289#define J_EXPECT(expr, why...) J_ASSERT(expr)
290#define J_EXPECT_BH(bh, expr, why...) J_ASSERT_BH(bh, expr)
291#define J_EXPECT_JH(jh, expr, why...) J_ASSERT_JH(jh, expr)
292#else
293#define __journal_expect(expr, why...) \
294 ({ \
295 int val = (expr); \
296 if (!val) { \
297 printk(KERN_ERR \
298 "EXT3-fs unexpected failure: %s;\n",# expr); \
299 printk(KERN_ERR why "\n"); \
300 } \
301 val; \
302 })
303#define J_EXPECT(expr, why...) __journal_expect(expr, ## why)
304#define J_EXPECT_BH(bh, expr, why...) __journal_expect(expr, ## why)
305#define J_EXPECT_JH(jh, expr, why...) __journal_expect(expr, ## why)
306#endif
307
308enum jbd_state_bits {
309 BH_JBD /* Has an attached ext3 journal_head */
310 = BH_PrivateStart,
311 BH_JWrite, /* Being written to log (@@@ DEBUGGING) */
312 BH_Freed, /* Has been freed (truncated) */
313 BH_Revoked, /* Has been revoked from the log */
314 BH_RevokeValid, /* Revoked flag is valid */
315 BH_JBDDirty, /* Is dirty but journaled */
316 BH_State, /* Pins most journal_head state */
317 BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
318 BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */
319};
320
321BUFFER_FNS(JBD, jbd)
322BUFFER_FNS(JWrite, jwrite)
323BUFFER_FNS(JBDDirty, jbddirty)
324TAS_BUFFER_FNS(JBDDirty, jbddirty)
325BUFFER_FNS(Revoked, revoked)
326TAS_BUFFER_FNS(Revoked, revoked)
327BUFFER_FNS(RevokeValid, revokevalid)
328TAS_BUFFER_FNS(RevokeValid, revokevalid)
329BUFFER_FNS(Freed, freed)
330
331static inline struct buffer_head *jh2bh(struct journal_head *jh)
332{
333 return jh->b_bh;
334}
335
336static inline struct journal_head *bh2jh(struct buffer_head *bh)
337{
338 return bh->b_private;
339}
340
341static inline void jbd_lock_bh_state(struct buffer_head *bh)
342{
343 bit_spin_lock(BH_State, &bh->b_state);
344}
345
346static inline int jbd_trylock_bh_state(struct buffer_head *bh)
347{
348 return bit_spin_trylock(BH_State, &bh->b_state);
349}
350
351static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
352{
353 return bit_spin_is_locked(BH_State, &bh->b_state);
354}
355
356static inline void jbd_unlock_bh_state(struct buffer_head *bh)
357{
358 bit_spin_unlock(BH_State, &bh->b_state);
359}
360
361static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
362{
363 bit_spin_lock(BH_JournalHead, &bh->b_state);
364}
365
366static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
367{
368 bit_spin_unlock(BH_JournalHead, &bh->b_state);
369}
370
371struct jbd2_revoke_table_s;
372
373/**
374 * struct handle_s - The handle_s type is the concrete type associated with
375 * handle_t.
376 * @h_transaction: Which compound transaction is this update a part of?
377 * @h_buffer_credits: Number of remaining buffers we are allowed to dirty.
378 * @h_ref: Reference count on this handle
379 * @h_err: Field for caller's use to track errors through large fs operations
380 * @h_sync: flag for sync-on-close
381 * @h_jdata: flag to force data journaling
382 * @h_aborted: flag indicating fatal error on handle
383 **/
384
385/* Docbook can't yet cope with the bit fields, but will leave the documentation
386 * in so it can be fixed later.
387 */
388
389struct handle_s
390{
391 /* Which compound transaction is this update a part of? */
392 transaction_t *h_transaction;
393
394 /* Number of remaining buffers we are allowed to dirty: */
395 int h_buffer_credits;
396
397 /* Reference count on this handle */
398 int h_ref;
399
400 /* Field for caller's use to track errors through large fs */
401 /* operations */
402 int h_err;
403
404 /* Flags [no locking] */
405 unsigned int h_sync: 1; /* sync-on-close */
406 unsigned int h_jdata: 1; /* force data journaling */
407 unsigned int h_aborted: 1; /* fatal error on handle */
408};
409
410
411/* The transaction_t type is the guts of the journaling mechanism. It
412 * tracks a compound transaction through its various states:
413 *
414 * RUNNING: accepting new updates
415 * LOCKED: Updates still running but we don't accept new ones
416 * RUNDOWN: Updates are tidying up but have finished requesting
417 * new buffers to modify (state not used for now)
418 * FLUSH: All updates complete, but we are still writing to disk
419 * COMMIT: All data on disk, writing commit record
420 * FINISHED: We still have to keep the transaction for checkpointing.
421 *
422 * The transaction keeps track of all of the buffers modified by a
423 * running transaction, and all of the buffers committed but not yet
424 * flushed to home for finished transactions.
425 */
426
427/*
428 * Lock ranking:
429 *
430 * j_list_lock
431 * ->jbd_lock_bh_journal_head() (This is "innermost")
432 *
433 * j_state_lock
434 * ->jbd_lock_bh_state()
435 *
436 * jbd_lock_bh_state()
437 * ->j_list_lock
438 *
439 * j_state_lock
440 * ->t_handle_lock
441 *
442 * j_state_lock
443 * ->j_list_lock (journal_unmap_buffer)
444 *
445 */
446
447struct transaction_s
448{
449 /* Pointer to the journal for this transaction. [no locking] */
450 journal_t *t_journal;
451
452 /* Sequence number for this transaction [no locking] */
453 tid_t t_tid;
454
455 /*
456 * Transaction's current state
457 * [no locking - only kjournald2 alters this]
458 * FIXME: needs barriers
459 * KLUDGE: [use j_state_lock]
460 */
461 enum {
462 T_RUNNING,
463 T_LOCKED,
464 T_RUNDOWN,
465 T_FLUSH,
466 T_COMMIT,
467 T_FINISHED
468 } t_state;
469
470 /*
471 * Where in the log does this transaction's commit start? [no locking]
472 */
473 unsigned long t_log_start;
474
475 /* Number of buffers on the t_buffers list [j_list_lock] */
476 int t_nr_buffers;
477
478 /*
479 * Doubly-linked circular list of all buffers reserved but not yet
480 * modified by this transaction [j_list_lock]
481 */
482 struct journal_head *t_reserved_list;
483
484 /*
485 * Doubly-linked circular list of all buffers under writeout during
486 * commit [j_list_lock]
487 */
488 struct journal_head *t_locked_list;
489
490 /*
491 * Doubly-linked circular list of all metadata buffers owned by this
492 * transaction [j_list_lock]
493 */
494 struct journal_head *t_buffers;
495
496 /*
497 * Doubly-linked circular list of all data buffers still to be
498 * flushed before this transaction can be committed [j_list_lock]
499 */
500 struct journal_head *t_sync_datalist;
501
502 /*
503 * Doubly-linked circular list of all forget buffers (superseded
504 * buffers which we can un-checkpoint once this transaction commits)
505 * [j_list_lock]
506 */
507 struct journal_head *t_forget;
508
509 /*
510 * Doubly-linked circular list of all buffers still to be flushed before
511 * this transaction can be checkpointed. [j_list_lock]
512 */
513 struct journal_head *t_checkpoint_list;
514
515 /*
516 * Doubly-linked circular list of all buffers submitted for IO while
517 * checkpointing. [j_list_lock]
518 */
519 struct journal_head *t_checkpoint_io_list;
520
521 /*
522 * Doubly-linked circular list of temporary buffers currently undergoing
523 * IO in the log [j_list_lock]
524 */
525 struct journal_head *t_iobuf_list;
526
527 /*
528 * Doubly-linked circular list of metadata buffers being shadowed by log
529 * IO. The IO buffers on the iobuf list and the shadow buffers on this
530 * list match each other one for one at all times. [j_list_lock]
531 */
532 struct journal_head *t_shadow_list;
533
534 /*
535 * Doubly-linked circular list of control buffers being written to the
536 * log. [j_list_lock]
537 */
538 struct journal_head *t_log_list;
539
540 /*
541 * Protects info related to handles
542 */
543 spinlock_t t_handle_lock;
544
545 /*
546 * Number of outstanding updates running on this transaction
547 * [t_handle_lock]
548 */
549 int t_updates;
550
551 /*
552 * Number of buffers reserved for use by all handles in this transaction
553 * handle but not yet modified. [t_handle_lock]
554 */
555 int t_outstanding_credits;
556
557 /*
558 * Forward and backward links for the circular list of all transactions
559 * awaiting checkpoint. [j_list_lock]
560 */
561 transaction_t *t_cpnext, *t_cpprev;
562
563 /*
564 * When will the transaction expire (become due for commit), in jiffies?
565 * [no locking]
566 */
567 unsigned long t_expires;
568
569 /*
570 * How many handles used this transaction? [t_handle_lock]
571 */
572 int t_handle_count;
573
574};
575
576/**
577 * struct journal_s - The journal_s type is the concrete type associated with
578 * journal_t.
579 * @j_flags: General journaling state flags
580 * @j_errno: Is there an outstanding uncleared error on the journal (from a
581 * prior abort)?
582 * @j_sb_buffer: First part of superblock buffer
583 * @j_superblock: Second part of superblock buffer
584 * @j_format_version: Version of the superblock format
585 * @j_state_lock: Protect the various scalars in the journal
586 * @j_barrier_count: Number of processes waiting to create a barrier lock
587 * @j_barrier: The barrier lock itself
588 * @j_running_transaction: The current running transaction..
589 * @j_committing_transaction: the transaction we are pushing to disk
590 * @j_checkpoint_transactions: a linked circular list of all transactions
591 * waiting for checkpointing
592 * @j_wait_transaction_locked: Wait queue for waiting for a locked transaction
593 * to start committing, or for a barrier lock to be released
594 * @j_wait_logspace: Wait queue for waiting for checkpointing to complete
595 * @j_wait_done_commit: Wait queue for waiting for commit to complete
596 * @j_wait_checkpoint: Wait queue to trigger checkpointing
597 * @j_wait_commit: Wait queue to trigger commit
598 * @j_wait_updates: Wait queue to wait for updates to complete
599 * @j_checkpoint_mutex: Mutex for locking against concurrent checkpoints
600 * @j_head: Journal head - identifies the first unused block in the journal
601 * @j_tail: Journal tail - identifies the oldest still-used block in the
602 * journal.
603 * @j_free: Journal free - how many free blocks are there in the journal?
604 * @j_first: The block number of the first usable block
605 * @j_last: The block number one beyond the last usable block
606 * @j_dev: Device where we store the journal
607 * @j_blocksize: blocksize for the location where we store the journal.
608 * @j_blk_offset: starting block offset for into the device where we store the
609 * journal
610 * @j_fs_dev: Device which holds the client fs. For internal journal this will
611 * be equal to j_dev
612 * @j_maxlen: Total maximum capacity of the journal region on disk.
613 * @j_list_lock: Protects the buffer lists and internal buffer state.
614 * @j_inode: Optional inode where we store the journal. If present, all journal
615 * block numbers are mapped into this inode via bmap().
616 * @j_tail_sequence: Sequence number of the oldest transaction in the log
617 * @j_transaction_sequence: Sequence number of the next transaction to grant
618 * @j_commit_sequence: Sequence number of the most recently committed
619 * transaction
620 * @j_commit_request: Sequence number of the most recent transaction wanting
621 * commit
622 * @j_uuid: Uuid of client object.
623 * @j_task: Pointer to the current commit thread for this journal
624 * @j_max_transaction_buffers: Maximum number of metadata buffers to allow in a
625 * single compound commit transaction
626 * @j_commit_interval: What is the maximum transaction lifetime before we begin
627 * a commit?
628 * @j_commit_timer: The timer used to wakeup the commit thread
629 * @j_revoke_lock: Protect the revoke table
630 * @j_revoke: The revoke table - maintains the list of revoked blocks in the
631 * current transaction.
632 * @j_revoke_table: alternate revoke tables for j_revoke
633 * @j_wbuf: array of buffer_heads for jbd2_journal_commit_transaction
634 * @j_wbufsize: maximum number of buffer_heads allowed in j_wbuf, the
635 * number that will fit in j_blocksize
636 * @j_last_sync_writer: most recent pid which did a synchronous write
637 * @j_private: An opaque pointer to fs-private information.
638 */
639
640struct journal_s
641{
642 /* General journaling state flags [j_state_lock] */
643 unsigned long j_flags;
644
645 /*
646 * Is there an outstanding uncleared error on the journal (from a prior
647 * abort)? [j_state_lock]
648 */
649 int j_errno;
650
651 /* The superblock buffer */
652 struct buffer_head *j_sb_buffer;
653 journal_superblock_t *j_superblock;
654
655 /* Version of the superblock format */
656 int j_format_version;
657
658 /*
659 * Protect the various scalars in the journal
660 */
661 spinlock_t j_state_lock;
662
663 /*
664 * Number of processes waiting to create a barrier lock [j_state_lock]
665 */
666 int j_barrier_count;
667
668 /* The barrier lock itself */
669 struct mutex j_barrier;
670
671 /*
672 * Transactions: The current running transaction...
673 * [j_state_lock] [caller holding open handle]
674 */
675 transaction_t *j_running_transaction;
676
677 /*
678 * the transaction we are pushing to disk
679 * [j_state_lock] [caller holding open handle]
680 */
681 transaction_t *j_committing_transaction;
682
683 /*
684 * ... and a linked circular list of all transactions waiting for
685 * checkpointing. [j_list_lock]
686 */
687 transaction_t *j_checkpoint_transactions;
688
689 /*
690 * Wait queue for waiting for a locked transaction to start committing,
691 * or for a barrier lock to be released
692 */
693 wait_queue_head_t j_wait_transaction_locked;
694
695 /* Wait queue for waiting for checkpointing to complete */
696 wait_queue_head_t j_wait_logspace;
697
698 /* Wait queue for waiting for commit to complete */
699 wait_queue_head_t j_wait_done_commit;
700
701 /* Wait queue to trigger checkpointing */
702 wait_queue_head_t j_wait_checkpoint;
703
704 /* Wait queue to trigger commit */
705 wait_queue_head_t j_wait_commit;
706
707 /* Wait queue to wait for updates to complete */
708 wait_queue_head_t j_wait_updates;
709
710 /* Semaphore for locking against concurrent checkpoints */
711 struct mutex j_checkpoint_mutex;
712
713 /*
714 * Journal head: identifies the first unused block in the journal.
715 * [j_state_lock]
716 */
717 unsigned long j_head;
718
719 /*
720 * Journal tail: identifies the oldest still-used block in the journal.
721 * [j_state_lock]
722 */
723 unsigned long j_tail;
724
725 /*
726 * Journal free: how many free blocks are there in the journal?
727 * [j_state_lock]
728 */
729 unsigned long j_free;
730
731 /*
732 * Journal start and end: the block numbers of the first usable block
733 * and one beyond the last usable block in the journal. [j_state_lock]
734 */
735 unsigned long j_first;
736 unsigned long j_last;
737
738 /*
739 * Device, blocksize and starting block offset for the location where we
740 * store the journal.
741 */
742 struct block_device *j_dev;
743 int j_blocksize;
744 unsigned long long j_blk_offset;
745
746 /*
747 * Device which holds the client fs. For internal journal this will be
748 * equal to j_dev.
749 */
750 struct block_device *j_fs_dev;
751
752 /* Total maximum capacity of the journal region on disk. */
753 unsigned int j_maxlen;
754
755 /*
756 * Protects the buffer lists and internal buffer state.
757 */
758 spinlock_t j_list_lock;
759
760 /* Optional inode where we store the journal. If present, all */
761 /* journal block numbers are mapped into this inode via */
762 /* bmap(). */
763 struct inode *j_inode;
764
765 /*
766 * Sequence number of the oldest transaction in the log [j_state_lock]
767 */
768 tid_t j_tail_sequence;
769
770 /*
771 * Sequence number of the next transaction to grant [j_state_lock]
772 */
773 tid_t j_transaction_sequence;
774
775 /*
776 * Sequence number of the most recently committed transaction
777 * [j_state_lock].
778 */
779 tid_t j_commit_sequence;
780
781 /*
782 * Sequence number of the most recent transaction wanting commit
783 * [j_state_lock]
784 */
785 tid_t j_commit_request;
786
787 /*
788 * Journal uuid: identifies the object (filesystem, LVM volume etc)
789 * backed by this journal. This will eventually be replaced by an array
790 * of uuids, allowing us to index multiple devices within a single
791 * journal and to perform atomic updates across them.
792 */
793 __u8 j_uuid[16];
794
795 /* Pointer to the current commit thread for this journal */
796 struct task_struct *j_task;
797
798 /*
799 * Maximum number of metadata buffers to allow in a single compound
800 * commit transaction
801 */
802 int j_max_transaction_buffers;
803
804 /*
805 * What is the maximum transaction lifetime before we begin a commit?
806 */
807 unsigned long j_commit_interval;
808
809 /* The timer used to wakeup the commit thread: */
810 struct timer_list j_commit_timer;
811
812 /*
813 * The revoke table: maintains the list of revoked blocks in the
814 * current transaction. [j_revoke_lock]
815 */
816 spinlock_t j_revoke_lock;
817 struct jbd2_revoke_table_s *j_revoke;
818 struct jbd2_revoke_table_s *j_revoke_table[2];
819
820 /*
821 * array of bhs for jbd2_journal_commit_transaction
822 */
823 struct buffer_head **j_wbuf;
824 int j_wbufsize;
825
826 pid_t j_last_sync_writer;
827
828 /*
829 * An opaque pointer to fs-private information. ext3 puts its
830 * superblock pointer here
831 */
832 void *j_private;
833};
834
835/*
836 * Journal flag definitions
837 */
838#define JBD2_UNMOUNT 0x001 /* Journal thread is being destroyed */
839#define JBD2_ABORT 0x002 /* Journaling has been aborted for errors. */
840#define JBD2_ACK_ERR 0x004 /* The errno in the sb has been acked */
841#define JBD2_FLUSHED 0x008 /* The journal superblock has been flushed */
842#define JBD2_LOADED 0x010 /* The journal superblock has been loaded */
843#define JBD2_BARRIER 0x020 /* Use IDE barriers */
844
845/*
846 * Function declarations for the journaling transaction and buffer
847 * management
848 */
849
850/* Filing buffers */
851extern void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
852extern void jbd2_journal_unfile_buffer(journal_t *, struct journal_head *);
853extern void __jbd2_journal_unfile_buffer(struct journal_head *);
854extern void __jbd2_journal_refile_buffer(struct journal_head *);
855extern void jbd2_journal_refile_buffer(journal_t *, struct journal_head *);
856extern void __jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int);
857extern void __journal_free_buffer(struct journal_head *bh);
858extern void jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int);
859extern void __journal_clean_data_list(transaction_t *transaction);
860
861/* Log buffer allocation */
862extern struct journal_head * jbd2_journal_get_descriptor_buffer(journal_t *);
863int jbd2_journal_next_log_block(journal_t *, unsigned long long *);
864
865/* Commit management */
866extern void jbd2_journal_commit_transaction(journal_t *);
867
868/* Checkpoint list management */
869int __jbd2_journal_clean_checkpoint_list(journal_t *journal);
870int __jbd2_journal_remove_checkpoint(struct journal_head *);
871void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *);
872
873/* Buffer IO */
874extern int
875jbd2_journal_write_metadata_buffer(transaction_t *transaction,
876 struct journal_head *jh_in,
877 struct journal_head **jh_out,
878 unsigned long long blocknr);
879
880/* Transaction locking */
881extern void __wait_on_journal (journal_t *);
882
883/*
884 * Journal locking.
885 *
886 * We need to lock the journal during transaction state changes so that nobody
887 * ever tries to take a handle on the running transaction while we are in the
888 * middle of moving it to the commit phase. j_state_lock does this.
889 *
890 * Note that the locking is completely interrupt unsafe. We never touch
891 * journal structures from interrupts.
892 */
893
894static inline handle_t *journal_current_handle(void)
895{
896 return current->journal_info;
897}
898
899/* The journaling code user interface:
900 *
901 * Create and destroy handles
902 * Register buffer modifications against the current transaction.
903 */
904
905extern handle_t *jbd2_journal_start(journal_t *, int nblocks);
906extern int jbd2_journal_restart (handle_t *, int nblocks);
907extern int jbd2_journal_extend (handle_t *, int nblocks);
908extern int jbd2_journal_get_write_access(handle_t *, struct buffer_head *);
909extern int jbd2_journal_get_create_access (handle_t *, struct buffer_head *);
910extern int jbd2_journal_get_undo_access(handle_t *, struct buffer_head *);
911extern int jbd2_journal_dirty_data (handle_t *, struct buffer_head *);
912extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *);
913extern void jbd2_journal_release_buffer (handle_t *, struct buffer_head *);
914extern int jbd2_journal_forget (handle_t *, struct buffer_head *);
915extern void journal_sync_buffer (struct buffer_head *);
916extern void jbd2_journal_invalidatepage(journal_t *,
917 struct page *, unsigned long);
918extern int jbd2_journal_try_to_free_buffers(journal_t *, struct page *, gfp_t);
919extern int jbd2_journal_stop(handle_t *);
920extern int jbd2_journal_flush (journal_t *);
921extern void jbd2_journal_lock_updates (journal_t *);
922extern void jbd2_journal_unlock_updates (journal_t *);
923
924extern journal_t * jbd2_journal_init_dev(struct block_device *bdev,
925 struct block_device *fs_dev,
926 unsigned long long start, int len, int bsize);
927extern journal_t * jbd2_journal_init_inode (struct inode *);
928extern int jbd2_journal_update_format (journal_t *);
929extern int jbd2_journal_check_used_features
930 (journal_t *, unsigned long, unsigned long, unsigned long);
931extern int jbd2_journal_check_available_features
932 (journal_t *, unsigned long, unsigned long, unsigned long);
933extern int jbd2_journal_set_features
934 (journal_t *, unsigned long, unsigned long, unsigned long);
935extern int jbd2_journal_create (journal_t *);
936extern int jbd2_journal_load (journal_t *journal);
937extern void jbd2_journal_destroy (journal_t *);
938extern int jbd2_journal_recover (journal_t *journal);
939extern int jbd2_journal_wipe (journal_t *, int);
940extern int jbd2_journal_skip_recovery (journal_t *);
941extern void jbd2_journal_update_superblock (journal_t *, int);
942extern void __jbd2_journal_abort_hard (journal_t *);
943extern void jbd2_journal_abort (journal_t *, int);
944extern int jbd2_journal_errno (journal_t *);
945extern void jbd2_journal_ack_err (journal_t *);
946extern int jbd2_journal_clear_err (journal_t *);
947extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
948extern int jbd2_journal_force_commit(journal_t *);
949
950/*
951 * journal_head management
952 */
953struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh);
954struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh);
955void jbd2_journal_remove_journal_head(struct buffer_head *bh);
956void jbd2_journal_put_journal_head(struct journal_head *jh);
957
958/*
959 * handle management
960 */
961extern kmem_cache_t *jbd2_handle_cache;
962
963static inline handle_t *jbd_alloc_handle(gfp_t gfp_flags)
964{
965 return kmem_cache_alloc(jbd2_handle_cache, gfp_flags);
966}
967
968static inline void jbd_free_handle(handle_t *handle)
969{
970 kmem_cache_free(jbd2_handle_cache, handle);
971}
972
973/* Primary revoke support */
974#define JOURNAL_REVOKE_DEFAULT_HASH 256
975extern int jbd2_journal_init_revoke(journal_t *, int);
976extern void jbd2_journal_destroy_revoke_caches(void);
977extern int jbd2_journal_init_revoke_caches(void);
978
979extern void jbd2_journal_destroy_revoke(journal_t *);
980extern int jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *);
981extern int jbd2_journal_cancel_revoke(handle_t *, struct journal_head *);
982extern void jbd2_journal_write_revoke_records(journal_t *, transaction_t *);
983
984/* Recovery revoke support */
985extern int jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t);
986extern int jbd2_journal_test_revoke(journal_t *, unsigned long long, tid_t);
987extern void jbd2_journal_clear_revoke(journal_t *);
988extern void jbd2_journal_switch_revoke_table(journal_t *journal);
989
990/*
991 * The log thread user interface:
992 *
993 * Request space in the current transaction, and force transaction commit
994 * transitions on demand.
995 */
996
997int __jbd2_log_space_left(journal_t *); /* Called with journal locked */
998int jbd2_log_start_commit(journal_t *journal, tid_t tid);
999int __jbd2_log_start_commit(journal_t *journal, tid_t tid);
1000int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
1001int jbd2_journal_force_commit_nested(journal_t *journal);
1002int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
1003int jbd2_log_do_checkpoint(journal_t *journal);
1004
1005void __jbd2_log_wait_for_space(journal_t *journal);
1006extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *);
1007extern int jbd2_cleanup_journal_tail(journal_t *);
1008
1009/* Debugging code only: */
1010
1011#define jbd_ENOSYS() \
1012do { \
1013 printk (KERN_ERR "JBD unimplemented function %s\n", __FUNCTION__); \
1014 current->state = TASK_UNINTERRUPTIBLE; \
1015 schedule(); \
1016} while (1)
1017
1018/*
1019 * is_journal_abort
1020 *
1021 * Simple test wrapper function to test the JBD2_ABORT state flag. This
1022 * bit, when set, indicates that we have had a fatal error somewhere,
1023 * either inside the journaling layer or indicated to us by the client
1024 * (eg. ext3), and that we and should not commit any further
1025 * transactions.
1026 */
1027
1028static inline int is_journal_aborted(journal_t *journal)
1029{
1030 return journal->j_flags & JBD2_ABORT;
1031}
1032
1033static inline int is_handle_aborted(handle_t *handle)
1034{
1035 if (handle->h_aborted)
1036 return 1;
1037 return is_journal_aborted(handle->h_transaction->t_journal);
1038}
1039
1040static inline void jbd2_journal_abort_handle(handle_t *handle)
1041{
1042 handle->h_aborted = 1;
1043}
1044
1045#endif /* __KERNEL__ */
1046
1047/* Comparison functions for transaction IDs: perform comparisons using
1048 * modulo arithmetic so that they work over sequence number wraps. */
1049
1050static inline int tid_gt(tid_t x, tid_t y)
1051{
1052 int difference = (x - y);
1053 return (difference > 0);
1054}
1055
1056static inline int tid_geq(tid_t x, tid_t y)
1057{
1058 int difference = (x - y);
1059 return (difference >= 0);
1060}
1061
1062extern int jbd2_journal_blocks_per_page(struct inode *inode);
1063extern size_t journal_tag_bytes(journal_t *journal);
1064
1065/*
1066 * Return the minimum number of blocks which must be free in the journal
1067 * before a new transaction may be started. Must be called under j_state_lock.
1068 */
1069static inline int jbd_space_needed(journal_t *journal)
1070{
1071 int nblocks = journal->j_max_transaction_buffers;
1072 if (journal->j_committing_transaction)
1073 nblocks += journal->j_committing_transaction->
1074 t_outstanding_credits;
1075 return nblocks;
1076}
1077
1078/*
1079 * Definitions which augment the buffer_head layer
1080 */
1081
1082/* journaling buffer types */
1083#define BJ_None 0 /* Not journaled */
1084#define BJ_SyncData 1 /* Normal data: flush before commit */
1085#define BJ_Metadata 2 /* Normal journaled metadata */
1086#define BJ_Forget 3 /* Buffer superseded by this transaction */
1087#define BJ_IO 4 /* Buffer is for temporary IO use */
1088#define BJ_Shadow 5 /* Buffer contents being shadowed to the log */
1089#define BJ_LogCtl 6 /* Buffer contains log descriptors */
1090#define BJ_Reserved 7 /* Buffer is reserved for access by journal */
1091#define BJ_Locked 8 /* Locked for I/O during commit */
1092#define BJ_Types 9
1093
1094extern int jbd_blocks_per_page(struct inode *inode);
1095
1096#ifdef __KERNEL__
1097
1098#define buffer_trace_init(bh) do {} while (0)
1099#define print_buffer_fields(bh) do {} while (0)
1100#define print_buffer_trace(bh) do {} while (0)
1101#define BUFFER_TRACE(bh, info) do {} while (0)
1102#define BUFFER_TRACE2(bh, bh2, info) do {} while (0)
1103#define JBUFFER_TRACE(jh, info) do {} while (0)
1104
1105#endif /* __KERNEL__ */
1106
1107#endif /* _LINUX_JBD_H */
diff --git a/include/linux/magic.h b/include/linux/magic.h
index 22036dd2ba36..156c40fc664e 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -8,6 +8,7 @@
8#define EFS_SUPER_MAGIC 0x414A53 8#define EFS_SUPER_MAGIC 0x414A53
9#define EXT2_SUPER_MAGIC 0xEF53 9#define EXT2_SUPER_MAGIC 0xEF53
10#define EXT3_SUPER_MAGIC 0xEF53 10#define EXT3_SUPER_MAGIC 0xEF53
11#define EXT4_SUPER_MAGIC 0xEF53
11#define HPFS_SUPER_MAGIC 0xf995e849 12#define HPFS_SUPER_MAGIC 0xf995e849
12#define ISOFS_SUPER_MAGIC 0x9660 13#define ISOFS_SUPER_MAGIC 0x9660
13#define JFFS2_SUPER_MAGIC 0x72b6 14#define JFFS2_SUPER_MAGIC 0x72b6
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 26146623be2f..5a6068ff5556 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1103,12 +1103,7 @@ static inline void vm_stat_account(struct mm_struct *mm,
1103 1103
1104#ifndef CONFIG_DEBUG_PAGEALLOC 1104#ifndef CONFIG_DEBUG_PAGEALLOC
1105static inline void 1105static inline void
1106kernel_map_pages(struct page *page, int numpages, int enable) 1106kernel_map_pages(struct page *page, int numpages, int enable) {}
1107{
1108 if (!PageHighMem(page) && !enable)
1109 debug_check_no_locks_freed(page_address(page),
1110 numpages * PAGE_SIZE);
1111}
1112#endif 1107#endif
1113 1108
1114extern struct vm_area_struct *get_gate_vma(struct task_struct *tsk); 1109extern struct vm_area_struct *get_gate_vma(struct task_struct *tsk);
diff --git a/include/linux/module.h b/include/linux/module.h
index 4b2d8091a410..d1d00ce8f4ed 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -317,9 +317,6 @@ struct module
317 /* Am I unsafe to unload? */ 317 /* Am I unsafe to unload? */
318 int unsafe; 318 int unsafe;
319 319
320 /* Am I GPL-compatible */
321 int license_gplok;
322
323 unsigned int taints; /* same bits as kernel:tainted */ 320 unsigned int taints; /* same bits as kernel:tainted */
324 321
325#ifdef CONFIG_MODULE_UNLOAD 322#ifdef CONFIG_MODULE_UNLOAD
diff --git a/include/linux/nbd.h b/include/linux/nbd.h
index e712e7d47cc2..d6b6dc09ad97 100644
--- a/include/linux/nbd.h
+++ b/include/linux/nbd.h
@@ -15,6 +15,8 @@
15#ifndef LINUX_NBD_H 15#ifndef LINUX_NBD_H
16#define LINUX_NBD_H 16#define LINUX_NBD_H
17 17
18#include <linux/types.h>
19
18#define NBD_SET_SOCK _IO( 0xab, 0 ) 20#define NBD_SET_SOCK _IO( 0xab, 0 )
19#define NBD_SET_BLKSIZE _IO( 0xab, 1 ) 21#define NBD_SET_BLKSIZE _IO( 0xab, 1 )
20#define NBD_SET_SIZE _IO( 0xab, 2 ) 22#define NBD_SET_SIZE _IO( 0xab, 2 )
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 5dce5c21822c..b1063e9cdb1b 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -8,8 +8,8 @@
8 * See detailed comments in the file linux/bitmap.h describing the 8 * See detailed comments in the file linux/bitmap.h describing the
9 * data type on which these nodemasks are based. 9 * data type on which these nodemasks are based.
10 * 10 *
11 * For details of nodemask_scnprintf() and nodemask_parse(), 11 * For details of nodemask_scnprintf() and nodemask_parse_user(),
12 * see bitmap_scnprintf() and bitmap_parse() in lib/bitmap.c. 12 * see bitmap_scnprintf() and bitmap_parse_user() in lib/bitmap.c.
13 * For details of nodelist_scnprintf() and nodelist_parse(), see 13 * For details of nodelist_scnprintf() and nodelist_parse(), see
14 * bitmap_scnlistprintf() and bitmap_parselist(), also in bitmap.c. 14 * bitmap_scnlistprintf() and bitmap_parselist(), also in bitmap.c.
15 * For details of node_remap(), see bitmap_bitremap in lib/bitmap.c. 15 * For details of node_remap(), see bitmap_bitremap in lib/bitmap.c.
@@ -51,7 +51,7 @@
51 * unsigned long *nodes_addr(mask) Array of unsigned long's in mask 51 * unsigned long *nodes_addr(mask) Array of unsigned long's in mask
52 * 52 *
53 * int nodemask_scnprintf(buf, len, mask) Format nodemask for printing 53 * int nodemask_scnprintf(buf, len, mask) Format nodemask for printing
54 * int nodemask_parse(ubuf, ulen, mask) Parse ascii string as nodemask 54 * int nodemask_parse_user(ubuf, ulen, mask) Parse ascii string as nodemask
55 * int nodelist_scnprintf(buf, len, mask) Format nodemask as list for printing 55 * int nodelist_scnprintf(buf, len, mask) Format nodemask as list for printing
56 * int nodelist_parse(buf, map) Parse ascii string as nodelist 56 * int nodelist_parse(buf, map) Parse ascii string as nodelist
57 * int node_remap(oldbit, old, new) newbit = map(old, new)(oldbit) 57 * int node_remap(oldbit, old, new) newbit = map(old, new)(oldbit)
@@ -288,12 +288,12 @@ static inline int __nodemask_scnprintf(char *buf, int len,
288 return bitmap_scnprintf(buf, len, srcp->bits, nbits); 288 return bitmap_scnprintf(buf, len, srcp->bits, nbits);
289} 289}
290 290
291#define nodemask_parse(ubuf, ulen, dst) \ 291#define nodemask_parse_user(ubuf, ulen, dst) \
292 __nodemask_parse((ubuf), (ulen), &(dst), MAX_NUMNODES) 292 __nodemask_parse_user((ubuf), (ulen), &(dst), MAX_NUMNODES)
293static inline int __nodemask_parse(const char __user *buf, int len, 293static inline int __nodemask_parse_user(const char __user *buf, int len,
294 nodemask_t *dstp, int nbits) 294 nodemask_t *dstp, int nbits)
295{ 295{
296 return bitmap_parse(buf, len, dstp->bits, nbits); 296 return bitmap_parse_user(buf, len, dstp->bits, nbits);
297} 297}
298 298
299#define nodelist_scnprintf(buf, len, src) \ 299#define nodelist_scnprintf(buf, len, src) \
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index b0ace3fd7eb9..1912c6cbef55 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -431,6 +431,10 @@ asmlinkage long sys_epoll_ctl(int epfd, int op, int fd,
431 struct epoll_event __user *event); 431 struct epoll_event __user *event);
432asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events, 432asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
433 int maxevents, int timeout); 433 int maxevents, int timeout);
434asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
435 int maxevents, int timeout,
436 const sigset_t __user *sigmask,
437 size_t sigsetsize);
434asmlinkage long sys_gethostname(char __user *name, int len); 438asmlinkage long sys_gethostname(char __user *name, int len);
435asmlinkage long sys_sethostname(char __user *name, int len); 439asmlinkage long sys_sethostname(char __user *name, int len);
436asmlinkage long sys_setdomainname(char __user *name, int len); 440asmlinkage long sys_setdomainname(char __user *name, int len);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 607c7809ad01..9a352667007c 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -57,7 +57,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
57 if (!irq_desc[irq].chip->set_affinity || no_irq_affinity) 57 if (!irq_desc[irq].chip->set_affinity || no_irq_affinity)
58 return -EIO; 58 return -EIO;
59 59
60 err = cpumask_parse(buffer, count, new_value); 60 err = cpumask_parse_user(buffer, count, new_value);
61 if (err) 61 if (err)
62 return err; 62 return err;
63 63
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 4c0553461000..805a322a5655 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -1114,8 +1114,6 @@ static int count_matching_names(struct lock_class *new_class)
1114 return count + 1; 1114 return count + 1;
1115} 1115}
1116 1116
1117extern void __error_too_big_MAX_LOCKDEP_SUBCLASSES(void);
1118
1119/* 1117/*
1120 * Register a lock's class in the hash-table, if the class is not present 1118 * Register a lock's class in the hash-table, if the class is not present
1121 * yet. Otherwise we look it up. We cache the result in the lock object 1119 * yet. Otherwise we look it up. We cache the result in the lock object
@@ -1153,8 +1151,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
1153 * (or spin_lock_init()) call - which acts as the key. For static 1151 * (or spin_lock_init()) call - which acts as the key. For static
1154 * locks we use the lock object itself as the key. 1152 * locks we use the lock object itself as the key.
1155 */ 1153 */
1156 if (sizeof(struct lock_class_key) > sizeof(struct lock_class)) 1154 BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(struct lock_class));
1157 __error_too_big_MAX_LOCKDEP_SUBCLASSES();
1158 1155
1159 key = lock->key->subkeys + subclass; 1156 key = lock->key->subkeys + subclass;
1160 1157
diff --git a/kernel/module.c b/kernel/module.c
index 7f60e782de1e..67009bd56c52 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -87,6 +87,12 @@ static inline int strong_try_module_get(struct module *mod)
87 return try_module_get(mod); 87 return try_module_get(mod);
88} 88}
89 89
90static inline void add_taint_module(struct module *mod, unsigned flag)
91{
92 add_taint(flag);
93 mod->taints |= flag;
94}
95
90/* A thread that wants to hold a reference to a module only while it 96/* A thread that wants to hold a reference to a module only while it
91 * is running can call ths to safely exit. 97 * is running can call ths to safely exit.
92 * nfsd and lockd use this. 98 * nfsd and lockd use this.
@@ -847,12 +853,10 @@ static int check_version(Elf_Shdr *sechdrs,
847 return 0; 853 return 0;
848 } 854 }
849 /* Not in module's version table. OK, but that taints the kernel. */ 855 /* Not in module's version table. OK, but that taints the kernel. */
850 if (!(tainted & TAINT_FORCED_MODULE)) { 856 if (!(tainted & TAINT_FORCED_MODULE))
851 printk("%s: no version for \"%s\" found: kernel tainted.\n", 857 printk("%s: no version for \"%s\" found: kernel tainted.\n",
852 mod->name, symname); 858 mod->name, symname);
853 add_taint(TAINT_FORCED_MODULE); 859 add_taint_module(mod, TAINT_FORCED_MODULE);
854 mod->taints |= TAINT_FORCED_MODULE;
855 }
856 return 1; 860 return 1;
857} 861}
858 862
@@ -910,7 +914,8 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
910 unsigned long ret; 914 unsigned long ret;
911 const unsigned long *crc; 915 const unsigned long *crc;
912 916
913 ret = __find_symbol(name, &owner, &crc, mod->license_gplok); 917 ret = __find_symbol(name, &owner, &crc,
918 !(mod->taints & TAINT_PROPRIETARY_MODULE));
914 if (ret) { 919 if (ret) {
915 /* use_module can fail due to OOM, or module unloading */ 920 /* use_module can fail due to OOM, or module unloading */
916 if (!check_version(sechdrs, versindex, name, mod, crc) || 921 if (!check_version(sechdrs, versindex, name, mod, crc) ||
@@ -1335,12 +1340,11 @@ static void set_license(struct module *mod, const char *license)
1335 if (!license) 1340 if (!license)
1336 license = "unspecified"; 1341 license = "unspecified";
1337 1342
1338 mod->license_gplok = license_is_gpl_compatible(license); 1343 if (!license_is_gpl_compatible(license)) {
1339 if (!mod->license_gplok && !(tainted & TAINT_PROPRIETARY_MODULE)) { 1344 if (!(tainted & TAINT_PROPRIETARY_MODULE))
1340 printk(KERN_WARNING "%s: module license '%s' taints kernel.\n", 1345 printk(KERN_WARNING "%s: module license '%s' taints"
1341 mod->name, license); 1346 "kernel.\n", mod->name, license);
1342 add_taint(TAINT_PROPRIETARY_MODULE); 1347 add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
1343 mod->taints |= TAINT_PROPRIETARY_MODULE;
1344 } 1348 }
1345} 1349}
1346 1350
@@ -1619,8 +1623,7 @@ static struct module *load_module(void __user *umod,
1619 modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); 1623 modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
1620 /* This is allowed: modprobe --force will invalidate it. */ 1624 /* This is allowed: modprobe --force will invalidate it. */
1621 if (!modmagic) { 1625 if (!modmagic) {
1622 add_taint(TAINT_FORCED_MODULE); 1626 add_taint_module(mod, TAINT_FORCED_MODULE);
1623 mod->taints |= TAINT_FORCED_MODULE;
1624 printk(KERN_WARNING "%s: no version magic, tainting kernel.\n", 1627 printk(KERN_WARNING "%s: no version magic, tainting kernel.\n",
1625 mod->name); 1628 mod->name);
1626 } else if (!same_magic(modmagic, vermagic)) { 1629 } else if (!same_magic(modmagic, vermagic)) {
@@ -1714,14 +1717,10 @@ static struct module *load_module(void __user *umod,
1714 /* Set up license info based on the info section */ 1717 /* Set up license info based on the info section */
1715 set_license(mod, get_modinfo(sechdrs, infoindex, "license")); 1718 set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
1716 1719
1717 if (strcmp(mod->name, "ndiswrapper") == 0) { 1720 if (strcmp(mod->name, "ndiswrapper") == 0)
1718 add_taint(TAINT_PROPRIETARY_MODULE); 1721 add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
1719 mod->taints |= TAINT_PROPRIETARY_MODULE; 1722 if (strcmp(mod->name, "driverloader") == 0)
1720 } 1723 add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
1721 if (strcmp(mod->name, "driverloader") == 0) {
1722 add_taint(TAINT_PROPRIETARY_MODULE);
1723 mod->taints |= TAINT_PROPRIETARY_MODULE;
1724 }
1725 1724
1726 /* Set up MODINFO_ATTR fields */ 1725 /* Set up MODINFO_ATTR fields */
1727 setup_modinfo(mod, sechdrs, infoindex); 1726 setup_modinfo(mod, sechdrs, infoindex);
@@ -1766,8 +1765,7 @@ static struct module *load_module(void __user *umod,
1766 (mod->num_unused_gpl_syms && !unusedgplcrcindex)) { 1765 (mod->num_unused_gpl_syms && !unusedgplcrcindex)) {
1767 printk(KERN_WARNING "%s: No versions for exported symbols." 1766 printk(KERN_WARNING "%s: No versions for exported symbols."
1768 " Tainting kernel.\n", mod->name); 1767 " Tainting kernel.\n", mod->name);
1769 add_taint(TAINT_FORCED_MODULE); 1768 add_taint_module(mod, TAINT_FORCED_MODULE);
1770 mod->taints |= TAINT_FORCED_MODULE;
1771 } 1769 }
1772#endif 1770#endif
1773 1771
@@ -2132,9 +2130,33 @@ static void m_stop(struct seq_file *m, void *p)
2132 mutex_unlock(&module_mutex); 2130 mutex_unlock(&module_mutex);
2133} 2131}
2134 2132
2133static char *taint_flags(unsigned int taints, char *buf)
2134{
2135 int bx = 0;
2136
2137 if (taints) {
2138 buf[bx++] = '(';
2139 if (taints & TAINT_PROPRIETARY_MODULE)
2140 buf[bx++] = 'P';
2141 if (taints & TAINT_FORCED_MODULE)
2142 buf[bx++] = 'F';
2143 /*
2144 * TAINT_FORCED_RMMOD: could be added.
2145 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
2146 * apply to modules.
2147 */
2148 buf[bx++] = ')';
2149 }
2150 buf[bx] = '\0';
2151
2152 return buf;
2153}
2154
2135static int m_show(struct seq_file *m, void *p) 2155static int m_show(struct seq_file *m, void *p)
2136{ 2156{
2137 struct module *mod = list_entry(p, struct module, list); 2157 struct module *mod = list_entry(p, struct module, list);
2158 char buf[8];
2159
2138 seq_printf(m, "%s %lu", 2160 seq_printf(m, "%s %lu",
2139 mod->name, mod->init_size + mod->core_size); 2161 mod->name, mod->init_size + mod->core_size);
2140 print_unload_info(m, mod); 2162 print_unload_info(m, mod);
@@ -2147,6 +2169,10 @@ static int m_show(struct seq_file *m, void *p)
2147 /* Used by oprofile and other similar tools. */ 2169 /* Used by oprofile and other similar tools. */
2148 seq_printf(m, " 0x%p", mod->module_core); 2170 seq_printf(m, " 0x%p", mod->module_core);
2149 2171
2172 /* Taints info */
2173 if (mod->taints)
2174 seq_printf(m, " %s", taint_flags(mod->taints, buf));
2175
2150 seq_printf(m, "\n"); 2176 seq_printf(m, "\n");
2151 return 0; 2177 return 0;
2152} 2178}
@@ -2235,28 +2261,6 @@ struct module *module_text_address(unsigned long addr)
2235 return mod; 2261 return mod;
2236} 2262}
2237 2263
2238static char *taint_flags(unsigned int taints, char *buf)
2239{
2240 *buf = '\0';
2241 if (taints) {
2242 int bx;
2243
2244 buf[0] = '(';
2245 bx = 1;
2246 if (taints & TAINT_PROPRIETARY_MODULE)
2247 buf[bx++] = 'P';
2248 if (taints & TAINT_FORCED_MODULE)
2249 buf[bx++] = 'F';
2250 /*
2251 * TAINT_FORCED_RMMOD: could be added.
2252 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
2253 * apply to modules.
2254 */
2255 buf[bx] = ')';
2256 }
2257 return buf;
2258}
2259
2260/* Don't grab lock, we're oopsing. */ 2264/* Don't grab lock, we're oopsing. */
2261void print_modules(void) 2265void print_modules(void)
2262{ 2266{
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index d72234942798..d3a158a60312 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -18,6 +18,7 @@
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/mount.h> 19#include <linux/mount.h>
20#include <linux/pm.h> 20#include <linux/pm.h>
21#include <linux/console.h>
21#include <linux/cpu.h> 22#include <linux/cpu.h>
22 23
23#include "power.h" 24#include "power.h"
@@ -119,8 +120,10 @@ int pm_suspend_disk(void)
119 if (error) 120 if (error)
120 return error; 121 return error;
121 122
123 suspend_console();
122 error = device_suspend(PMSG_FREEZE); 124 error = device_suspend(PMSG_FREEZE);
123 if (error) { 125 if (error) {
126 resume_console();
124 printk("Some devices failed to suspend\n"); 127 printk("Some devices failed to suspend\n");
125 unprepare_processes(); 128 unprepare_processes();
126 return error; 129 return error;
@@ -133,6 +136,7 @@ int pm_suspend_disk(void)
133 136
134 if (in_suspend) { 137 if (in_suspend) {
135 device_resume(); 138 device_resume();
139 resume_console();
136 pr_debug("PM: writing image.\n"); 140 pr_debug("PM: writing image.\n");
137 error = swsusp_write(); 141 error = swsusp_write();
138 if (!error) 142 if (!error)
@@ -148,6 +152,7 @@ int pm_suspend_disk(void)
148 swsusp_free(); 152 swsusp_free();
149 Done: 153 Done:
150 device_resume(); 154 device_resume();
155 resume_console();
151 unprepare_processes(); 156 unprepare_processes();
152 return error; 157 return error;
153} 158}
@@ -212,7 +217,9 @@ static int software_resume(void)
212 217
213 pr_debug("PM: Preparing devices for restore.\n"); 218 pr_debug("PM: Preparing devices for restore.\n");
214 219
220 suspend_console();
215 if ((error = device_suspend(PMSG_PRETHAW))) { 221 if ((error = device_suspend(PMSG_PRETHAW))) {
222 resume_console();
216 printk("Some devices failed to suspend\n"); 223 printk("Some devices failed to suspend\n");
217 swsusp_free(); 224 swsusp_free();
218 goto Thaw; 225 goto Thaw;
@@ -224,6 +231,7 @@ static int software_resume(void)
224 swsusp_resume(); 231 swsusp_resume();
225 pr_debug("PM: Restore failed, recovering.n"); 232 pr_debug("PM: Restore failed, recovering.n");
226 device_resume(); 233 device_resume();
234 resume_console();
227 Thaw: 235 Thaw:
228 unprepare_processes(); 236 unprepare_processes();
229 Done: 237 Done:
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 93b5dd283dea..d991d3b0e5a4 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -19,6 +19,7 @@
19#include <linux/swapops.h> 19#include <linux/swapops.h>
20#include <linux/pm.h> 20#include <linux/pm.h>
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/console.h>
22#include <linux/cpu.h> 23#include <linux/cpu.h>
23 24
24#include <asm/uaccess.h> 25#include <asm/uaccess.h>
@@ -173,12 +174,14 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
173 /* Free memory before shutting down devices. */ 174 /* Free memory before shutting down devices. */
174 error = swsusp_shrink_memory(); 175 error = swsusp_shrink_memory();
175 if (!error) { 176 if (!error) {
177 suspend_console();
176 error = device_suspend(PMSG_FREEZE); 178 error = device_suspend(PMSG_FREEZE);
177 if (!error) { 179 if (!error) {
178 in_suspend = 1; 180 in_suspend = 1;
179 error = swsusp_suspend(); 181 error = swsusp_suspend();
180 device_resume(); 182 device_resume();
181 } 183 }
184 resume_console();
182 } 185 }
183 up(&pm_sem); 186 up(&pm_sem);
184 if (!error) 187 if (!error)
@@ -196,11 +199,13 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
196 snapshot_free_unused_memory(&data->handle); 199 snapshot_free_unused_memory(&data->handle);
197 down(&pm_sem); 200 down(&pm_sem);
198 pm_prepare_console(); 201 pm_prepare_console();
202 suspend_console();
199 error = device_suspend(PMSG_PRETHAW); 203 error = device_suspend(PMSG_PRETHAW);
200 if (!error) { 204 if (!error) {
201 error = swsusp_resume(); 205 error = swsusp_resume();
202 device_resume(); 206 device_resume();
203 } 207 }
208 resume_console();
204 pm_restore_console(); 209 pm_restore_console();
205 up(&pm_sem); 210 up(&pm_sem);
206 break; 211 break;
@@ -289,6 +294,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
289 } 294 }
290 295
291 /* Put devices to sleep */ 296 /* Put devices to sleep */
297 suspend_console();
292 error = device_suspend(PMSG_SUSPEND); 298 error = device_suspend(PMSG_SUSPEND);
293 if (error) { 299 if (error) {
294 printk(KERN_ERR "Failed to suspend some devices.\n"); 300 printk(KERN_ERR "Failed to suspend some devices.\n");
@@ -299,7 +305,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
299 /* Wake up devices */ 305 /* Wake up devices */
300 device_resume(); 306 device_resume();
301 } 307 }
302 308 resume_console();
303 if (pm_ops->finish) 309 if (pm_ops->finish)
304 pm_ops->finish(PM_SUSPEND_MEM); 310 pm_ops->finish(PM_SUSPEND_MEM);
305 311
diff --git a/kernel/printk.c b/kernel/printk.c
index 771f5e861bcd..f7d427ef5038 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -820,15 +820,8 @@ void release_console_sem(void)
820 console_locked = 0; 820 console_locked = 0;
821 up(&console_sem); 821 up(&console_sem);
822 spin_unlock_irqrestore(&logbuf_lock, flags); 822 spin_unlock_irqrestore(&logbuf_lock, flags);
823 if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) { 823 if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait))
824 /* 824 wake_up_interruptible(&log_wait);
825 * If we printk from within the lock dependency code,
826 * from within the scheduler code, then do not lock
827 * up due to self-recursion:
828 */
829 if (!lockdep_internal())
830 wake_up_interruptible(&log_wait);
831 }
832} 825}
833EXPORT_SYMBOL(release_console_sem); 826EXPORT_SYMBOL(release_console_sem);
834 827
diff --git a/kernel/profile.c b/kernel/profile.c
index 857300a2afec..f940b462eec9 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -399,7 +399,7 @@ static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffe
399 unsigned long full_count = count, err; 399 unsigned long full_count = count, err;
400 cpumask_t new_value; 400 cpumask_t new_value;
401 401
402 err = cpumask_parse(buffer, count, new_value); 402 err = cpumask_parse_user(buffer, count, new_value);
403 if (err) 403 if (err)
404 return err; 404 return err;
405 405
diff --git a/kernel/sched.c b/kernel/sched.c
index 53608a59d6e3..094b5687eef6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1822,14 +1822,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
1822 struct mm_struct *mm = next->mm; 1822 struct mm_struct *mm = next->mm;
1823 struct mm_struct *oldmm = prev->active_mm; 1823 struct mm_struct *oldmm = prev->active_mm;
1824 1824
1825 if (unlikely(!mm)) { 1825 if (!mm) {
1826 next->active_mm = oldmm; 1826 next->active_mm = oldmm;
1827 atomic_inc(&oldmm->mm_count); 1827 atomic_inc(&oldmm->mm_count);
1828 enter_lazy_tlb(oldmm, next); 1828 enter_lazy_tlb(oldmm, next);
1829 } else 1829 } else
1830 switch_mm(oldmm, mm, next); 1830 switch_mm(oldmm, mm, next);
1831 1831
1832 if (unlikely(!prev->mm)) { 1832 if (!prev->mm) {
1833 prev->active_mm = NULL; 1833 prev->active_mm = NULL;
1834 WARN_ON(rq->prev_mm); 1834 WARN_ON(rq->prev_mm);
1835 rq->prev_mm = oldmm; 1835 rq->prev_mm = oldmm;
@@ -3491,7 +3491,7 @@ asmlinkage void __sched preempt_schedule(void)
3491 * If there is a non-zero preempt_count or interrupts are disabled, 3491 * If there is a non-zero preempt_count or interrupts are disabled,
3492 * we do not want to preempt the current task. Just return.. 3492 * we do not want to preempt the current task. Just return..
3493 */ 3493 */
3494 if (unlikely(ti->preempt_count || irqs_disabled())) 3494 if (likely(ti->preempt_count || irqs_disabled()))
3495 return; 3495 return;
3496 3496
3497need_resched: 3497need_resched:
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index cfc737bffe6d..3df9bfc7ff78 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -28,6 +28,7 @@
28#include <linux/notifier.h> 28#include <linux/notifier.h>
29#include <linux/kthread.h> 29#include <linux/kthread.h>
30#include <linux/hardirq.h> 30#include <linux/hardirq.h>
31#include <linux/mempolicy.h>
31 32
32/* 33/*
33 * The per-CPU workqueue (if single thread, we always use the first 34 * The per-CPU workqueue (if single thread, we always use the first
@@ -245,6 +246,12 @@ static int worker_thread(void *__cwq)
245 sigprocmask(SIG_BLOCK, &blocked, NULL); 246 sigprocmask(SIG_BLOCK, &blocked, NULL);
246 flush_signals(current); 247 flush_signals(current);
247 248
249 /*
250 * We inherited MPOL_INTERLEAVE from the booting kernel.
251 * Set MPOL_DEFAULT to insure node local allocations.
252 */
253 numa_default_policy();
254
248 /* SIG_IGN makes children autoreap: see do_notify_parent(). */ 255 /* SIG_IGN makes children autoreap: see do_notify_parent(). */
249 sa.sa.sa_handler = SIG_IGN; 256 sa.sa.sa_handler = SIG_IGN;
250 sa.sa.sa_flags = 0; 257 sa.sa.sa_flags = 0;
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 756a908c441d..8fd2dbf7eb5b 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -71,7 +71,7 @@ config LOG_BUF_SHIFT
71 71
72config DETECT_SOFTLOCKUP 72config DETECT_SOFTLOCKUP
73 bool "Detect Soft Lockups" 73 bool "Detect Soft Lockups"
74 depends on DEBUG_KERNEL 74 depends on DEBUG_KERNEL && !S390
75 default y 75 default y
76 help 76 help
77 Say Y here to enable the kernel to detect "soft lockups", 77 Say Y here to enable the kernel to detect "soft lockups",
@@ -371,6 +371,19 @@ config FORCED_INLINING
371 become the default in the future, until then this option is there to 371 become the default in the future, until then this option is there to
372 test gcc for this. 372 test gcc for this.
373 373
374config HEADERS_CHECK
375 bool "Run 'make headers_check' when building vmlinux"
376 help
377 This option will extract the user-visible kernel headers whenever
378 building the kernel, and will run basic sanity checks on them to
379 ensure that exported files do not attempt to include files which
380 were not exported, etc.
381
382 If you're making modifications to header files which are
383 relevant for userspace, say 'Y', and check the headers
384 exported to $(INSTALL_HDR_PATH) (usually 'usr/include' in
385 your build tree), to make sure they're suitable.
386
374config RCU_TORTURE_TEST 387config RCU_TORTURE_TEST
375 tristate "torture tests for RCU" 388 tristate "torture tests for RCU"
376 depends on DEBUG_KERNEL 389 depends on DEBUG_KERNEL
diff --git a/lib/Makefile b/lib/Makefile
index 8e6662bb9c37..59070dbfbeb4 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -5,7 +5,7 @@
5lib-y := ctype.o string.o vsprintf.o cmdline.o \ 5lib-y := ctype.o string.o vsprintf.o cmdline.o \
6 bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o \ 6 bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o \
7 idr.o div64.o int_sqrt.o bitmap.o extable.o prio_tree.o \ 7 idr.o div64.o int_sqrt.o bitmap.o extable.o prio_tree.o \
8 sha1.o irq_regs.o 8 sha1.o irq_regs.o carta_random32.o
9 9
10lib-$(CONFIG_MMU) += ioremap.o 10lib-$(CONFIG_MMU) += ioremap.o
11lib-$(CONFIG_SMP) += cpumask.o 11lib-$(CONFIG_SMP) += cpumask.o
diff --git a/lib/bitmap.c b/lib/bitmap.c
index d71e38c54ea5..037fa9aa2ed7 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -316,10 +316,11 @@ int bitmap_scnprintf(char *buf, unsigned int buflen,
316EXPORT_SYMBOL(bitmap_scnprintf); 316EXPORT_SYMBOL(bitmap_scnprintf);
317 317
318/** 318/**
319 * bitmap_parse - convert an ASCII hex string into a bitmap. 319 * __bitmap_parse - convert an ASCII hex string into a bitmap.
320 * @ubuf: pointer to buffer in user space containing string. 320 * @buf: pointer to buffer containing string.
321 * @ubuflen: buffer size in bytes. If string is smaller than this 321 * @buflen: buffer size in bytes. If string is smaller than this
322 * then it must be terminated with a \0. 322 * then it must be terminated with a \0.
323 * @is_user: location of buffer, 0 indicates kernel space
323 * @maskp: pointer to bitmap array that will contain result. 324 * @maskp: pointer to bitmap array that will contain result.
324 * @nmaskbits: size of bitmap, in bits. 325 * @nmaskbits: size of bitmap, in bits.
325 * 326 *
@@ -330,11 +331,13 @@ EXPORT_SYMBOL(bitmap_scnprintf);
330 * characters and for grouping errors such as "1,,5", ",44", "," and "". 331 * characters and for grouping errors such as "1,,5", ",44", "," and "".
331 * Leading and trailing whitespace accepted, but not embedded whitespace. 332 * Leading and trailing whitespace accepted, but not embedded whitespace.
332 */ 333 */
333int bitmap_parse(const char __user *ubuf, unsigned int ubuflen, 334int __bitmap_parse(const char *buf, unsigned int buflen,
334 unsigned long *maskp, int nmaskbits) 335 int is_user, unsigned long *maskp,
336 int nmaskbits)
335{ 337{
336 int c, old_c, totaldigits, ndigits, nchunks, nbits; 338 int c, old_c, totaldigits, ndigits, nchunks, nbits;
337 u32 chunk; 339 u32 chunk;
340 const char __user *ubuf = buf;
338 341
339 bitmap_zero(maskp, nmaskbits); 342 bitmap_zero(maskp, nmaskbits);
340 343
@@ -343,11 +346,15 @@ int bitmap_parse(const char __user *ubuf, unsigned int ubuflen,
343 chunk = ndigits = 0; 346 chunk = ndigits = 0;
344 347
345 /* Get the next chunk of the bitmap */ 348 /* Get the next chunk of the bitmap */
346 while (ubuflen) { 349 while (buflen) {
347 old_c = c; 350 old_c = c;
348 if (get_user(c, ubuf++)) 351 if (is_user) {
349 return -EFAULT; 352 if (__get_user(c, ubuf++))
350 ubuflen--; 353 return -EFAULT;
354 }
355 else
356 c = *buf++;
357 buflen--;
351 if (isspace(c)) 358 if (isspace(c))
352 continue; 359 continue;
353 360
@@ -388,11 +395,36 @@ int bitmap_parse(const char __user *ubuf, unsigned int ubuflen,
388 nbits += (nchunks == 1) ? nbits_to_hold_value(chunk) : CHUNKSZ; 395 nbits += (nchunks == 1) ? nbits_to_hold_value(chunk) : CHUNKSZ;
389 if (nbits > nmaskbits) 396 if (nbits > nmaskbits)
390 return -EOVERFLOW; 397 return -EOVERFLOW;
391 } while (ubuflen && c == ','); 398 } while (buflen && c == ',');
392 399
393 return 0; 400 return 0;
394} 401}
395EXPORT_SYMBOL(bitmap_parse); 402EXPORT_SYMBOL(__bitmap_parse);
403
404/**
405 * bitmap_parse_user()
406 *
407 * @ubuf: pointer to user buffer containing string.
408 * @ulen: buffer size in bytes. If string is smaller than this
409 * then it must be terminated with a \0.
410 * @maskp: pointer to bitmap array that will contain result.
411 * @nmaskbits: size of bitmap, in bits.
412 *
413 * Wrapper for __bitmap_parse(), providing it with user buffer.
414 *
415 * We cannot have this as an inline function in bitmap.h because it needs
416 * linux/uaccess.h to get the access_ok() declaration and this causes
417 * cyclic dependencies.
418 */
419int bitmap_parse_user(const char __user *ubuf,
420 unsigned int ulen, unsigned long *maskp,
421 int nmaskbits)
422{
423 if (!access_ok(VERIFY_READ, ubuf, ulen))
424 return -EFAULT;
425 return __bitmap_parse((const char *)ubuf, ulen, 1, maskp, nmaskbits);
426}
427EXPORT_SYMBOL(bitmap_parse_user);
396 428
397/* 429/*
398 * bscnl_emit(buf, buflen, rbot, rtop, bp) 430 * bscnl_emit(buf, buflen, rbot, rtop, bp)
diff --git a/lib/carta_random32.c b/lib/carta_random32.c
new file mode 100644
index 000000000000..ca82df70eee4
--- /dev/null
+++ b/lib/carta_random32.c
@@ -0,0 +1,41 @@
1/*
2 * Copyright (c) 2002-2006 Hewlett-Packard Development Company, L.P.
3 * Contributed by David Mosberger-Tang <davidm@hpl.hp.com>
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of version 2 of the GNU General Public
7 * License as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
17 * 02111-1307 USA
18 */
19#include <linux/types.h>
20#include <linux/module.h>
21
22/*
23 * Fast, simple, yet decent quality random number generator based on
24 * a paper by David G. Carta ("Two Fast Implementations of the
25 * `Minimal Standard' Random Number Generator," Communications of the
26 * ACM, January, 1990).
27 */
28u64 carta_random32 (u64 seed)
29{
30# define A 16807
31# define M ((u32) 1 << 31)
32 u64 s, prod = A * seed, p, q;
33
34 p = (prod >> 31) & (M - 1);
35 q = (prod >> 0) & (M - 1);
36 s = p + q;
37 if (s >= M)
38 s -= M - 1;
39 return s;
40}
41EXPORT_SYMBOL_GPL(carta_random32);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 1d709ff528e1..2dbec90dc3ba 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -356,8 +356,8 @@ nomem:
356 return -ENOMEM; 356 return -ENOMEM;
357} 357}
358 358
359void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 359void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
360 unsigned long end) 360 unsigned long end)
361{ 361{
362 struct mm_struct *mm = vma->vm_mm; 362 struct mm_struct *mm = vma->vm_mm;
363 unsigned long address; 363 unsigned long address;
@@ -398,6 +398,24 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
398 } 398 }
399} 399}
400 400
401void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
402 unsigned long end)
403{
404 /*
405 * It is undesirable to test vma->vm_file as it should be non-null
406 * for valid hugetlb area. However, vm_file will be NULL in the error
407 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
408 * do_mmap_pgoff() nullifies vma->vm_file before calling this function
409 * to clean up. Since no pte has actually been setup, it is safe to
410 * do nothing in this case.
411 */
412 if (vma->vm_file) {
413 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
414 __unmap_hugepage_range(vma, start, end);
415 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
416 }
417}
418
401static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 419static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
402 unsigned long address, pte_t *ptep, pte_t pte) 420 unsigned long address, pte_t *ptep, pte_t pte)
403{ 421{
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 25788b1b7fcf..617fb31086ee 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -727,7 +727,7 @@ int do_migrate_pages(struct mm_struct *mm,
727 return -ENOSYS; 727 return -ENOSYS;
728} 728}
729 729
730static struct page *new_vma_page(struct page *page, unsigned long private) 730static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
731{ 731{
732 return NULL; 732 return NULL;
733} 733}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a8c003e7b3d5..40db96a655d0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -495,17 +495,16 @@ static void __free_pages_ok(struct page *page, unsigned int order)
495 int i; 495 int i;
496 int reserved = 0; 496 int reserved = 0;
497 497
498 arch_free_page(page, order);
499 if (!PageHighMem(page))
500 debug_check_no_locks_freed(page_address(page),
501 PAGE_SIZE<<order);
502
503 for (i = 0 ; i < (1 << order) ; ++i) 498 for (i = 0 ; i < (1 << order) ; ++i)
504 reserved += free_pages_check(page + i); 499 reserved += free_pages_check(page + i);
505 if (reserved) 500 if (reserved)
506 return; 501 return;
507 502
503 if (!PageHighMem(page))
504 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
505 arch_free_page(page, order);
508 kernel_map_pages(page, 1 << order, 0); 506 kernel_map_pages(page, 1 << order, 0);
507
509 local_irq_save(flags); 508 local_irq_save(flags);
510 __count_vm_events(PGFREE, 1 << order); 509 __count_vm_events(PGFREE, 1 << order);
511 free_one_page(page_zone(page), page, order); 510 free_one_page(page_zone(page), page, order);
@@ -781,13 +780,14 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
781 struct per_cpu_pages *pcp; 780 struct per_cpu_pages *pcp;
782 unsigned long flags; 781 unsigned long flags;
783 782
784 arch_free_page(page, 0);
785
786 if (PageAnon(page)) 783 if (PageAnon(page))
787 page->mapping = NULL; 784 page->mapping = NULL;
788 if (free_pages_check(page)) 785 if (free_pages_check(page))
789 return; 786 return;
790 787
788 if (!PageHighMem(page))
789 debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
790 arch_free_page(page, 0);
791 kernel_map_pages(page, 1, 0); 791 kernel_map_pages(page, 1, 0);
792 792
793 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 793 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
@@ -2294,19 +2294,6 @@ unsigned long __init zone_absent_pages_in_node(int nid,
2294 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 2294 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
2295} 2295}
2296 2296
2297/* Return the zone index a PFN is in */
2298int memmap_zone_idx(struct page *lmem_map)
2299{
2300 int i;
2301 unsigned long phys_addr = virt_to_phys(lmem_map);
2302 unsigned long pfn = phys_addr >> PAGE_SHIFT;
2303
2304 for (i = 0; i < MAX_NR_ZONES; i++)
2305 if (pfn < arch_zone_highest_possible_pfn[i])
2306 break;
2307
2308 return i;
2309}
2310#else 2297#else
2311static inline unsigned long zone_spanned_pages_in_node(int nid, 2298static inline unsigned long zone_spanned_pages_in_node(int nid,
2312 unsigned long zone_type, 2299 unsigned long zone_type,
@@ -2325,10 +2312,6 @@ static inline unsigned long zone_absent_pages_in_node(int nid,
2325 return zholes_size[zone_type]; 2312 return zholes_size[zone_type];
2326} 2313}
2327 2314
2328static inline int memmap_zone_idx(struct page *lmem_map)
2329{
2330 return MAX_NR_ZONES;
2331}
2332#endif 2315#endif
2333 2316
2334static void __init calculate_node_totalpages(struct pglist_data *pgdat, 2317static void __init calculate_node_totalpages(struct pglist_data *pgdat,
diff --git a/mm/rmap.c b/mm/rmap.c
index e2155d791d99..a9136d8b7577 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -576,15 +576,14 @@ void page_add_file_rmap(struct page *page)
576void page_remove_rmap(struct page *page) 576void page_remove_rmap(struct page *page)
577{ 577{
578 if (atomic_add_negative(-1, &page->_mapcount)) { 578 if (atomic_add_negative(-1, &page->_mapcount)) {
579#ifdef CONFIG_DEBUG_VM
580 if (unlikely(page_mapcount(page) < 0)) { 579 if (unlikely(page_mapcount(page) < 0)) {
581 printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); 580 printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
582 printk (KERN_EMERG " page->flags = %lx\n", page->flags); 581 printk (KERN_EMERG " page->flags = %lx\n", page->flags);
583 printk (KERN_EMERG " page->count = %x\n", page_count(page)); 582 printk (KERN_EMERG " page->count = %x\n", page_count(page));
584 printk (KERN_EMERG " page->mapping = %p\n", page->mapping); 583 printk (KERN_EMERG " page->mapping = %p\n", page->mapping);
584 BUG();
585 } 585 }
586#endif 586
587 BUG_ON(page_mapcount(page) < 0);
588 /* 587 /*
589 * It would be tidy to reset the PageAnon mapping here, 588 * It would be tidy to reset the PageAnon mapping here,
590 * but that might overwrite a racing page_add_anon_rmap 589 * but that might overwrite a racing page_add_anon_rmap
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
index c946bf468718..f5664c5b9eb1 100644
--- a/mm/shmem_acl.c
+++ b/mm/shmem_acl.c
@@ -35,7 +35,7 @@ shmem_get_acl(struct inode *inode, int type)
35} 35}
36 36
37/** 37/**
38 * shmem_get_acl - generic_acl_operations->setacl() operation 38 * shmem_set_acl - generic_acl_operations->setacl() operation
39 */ 39 */
40static void 40static void
41shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl) 41shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl)
diff --git a/mm/truncate.c b/mm/truncate.c
index f4edbc179d14..11ca480701dd 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -302,7 +302,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
302 if (page->mapping != mapping) 302 if (page->mapping != mapping)
303 return 0; 303 return 0;
304 304
305 if (PagePrivate(page) && !try_to_release_page(page, 0)) 305 if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
306 return 0; 306 return 0;
307 307
308 write_lock_irq(&mapping->tree_lock); 308 write_lock_irq(&mapping->tree_lock);
@@ -396,6 +396,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
396 pagevec_release(&pvec); 396 pagevec_release(&pvec);
397 cond_resched(); 397 cond_resched();
398 } 398 }
399 WARN_ON_ONCE(ret);
399 return ret; 400 return ret;
400} 401}
401EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); 402EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
diff --git a/scripts/kconfig/lxdialog/dialog.h b/scripts/kconfig/lxdialog/dialog.h
index 8dea47f9d3e4..fd695e1070f7 100644
--- a/scripts/kconfig/lxdialog/dialog.h
+++ b/scripts/kconfig/lxdialog/dialog.h
@@ -24,6 +24,7 @@
24#include <ctype.h> 24#include <ctype.h>
25#include <stdlib.h> 25#include <stdlib.h>
26#include <string.h> 26#include <string.h>
27#include <stdbool.h>
27 28
28#ifdef __sun__ 29#ifdef __sun__
29#define CURS_MACROS 30#define CURS_MACROS
diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index 00d1ad19b2cc..187f5de4612c 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -1262,7 +1262,9 @@ sub output_intro_text(%) {
1262} 1262}
1263 1263
1264## 1264##
1265# generic output function for typedefs 1265# generic output function for all types (function, struct/union, typedef, enum);
1266# calls the generated, variable output_ function name based on
1267# functype and output_mode
1266sub output_declaration { 1268sub output_declaration {
1267 no strict 'refs'; 1269 no strict 'refs';
1268 my $name = shift; 1270 my $name = shift;
@@ -1278,8 +1280,7 @@ sub output_declaration {
1278} 1280}
1279 1281
1280## 1282##
1281# generic output function - calls the right one based 1283# generic output function - calls the right one based on current output mode.
1282# on current output mode.
1283sub output_intro { 1284sub output_intro {
1284 no strict 'refs'; 1285 no strict 'refs';
1285 my $func = "output_intro_".$output_mode; 1286 my $func = "output_intro_".$output_mode;
@@ -1518,6 +1519,9 @@ sub dump_function($$) {
1518 $prototype =~ s/^asmlinkage +//; 1519 $prototype =~ s/^asmlinkage +//;
1519 $prototype =~ s/^inline +//; 1520 $prototype =~ s/^inline +//;
1520 $prototype =~ s/^__inline__ +//; 1521 $prototype =~ s/^__inline__ +//;
1522 $prototype =~ s/^__inline +//;
1523 $prototype =~ s/^__always_inline +//;
1524 $prototype =~ s/^noinline +//;
1521 $prototype =~ s/__devinit +//; 1525 $prototype =~ s/__devinit +//;
1522 $prototype =~ s/^#define +//; #ak added 1526 $prototype =~ s/^#define +//; #ak added
1523 $prototype =~ s/__attribute__ \(\([a-z,]*\)\)//; 1527 $prototype =~ s/__attribute__ \(\([a-z,]*\)\)//;
@@ -1778,8 +1782,9 @@ sub process_file($) {
1778 $in_doc_sect = 1; 1782 $in_doc_sect = 1;
1779 $contents = $newcontents; 1783 $contents = $newcontents;
1780 if ($contents ne "") { 1784 if ($contents ne "") {
1781 if (substr($contents, 0, 1) eq " ") { 1785 while ((substr($contents, 0, 1) eq " ") ||
1782 $contents = substr($contents, 1); 1786 substr($contents, 0, 1) eq "\t") {
1787 $contents = substr($contents, 1);
1783 } 1788 }
1784 $contents .= "\n"; 1789 $contents .= "\n";
1785 } 1790 }