aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-fs-ext481
-rw-r--r--Documentation/filesystems/ext4.txt30
-rw-r--r--Documentation/filesystems/proc.txt21
-rw-r--r--arch/ia64/include/asm/intrinsics.h6
-rw-r--r--arch/ia64/include/asm/mmu_context.h6
-rw-r--r--arch/ia64/include/asm/module.h6
-rw-r--r--arch/ia64/include/asm/native/inst.h13
-rw-r--r--arch/ia64/include/asm/native/patchlist.h38
-rw-r--r--arch/ia64/include/asm/native/pvchk_inst.h8
-rw-r--r--arch/ia64/include/asm/paravirt.h65
-rw-r--r--arch/ia64/include/asm/paravirt_patch.h143
-rw-r--r--arch/ia64/include/asm/paravirt_privop.h365
-rw-r--r--arch/ia64/include/asm/smp.h3
-rw-r--r--arch/ia64/include/asm/timex.h1
-rw-r--r--arch/ia64/include/asm/topology.h5
-rw-r--r--arch/ia64/include/asm/xen/hypervisor.h39
-rw-r--r--arch/ia64/include/asm/xen/inst.h28
-rw-r--r--arch/ia64/include/asm/xen/interface.h9
-rw-r--r--arch/ia64/include/asm/xen/minstate.h11
-rw-r--r--arch/ia64/include/asm/xen/patchlist.h38
-rw-r--r--arch/ia64/include/asm/xen/privop.h8
-rw-r--r--arch/ia64/kernel/Makefile39
-rw-r--r--arch/ia64/kernel/Makefile.gate27
-rw-r--r--arch/ia64/kernel/acpi.c8
-rw-r--r--arch/ia64/kernel/asm-offsets.c2
-rw-r--r--arch/ia64/kernel/efi.c1
-rw-r--r--arch/ia64/kernel/entry.S4
-rw-r--r--arch/ia64/kernel/fsys.S35
-rw-r--r--arch/ia64/kernel/gate.S171
-rw-r--r--arch/ia64/kernel/gate.lds.S17
-rw-r--r--arch/ia64/kernel/head.S10
-rw-r--r--arch/ia64/kernel/ivt.S2
-rw-r--r--arch/ia64/kernel/mca.c6
-rw-r--r--arch/ia64/kernel/module.c35
-rw-r--r--arch/ia64/kernel/paravirt.c539
-rw-r--r--arch/ia64/kernel/paravirt_patch.c514
-rw-r--r--arch/ia64/kernel/paravirt_patchlist.c79
-rw-r--r--arch/ia64/kernel/paravirt_patchlist.h28
-rw-r--r--arch/ia64/kernel/paravirtentry.S99
-rw-r--r--arch/ia64/kernel/patch.c40
-rw-r--r--arch/ia64/kernel/perfmon.c4
-rw-r--r--arch/ia64/kernel/salinfo.c6
-rw-r--r--arch/ia64/kernel/setup.c9
-rw-r--r--arch/ia64/kernel/smp.c6
-rw-r--r--arch/ia64/kernel/smpboot.c17
-rw-r--r--arch/ia64/kernel/time.c9
-rw-r--r--arch/ia64/kernel/vmlinux.lds.S30
-rw-r--r--arch/ia64/kvm/kvm-ia64.c2
-rw-r--r--arch/ia64/kvm/vcpu.c2
-rw-r--r--arch/ia64/kvm/vtlb.c2
-rw-r--r--arch/ia64/mm/init.c12
-rw-r--r--arch/ia64/mm/tlb.c2
-rw-r--r--arch/ia64/scripts/pvcheck.sed1
-rw-r--r--arch/ia64/sn/kernel/io_common.c15
-rw-r--r--arch/ia64/sn/kernel/io_init.c12
-rw-r--r--arch/ia64/sn/kernel/setup.c5
-rw-r--r--arch/ia64/sn/kernel/sn2/sn2_smp.c12
-rw-r--r--arch/ia64/sn/kernel/sn2/sn_hwperf.c8
-rw-r--r--arch/ia64/sn/pci/pcibr/pcibr_dma.c4
-rw-r--r--arch/ia64/xen/Makefile19
-rw-r--r--arch/ia64/xen/gate-data.S3
-rw-r--r--arch/ia64/xen/hypercall.S2
-rw-r--r--arch/ia64/xen/time.c48
-rw-r--r--arch/ia64/xen/xen_pv_ops.c800
-rw-r--r--arch/x86/boot/memory.c39
-rw-r--r--drivers/gpu/drm/drm_crtc_helper.c31
-rw-r--r--drivers/gpu/drm/drm_edid.c8
-rw-r--r--drivers/gpu/drm/i915/i915_dma.c2
-rw-r--r--drivers/s390/net/qeth_core_offl.c0
-rw-r--r--drivers/s390/net/qeth_core_offl.h0
-rw-r--r--drivers/serial/serial_core.c2
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/btrfs_inode.h31
-rw-r--r--fs/btrfs/ctree.c588
-rw-r--r--fs/btrfs/ctree.h69
-rw-r--r--fs/btrfs/delayed-ref.c669
-rw-r--r--fs/btrfs/delayed-ref.h193
-rw-r--r--fs/btrfs/dir-item.c3
-rw-r--r--fs/btrfs/disk-io.c81
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/extent-tree.c1674
-rw-r--r--fs/btrfs/extent_io.c51
-rw-r--r--fs/btrfs/extent_io.h3
-rw-r--r--fs/btrfs/file-item.c7
-rw-r--r--fs/btrfs/file.c50
-rw-r--r--fs/btrfs/inode-item.c3
-rw-r--r--fs/btrfs/inode.c194
-rw-r--r--fs/btrfs/locking.c21
-rw-r--r--fs/btrfs/ordered-data.c118
-rw-r--r--fs/btrfs/ordered-data.h4
-rw-r--r--fs/btrfs/transaction.c151
-rw-r--r--fs/btrfs/transaction.h8
-rw-r--r--fs/btrfs/tree-defrag.c2
-rw-r--r--fs/btrfs/tree-log.c444
-rw-r--r--fs/btrfs/tree-log.h17
-rw-r--r--fs/ext4/balloc.c14
-rw-r--r--fs/ext4/dir.c16
-rw-r--r--fs/ext4/ext4.h91
-rw-r--r--fs/ext4/ext4_extents.h1
-rw-r--r--fs/ext4/ext4_i.h6
-rw-r--r--fs/ext4/ext4_sb.h14
-rw-r--r--fs/ext4/extents.c127
-rw-r--r--fs/ext4/file.c7
-rw-r--r--fs/ext4/ialloc.c273
-rw-r--r--fs/ext4/inode.c424
-rw-r--r--fs/ext4/ioctl.c17
-rw-r--r--fs/ext4/mballoc.c158
-rw-r--r--fs/ext4/mballoc.h8
-rw-r--r--fs/ext4/namei.c164
-rw-r--r--fs/ext4/resize.c8
-rw-r--r--fs/ext4/super.c327
-rw-r--r--fs/jbd2/commit.c5
-rw-r--r--fs/jbd2/revoke.c24
-rw-r--r--fs/jbd2/transaction.c2
-rw-r--r--fs/lockd/clntlock.c51
-rw-r--r--fs/lockd/mon.c8
-rw-r--r--fs/lockd/svc.c42
-rw-r--r--fs/nfs/callback.c31
-rw-r--r--fs/nfs/callback.h1
-rw-r--r--fs/nfs/client.c116
-rw-r--r--fs/nfs/dir.c9
-rw-r--r--fs/nfs/file.c32
-rw-r--r--fs/nfs/getroot.c4
-rw-r--r--fs/nfs/inode.c309
-rw-r--r--fs/nfs/internal.h4
-rw-r--r--fs/nfs/nfs2xdr.c9
-rw-r--r--fs/nfs/nfs3proc.c1
-rw-r--r--fs/nfs/nfs3xdr.c37
-rw-r--r--fs/nfs/nfs4proc.c47
-rw-r--r--fs/nfs/nfs4state.c10
-rw-r--r--fs/nfs/nfs4xdr.c213
-rw-r--r--fs/nfs/pagelist.c11
-rw-r--r--fs/nfs/proc.c1
-rw-r--r--fs/nfs/super.c4
-rw-r--r--fs/nfs/write.c53
-rw-r--r--fs/nfsd/nfsctl.c6
-rw-r--r--fs/nfsd/nfssvc.c5
-rw-r--r--include/drm/drm_crtc_helper.h3
-rw-r--r--include/drm/drm_os_linux.h4
-rw-r--r--include/linux/jbd2.h6
-rw-r--r--include/linux/nfs_fs.h4
-rw-r--r--include/linux/nfs_fs_sb.h5
-rw-r--r--include/linux/nfs_xdr.h59
-rw-r--r--include/linux/sunrpc/svc.h9
-rw-r--r--include/linux/sunrpc/svc_xprt.h52
-rw-r--r--include/linux/sunrpc/xprt.h2
-rw-r--r--net/sunrpc/Kconfig22
-rw-r--r--net/sunrpc/clnt.c48
-rw-r--r--net/sunrpc/rpcb_clnt.c103
-rw-r--r--net/sunrpc/svc.c158
-rw-r--r--net/sunrpc/svc_xprt.c31
-rw-r--r--net/sunrpc/svcsock.c40
-rw-r--r--net/sunrpc/xprt.c89
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c26
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c8
-rw-r--r--net/sunrpc/xprtsock.c363
156 files changed, 8419 insertions, 3379 deletions
diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4
new file mode 100644
index 000000000000..4e79074de282
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-fs-ext4
@@ -0,0 +1,81 @@
1What: /sys/fs/ext4/<disk>/mb_stats
2Date: March 2008
3Contact: "Theodore Ts'o" <tytso@mit.edu>
4Description:
5 Controls whether the multiblock allocator should
6 collect statistics, which are shown during the unmount.
7 1 means to collect statistics, 0 means not to collect
8 statistics
9
10What: /sys/fs/ext4/<disk>/mb_group_prealloc
11Date: March 2008
12Contact: "Theodore Ts'o" <tytso@mit.edu>
13Description:
14 The multiblock allocator will round up allocation
15 requests to a multiple of this tuning parameter if the
16 stripe size is not set in the ext4 superblock
17
18What: /sys/fs/ext4/<disk>/mb_max_to_scan
19Date: March 2008
20Contact: "Theodore Ts'o" <tytso@mit.edu>
21Description:
22 The maximum number of extents the multiblock allocator
23 will search to find the best extent
24
25What: /sys/fs/ext4/<disk>/mb_min_to_scan
26Date: March 2008
27Contact: "Theodore Ts'o" <tytso@mit.edu>
28Description:
29 The minimum number of extents the multiblock allocator
30 will search to find the best extent
31
32What: /sys/fs/ext4/<disk>/mb_order2_req
33Date: March 2008
34Contact: "Theodore Ts'o" <tytso@mit.edu>
35Description:
36 Tuning parameter which controls the minimum size for
37 requests (as a power of 2) where the buddy cache is
38 used
39
40What: /sys/fs/ext4/<disk>/mb_stream_req
41Date: March 2008
42Contact: "Theodore Ts'o" <tytso@mit.edu>
43Description:
44 Files which have fewer blocks than this tunable
45 parameter will have their blocks allocated out of a
46 block group specific preallocation pool, so that small
47 files are packed closely together. Each large file
48 will have its blocks allocated out of its own unique
49 preallocation pool.
50
51What: /sys/fs/ext4/<disk>/inode_readahead
52Date: March 2008
53Contact: "Theodore Ts'o" <tytso@mit.edu>
54Description:
55 Tuning parameter which controls the maximum number of
56 inode table blocks that ext4's inode table readahead
57 algorithm will pre-read into the buffer cache
58
59What: /sys/fs/ext4/<disk>/delayed_allocation_blocks
60Date: March 2008
61Contact: "Theodore Ts'o" <tytso@mit.edu>
62Description:
63 This file is read-only and shows the number of blocks
64 that are dirty in the page cache, but which do not
65 have their location in the filesystem allocated yet.
66
67What: /sys/fs/ext4/<disk>/lifetime_write_kbytes
68Date: March 2008
69Contact: "Theodore Ts'o" <tytso@mit.edu>
70Description:
71 This file is read-only and shows the number of kilobytes
72 of data that have been written to this filesystem since it was
73 created.
74
75What: /sys/fs/ext4/<disk>/session_write_kbytes
76Date: March 2008
77Contact: "Theodore Ts'o" <tytso@mit.edu>
78Description:
79 This file is read-only and shows the number of
80 kilobytes of data that have been written to this
81 filesystem since it was mounted.
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index cec829bc7291..97882df04865 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -85,7 +85,7 @@ Note: More extensive information for getting started with ext4 can be
85* extent format more robust in face of on-disk corruption due to magics, 85* extent format more robust in face of on-disk corruption due to magics,
86* internal redundancy in tree 86* internal redundancy in tree
87* improved file allocation (multi-block alloc) 87* improved file allocation (multi-block alloc)
88* fix 32000 subdirectory limit 88* lift 32000 subdirectory limit imposed by i_links_count[1]
89* nsec timestamps for mtime, atime, ctime, create time 89* nsec timestamps for mtime, atime, ctime, create time
90* inode version field on disk (NFSv4, Lustre) 90* inode version field on disk (NFSv4, Lustre)
91* reduced e2fsck time via uninit_bg feature 91* reduced e2fsck time via uninit_bg feature
@@ -100,6 +100,9 @@ Note: More extensive information for getting started with ext4 can be
100* efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force 100* efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force
101 the ordering) 101 the ordering)
102 102
103[1] Filesystems with a block size of 1k may see a limit imposed by the
104directory hash tree having a maximum depth of two.
105
1032.2 Candidate features for future inclusion 1062.2 Candidate features for future inclusion
104 107
105* Online defrag (patches available but not well tested) 108* Online defrag (patches available but not well tested)
@@ -180,8 +183,8 @@ commit=nrsec (*) Ext4 can be told to sync all its data and metadata
180 performance. 183 performance.
181 184
182barrier=<0|1(*)> This enables/disables the use of write barriers in 185barrier=<0|1(*)> This enables/disables the use of write barriers in
183 the jbd code. barrier=0 disables, barrier=1 enables. 186barrier(*) the jbd code. barrier=0 disables, barrier=1 enables.
184 This also requires an IO stack which can support 187nobarrier This also requires an IO stack which can support
185 barriers, and if jbd gets an error on a barrier 188 barriers, and if jbd gets an error on a barrier
186 write, it will disable again with a warning. 189 write, it will disable again with a warning.
187 Write barriers enforce proper on-disk ordering 190 Write barriers enforce proper on-disk ordering
@@ -189,6 +192,9 @@ barrier=<0|1(*)> This enables/disables the use of write barriers in
189 safe to use, at some performance penalty. If 192 safe to use, at some performance penalty. If
190 your disks are battery-backed in one way or another, 193 your disks are battery-backed in one way or another,
191 disabling barriers may safely improve performance. 194 disabling barriers may safely improve performance.
195 The mount options "barrier" and "nobarrier" can
196 also be used to enable or disable barriers, for
197 consistency with other ext4 mount options.
192 198
193inode_readahead=n This tuning parameter controls the maximum 199inode_readahead=n This tuning parameter controls the maximum
194 number of inode table blocks that ext4's inode 200 number of inode table blocks that ext4's inode
@@ -310,6 +316,24 @@ journal_ioprio=prio The I/O priority (from 0 to 7, where 0 is the
310 a slightly higher priority than the default I/O 316 a slightly higher priority than the default I/O
311 priority. 317 priority.
312 318
319auto_da_alloc(*) Many broken applications don't use fsync() when
320noauto_da_alloc replacing existing files via patterns such as
321 fd = open("foo.new")/write(fd,..)/close(fd)/
322 rename("foo.new", "foo"), or worse yet,
323 fd = open("foo", O_TRUNC)/write(fd,..)/close(fd).
324 If auto_da_alloc is enabled, ext4 will detect
325 the replace-via-rename and replace-via-truncate
326 patterns and force that any delayed allocation
327 blocks are allocated such that at the next
328 journal commit, in the default data=ordered
329 mode, the data blocks of the new file are forced
330 to disk before the rename() operation is
331 commited. This provides roughly the same level
332 of guarantees as ext3, and avoids the
333 "zero-length" problem that can happen when a
334 system crashes before the delayed allocation
335 blocks are forced to disk.
336
313Data Mode 337Data Mode
314========= 338=========
315There are 3 different data modes: 339There are 3 different data modes:
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 830bad7cce0f..efc4fd9f40ce 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -940,27 +940,6 @@ Table 1-10: Files in /proc/fs/ext4/<devname>
940 File Content 940 File Content
941 mb_groups details of multiblock allocator buddy cache of free blocks 941 mb_groups details of multiblock allocator buddy cache of free blocks
942 mb_history multiblock allocation history 942 mb_history multiblock allocation history
943 stats controls whether the multiblock allocator should start
944 collecting statistics, which are shown during the unmount
945 group_prealloc the multiblock allocator will round up allocation
946 requests to a multiple of this tuning parameter if the
947 stripe size is not set in the ext4 superblock
948 max_to_scan The maximum number of extents the multiblock allocator
949 will search to find the best extent
950 min_to_scan The minimum number of extents the multiblock allocator
951 will search to find the best extent
952 order2_req Tuning parameter which controls the minimum size for
953 requests (as a power of 2) where the buddy cache is
954 used
955 stream_req Files which have fewer blocks than this tunable
956 parameter will have their blocks allocated out of a
957 block group specific preallocation pool, so that small
958 files are packed closely together. Each large file
959 will have its blocks allocated out of its own unique
960 preallocation pool.
961inode_readahead Tuning parameter which controls the maximum number of
962 inode table blocks that ext4's inode table readahead
963 algorithm will pre-read into the buffer cache
964.............................................................................. 943..............................................................................
965 944
966 945
diff --git a/arch/ia64/include/asm/intrinsics.h b/arch/ia64/include/asm/intrinsics.h
index c47830e26cb7..111ed5222892 100644
--- a/arch/ia64/include/asm/intrinsics.h
+++ b/arch/ia64/include/asm/intrinsics.h
@@ -202,7 +202,11 @@ extern long ia64_cmpxchg_called_with_bad_pointer (void);
202 202
203#ifndef __ASSEMBLY__ 203#ifndef __ASSEMBLY__
204#if defined(CONFIG_PARAVIRT) && defined(__KERNEL__) 204#if defined(CONFIG_PARAVIRT) && defined(__KERNEL__)
205#define IA64_INTRINSIC_API(name) pv_cpu_ops.name 205#ifdef ASM_SUPPORTED
206# define IA64_INTRINSIC_API(name) paravirt_ ## name
207#else
208# define IA64_INTRINSIC_API(name) pv_cpu_ops.name
209#endif
206#define IA64_INTRINSIC_MACRO(name) paravirt_ ## name 210#define IA64_INTRINSIC_MACRO(name) paravirt_ ## name
207#else 211#else
208#define IA64_INTRINSIC_API(name) ia64_native_ ## name 212#define IA64_INTRINSIC_API(name) ia64_native_ ## name
diff --git a/arch/ia64/include/asm/mmu_context.h b/arch/ia64/include/asm/mmu_context.h
index 040bc87db930..7f2a456603cb 100644
--- a/arch/ia64/include/asm/mmu_context.h
+++ b/arch/ia64/include/asm/mmu_context.h
@@ -87,7 +87,7 @@ get_mmu_context (struct mm_struct *mm)
87 /* re-check, now that we've got the lock: */ 87 /* re-check, now that we've got the lock: */
88 context = mm->context; 88 context = mm->context;
89 if (context == 0) { 89 if (context == 0) {
90 cpus_clear(mm->cpu_vm_mask); 90 cpumask_clear(mm_cpumask(mm));
91 if (ia64_ctx.next >= ia64_ctx.limit) { 91 if (ia64_ctx.next >= ia64_ctx.limit) {
92 ia64_ctx.next = find_next_zero_bit(ia64_ctx.bitmap, 92 ia64_ctx.next = find_next_zero_bit(ia64_ctx.bitmap,
93 ia64_ctx.max_ctx, ia64_ctx.next); 93 ia64_ctx.max_ctx, ia64_ctx.next);
@@ -166,8 +166,8 @@ activate_context (struct mm_struct *mm)
166 166
167 do { 167 do {
168 context = get_mmu_context(mm); 168 context = get_mmu_context(mm);
169 if (!cpu_isset(smp_processor_id(), mm->cpu_vm_mask)) 169 if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm)))
170 cpu_set(smp_processor_id(), mm->cpu_vm_mask); 170 cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
171 reload_context(context); 171 reload_context(context);
172 /* 172 /*
173 * in the unlikely event of a TLB-flush by another thread, 173 * in the unlikely event of a TLB-flush by another thread,
diff --git a/arch/ia64/include/asm/module.h b/arch/ia64/include/asm/module.h
index d2da61e4c49b..908eaef42a08 100644
--- a/arch/ia64/include/asm/module.h
+++ b/arch/ia64/include/asm/module.h
@@ -16,6 +16,12 @@ struct mod_arch_specific {
16 struct elf64_shdr *got; /* global offset table */ 16 struct elf64_shdr *got; /* global offset table */
17 struct elf64_shdr *opd; /* official procedure descriptors */ 17 struct elf64_shdr *opd; /* official procedure descriptors */
18 struct elf64_shdr *unwind; /* unwind-table section */ 18 struct elf64_shdr *unwind; /* unwind-table section */
19#ifdef CONFIG_PARAVIRT
20 struct elf64_shdr *paravirt_bundles;
21 /* paravirt_alt_bundle_patch table */
22 struct elf64_shdr *paravirt_insts;
23 /* paravirt_alt_inst_patch table */
24#endif
19 unsigned long gp; /* global-pointer for module */ 25 unsigned long gp; /* global-pointer for module */
20 26
21 void *core_unw_table; /* core unwind-table cookie returned by unwinder */ 27 void *core_unw_table; /* core unwind-table cookie returned by unwinder */
diff --git a/arch/ia64/include/asm/native/inst.h b/arch/ia64/include/asm/native/inst.h
index 0a1026cca4fa..d2d46efb3e6e 100644
--- a/arch/ia64/include/asm/native/inst.h
+++ b/arch/ia64/include/asm/native/inst.h
@@ -30,6 +30,9 @@
30#define __paravirt_work_processed_syscall_target \ 30#define __paravirt_work_processed_syscall_target \
31 ia64_work_processed_syscall 31 ia64_work_processed_syscall
32 32
33#define paravirt_fsyscall_table ia64_native_fsyscall_table
34#define paravirt_fsys_bubble_down ia64_native_fsys_bubble_down
35
33#ifdef CONFIG_PARAVIRT_GUEST_ASM_CLOBBER_CHECK 36#ifdef CONFIG_PARAVIRT_GUEST_ASM_CLOBBER_CHECK
34# define PARAVIRT_POISON 0xdeadbeefbaadf00d 37# define PARAVIRT_POISON 0xdeadbeefbaadf00d
35# define CLOBBER(clob) \ 38# define CLOBBER(clob) \
@@ -74,6 +77,11 @@
74(pred) mov reg = psr \ 77(pred) mov reg = psr \
75 CLOBBER(clob) 78 CLOBBER(clob)
76 79
80#define MOV_FROM_ITC(pred, pred_clob, reg, clob) \
81(pred) mov reg = ar.itc \
82 CLOBBER(clob) \
83 CLOBBER_PRED(pred_clob)
84
77#define MOV_TO_IFA(reg, clob) \ 85#define MOV_TO_IFA(reg, clob) \
78 mov cr.ifa = reg \ 86 mov cr.ifa = reg \
79 CLOBBER(clob) 87 CLOBBER(clob)
@@ -158,6 +166,11 @@
158#define RSM_PSR_DT \ 166#define RSM_PSR_DT \
159 rsm psr.dt 167 rsm psr.dt
160 168
169#define RSM_PSR_BE_I(clob0, clob1) \
170 rsm psr.be | psr.i \
171 CLOBBER(clob0) \
172 CLOBBER(clob1)
173
161#define SSM_PSR_DT_AND_SRLZ_I \ 174#define SSM_PSR_DT_AND_SRLZ_I \
162 ssm psr.dt \ 175 ssm psr.dt \
163 ;; \ 176 ;; \
diff --git a/arch/ia64/include/asm/native/patchlist.h b/arch/ia64/include/asm/native/patchlist.h
new file mode 100644
index 000000000000..be16ca9311bf
--- /dev/null
+++ b/arch/ia64/include/asm/native/patchlist.h
@@ -0,0 +1,38 @@
1/******************************************************************************
2 * arch/ia64/include/asm/native/inst.h
3 *
4 * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
5 * VA Linux Systems Japan K.K.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 *
21 */
22
23#define __paravirt_start_gate_fsyscall_patchlist \
24 __ia64_native_start_gate_fsyscall_patchlist
25#define __paravirt_end_gate_fsyscall_patchlist \
26 __ia64_native_end_gate_fsyscall_patchlist
27#define __paravirt_start_gate_brl_fsys_bubble_down_patchlist \
28 __ia64_native_start_gate_brl_fsys_bubble_down_patchlist
29#define __paravirt_end_gate_brl_fsys_bubble_down_patchlist \
30 __ia64_native_end_gate_brl_fsys_bubble_down_patchlist
31#define __paravirt_start_gate_vtop_patchlist \
32 __ia64_native_start_gate_vtop_patchlist
33#define __paravirt_end_gate_vtop_patchlist \
34 __ia64_native_end_gate_vtop_patchlist
35#define __paravirt_start_gate_mckinley_e9_patchlist \
36 __ia64_native_start_gate_mckinley_e9_patchlist
37#define __paravirt_end_gate_mckinley_e9_patchlist \
38 __ia64_native_end_gate_mckinley_e9_patchlist
diff --git a/arch/ia64/include/asm/native/pvchk_inst.h b/arch/ia64/include/asm/native/pvchk_inst.h
index b8e6eb1090d7..8d72962ec838 100644
--- a/arch/ia64/include/asm/native/pvchk_inst.h
+++ b/arch/ia64/include/asm/native/pvchk_inst.h
@@ -180,6 +180,11 @@
180 IS_PRED_IN(pred) \ 180 IS_PRED_IN(pred) \
181 IS_RREG_OUT(reg) \ 181 IS_RREG_OUT(reg) \
182 IS_RREG_CLOB(clob) 182 IS_RREG_CLOB(clob)
183#define MOV_FROM_ITC(pred, pred_clob, reg, clob) \
184 IS_PRED_IN(pred) \
185 IS_PRED_CLOB(pred_clob) \
186 IS_RREG_OUT(reg) \
187 IS_RREG_CLOB(clob)
183#define MOV_TO_IFA(reg, clob) \ 188#define MOV_TO_IFA(reg, clob) \
184 IS_RREG_IN(reg) \ 189 IS_RREG_IN(reg) \
185 IS_RREG_CLOB(clob) 190 IS_RREG_CLOB(clob)
@@ -246,6 +251,9 @@
246 IS_RREG_CLOB(clob2) 251 IS_RREG_CLOB(clob2)
247#define RSM_PSR_DT \ 252#define RSM_PSR_DT \
248 nop 0 253 nop 0
254#define RSM_PSR_BE_I(clob0, clob1) \
255 IS_RREG_CLOB(clob0) \
256 IS_RREG_CLOB(clob1)
249#define SSM_PSR_DT_AND_SRLZ_I \ 257#define SSM_PSR_DT_AND_SRLZ_I \
250 nop 0 258 nop 0
251#define BSW_0(clob0, clob1, clob2) \ 259#define BSW_0(clob0, clob1, clob2) \
diff --git a/arch/ia64/include/asm/paravirt.h b/arch/ia64/include/asm/paravirt.h
index 2bf3636473fe..2eb0a981a09a 100644
--- a/arch/ia64/include/asm/paravirt.h
+++ b/arch/ia64/include/asm/paravirt.h
@@ -22,6 +22,56 @@
22#ifndef __ASM_PARAVIRT_H 22#ifndef __ASM_PARAVIRT_H
23#define __ASM_PARAVIRT_H 23#define __ASM_PARAVIRT_H
24 24
25#ifndef __ASSEMBLY__
26/******************************************************************************
27 * fsys related addresses
28 */
29struct pv_fsys_data {
30 unsigned long *fsyscall_table;
31 void *fsys_bubble_down;
32};
33
34extern struct pv_fsys_data pv_fsys_data;
35
36unsigned long *paravirt_get_fsyscall_table(void);
37char *paravirt_get_fsys_bubble_down(void);
38
39/******************************************************************************
40 * patchlist addresses for gate page
41 */
42enum pv_gate_patchlist {
43 PV_GATE_START_FSYSCALL,
44 PV_GATE_END_FSYSCALL,
45
46 PV_GATE_START_BRL_FSYS_BUBBLE_DOWN,
47 PV_GATE_END_BRL_FSYS_BUBBLE_DOWN,
48
49 PV_GATE_START_VTOP,
50 PV_GATE_END_VTOP,
51
52 PV_GATE_START_MCKINLEY_E9,
53 PV_GATE_END_MCKINLEY_E9,
54};
55
56struct pv_patchdata {
57 unsigned long start_fsyscall_patchlist;
58 unsigned long end_fsyscall_patchlist;
59 unsigned long start_brl_fsys_bubble_down_patchlist;
60 unsigned long end_brl_fsys_bubble_down_patchlist;
61 unsigned long start_vtop_patchlist;
62 unsigned long end_vtop_patchlist;
63 unsigned long start_mckinley_e9_patchlist;
64 unsigned long end_mckinley_e9_patchlist;
65
66 void *gate_section;
67};
68
69extern struct pv_patchdata pv_patchdata;
70
71unsigned long paravirt_get_gate_patchlist(enum pv_gate_patchlist type);
72void *paravirt_get_gate_section(void);
73#endif
74
25#ifdef CONFIG_PARAVIRT_GUEST 75#ifdef CONFIG_PARAVIRT_GUEST
26 76
27#define PARAVIRT_HYPERVISOR_TYPE_DEFAULT 0 77#define PARAVIRT_HYPERVISOR_TYPE_DEFAULT 0
@@ -68,6 +118,14 @@ struct pv_init_ops {
68 int (*arch_setup_nomca)(void); 118 int (*arch_setup_nomca)(void);
69 119
70 void (*post_smp_prepare_boot_cpu)(void); 120 void (*post_smp_prepare_boot_cpu)(void);
121
122#ifdef ASM_SUPPORTED
123 unsigned long (*patch_bundle)(void *sbundle, void *ebundle,
124 unsigned long type);
125 unsigned long (*patch_inst)(unsigned long stag, unsigned long etag,
126 unsigned long type);
127#endif
128 void (*patch_branch)(unsigned long tag, unsigned long type);
71}; 129};
72 130
73extern struct pv_init_ops pv_init_ops; 131extern struct pv_init_ops pv_init_ops;
@@ -210,6 +268,8 @@ struct pv_time_ops {
210 int (*do_steal_accounting)(unsigned long *new_itm); 268 int (*do_steal_accounting)(unsigned long *new_itm);
211 269
212 void (*clocksource_resume)(void); 270 void (*clocksource_resume)(void);
271
272 unsigned long long (*sched_clock)(void);
213}; 273};
214 274
215extern struct pv_time_ops pv_time_ops; 275extern struct pv_time_ops pv_time_ops;
@@ -227,6 +287,11 @@ paravirt_do_steal_accounting(unsigned long *new_itm)
227 return pv_time_ops.do_steal_accounting(new_itm); 287 return pv_time_ops.do_steal_accounting(new_itm);
228} 288}
229 289
290static inline unsigned long long paravirt_sched_clock(void)
291{
292 return pv_time_ops.sched_clock();
293}
294
230#endif /* !__ASSEMBLY__ */ 295#endif /* !__ASSEMBLY__ */
231 296
232#else 297#else
diff --git a/arch/ia64/include/asm/paravirt_patch.h b/arch/ia64/include/asm/paravirt_patch.h
new file mode 100644
index 000000000000..128ff5db6e67
--- /dev/null
+++ b/arch/ia64/include/asm/paravirt_patch.h
@@ -0,0 +1,143 @@
1/******************************************************************************
2 * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
3 * VA Linux Systems Japan K.K.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 *
19 */
20
21#ifndef __ASM_PARAVIRT_PATCH_H
22#define __ASM_PARAVIRT_PATCH_H
23
24#ifdef __ASSEMBLY__
25
26 .section .paravirt_branches, "a"
27 .previous
28#define PARAVIRT_PATCH_SITE_BR(type) \
29 { \
30 [1:] ; \
31 br.cond.sptk.many 2f ; \
32 nop.b 0 ; \
33 nop.b 0;; ; \
34 } ; \
35 2: \
36 .xdata8 ".paravirt_branches", 1b, type
37
38#else
39
40#include <linux/stringify.h>
41#include <asm/intrinsics.h>
42
43/* for binary patch */
44struct paravirt_patch_site_bundle {
45 void *sbundle;
46 void *ebundle;
47 unsigned long type;
48};
49
50/* label means the beginning of new bundle */
51#define paravirt_alt_bundle(instr, privop) \
52 "\t998:\n" \
53 "\t" instr "\n" \
54 "\t999:\n" \
55 "\t.pushsection .paravirt_bundles, \"a\"\n" \
56 "\t.popsection\n" \
57 "\t.xdata8 \".paravirt_bundles\", 998b, 999b, " \
58 __stringify(privop) "\n"
59
60
61struct paravirt_patch_bundle_elem {
62 const void *sbundle;
63 const void *ebundle;
64 unsigned long type;
65};
66
67
68struct paravirt_patch_site_inst {
69 unsigned long stag;
70 unsigned long etag;
71 unsigned long type;
72};
73
74#define paravirt_alt_inst(instr, privop) \
75 "\t[998:]\n" \
76 "\t" instr "\n" \
77 "\t[999:]\n" \
78 "\t.pushsection .paravirt_insts, \"a\"\n" \
79 "\t.popsection\n" \
80 "\t.xdata8 \".paravirt_insts\", 998b, 999b, " \
81 __stringify(privop) "\n"
82
83struct paravirt_patch_site_branch {
84 unsigned long tag;
85 unsigned long type;
86};
87
88struct paravirt_patch_branch_target {
89 const void *entry;
90 unsigned long type;
91};
92
93void
94__paravirt_patch_apply_branch(
95 unsigned long tag, unsigned long type,
96 const struct paravirt_patch_branch_target *entries,
97 unsigned int nr_entries);
98
99void
100paravirt_patch_reloc_br(unsigned long tag, const void *target);
101
102void
103paravirt_patch_reloc_brl(unsigned long tag, const void *target);
104
105
106#if defined(ASM_SUPPORTED) && defined(CONFIG_PARAVIRT)
107unsigned long
108ia64_native_patch_bundle(void *sbundle, void *ebundle, unsigned long type);
109
110unsigned long
111__paravirt_patch_apply_bundle(void *sbundle, void *ebundle, unsigned long type,
112 const struct paravirt_patch_bundle_elem *elems,
113 unsigned long nelems,
114 const struct paravirt_patch_bundle_elem **found);
115
116void
117paravirt_patch_apply_bundle(const struct paravirt_patch_site_bundle *start,
118 const struct paravirt_patch_site_bundle *end);
119
120void
121paravirt_patch_apply_inst(const struct paravirt_patch_site_inst *start,
122 const struct paravirt_patch_site_inst *end);
123
124void paravirt_patch_apply(void);
125#else
126#define paravirt_patch_apply_bundle(start, end) do { } while (0)
127#define paravirt_patch_apply_inst(start, end) do { } while (0)
128#define paravirt_patch_apply() do { } while (0)
129#endif
130
131#endif /* !__ASSEMBLEY__ */
132
133#endif /* __ASM_PARAVIRT_PATCH_H */
134
135/*
136 * Local variables:
137 * mode: C
138 * c-set-style: "linux"
139 * c-basic-offset: 8
140 * tab-width: 8
141 * indent-tabs-mode: t
142 * End:
143 */
diff --git a/arch/ia64/include/asm/paravirt_privop.h b/arch/ia64/include/asm/paravirt_privop.h
index 33c8e55f5775..3d2951130b5f 100644
--- a/arch/ia64/include/asm/paravirt_privop.h
+++ b/arch/ia64/include/asm/paravirt_privop.h
@@ -33,7 +33,7 @@
33 */ 33 */
34 34
35struct pv_cpu_ops { 35struct pv_cpu_ops {
36 void (*fc)(unsigned long addr); 36 void (*fc)(void *addr);
37 unsigned long (*thash)(unsigned long addr); 37 unsigned long (*thash)(unsigned long addr);
38 unsigned long (*get_cpuid)(int index); 38 unsigned long (*get_cpuid)(int index);
39 unsigned long (*get_pmd)(int index); 39 unsigned long (*get_pmd)(int index);
@@ -60,12 +60,18 @@ extern unsigned long ia64_native_getreg_func(int regnum);
60/* Instructions paravirtualized for performance */ 60/* Instructions paravirtualized for performance */
61/************************************************/ 61/************************************************/
62 62
63#ifndef ASM_SUPPORTED
64#define paravirt_ssm_i() pv_cpu_ops.ssm_i()
65#define paravirt_rsm_i() pv_cpu_ops.rsm_i()
66#define __paravirt_getreg() pv_cpu_ops.getreg()
67#endif
68
63/* mask for ia64_native_ssm/rsm() must be constant.("i" constraing). 69/* mask for ia64_native_ssm/rsm() must be constant.("i" constraing).
64 * static inline function doesn't satisfy it. */ 70 * static inline function doesn't satisfy it. */
65#define paravirt_ssm(mask) \ 71#define paravirt_ssm(mask) \
66 do { \ 72 do { \
67 if ((mask) == IA64_PSR_I) \ 73 if ((mask) == IA64_PSR_I) \
68 pv_cpu_ops.ssm_i(); \ 74 paravirt_ssm_i(); \
69 else \ 75 else \
70 ia64_native_ssm(mask); \ 76 ia64_native_ssm(mask); \
71 } while (0) 77 } while (0)
@@ -73,7 +79,7 @@ extern unsigned long ia64_native_getreg_func(int regnum);
73#define paravirt_rsm(mask) \ 79#define paravirt_rsm(mask) \
74 do { \ 80 do { \
75 if ((mask) == IA64_PSR_I) \ 81 if ((mask) == IA64_PSR_I) \
76 pv_cpu_ops.rsm_i(); \ 82 paravirt_rsm_i(); \
77 else \ 83 else \
78 ia64_native_rsm(mask); \ 84 ia64_native_rsm(mask); \
79 } while (0) 85 } while (0)
@@ -86,7 +92,7 @@ extern unsigned long ia64_native_getreg_func(int regnum);
86 if ((reg) == _IA64_REG_IP) \ 92 if ((reg) == _IA64_REG_IP) \
87 res = ia64_native_getreg(_IA64_REG_IP); \ 93 res = ia64_native_getreg(_IA64_REG_IP); \
88 else \ 94 else \
89 res = pv_cpu_ops.getreg(reg); \ 95 res = __paravirt_getreg(reg); \
90 res; \ 96 res; \
91 }) 97 })
92 98
@@ -112,6 +118,12 @@ void paravirt_cpu_asm_init(const struct pv_cpu_asm_switch *cpu_asm_switch);
112 118
113#endif /* CONFIG_PARAVIRT */ 119#endif /* CONFIG_PARAVIRT */
114 120
121#if defined(CONFIG_PARAVIRT) && defined(ASM_SUPPORTED)
122#define paravirt_dv_serialize_data() ia64_dv_serialize_data()
123#else
124#define paravirt_dv_serialize_data() /* nothing */
125#endif
126
115/* these routines utilize privilege-sensitive or performance-sensitive 127/* these routines utilize privilege-sensitive or performance-sensitive
116 * privileged instructions so the code must be replaced with 128 * privileged instructions so the code must be replaced with
117 * paravirtualized versions */ 129 * paravirtualized versions */
@@ -121,4 +133,349 @@ void paravirt_cpu_asm_init(const struct pv_cpu_asm_switch *cpu_asm_switch);
121 IA64_PARAVIRT_ASM_FUNC(work_processed_syscall) 133 IA64_PARAVIRT_ASM_FUNC(work_processed_syscall)
122#define ia64_leave_kernel IA64_PARAVIRT_ASM_FUNC(leave_kernel) 134#define ia64_leave_kernel IA64_PARAVIRT_ASM_FUNC(leave_kernel)
123 135
136
137#if defined(CONFIG_PARAVIRT)
138/******************************************************************************
139 * binary patching infrastructure
140 */
141#define PARAVIRT_PATCH_TYPE_FC 1
142#define PARAVIRT_PATCH_TYPE_THASH 2
143#define PARAVIRT_PATCH_TYPE_GET_CPUID 3
144#define PARAVIRT_PATCH_TYPE_GET_PMD 4
145#define PARAVIRT_PATCH_TYPE_PTCGA 5
146#define PARAVIRT_PATCH_TYPE_GET_RR 6
147#define PARAVIRT_PATCH_TYPE_SET_RR 7
148#define PARAVIRT_PATCH_TYPE_SET_RR0_TO_RR4 8
149#define PARAVIRT_PATCH_TYPE_SSM_I 9
150#define PARAVIRT_PATCH_TYPE_RSM_I 10
151#define PARAVIRT_PATCH_TYPE_GET_PSR_I 11
152#define PARAVIRT_PATCH_TYPE_INTRIN_LOCAL_IRQ_RESTORE 12
153
154/* PARAVIRT_PATY_TYPE_[GS]ETREG + _IA64_REG_xxx */
155#define PARAVIRT_PATCH_TYPE_GETREG 0x10000000
156#define PARAVIRT_PATCH_TYPE_SETREG 0x20000000
157
158/*
159 * struct task_struct* (*ia64_switch_to)(void* next_task);
160 * void *ia64_leave_syscall;
161 * void *ia64_work_processed_syscall
162 * void *ia64_leave_kernel;
163 */
164
165#define PARAVIRT_PATCH_TYPE_BR_START 0x30000000
166#define PARAVIRT_PATCH_TYPE_BR_SWITCH_TO \
167 (PARAVIRT_PATCH_TYPE_BR_START + 0)
168#define PARAVIRT_PATCH_TYPE_BR_LEAVE_SYSCALL \
169 (PARAVIRT_PATCH_TYPE_BR_START + 1)
170#define PARAVIRT_PATCH_TYPE_BR_WORK_PROCESSED_SYSCALL \
171 (PARAVIRT_PATCH_TYPE_BR_START + 2)
172#define PARAVIRT_PATCH_TYPE_BR_LEAVE_KERNEL \
173 (PARAVIRT_PATCH_TYPE_BR_START + 3)
174
175#ifdef ASM_SUPPORTED
176#include <asm/paravirt_patch.h>
177
178/*
179 * pv_cpu_ops calling stub.
180 * normal function call convension can't be written by gcc
181 * inline assembly.
182 *
183 * from the caller's point of view,
184 * the following registers will be clobbered.
185 * r2, r3
186 * r8-r15
187 * r16, r17
188 * b6, b7
189 * p6-p15
190 * ar.ccv
191 *
192 * from the callee's point of view ,
193 * the following registers can be used.
194 * r2, r3: scratch
195 * r8: scratch, input argument0 and return value
196 * r0-r15: scratch, input argument1-5
197 * b6: return pointer
198 * b7: scratch
199 * p6-p15: scratch
200 * ar.ccv: scratch
201 *
202 * other registers must not be changed. especially
203 * b0: rp: preserved. gcc ignores b0 in clobbered register.
204 * r16: saved gp
205 */
206/* 5 bundles */
207#define __PARAVIRT_BR \
208 ";;\n" \
209 "{ .mlx\n" \
210 "nop 0\n" \
211 "movl r2 = %[op_addr]\n"/* get function pointer address */ \
212 ";;\n" \
213 "}\n" \
214 "1:\n" \
215 "{ .mii\n" \
216 "ld8 r2 = [r2]\n" /* load function descriptor address */ \
217 "mov r17 = ip\n" /* get ip to calc return address */ \
218 "mov r16 = gp\n" /* save gp */ \
219 ";;\n" \
220 "}\n" \
221 "{ .mii\n" \
222 "ld8 r3 = [r2], 8\n" /* load entry address */ \
223 "adds r17 = 1f - 1b, r17\n" /* calculate return address */ \
224 ";;\n" \
225 "mov b7 = r3\n" /* set entry address */ \
226 "}\n" \
227 "{ .mib\n" \
228 "ld8 gp = [r2]\n" /* load gp value */ \
229 "mov b6 = r17\n" /* set return address */ \
230 "br.cond.sptk.few b7\n" /* intrinsics are very short isns */ \
231 "}\n" \
232 "1:\n" \
233 "{ .mii\n" \
234 "mov gp = r16\n" /* restore gp value */ \
235 "nop 0\n" \
236 "nop 0\n" \
237 ";;\n" \
238 "}\n"
239
240#define PARAVIRT_OP(op) \
241 [op_addr] "i"(&pv_cpu_ops.op)
242
243#define PARAVIRT_TYPE(type) \
244 PARAVIRT_PATCH_TYPE_ ## type
245
246#define PARAVIRT_REG_CLOBBERS0 \
247 "r2", "r3", /*"r8",*/ "r9", "r10", "r11", "r14", \
248 "r15", "r16", "r17"
249
250#define PARAVIRT_REG_CLOBBERS1 \
251 "r2","r3", /*"r8",*/ "r9", "r10", "r11", "r14", \
252 "r15", "r16", "r17"
253
254#define PARAVIRT_REG_CLOBBERS2 \
255 "r2", "r3", /*"r8", "r9",*/ "r10", "r11", "r14", \
256 "r15", "r16", "r17"
257
258#define PARAVIRT_REG_CLOBBERS5 \
259 "r2", "r3", /*"r8", "r9", "r10", "r11", "r14",*/ \
260 "r15", "r16", "r17"
261
262#define PARAVIRT_BR_CLOBBERS \
263 "b6", "b7"
264
265#define PARAVIRT_PR_CLOBBERS \
266 "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15"
267
268#define PARAVIRT_AR_CLOBBERS \
269 "ar.ccv"
270
271#define PARAVIRT_CLOBBERS0 \
272 PARAVIRT_REG_CLOBBERS0, \
273 PARAVIRT_BR_CLOBBERS, \
274 PARAVIRT_PR_CLOBBERS, \
275 PARAVIRT_AR_CLOBBERS, \
276 "memory"
277
278#define PARAVIRT_CLOBBERS1 \
279 PARAVIRT_REG_CLOBBERS1, \
280 PARAVIRT_BR_CLOBBERS, \
281 PARAVIRT_PR_CLOBBERS, \
282 PARAVIRT_AR_CLOBBERS, \
283 "memory"
284
285#define PARAVIRT_CLOBBERS2 \
286 PARAVIRT_REG_CLOBBERS2, \
287 PARAVIRT_BR_CLOBBERS, \
288 PARAVIRT_PR_CLOBBERS, \
289 PARAVIRT_AR_CLOBBERS, \
290 "memory"
291
292#define PARAVIRT_CLOBBERS5 \
293 PARAVIRT_REG_CLOBBERS5, \
294 PARAVIRT_BR_CLOBBERS, \
295 PARAVIRT_PR_CLOBBERS, \
296 PARAVIRT_AR_CLOBBERS, \
297 "memory"
298
299#define PARAVIRT_BR0(op, type) \
300 register unsigned long ia64_clobber asm ("r8"); \
301 asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \
302 PARAVIRT_TYPE(type)) \
303 : "=r"(ia64_clobber) \
304 : PARAVIRT_OP(op) \
305 : PARAVIRT_CLOBBERS0)
306
307#define PARAVIRT_BR0_RET(op, type) \
308 register unsigned long ia64_intri_res asm ("r8"); \
309 asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \
310 PARAVIRT_TYPE(type)) \
311 : "=r"(ia64_intri_res) \
312 : PARAVIRT_OP(op) \
313 : PARAVIRT_CLOBBERS0)
314
315#define PARAVIRT_BR1(op, type, arg1) \
316 register unsigned long __##arg1 asm ("r8") = arg1; \
317 register unsigned long ia64_clobber asm ("r8"); \
318 asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \
319 PARAVIRT_TYPE(type)) \
320 : "=r"(ia64_clobber) \
321 : PARAVIRT_OP(op), "0"(__##arg1) \
322 : PARAVIRT_CLOBBERS1)
323
324#define PARAVIRT_BR1_RET(op, type, arg1) \
325 register unsigned long ia64_intri_res asm ("r8"); \
326 register unsigned long __##arg1 asm ("r8") = arg1; \
327 asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \
328 PARAVIRT_TYPE(type)) \
329 : "=r"(ia64_intri_res) \
330 : PARAVIRT_OP(op), "0"(__##arg1) \
331 : PARAVIRT_CLOBBERS1)
332
333#define PARAVIRT_BR1_VOID(op, type, arg1) \
334 register void *__##arg1 asm ("r8") = arg1; \
335 register unsigned long ia64_clobber asm ("r8"); \
336 asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \
337 PARAVIRT_TYPE(type)) \
338 : "=r"(ia64_clobber) \
339 : PARAVIRT_OP(op), "0"(__##arg1) \
340 : PARAVIRT_CLOBBERS1)
341
342#define PARAVIRT_BR2(op, type, arg1, arg2) \
343 register unsigned long __##arg1 asm ("r8") = arg1; \
344 register unsigned long __##arg2 asm ("r9") = arg2; \
345 register unsigned long ia64_clobber1 asm ("r8"); \
346 register unsigned long ia64_clobber2 asm ("r9"); \
347 asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \
348 PARAVIRT_TYPE(type)) \
349 : "=r"(ia64_clobber1), "=r"(ia64_clobber2) \
350 : PARAVIRT_OP(op), "0"(__##arg1), "1"(__##arg2) \
351 : PARAVIRT_CLOBBERS2)
352
353
354#define PARAVIRT_DEFINE_CPU_OP0(op, type) \
355 static inline void \
356 paravirt_ ## op (void) \
357 { \
358 PARAVIRT_BR0(op, type); \
359 }
360
361#define PARAVIRT_DEFINE_CPU_OP0_RET(op, type) \
362 static inline unsigned long \
363 paravirt_ ## op (void) \
364 { \
365 PARAVIRT_BR0_RET(op, type); \
366 return ia64_intri_res; \
367 }
368
369#define PARAVIRT_DEFINE_CPU_OP1_VOID(op, type) \
370 static inline void \
371 paravirt_ ## op (void *arg1) \
372 { \
373 PARAVIRT_BR1_VOID(op, type, arg1); \
374 }
375
376#define PARAVIRT_DEFINE_CPU_OP1(op, type) \
377 static inline void \
378 paravirt_ ## op (unsigned long arg1) \
379 { \
380 PARAVIRT_BR1(op, type, arg1); \
381 }
382
383#define PARAVIRT_DEFINE_CPU_OP1_RET(op, type) \
384 static inline unsigned long \
385 paravirt_ ## op (unsigned long arg1) \
386 { \
387 PARAVIRT_BR1_RET(op, type, arg1); \
388 return ia64_intri_res; \
389 }
390
391#define PARAVIRT_DEFINE_CPU_OP2(op, type) \
392 static inline void \
393 paravirt_ ## op (unsigned long arg1, \
394 unsigned long arg2) \
395 { \
396 PARAVIRT_BR2(op, type, arg1, arg2); \
397 }
398
399
400PARAVIRT_DEFINE_CPU_OP1_VOID(fc, FC);
401PARAVIRT_DEFINE_CPU_OP1_RET(thash, THASH)
402PARAVIRT_DEFINE_CPU_OP1_RET(get_cpuid, GET_CPUID)
403PARAVIRT_DEFINE_CPU_OP1_RET(get_pmd, GET_PMD)
404PARAVIRT_DEFINE_CPU_OP2(ptcga, PTCGA)
405PARAVIRT_DEFINE_CPU_OP1_RET(get_rr, GET_RR)
406PARAVIRT_DEFINE_CPU_OP2(set_rr, SET_RR)
407PARAVIRT_DEFINE_CPU_OP0(ssm_i, SSM_I)
408PARAVIRT_DEFINE_CPU_OP0(rsm_i, RSM_I)
409PARAVIRT_DEFINE_CPU_OP0_RET(get_psr_i, GET_PSR_I)
410PARAVIRT_DEFINE_CPU_OP1(intrin_local_irq_restore, INTRIN_LOCAL_IRQ_RESTORE)
411
412static inline void
413paravirt_set_rr0_to_rr4(unsigned long val0, unsigned long val1,
414 unsigned long val2, unsigned long val3,
415 unsigned long val4)
416{
417 register unsigned long __val0 asm ("r8") = val0;
418 register unsigned long __val1 asm ("r9") = val1;
419 register unsigned long __val2 asm ("r10") = val2;
420 register unsigned long __val3 asm ("r11") = val3;
421 register unsigned long __val4 asm ("r14") = val4;
422
423 register unsigned long ia64_clobber0 asm ("r8");
424 register unsigned long ia64_clobber1 asm ("r9");
425 register unsigned long ia64_clobber2 asm ("r10");
426 register unsigned long ia64_clobber3 asm ("r11");
427 register unsigned long ia64_clobber4 asm ("r14");
428
429 asm volatile (paravirt_alt_bundle(__PARAVIRT_BR,
430 PARAVIRT_TYPE(SET_RR0_TO_RR4))
431 : "=r"(ia64_clobber0),
432 "=r"(ia64_clobber1),
433 "=r"(ia64_clobber2),
434 "=r"(ia64_clobber3),
435 "=r"(ia64_clobber4)
436 : PARAVIRT_OP(set_rr0_to_rr4),
437 "0"(__val0), "1"(__val1), "2"(__val2),
438 "3"(__val3), "4"(__val4)
439 : PARAVIRT_CLOBBERS5);
440}
441
442/* unsigned long paravirt_getreg(int reg) */
443#define __paravirt_getreg(reg) \
444 ({ \
445 register unsigned long ia64_intri_res asm ("r8"); \
446 register unsigned long __reg asm ("r8") = (reg); \
447 \
448 BUILD_BUG_ON(!__builtin_constant_p(reg)); \
449 asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \
450 PARAVIRT_TYPE(GETREG) \
451 + (reg)) \
452 : "=r"(ia64_intri_res) \
453 : PARAVIRT_OP(getreg), "0"(__reg) \
454 : PARAVIRT_CLOBBERS1); \
455 \
456 ia64_intri_res; \
457 })
458
459/* void paravirt_setreg(int reg, unsigned long val) */
460#define paravirt_setreg(reg, val) \
461 do { \
462 register unsigned long __val asm ("r8") = val; \
463 register unsigned long __reg asm ("r9") = reg; \
464 register unsigned long ia64_clobber1 asm ("r8"); \
465 register unsigned long ia64_clobber2 asm ("r9"); \
466 \
467 BUILD_BUG_ON(!__builtin_constant_p(reg)); \
468 asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \
469 PARAVIRT_TYPE(SETREG) \
470 + (reg)) \
471 : "=r"(ia64_clobber1), \
472 "=r"(ia64_clobber2) \
473 : PARAVIRT_OP(setreg), \
474 "1"(__reg), "0"(__val) \
475 : PARAVIRT_CLOBBERS2); \
476 } while (0)
477
478#endif /* ASM_SUPPORTED */
479#endif /* CONFIG_PARAVIRT && ASM_SUPPOTED */
480
124#endif /* _ASM_IA64_PARAVIRT_PRIVOP_H */ 481#endif /* _ASM_IA64_PARAVIRT_PRIVOP_H */
diff --git a/arch/ia64/include/asm/smp.h b/arch/ia64/include/asm/smp.h
index 21c402365d0e..598408336251 100644
--- a/arch/ia64/include/asm/smp.h
+++ b/arch/ia64/include/asm/smp.h
@@ -126,7 +126,8 @@ extern void identify_siblings (struct cpuinfo_ia64 *);
126extern int is_multithreading_enabled(void); 126extern int is_multithreading_enabled(void);
127 127
128extern void arch_send_call_function_single_ipi(int cpu); 128extern void arch_send_call_function_single_ipi(int cpu);
129extern void arch_send_call_function_ipi(cpumask_t mask); 129extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
130#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
130 131
131#else /* CONFIG_SMP */ 132#else /* CONFIG_SMP */
132 133
diff --git a/arch/ia64/include/asm/timex.h b/arch/ia64/include/asm/timex.h
index 4e03cfe74a0c..86c7db861180 100644
--- a/arch/ia64/include/asm/timex.h
+++ b/arch/ia64/include/asm/timex.h
@@ -40,5 +40,6 @@ get_cycles (void)
40} 40}
41 41
42extern void ia64_cpu_local_tick (void); 42extern void ia64_cpu_local_tick (void);
43extern unsigned long long ia64_native_sched_clock (void);
43 44
44#endif /* _ASM_IA64_TIMEX_H */ 45#endif /* _ASM_IA64_TIMEX_H */
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index f260dcf21515..7b4c8c70b2d1 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -112,11 +112,6 @@ void build_cpu_to_node_map(void);
112 112
113extern void arch_fix_phys_package_id(int num, u32 slot); 113extern void arch_fix_phys_package_id(int num, u32 slot);
114 114
115#define pcibus_to_cpumask(bus) (pcibus_to_node(bus) == -1 ? \
116 CPU_MASK_ALL : \
117 node_to_cpumask(pcibus_to_node(bus)) \
118 )
119
120#define cpumask_of_pcibus(bus) (pcibus_to_node(bus) == -1 ? \ 115#define cpumask_of_pcibus(bus) (pcibus_to_node(bus) == -1 ? \
121 cpu_all_mask : \ 116 cpu_all_mask : \
122 cpumask_of_node(pcibus_to_node(bus))) 117 cpumask_of_node(pcibus_to_node(bus)))
diff --git a/arch/ia64/include/asm/xen/hypervisor.h b/arch/ia64/include/asm/xen/hypervisor.h
index 7a804e80fc67..e425227a418e 100644
--- a/arch/ia64/include/asm/xen/hypervisor.h
+++ b/arch/ia64/include/asm/xen/hypervisor.h
@@ -33,9 +33,6 @@
33#ifndef _ASM_IA64_XEN_HYPERVISOR_H 33#ifndef _ASM_IA64_XEN_HYPERVISOR_H
34#define _ASM_IA64_XEN_HYPERVISOR_H 34#define _ASM_IA64_XEN_HYPERVISOR_H
35 35
36#ifdef CONFIG_XEN
37
38#include <linux/init.h>
39#include <xen/interface/xen.h> 36#include <xen/interface/xen.h>
40#include <xen/interface/version.h> /* to compile feature.c */ 37#include <xen/interface/version.h> /* to compile feature.c */
41#include <xen/features.h> /* to comiple xen-netfront.c */ 38#include <xen/features.h> /* to comiple xen-netfront.c */
@@ -43,22 +40,32 @@
43 40
44/* xen_domain_type is set before executing any C code by early_xen_setup */ 41/* xen_domain_type is set before executing any C code by early_xen_setup */
45enum xen_domain_type { 42enum xen_domain_type {
46 XEN_NATIVE, 43 XEN_NATIVE, /* running on bare hardware */
47 XEN_PV_DOMAIN, 44 XEN_PV_DOMAIN, /* running in a PV domain */
48 XEN_HVM_DOMAIN, 45 XEN_HVM_DOMAIN, /* running in a Xen hvm domain*/
49}; 46};
50 47
48#ifdef CONFIG_XEN
51extern enum xen_domain_type xen_domain_type; 49extern enum xen_domain_type xen_domain_type;
50#else
51#define xen_domain_type XEN_NATIVE
52#endif
52 53
53#define xen_domain() (xen_domain_type != XEN_NATIVE) 54#define xen_domain() (xen_domain_type != XEN_NATIVE)
54#define xen_pv_domain() (xen_domain_type == XEN_PV_DOMAIN) 55#define xen_pv_domain() (xen_domain() && \
55#define xen_initial_domain() (xen_pv_domain() && \ 56 xen_domain_type == XEN_PV_DOMAIN)
57#define xen_hvm_domain() (xen_domain() && \
58 xen_domain_type == XEN_HVM_DOMAIN)
59
60#ifdef CONFIG_XEN_DOM0
61#define xen_initial_domain() (xen_pv_domain() && \
56 (xen_start_info->flags & SIF_INITDOMAIN)) 62 (xen_start_info->flags & SIF_INITDOMAIN))
57#define xen_hvm_domain() (xen_domain_type == XEN_HVM_DOMAIN) 63#else
64#define xen_initial_domain() (0)
65#endif
58 66
59/* deprecated. remove this */
60#define is_running_on_xen() (xen_domain_type == XEN_PV_DOMAIN)
61 67
68#ifdef CONFIG_XEN
62extern struct shared_info *HYPERVISOR_shared_info; 69extern struct shared_info *HYPERVISOR_shared_info;
63extern struct start_info *xen_start_info; 70extern struct start_info *xen_start_info;
64 71
@@ -74,16 +81,6 @@ void force_evtchn_callback(void);
74 81
75/* For setup_arch() in arch/ia64/kernel/setup.c */ 82/* For setup_arch() in arch/ia64/kernel/setup.c */
76void xen_ia64_enable_opt_feature(void); 83void xen_ia64_enable_opt_feature(void);
77
78#else /* CONFIG_XEN */
79
80#define xen_domain() (0)
81#define xen_pv_domain() (0)
82#define xen_initial_domain() (0)
83#define xen_hvm_domain() (0)
84#define is_running_on_xen() (0) /* deprecated. remove this */
85#endif 84#endif
86 85
87#define is_initial_xendomain() (0) /* deprecated. remove this */
88
89#endif /* _ASM_IA64_XEN_HYPERVISOR_H */ 86#endif /* _ASM_IA64_XEN_HYPERVISOR_H */
diff --git a/arch/ia64/include/asm/xen/inst.h b/arch/ia64/include/asm/xen/inst.h
index 19c2ae1d878a..c53a47611208 100644
--- a/arch/ia64/include/asm/xen/inst.h
+++ b/arch/ia64/include/asm/xen/inst.h
@@ -33,6 +33,9 @@
33#define __paravirt_work_processed_syscall_target \ 33#define __paravirt_work_processed_syscall_target \
34 xen_work_processed_syscall 34 xen_work_processed_syscall
35 35
36#define paravirt_fsyscall_table xen_fsyscall_table
37#define paravirt_fsys_bubble_down xen_fsys_bubble_down
38
36#define MOV_FROM_IFA(reg) \ 39#define MOV_FROM_IFA(reg) \
37 movl reg = XSI_IFA; \ 40 movl reg = XSI_IFA; \
38 ;; \ 41 ;; \
@@ -110,6 +113,27 @@
110.endm 113.endm
111#define MOV_FROM_PSR(pred, reg, clob) __MOV_FROM_PSR pred, reg, clob 114#define MOV_FROM_PSR(pred, reg, clob) __MOV_FROM_PSR pred, reg, clob
112 115
116/* assuming ar.itc is read with interrupt disabled. */
117#define MOV_FROM_ITC(pred, pred_clob, reg, clob) \
118(pred) movl clob = XSI_ITC_OFFSET; \
119 ;; \
120(pred) ld8 clob = [clob]; \
121(pred) mov reg = ar.itc; \
122 ;; \
123(pred) add reg = reg, clob; \
124 ;; \
125(pred) movl clob = XSI_ITC_LAST; \
126 ;; \
127(pred) ld8 clob = [clob]; \
128 ;; \
129(pred) cmp.geu.unc pred_clob, p0 = clob, reg; \
130 ;; \
131(pred_clob) add reg = 1, clob; \
132 ;; \
133(pred) movl clob = XSI_ITC_LAST; \
134 ;; \
135(pred) st8 [clob] = reg
136
113 137
114#define MOV_TO_IFA(reg, clob) \ 138#define MOV_TO_IFA(reg, clob) \
115 movl clob = XSI_IFA; \ 139 movl clob = XSI_IFA; \
@@ -362,6 +386,10 @@
362#define RSM_PSR_DT \ 386#define RSM_PSR_DT \
363 XEN_HYPER_RSM_PSR_DT 387 XEN_HYPER_RSM_PSR_DT
364 388
389#define RSM_PSR_BE_I(clob0, clob1) \
390 RSM_PSR_I(p0, clob0, clob1); \
391 rum psr.be
392
365#define SSM_PSR_DT_AND_SRLZ_I \ 393#define SSM_PSR_DT_AND_SRLZ_I \
366 XEN_HYPER_SSM_PSR_DT 394 XEN_HYPER_SSM_PSR_DT
367 395
diff --git a/arch/ia64/include/asm/xen/interface.h b/arch/ia64/include/asm/xen/interface.h
index f00fab40854d..e951e740bdf2 100644
--- a/arch/ia64/include/asm/xen/interface.h
+++ b/arch/ia64/include/asm/xen/interface.h
@@ -209,6 +209,15 @@ struct mapped_regs {
209 unsigned long krs[8]; /* kernel registers */ 209 unsigned long krs[8]; /* kernel registers */
210 unsigned long tmp[16]; /* temp registers 210 unsigned long tmp[16]; /* temp registers
211 (e.g. for hyperprivops) */ 211 (e.g. for hyperprivops) */
212
213 /* itc paravirtualization
214 * vAR.ITC = mAR.ITC + itc_offset
215 * itc_last is one which was lastly passed to
216 * the guest OS in order to prevent it from
217 * going backwords.
218 */
219 unsigned long itc_offset;
220 unsigned long itc_last;
212 }; 221 };
213 }; 222 };
214}; 223};
diff --git a/arch/ia64/include/asm/xen/minstate.h b/arch/ia64/include/asm/xen/minstate.h
index 4d92d9bbda7b..c57fa910f2c9 100644
--- a/arch/ia64/include/asm/xen/minstate.h
+++ b/arch/ia64/include/asm/xen/minstate.h
@@ -1,3 +1,12 @@
1
2#ifdef CONFIG_VIRT_CPU_ACCOUNTING
3/* read ar.itc in advance, and use it before leaving bank 0 */
4#define XEN_ACCOUNT_GET_STAMP \
5 MOV_FROM_ITC(pUStk, p6, r20, r2);
6#else
7#define XEN_ACCOUNT_GET_STAMP
8#endif
9
1/* 10/*
2 * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves 11 * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves
3 * the minimum state necessary that allows us to turn psr.ic back 12 * the minimum state necessary that allows us to turn psr.ic back
@@ -123,7 +132,7 @@
123 ;; \ 132 ;; \
124.mem.offset 0,0; st8.spill [r16]=r2,16; \ 133.mem.offset 0,0; st8.spill [r16]=r2,16; \
125.mem.offset 8,0; st8.spill [r17]=r3,16; \ 134.mem.offset 8,0; st8.spill [r17]=r3,16; \
126 ACCOUNT_GET_STAMP \ 135 XEN_ACCOUNT_GET_STAMP \
127 adds r2=IA64_PT_REGS_R16_OFFSET,r1; \ 136 adds r2=IA64_PT_REGS_R16_OFFSET,r1; \
128 ;; \ 137 ;; \
129 EXTRA; \ 138 EXTRA; \
diff --git a/arch/ia64/include/asm/xen/patchlist.h b/arch/ia64/include/asm/xen/patchlist.h
new file mode 100644
index 000000000000..eae944e88846
--- /dev/null
+++ b/arch/ia64/include/asm/xen/patchlist.h
@@ -0,0 +1,38 @@
1/******************************************************************************
2 * arch/ia64/include/asm/xen/patchlist.h
3 *
4 * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
5 * VA Linux Systems Japan K.K.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 *
21 */
22
23#define __paravirt_start_gate_fsyscall_patchlist \
24 __xen_start_gate_fsyscall_patchlist
25#define __paravirt_end_gate_fsyscall_patchlist \
26 __xen_end_gate_fsyscall_patchlist
27#define __paravirt_start_gate_brl_fsys_bubble_down_patchlist \
28 __xen_start_gate_brl_fsys_bubble_down_patchlist
29#define __paravirt_end_gate_brl_fsys_bubble_down_patchlist \
30 __xen_end_gate_brl_fsys_bubble_down_patchlist
31#define __paravirt_start_gate_vtop_patchlist \
32 __xen_start_gate_vtop_patchlist
33#define __paravirt_end_gate_vtop_patchlist \
34 __xen_end_gate_vtop_patchlist
35#define __paravirt_start_gate_mckinley_e9_patchlist \
36 __xen_start_gate_mckinley_e9_patchlist
37#define __paravirt_end_gate_mckinley_e9_patchlist \
38 __xen_end_gate_mckinley_e9_patchlist
diff --git a/arch/ia64/include/asm/xen/privop.h b/arch/ia64/include/asm/xen/privop.h
index 71ec7546e100..fb4ec5e0b066 100644
--- a/arch/ia64/include/asm/xen/privop.h
+++ b/arch/ia64/include/asm/xen/privop.h
@@ -55,6 +55,8 @@
55#define XSI_BANK1_R16 (XSI_BASE + XSI_BANK1_R16_OFS) 55#define XSI_BANK1_R16 (XSI_BASE + XSI_BANK1_R16_OFS)
56#define XSI_BANKNUM (XSI_BASE + XSI_BANKNUM_OFS) 56#define XSI_BANKNUM (XSI_BASE + XSI_BANKNUM_OFS)
57#define XSI_IHA (XSI_BASE + XSI_IHA_OFS) 57#define XSI_IHA (XSI_BASE + XSI_IHA_OFS)
58#define XSI_ITC_OFFSET (XSI_BASE + XSI_ITC_OFFSET_OFS)
59#define XSI_ITC_LAST (XSI_BASE + XSI_ITC_LAST_OFS)
58#endif 60#endif
59 61
60#ifndef __ASSEMBLY__ 62#ifndef __ASSEMBLY__
@@ -67,7 +69,7 @@
67 * may have different semantics depending on whether they are executed 69 * may have different semantics depending on whether they are executed
68 * at PL0 vs PL!=0. When paravirtualized, these instructions mustn't 70 * at PL0 vs PL!=0. When paravirtualized, these instructions mustn't
69 * be allowed to execute directly, lest incorrect semantics result. */ 71 * be allowed to execute directly, lest incorrect semantics result. */
70extern void xen_fc(unsigned long addr); 72extern void xen_fc(void *addr);
71extern unsigned long xen_thash(unsigned long addr); 73extern unsigned long xen_thash(unsigned long addr);
72 74
73/* Note that "ttag" and "cover" are also privilege-sensitive; "ttag" 75/* Note that "ttag" and "cover" are also privilege-sensitive; "ttag"
@@ -80,8 +82,10 @@ extern unsigned long xen_thash(unsigned long addr);
80extern unsigned long xen_get_cpuid(int index); 82extern unsigned long xen_get_cpuid(int index);
81extern unsigned long xen_get_pmd(int index); 83extern unsigned long xen_get_pmd(int index);
82 84
85#ifndef ASM_SUPPORTED
83extern unsigned long xen_get_eflag(void); /* see xen_ia64_getreg */ 86extern unsigned long xen_get_eflag(void); /* see xen_ia64_getreg */
84extern void xen_set_eflag(unsigned long); /* see xen_ia64_setreg */ 87extern void xen_set_eflag(unsigned long); /* see xen_ia64_setreg */
88#endif
85 89
86/************************************************/ 90/************************************************/
87/* Instructions paravirtualized for performance */ 91/* Instructions paravirtualized for performance */
@@ -106,6 +110,7 @@ extern void xen_set_eflag(unsigned long); /* see xen_ia64_setreg */
106#define xen_get_virtual_pend() \ 110#define xen_get_virtual_pend() \
107 (*(((uint8_t *)XEN_MAPPEDREGS->interrupt_mask_addr) - 1)) 111 (*(((uint8_t *)XEN_MAPPEDREGS->interrupt_mask_addr) - 1))
108 112
113#ifndef ASM_SUPPORTED
109/* Although all privileged operations can be left to trap and will 114/* Although all privileged operations can be left to trap and will
110 * be properly handled by Xen, some are frequent enough that we use 115 * be properly handled by Xen, some are frequent enough that we use
111 * hyperprivops for performance. */ 116 * hyperprivops for performance. */
@@ -123,6 +128,7 @@ extern void xen_set_rr0_to_rr4(unsigned long val0, unsigned long val1,
123 unsigned long val4); 128 unsigned long val4);
124extern void xen_set_kr(unsigned long index, unsigned long val); 129extern void xen_set_kr(unsigned long index, unsigned long val);
125extern void xen_ptcga(unsigned long addr, unsigned long size); 130extern void xen_ptcga(unsigned long addr, unsigned long size);
131#endif /* !ASM_SUPPORTED */
126 132
127#endif /* !__ASSEMBLY__ */ 133#endif /* !__ASSEMBLY__ */
128 134
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
index f2778f2c4fd9..5628e9a990a6 100644
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@@ -5,7 +5,7 @@
5extra-y := head.o init_task.o vmlinux.lds 5extra-y := head.o init_task.o vmlinux.lds
6 6
7obj-y := acpi.o entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o \ 7obj-y := acpi.o entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o \
8 irq_lsapic.o ivt.o machvec.o pal.o patch.o process.o perfmon.o ptrace.o sal.o \ 8 irq_lsapic.o ivt.o machvec.o pal.o paravirt_patchlist.o patch.o process.o perfmon.o ptrace.o sal.o \
9 salinfo.o setup.o signal.o sys_ia64.o time.o traps.o unaligned.o \ 9 salinfo.o setup.o signal.o sys_ia64.o time.o traps.o unaligned.o \
10 unwind.o mca.o mca_asm.o topology.o dma-mapping.o 10 unwind.o mca.o mca_asm.o topology.o dma-mapping.o
11 11
@@ -36,7 +36,8 @@ obj-$(CONFIG_PCI_MSI) += msi_ia64.o
36mca_recovery-y += mca_drv.o mca_drv_asm.o 36mca_recovery-y += mca_drv.o mca_drv_asm.o
37obj-$(CONFIG_IA64_MC_ERR_INJECT)+= err_inject.o 37obj-$(CONFIG_IA64_MC_ERR_INJECT)+= err_inject.o
38 38
39obj-$(CONFIG_PARAVIRT) += paravirt.o paravirtentry.o 39obj-$(CONFIG_PARAVIRT) += paravirt.o paravirtentry.o \
40 paravirt_patch.o
40 41
41obj-$(CONFIG_IA64_ESI) += esi.o 42obj-$(CONFIG_IA64_ESI) += esi.o
42ifneq ($(CONFIG_IA64_ESI),) 43ifneq ($(CONFIG_IA64_ESI),)
@@ -45,35 +46,13 @@ endif
45obj-$(CONFIG_DMAR) += pci-dma.o 46obj-$(CONFIG_DMAR) += pci-dma.o
46obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o 47obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
47 48
48# The gate DSO image is built using a special linker script.
49targets += gate.so gate-syms.o
50
51extra-y += gate.so gate-syms.o gate.lds gate.o
52
53# fp_emulate() expects f2-f5,f16-f31 to contain the user-level state. 49# fp_emulate() expects f2-f5,f16-f31 to contain the user-level state.
54CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31 50CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31
55 51
56CPPFLAGS_gate.lds := -P -C -U$(ARCH) 52# The gate DSO image is built using a special linker script.
57 53include $(srctree)/arch/ia64/kernel/Makefile.gate
58quiet_cmd_gate = GATE $@ 54# tell compiled for native
59 cmd_gate = $(CC) -nostdlib $(GATECFLAGS_$(@F)) -Wl,-T,$(filter-out FORCE,$^) -o $@ 55CPPFLAGS_gate.lds += -D__IA64_GATE_PARAVIRTUALIZED_NATIVE
60
61GATECFLAGS_gate.so = -shared -s -Wl,-soname=linux-gate.so.1 \
62 $(call ld-option, -Wl$(comma)--hash-style=sysv)
63$(obj)/gate.so: $(obj)/gate.lds $(obj)/gate.o FORCE
64 $(call if_changed,gate)
65
66$(obj)/built-in.o: $(obj)/gate-syms.o
67$(obj)/built-in.o: ld_flags += -R $(obj)/gate-syms.o
68
69GATECFLAGS_gate-syms.o = -r
70$(obj)/gate-syms.o: $(obj)/gate.lds $(obj)/gate.o FORCE
71 $(call if_changed,gate)
72
73# gate-data.o contains the gate DSO image as data in section .data.gate.
74# We must build gate.so before we can assemble it.
75# Note: kbuild does not track this dependency due to usage of .incbin
76$(obj)/gate-data.o: $(obj)/gate.so
77 56
78# Calculate NR_IRQ = max(IA64_NATIVE_NR_IRQS, XEN_NR_IRQS, ...) based on config 57# Calculate NR_IRQ = max(IA64_NATIVE_NR_IRQS, XEN_NR_IRQS, ...) based on config
79define sed-y 58define sed-y
@@ -109,9 +88,9 @@ include/asm-ia64/nr-irqs.h: arch/$(SRCARCH)/kernel/nr-irqs.s
109clean-files += $(objtree)/include/asm-ia64/nr-irqs.h 88clean-files += $(objtree)/include/asm-ia64/nr-irqs.h
110 89
111# 90#
112# native ivt.S and entry.S 91# native ivt.S, entry.S and fsys.S
113# 92#
114ASM_PARAVIRT_OBJS = ivt.o entry.o 93ASM_PARAVIRT_OBJS = ivt.o entry.o fsys.o
115define paravirtualized_native 94define paravirtualized_native
116AFLAGS_$(1) += -D__IA64_ASM_PARAVIRTUALIZED_NATIVE 95AFLAGS_$(1) += -D__IA64_ASM_PARAVIRTUALIZED_NATIVE
117AFLAGS_pvchk-sed-$(1) += -D__IA64_ASM_PARAVIRTUALIZED_PVCHECK 96AFLAGS_pvchk-sed-$(1) += -D__IA64_ASM_PARAVIRTUALIZED_PVCHECK
diff --git a/arch/ia64/kernel/Makefile.gate b/arch/ia64/kernel/Makefile.gate
new file mode 100644
index 000000000000..1d87f84069b3
--- /dev/null
+++ b/arch/ia64/kernel/Makefile.gate
@@ -0,0 +1,27 @@
1# The gate DSO image is built using a special linker script.
2
3targets += gate.so gate-syms.o
4
5extra-y += gate.so gate-syms.o gate.lds gate.o
6
7CPPFLAGS_gate.lds := -P -C -U$(ARCH)
8
9quiet_cmd_gate = GATE $@
10 cmd_gate = $(CC) -nostdlib $(GATECFLAGS_$(@F)) -Wl,-T,$(filter-out FORCE,$^) -o $@
11
12GATECFLAGS_gate.so = -shared -s -Wl,-soname=linux-gate.so.1 \
13 $(call ld-option, -Wl$(comma)--hash-style=sysv)
14$(obj)/gate.so: $(obj)/gate.lds $(obj)/gate.o FORCE
15 $(call if_changed,gate)
16
17$(obj)/built-in.o: $(obj)/gate-syms.o
18$(obj)/built-in.o: ld_flags += -R $(obj)/gate-syms.o
19
20GATECFLAGS_gate-syms.o = -r
21$(obj)/gate-syms.o: $(obj)/gate.lds $(obj)/gate.o FORCE
22 $(call if_changed,gate)
23
24# gate-data.o contains the gate DSO image as data in section .data.gate.
25# We must build gate.so before we can assemble it.
26# Note: kbuild does not track this dependency due to usage of .incbin
27$(obj)/gate-data.o: $(obj)/gate.so
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index bdef2ce38c8b..5510317db37b 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -890,7 +890,7 @@ __init void prefill_possible_map(void)
890 possible, max((possible - available_cpus), 0)); 890 possible, max((possible - available_cpus), 0));
891 891
892 for (i = 0; i < possible; i++) 892 for (i = 0; i < possible; i++)
893 cpu_set(i, cpu_possible_map); 893 set_cpu_possible(i, true);
894} 894}
895 895
896int acpi_map_lsapic(acpi_handle handle, int *pcpu) 896int acpi_map_lsapic(acpi_handle handle, int *pcpu)
@@ -928,9 +928,9 @@ int acpi_map_lsapic(acpi_handle handle, int *pcpu)
928 buffer.length = ACPI_ALLOCATE_BUFFER; 928 buffer.length = ACPI_ALLOCATE_BUFFER;
929 buffer.pointer = NULL; 929 buffer.pointer = NULL;
930 930
931 cpus_complement(tmp_map, cpu_present_map); 931 cpumask_complement(&tmp_map, cpu_present_mask);
932 cpu = first_cpu(tmp_map); 932 cpu = cpumask_first(&tmp_map);
933 if (cpu >= NR_CPUS) 933 if (cpu >= nr_cpu_ids)
934 return -EINVAL; 934 return -EINVAL;
935 935
936 acpi_map_cpu2node(handle, cpu, physid); 936 acpi_map_cpu2node(handle, cpu, physid);
diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c
index 742dbb1d5a4f..af5650169043 100644
--- a/arch/ia64/kernel/asm-offsets.c
+++ b/arch/ia64/kernel/asm-offsets.c
@@ -316,5 +316,7 @@ void foo(void)
316 DEFINE_MAPPED_REG_OFS(XSI_BANK1_R16_OFS, bank1_regs[0]); 316 DEFINE_MAPPED_REG_OFS(XSI_BANK1_R16_OFS, bank1_regs[0]);
317 DEFINE_MAPPED_REG_OFS(XSI_B0NATS_OFS, vbnat); 317 DEFINE_MAPPED_REG_OFS(XSI_B0NATS_OFS, vbnat);
318 DEFINE_MAPPED_REG_OFS(XSI_B1NATS_OFS, vnat); 318 DEFINE_MAPPED_REG_OFS(XSI_B1NATS_OFS, vnat);
319 DEFINE_MAPPED_REG_OFS(XSI_ITC_OFFSET_OFS, itc_offset);
320 DEFINE_MAPPED_REG_OFS(XSI_ITC_LAST_OFS, itc_last);
319#endif /* CONFIG_XEN */ 321#endif /* CONFIG_XEN */
320} 322}
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index efaff15d8cf1..7ef80e8161ce 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -456,6 +456,7 @@ efi_map_pal_code (void)
456 GRANULEROUNDDOWN((unsigned long) pal_vaddr), 456 GRANULEROUNDDOWN((unsigned long) pal_vaddr),
457 pte_val(pfn_pte(__pa(pal_vaddr) >> PAGE_SHIFT, PAGE_KERNEL)), 457 pte_val(pfn_pte(__pa(pal_vaddr) >> PAGE_SHIFT, PAGE_KERNEL)),
458 IA64_GRANULE_SHIFT); 458 IA64_GRANULE_SHIFT);
459 paravirt_dv_serialize_data();
459 ia64_set_psr(psr); /* restore psr */ 460 ia64_set_psr(psr); /* restore psr */
460} 461}
461 462
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index e5341e2c1175..ccfdeee9d89f 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -735,7 +735,7 @@ GLOBAL_ENTRY(__paravirt_leave_syscall)
735__paravirt_work_processed_syscall: 735__paravirt_work_processed_syscall:
736#ifdef CONFIG_VIRT_CPU_ACCOUNTING 736#ifdef CONFIG_VIRT_CPU_ACCOUNTING
737 adds r2=PT(LOADRS)+16,r12 737 adds r2=PT(LOADRS)+16,r12
738(pUStk) mov.m r22=ar.itc // fetch time at leave 738 MOV_FROM_ITC(pUStk, p9, r22, r19) // fetch time at leave
739 adds r18=TI_FLAGS+IA64_TASK_SIZE,r13 739 adds r18=TI_FLAGS+IA64_TASK_SIZE,r13
740 ;; 740 ;;
741(p6) ld4 r31=[r18] // load current_thread_info()->flags 741(p6) ld4 r31=[r18] // load current_thread_info()->flags
@@ -984,7 +984,7 @@ GLOBAL_ENTRY(__paravirt_leave_kernel)
984#ifdef CONFIG_VIRT_CPU_ACCOUNTING 984#ifdef CONFIG_VIRT_CPU_ACCOUNTING
985 .pred.rel.mutex pUStk,pKStk 985 .pred.rel.mutex pUStk,pKStk
986 MOV_FROM_PSR(pKStk, r22, r29) // M2 read PSR now that interrupts are disabled 986 MOV_FROM_PSR(pKStk, r22, r29) // M2 read PSR now that interrupts are disabled
987(pUStk) mov.m r22=ar.itc // M fetch time at leave 987 MOV_FROM_ITC(pUStk, p9, r22, r29) // M fetch time at leave
988 nop.i 0 988 nop.i 0
989 ;; 989 ;;
990#else 990#else
diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S
index c1625c7e1779..3567d54f8cee 100644
--- a/arch/ia64/kernel/fsys.S
+++ b/arch/ia64/kernel/fsys.S
@@ -25,6 +25,7 @@
25#include <asm/unistd.h> 25#include <asm/unistd.h>
26 26
27#include "entry.h" 27#include "entry.h"
28#include "paravirt_inst.h"
28 29
29/* 30/*
30 * See Documentation/ia64/fsys.txt for details on fsyscalls. 31 * See Documentation/ia64/fsys.txt for details on fsyscalls.
@@ -279,7 +280,7 @@ ENTRY(fsys_gettimeofday)
279(p9) cmp.eq p13,p0 = 0,r30 // if mmio_ptr, clear p13 jitter control 280(p9) cmp.eq p13,p0 = 0,r30 // if mmio_ptr, clear p13 jitter control
280 ;; 281 ;;
281 .pred.rel.mutex p8,p9 282 .pred.rel.mutex p8,p9
282(p8) mov r2 = ar.itc // CPU_TIMER. 36 clocks latency!!! 283 MOV_FROM_ITC(p8, p6, r2, r10) // CPU_TIMER. 36 clocks latency!!!
283(p9) ld8 r2 = [r30] // MMIO_TIMER. Could also have latency issues.. 284(p9) ld8 r2 = [r30] // MMIO_TIMER. Could also have latency issues..
284(p13) ld8 r25 = [r19] // get itc_lastcycle value 285(p13) ld8 r25 = [r19] // get itc_lastcycle value
285 ld8 r9 = [r22],IA64_TIMESPEC_TV_NSEC_OFFSET // tv_sec 286 ld8 r9 = [r22],IA64_TIMESPEC_TV_NSEC_OFFSET // tv_sec
@@ -418,7 +419,7 @@ EX(.fail_efault, ld8 r14=[r33]) // r14 <- *set
418 mov r17=(1 << (SIGKILL - 1)) | (1 << (SIGSTOP - 1)) 419 mov r17=(1 << (SIGKILL - 1)) | (1 << (SIGSTOP - 1))
419 ;; 420 ;;
420 421
421 rsm psr.i // mask interrupt delivery 422 RSM_PSR_I(p0, r18, r19) // mask interrupt delivery
422 mov ar.ccv=0 423 mov ar.ccv=0
423 andcm r14=r14,r17 // filter out SIGKILL & SIGSTOP 424 andcm r14=r14,r17 // filter out SIGKILL & SIGSTOP
424 425
@@ -491,7 +492,7 @@ EX(.fail_efault, ld8 r14=[r33]) // r14 <- *set
491#ifdef CONFIG_SMP 492#ifdef CONFIG_SMP
492 st4.rel [r31]=r0 // release the lock 493 st4.rel [r31]=r0 // release the lock
493#endif 494#endif
494 ssm psr.i 495 SSM_PSR_I(p0, p9, r31)
495 ;; 496 ;;
496 497
497 srlz.d // ensure psr.i is set again 498 srlz.d // ensure psr.i is set again
@@ -513,7 +514,7 @@ EX(.fail_efault, (p15) st8 [r34]=r3)
513#ifdef CONFIG_SMP 514#ifdef CONFIG_SMP
514 st4.rel [r31]=r0 // release the lock 515 st4.rel [r31]=r0 // release the lock
515#endif 516#endif
516 ssm psr.i 517 SSM_PSR_I(p0, p9, r17)
517 ;; 518 ;;
518 srlz.d 519 srlz.d
519 br.sptk.many fsys_fallback_syscall // with signal pending, do the heavy-weight syscall 520 br.sptk.many fsys_fallback_syscall // with signal pending, do the heavy-weight syscall
@@ -521,7 +522,7 @@ EX(.fail_efault, (p15) st8 [r34]=r3)
521#ifdef CONFIG_SMP 522#ifdef CONFIG_SMP
522.lock_contention: 523.lock_contention:
523 /* Rather than spinning here, fall back on doing a heavy-weight syscall. */ 524 /* Rather than spinning here, fall back on doing a heavy-weight syscall. */
524 ssm psr.i 525 SSM_PSR_I(p0, p9, r17)
525 ;; 526 ;;
526 srlz.d 527 srlz.d
527 br.sptk.many fsys_fallback_syscall 528 br.sptk.many fsys_fallback_syscall
@@ -592,17 +593,17 @@ ENTRY(fsys_fallback_syscall)
592 adds r17=-1024,r15 593 adds r17=-1024,r15
593 movl r14=sys_call_table 594 movl r14=sys_call_table
594 ;; 595 ;;
595 rsm psr.i 596 RSM_PSR_I(p0, r26, r27)
596 shladd r18=r17,3,r14 597 shladd r18=r17,3,r14
597 ;; 598 ;;
598 ld8 r18=[r18] // load normal (heavy-weight) syscall entry-point 599 ld8 r18=[r18] // load normal (heavy-weight) syscall entry-point
599 mov r29=psr // read psr (12 cyc load latency) 600 MOV_FROM_PSR(p0, r29, r26) // read psr (12 cyc load latency)
600 mov r27=ar.rsc 601 mov r27=ar.rsc
601 mov r21=ar.fpsr 602 mov r21=ar.fpsr
602 mov r26=ar.pfs 603 mov r26=ar.pfs
603END(fsys_fallback_syscall) 604END(fsys_fallback_syscall)
604 /* FALL THROUGH */ 605 /* FALL THROUGH */
605GLOBAL_ENTRY(fsys_bubble_down) 606GLOBAL_ENTRY(paravirt_fsys_bubble_down)
606 .prologue 607 .prologue
607 .altrp b6 608 .altrp b6
608 .body 609 .body
@@ -640,7 +641,7 @@ GLOBAL_ENTRY(fsys_bubble_down)
640 * 641 *
641 * PSR.BE : already is turned off in __kernel_syscall_via_epc() 642 * PSR.BE : already is turned off in __kernel_syscall_via_epc()
642 * PSR.AC : don't care (kernel normally turns PSR.AC on) 643 * PSR.AC : don't care (kernel normally turns PSR.AC on)
643 * PSR.I : already turned off by the time fsys_bubble_down gets 644 * PSR.I : already turned off by the time paravirt_fsys_bubble_down gets
644 * invoked 645 * invoked
645 * PSR.DFL: always 0 (kernel never turns it on) 646 * PSR.DFL: always 0 (kernel never turns it on)
646 * PSR.DFH: don't care --- kernel never touches f32-f127 on its own 647 * PSR.DFH: don't care --- kernel never touches f32-f127 on its own
@@ -650,7 +651,7 @@ GLOBAL_ENTRY(fsys_bubble_down)
650 * PSR.DB : don't care --- kernel never enables kernel-level 651 * PSR.DB : don't care --- kernel never enables kernel-level
651 * breakpoints 652 * breakpoints
652 * PSR.TB : must be 0 already; if it wasn't zero on entry to 653 * PSR.TB : must be 0 already; if it wasn't zero on entry to
653 * __kernel_syscall_via_epc, the branch to fsys_bubble_down 654 * __kernel_syscall_via_epc, the branch to paravirt_fsys_bubble_down
654 * will trigger a taken branch; the taken-trap-handler then 655 * will trigger a taken branch; the taken-trap-handler then
655 * converts the syscall into a break-based system-call. 656 * converts the syscall into a break-based system-call.
656 */ 657 */
@@ -683,7 +684,7 @@ GLOBAL_ENTRY(fsys_bubble_down)
683 ;; 684 ;;
684 mov ar.rsc=0 // M2 set enforced lazy mode, pl 0, LE, loadrs=0 685 mov ar.rsc=0 // M2 set enforced lazy mode, pl 0, LE, loadrs=0
685#ifdef CONFIG_VIRT_CPU_ACCOUNTING 686#ifdef CONFIG_VIRT_CPU_ACCOUNTING
686 mov.m r30=ar.itc // M get cycle for accounting 687 MOV_FROM_ITC(p0, p6, r30, r23) // M get cycle for accounting
687#else 688#else
688 nop.m 0 689 nop.m 0
689#endif 690#endif
@@ -734,21 +735,21 @@ GLOBAL_ENTRY(fsys_bubble_down)
734 mov rp=r14 // I0 set the real return addr 735 mov rp=r14 // I0 set the real return addr
735 and r3=_TIF_SYSCALL_TRACEAUDIT,r3 // A 736 and r3=_TIF_SYSCALL_TRACEAUDIT,r3 // A
736 ;; 737 ;;
737 ssm psr.i // M2 we're on kernel stacks now, reenable irqs 738 SSM_PSR_I(p0, p6, r22) // M2 we're on kernel stacks now, reenable irqs
738 cmp.eq p8,p0=r3,r0 // A 739 cmp.eq p8,p0=r3,r0 // A
739(p10) br.cond.spnt.many ia64_ret_from_syscall // B return if bad call-frame or r15 is a NaT 740(p10) br.cond.spnt.many ia64_ret_from_syscall // B return if bad call-frame or r15 is a NaT
740 741
741 nop.m 0 742 nop.m 0
742(p8) br.call.sptk.many b6=b6 // B (ignore return address) 743(p8) br.call.sptk.many b6=b6 // B (ignore return address)
743 br.cond.spnt ia64_trace_syscall // B 744 br.cond.spnt ia64_trace_syscall // B
744END(fsys_bubble_down) 745END(paravirt_fsys_bubble_down)
745 746
746 .rodata 747 .rodata
747 .align 8 748 .align 8
748 .globl fsyscall_table 749 .globl paravirt_fsyscall_table
749 750
750 data8 fsys_bubble_down 751 data8 paravirt_fsys_bubble_down
751fsyscall_table: 752paravirt_fsyscall_table:
752 data8 fsys_ni_syscall 753 data8 fsys_ni_syscall
753 data8 0 // exit // 1025 754 data8 0 // exit // 1025
754 data8 0 // read 755 data8 0 // read
@@ -1033,4 +1034,4 @@ fsyscall_table:
1033 1034
1034 // fill in zeros for the remaining entries 1035 // fill in zeros for the remaining entries
1035 .zero: 1036 .zero:
1036 .space fsyscall_table + 8*NR_syscalls - .zero, 0 1037 .space paravirt_fsyscall_table + 8*NR_syscalls - .zero, 0
diff --git a/arch/ia64/kernel/gate.S b/arch/ia64/kernel/gate.S
index 74b1ccce4e84..cf5e0a105e16 100644
--- a/arch/ia64/kernel/gate.S
+++ b/arch/ia64/kernel/gate.S
@@ -13,6 +13,7 @@
13#include <asm/sigcontext.h> 13#include <asm/sigcontext.h>
14#include <asm/system.h> 14#include <asm/system.h>
15#include <asm/unistd.h> 15#include <asm/unistd.h>
16#include "paravirt_inst.h"
16 17
17/* 18/*
18 * We can't easily refer to symbols inside the kernel. To avoid full runtime relocation, 19 * We can't easily refer to symbols inside the kernel. To avoid full runtime relocation,
@@ -48,87 +49,6 @@ GLOBAL_ENTRY(__kernel_syscall_via_break)
48} 49}
49END(__kernel_syscall_via_break) 50END(__kernel_syscall_via_break)
50 51
51/*
52 * On entry:
53 * r11 = saved ar.pfs
54 * r15 = system call #
55 * b0 = saved return address
56 * b6 = return address
57 * On exit:
58 * r11 = saved ar.pfs
59 * r15 = system call #
60 * b0 = saved return address
61 * all other "scratch" registers: undefined
62 * all "preserved" registers: same as on entry
63 */
64
65GLOBAL_ENTRY(__kernel_syscall_via_epc)
66 .prologue
67 .altrp b6
68 .body
69{
70 /*
71 * Note: the kernel cannot assume that the first two instructions in this
72 * bundle get executed. The remaining code must be safe even if
73 * they do not get executed.
74 */
75 adds r17=-1024,r15 // A
76 mov r10=0 // A default to successful syscall execution
77 epc // B causes split-issue
78}
79 ;;
80 rsm psr.be | psr.i // M2 (5 cyc to srlz.d)
81 LOAD_FSYSCALL_TABLE(r14) // X
82 ;;
83 mov r16=IA64_KR(CURRENT) // M2 (12 cyc)
84 shladd r18=r17,3,r14 // A
85 mov r19=NR_syscalls-1 // A
86 ;;
87 lfetch [r18] // M0|1
88 mov r29=psr // M2 (12 cyc)
89 // If r17 is a NaT, p6 will be zero
90 cmp.geu p6,p7=r19,r17 // A (sysnr > 0 && sysnr < 1024+NR_syscalls)?
91 ;;
92 mov r21=ar.fpsr // M2 (12 cyc)
93 tnat.nz p10,p9=r15 // I0
94 mov.i r26=ar.pfs // I0 (would stall anyhow due to srlz.d...)
95 ;;
96 srlz.d // M0 (forces split-issue) ensure PSR.BE==0
97(p6) ld8 r18=[r18] // M0|1
98 nop.i 0
99 ;;
100 nop.m 0
101(p6) tbit.z.unc p8,p0=r18,0 // I0 (dual-issues with "mov b7=r18"!)
102 nop.i 0
103 ;;
104(p8) ssm psr.i
105(p6) mov b7=r18 // I0
106(p8) br.dptk.many b7 // B
107
108 mov r27=ar.rsc // M2 (12 cyc)
109/*
110 * brl.cond doesn't work as intended because the linker would convert this branch
111 * into a branch to a PLT. Perhaps there will be a way to avoid this with some
112 * future version of the linker. In the meantime, we just use an indirect branch
113 * instead.
114 */
115#ifdef CONFIG_ITANIUM
116(p6) add r14=-8,r14 // r14 <- addr of fsys_bubble_down entry
117 ;;
118(p6) ld8 r14=[r14] // r14 <- fsys_bubble_down
119 ;;
120(p6) mov b7=r14
121(p6) br.sptk.many b7
122#else
123 BRL_COND_FSYS_BUBBLE_DOWN(p6)
124#endif
125 ssm psr.i
126 mov r10=-1
127(p10) mov r8=EINVAL
128(p9) mov r8=ENOSYS
129 FSYS_RETURN
130END(__kernel_syscall_via_epc)
131
132# define ARG0_OFF (16 + IA64_SIGFRAME_ARG0_OFFSET) 52# define ARG0_OFF (16 + IA64_SIGFRAME_ARG0_OFFSET)
133# define ARG1_OFF (16 + IA64_SIGFRAME_ARG1_OFFSET) 53# define ARG1_OFF (16 + IA64_SIGFRAME_ARG1_OFFSET)
134# define ARG2_OFF (16 + IA64_SIGFRAME_ARG2_OFFSET) 54# define ARG2_OFF (16 + IA64_SIGFRAME_ARG2_OFFSET)
@@ -374,3 +294,92 @@ restore_rbs:
374 // invala not necessary as that will happen when returning to user-mode 294 // invala not necessary as that will happen when returning to user-mode
375 br.cond.sptk back_from_restore_rbs 295 br.cond.sptk back_from_restore_rbs
376END(__kernel_sigtramp) 296END(__kernel_sigtramp)
297
298/*
299 * On entry:
300 * r11 = saved ar.pfs
301 * r15 = system call #
302 * b0 = saved return address
303 * b6 = return address
304 * On exit:
305 * r11 = saved ar.pfs
306 * r15 = system call #
307 * b0 = saved return address
308 * all other "scratch" registers: undefined
309 * all "preserved" registers: same as on entry
310 */
311
312GLOBAL_ENTRY(__kernel_syscall_via_epc)
313 .prologue
314 .altrp b6
315 .body
316{
317 /*
318 * Note: the kernel cannot assume that the first two instructions in this
319 * bundle get executed. The remaining code must be safe even if
320 * they do not get executed.
321 */
322 adds r17=-1024,r15 // A
323 mov r10=0 // A default to successful syscall execution
324 epc // B causes split-issue
325}
326 ;;
327 RSM_PSR_BE_I(r20, r22) // M2 (5 cyc to srlz.d)
328 LOAD_FSYSCALL_TABLE(r14) // X
329 ;;
330 mov r16=IA64_KR(CURRENT) // M2 (12 cyc)
331 shladd r18=r17,3,r14 // A
332 mov r19=NR_syscalls-1 // A
333 ;;
334 lfetch [r18] // M0|1
335 MOV_FROM_PSR(p0, r29, r8) // M2 (12 cyc)
336 // If r17 is a NaT, p6 will be zero
337 cmp.geu p6,p7=r19,r17 // A (sysnr > 0 && sysnr < 1024+NR_syscalls)?
338 ;;
339 mov r21=ar.fpsr // M2 (12 cyc)
340 tnat.nz p10,p9=r15 // I0
341 mov.i r26=ar.pfs // I0 (would stall anyhow due to srlz.d...)
342 ;;
343 srlz.d // M0 (forces split-issue) ensure PSR.BE==0
344(p6) ld8 r18=[r18] // M0|1
345 nop.i 0
346 ;;
347 nop.m 0
348(p6) tbit.z.unc p8,p0=r18,0 // I0 (dual-issues with "mov b7=r18"!)
349 nop.i 0
350 ;;
351 SSM_PSR_I(p8, p14, r25)
352(p6) mov b7=r18 // I0
353(p8) br.dptk.many b7 // B
354
355 mov r27=ar.rsc // M2 (12 cyc)
356/*
357 * brl.cond doesn't work as intended because the linker would convert this branch
358 * into a branch to a PLT. Perhaps there will be a way to avoid this with some
359 * future version of the linker. In the meantime, we just use an indirect branch
360 * instead.
361 */
362#ifdef CONFIG_ITANIUM
363(p6) add r14=-8,r14 // r14 <- addr of fsys_bubble_down entry
364 ;;
365(p6) ld8 r14=[r14] // r14 <- fsys_bubble_down
366 ;;
367(p6) mov b7=r14
368(p6) br.sptk.many b7
369#else
370 BRL_COND_FSYS_BUBBLE_DOWN(p6)
371#endif
372 SSM_PSR_I(p0, p14, r10)
373 mov r10=-1
374(p10) mov r8=EINVAL
375(p9) mov r8=ENOSYS
376 FSYS_RETURN
377
378#ifdef CONFIG_PARAVIRT
379 /*
380 * padd to make the size of this symbol constant
381 * independent of paravirtualization.
382 */
383 .align PAGE_SIZE / 8
384#endif
385END(__kernel_syscall_via_epc)
diff --git a/arch/ia64/kernel/gate.lds.S b/arch/ia64/kernel/gate.lds.S
index 3cb1abc00e24..88c64ed47c36 100644
--- a/arch/ia64/kernel/gate.lds.S
+++ b/arch/ia64/kernel/gate.lds.S
@@ -7,6 +7,7 @@
7 7
8 8
9#include <asm/system.h> 9#include <asm/system.h>
10#include "paravirt_patchlist.h"
10 11
11SECTIONS 12SECTIONS
12{ 13{
@@ -33,21 +34,21 @@ SECTIONS
33 . = GATE_ADDR + 0x600; 34 . = GATE_ADDR + 0x600;
34 35
35 .data.patch : { 36 .data.patch : {
36 __start_gate_mckinley_e9_patchlist = .; 37 __paravirt_start_gate_mckinley_e9_patchlist = .;
37 *(.data.patch.mckinley_e9) 38 *(.data.patch.mckinley_e9)
38 __end_gate_mckinley_e9_patchlist = .; 39 __paravirt_end_gate_mckinley_e9_patchlist = .;
39 40
40 __start_gate_vtop_patchlist = .; 41 __paravirt_start_gate_vtop_patchlist = .;
41 *(.data.patch.vtop) 42 *(.data.patch.vtop)
42 __end_gate_vtop_patchlist = .; 43 __paravirt_end_gate_vtop_patchlist = .;
43 44
44 __start_gate_fsyscall_patchlist = .; 45 __paravirt_start_gate_fsyscall_patchlist = .;
45 *(.data.patch.fsyscall_table) 46 *(.data.patch.fsyscall_table)
46 __end_gate_fsyscall_patchlist = .; 47 __paravirt_end_gate_fsyscall_patchlist = .;
47 48
48 __start_gate_brl_fsys_bubble_down_patchlist = .; 49 __paravirt_start_gate_brl_fsys_bubble_down_patchlist = .;
49 *(.data.patch.brl_fsys_bubble_down) 50 *(.data.patch.brl_fsys_bubble_down)
50 __end_gate_brl_fsys_bubble_down_patchlist = .; 51 __paravirt_end_gate_brl_fsys_bubble_down_patchlist = .;
51 } :readable 52 } :readable
52 53
53 .IA_64.unwind_info : { *(.IA_64.unwind_info*) } 54 .IA_64.unwind_info : { *(.IA_64.unwind_info*) }
diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S
index 59301c472800..23f846de62d5 100644
--- a/arch/ia64/kernel/head.S
+++ b/arch/ia64/kernel/head.S
@@ -1050,7 +1050,7 @@ END(ia64_delay_loop)
1050 * except that the multiplication and the shift are done with 128-bit 1050 * except that the multiplication and the shift are done with 128-bit
1051 * intermediate precision so that we can produce a full 64-bit result. 1051 * intermediate precision so that we can produce a full 64-bit result.
1052 */ 1052 */
1053GLOBAL_ENTRY(sched_clock) 1053GLOBAL_ENTRY(ia64_native_sched_clock)
1054 addl r8=THIS_CPU(cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET,r0 1054 addl r8=THIS_CPU(cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET,r0
1055 mov.m r9=ar.itc // fetch cycle-counter (35 cyc) 1055 mov.m r9=ar.itc // fetch cycle-counter (35 cyc)
1056 ;; 1056 ;;
@@ -1066,7 +1066,13 @@ GLOBAL_ENTRY(sched_clock)
1066 ;; 1066 ;;
1067 shrp r8=r9,r8,IA64_NSEC_PER_CYC_SHIFT 1067 shrp r8=r9,r8,IA64_NSEC_PER_CYC_SHIFT
1068 br.ret.sptk.many rp 1068 br.ret.sptk.many rp
1069END(sched_clock) 1069END(ia64_native_sched_clock)
1070#ifndef CONFIG_PARAVIRT
1071 //unsigned long long
1072 //sched_clock(void) __attribute__((alias("ia64_native_sched_clock")));
1073 .global sched_clock
1074sched_clock = ia64_native_sched_clock
1075#endif
1070 1076
1071#ifdef CONFIG_VIRT_CPU_ACCOUNTING 1077#ifdef CONFIG_VIRT_CPU_ACCOUNTING
1072GLOBAL_ENTRY(cycle_to_cputime) 1078GLOBAL_ENTRY(cycle_to_cputime)
diff --git a/arch/ia64/kernel/ivt.S b/arch/ia64/kernel/ivt.S
index f675d8e33853..ec9a5fdfa1b9 100644
--- a/arch/ia64/kernel/ivt.S
+++ b/arch/ia64/kernel/ivt.S
@@ -804,7 +804,7 @@ ENTRY(break_fault)
804/////////////////////////////////////////////////////////////////////// 804///////////////////////////////////////////////////////////////////////
805 st1 [r16]=r0 // M2|3 clear current->thread.on_ustack flag 805 st1 [r16]=r0 // M2|3 clear current->thread.on_ustack flag
806#ifdef CONFIG_VIRT_CPU_ACCOUNTING 806#ifdef CONFIG_VIRT_CPU_ACCOUNTING
807 mov.m r30=ar.itc // M get cycle for accounting 807 MOV_FROM_ITC(p0, p14, r30, r18) // M get cycle for accounting
808#else 808#else
809 mov b6=r30 // I0 setup syscall handler branch reg early 809 mov b6=r30 // I0 setup syscall handler branch reg early
810#endif 810#endif
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index bab1de2d2f6a..8f33a8840422 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -1456,9 +1456,9 @@ ia64_mca_cmc_int_caller(int cmc_irq, void *arg)
1456 1456
1457 ia64_mca_cmc_int_handler(cmc_irq, arg); 1457 ia64_mca_cmc_int_handler(cmc_irq, arg);
1458 1458
1459 for (++cpuid ; cpuid < NR_CPUS && !cpu_online(cpuid) ; cpuid++); 1459 cpuid = cpumask_next(cpuid+1, cpu_online_mask);
1460 1460
1461 if (cpuid < NR_CPUS) { 1461 if (cpuid < nr_cpu_ids) {
1462 platform_send_ipi(cpuid, IA64_CMCP_VECTOR, IA64_IPI_DM_INT, 0); 1462 platform_send_ipi(cpuid, IA64_CMCP_VECTOR, IA64_IPI_DM_INT, 0);
1463 } else { 1463 } else {
1464 /* If no log record, switch out of polling mode */ 1464 /* If no log record, switch out of polling mode */
@@ -1525,7 +1525,7 @@ ia64_mca_cpe_int_caller(int cpe_irq, void *arg)
1525 1525
1526 ia64_mca_cpe_int_handler(cpe_irq, arg); 1526 ia64_mca_cpe_int_handler(cpe_irq, arg);
1527 1527
1528 for (++cpuid ; cpuid < NR_CPUS && !cpu_online(cpuid) ; cpuid++); 1528 cpuid = cpumask_next(cpuid+1, cpu_online_mask);
1529 1529
1530 if (cpuid < NR_CPUS) { 1530 if (cpuid < NR_CPUS) {
1531 platform_send_ipi(cpuid, IA64_CPEP_VECTOR, IA64_IPI_DM_INT, 0); 1531 platform_send_ipi(cpuid, IA64_CPEP_VECTOR, IA64_IPI_DM_INT, 0);
diff --git a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c
index aaa7d901521f..da3b0cf495a3 100644
--- a/arch/ia64/kernel/module.c
+++ b/arch/ia64/kernel/module.c
@@ -446,6 +446,14 @@ module_frob_arch_sections (Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings,
446 mod->arch.opd = s; 446 mod->arch.opd = s;
447 else if (strcmp(".IA_64.unwind", secstrings + s->sh_name) == 0) 447 else if (strcmp(".IA_64.unwind", secstrings + s->sh_name) == 0)
448 mod->arch.unwind = s; 448 mod->arch.unwind = s;
449#ifdef CONFIG_PARAVIRT
450 else if (strcmp(".paravirt_bundles",
451 secstrings + s->sh_name) == 0)
452 mod->arch.paravirt_bundles = s;
453 else if (strcmp(".paravirt_insts",
454 secstrings + s->sh_name) == 0)
455 mod->arch.paravirt_insts = s;
456#endif
449 457
450 if (!mod->arch.core_plt || !mod->arch.init_plt || !mod->arch.got || !mod->arch.opd) { 458 if (!mod->arch.core_plt || !mod->arch.init_plt || !mod->arch.got || !mod->arch.opd) {
451 printk(KERN_ERR "%s: sections missing\n", mod->name); 459 printk(KERN_ERR "%s: sections missing\n", mod->name);
@@ -525,8 +533,7 @@ get_ltoff (struct module *mod, uint64_t value, int *okp)
525 goto found; 533 goto found;
526 534
527 /* Not enough GOT entries? */ 535 /* Not enough GOT entries? */
528 if (e >= (struct got_entry *) (mod->arch.got->sh_addr + mod->arch.got->sh_size)) 536 BUG_ON(e >= (struct got_entry *) (mod->arch.got->sh_addr + mod->arch.got->sh_size));
529 BUG();
530 537
531 e->val = value; 538 e->val = value;
532 ++mod->arch.next_got_entry; 539 ++mod->arch.next_got_entry;
@@ -921,6 +928,30 @@ module_finalize (const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mo
921 DEBUGP("%s: init: entry=%p\n", __func__, mod->init); 928 DEBUGP("%s: init: entry=%p\n", __func__, mod->init);
922 if (mod->arch.unwind) 929 if (mod->arch.unwind)
923 register_unwind_table(mod); 930 register_unwind_table(mod);
931#ifdef CONFIG_PARAVIRT
932 if (mod->arch.paravirt_bundles) {
933 struct paravirt_patch_site_bundle *start =
934 (struct paravirt_patch_site_bundle *)
935 mod->arch.paravirt_bundles->sh_addr;
936 struct paravirt_patch_site_bundle *end =
937 (struct paravirt_patch_site_bundle *)
938 (mod->arch.paravirt_bundles->sh_addr +
939 mod->arch.paravirt_bundles->sh_size);
940
941 paravirt_patch_apply_bundle(start, end);
942 }
943 if (mod->arch.paravirt_insts) {
944 struct paravirt_patch_site_inst *start =
945 (struct paravirt_patch_site_inst *)
946 mod->arch.paravirt_insts->sh_addr;
947 struct paravirt_patch_site_inst *end =
948 (struct paravirt_patch_site_inst *)
949 (mod->arch.paravirt_insts->sh_addr +
950 mod->arch.paravirt_insts->sh_size);
951
952 paravirt_patch_apply_inst(start, end);
953 }
954#endif
924 return 0; 955 return 0;
925} 956}
926 957
diff --git a/arch/ia64/kernel/paravirt.c b/arch/ia64/kernel/paravirt.c
index 9f14c16f6369..a21d7bb9c69c 100644
--- a/arch/ia64/kernel/paravirt.c
+++ b/arch/ia64/kernel/paravirt.c
@@ -46,13 +46,23 @@ struct pv_info pv_info = {
46 * initialization hooks. 46 * initialization hooks.
47 */ 47 */
48 48
49struct pv_init_ops pv_init_ops; 49static void __init
50ia64_native_patch_branch(unsigned long tag, unsigned long type);
51
52struct pv_init_ops pv_init_ops =
53{
54#ifdef ASM_SUPPORTED
55 .patch_bundle = ia64_native_patch_bundle,
56#endif
57 .patch_branch = ia64_native_patch_branch,
58};
50 59
51/*************************************************************************** 60/***************************************************************************
52 * pv_cpu_ops 61 * pv_cpu_ops
53 * intrinsics hooks. 62 * intrinsics hooks.
54 */ 63 */
55 64
65#ifndef ASM_SUPPORTED
56/* ia64_native_xxx are macros so that we have to make them real functions */ 66/* ia64_native_xxx are macros so that we have to make them real functions */
57 67
58#define DEFINE_VOID_FUNC1(name) \ 68#define DEFINE_VOID_FUNC1(name) \
@@ -60,7 +70,14 @@ struct pv_init_ops pv_init_ops;
60 ia64_native_ ## name ## _func(unsigned long arg) \ 70 ia64_native_ ## name ## _func(unsigned long arg) \
61 { \ 71 { \
62 ia64_native_ ## name(arg); \ 72 ia64_native_ ## name(arg); \
63 } \ 73 }
74
75#define DEFINE_VOID_FUNC1_VOID(name) \
76 static void \
77 ia64_native_ ## name ## _func(void *arg) \
78 { \
79 ia64_native_ ## name(arg); \
80 }
64 81
65#define DEFINE_VOID_FUNC2(name) \ 82#define DEFINE_VOID_FUNC2(name) \
66 static void \ 83 static void \
@@ -68,7 +85,7 @@ struct pv_init_ops pv_init_ops;
68 unsigned long arg1) \ 85 unsigned long arg1) \
69 { \ 86 { \
70 ia64_native_ ## name(arg0, arg1); \ 87 ia64_native_ ## name(arg0, arg1); \
71 } \ 88 }
72 89
73#define DEFINE_FUNC0(name) \ 90#define DEFINE_FUNC0(name) \
74 static unsigned long \ 91 static unsigned long \
@@ -84,7 +101,7 @@ struct pv_init_ops pv_init_ops;
84 return ia64_native_ ## name(arg); \ 101 return ia64_native_ ## name(arg); \
85 } \ 102 } \
86 103
87DEFINE_VOID_FUNC1(fc); 104DEFINE_VOID_FUNC1_VOID(fc);
88DEFINE_VOID_FUNC1(intrin_local_irq_restore); 105DEFINE_VOID_FUNC1(intrin_local_irq_restore);
89 106
90DEFINE_VOID_FUNC2(ptcga); 107DEFINE_VOID_FUNC2(ptcga);
@@ -274,6 +291,266 @@ ia64_native_setreg_func(int regnum, unsigned long val)
274 break; 291 break;
275 } 292 }
276} 293}
294#else
295
296#define __DEFINE_FUNC(name, code) \
297 extern const char ia64_native_ ## name ## _direct_start[]; \
298 extern const char ia64_native_ ## name ## _direct_end[]; \
299 asm (".align 32\n" \
300 ".proc ia64_native_" #name "_func\n" \
301 "ia64_native_" #name "_func:\n" \
302 "ia64_native_" #name "_direct_start:\n" \
303 code \
304 "ia64_native_" #name "_direct_end:\n" \
305 "br.cond.sptk.many b6\n" \
306 ".endp ia64_native_" #name "_func\n")
307
308#define DEFINE_VOID_FUNC0(name, code) \
309 extern void \
310 ia64_native_ ## name ## _func(void); \
311 __DEFINE_FUNC(name, code)
312
313#define DEFINE_VOID_FUNC1(name, code) \
314 extern void \
315 ia64_native_ ## name ## _func(unsigned long arg); \
316 __DEFINE_FUNC(name, code)
317
318#define DEFINE_VOID_FUNC1_VOID(name, code) \
319 extern void \
320 ia64_native_ ## name ## _func(void *arg); \
321 __DEFINE_FUNC(name, code)
322
323#define DEFINE_VOID_FUNC2(name, code) \
324 extern void \
325 ia64_native_ ## name ## _func(unsigned long arg0, \
326 unsigned long arg1); \
327 __DEFINE_FUNC(name, code)
328
329#define DEFINE_FUNC0(name, code) \
330 extern unsigned long \
331 ia64_native_ ## name ## _func(void); \
332 __DEFINE_FUNC(name, code)
333
334#define DEFINE_FUNC1(name, type, code) \
335 extern unsigned long \
336 ia64_native_ ## name ## _func(type arg); \
337 __DEFINE_FUNC(name, code)
338
339DEFINE_VOID_FUNC1_VOID(fc,
340 "fc r8\n");
341DEFINE_VOID_FUNC1(intrin_local_irq_restore,
342 ";;\n"
343 " cmp.ne p6, p7 = r8, r0\n"
344 ";;\n"
345 "(p6) ssm psr.i\n"
346 "(p7) rsm psr.i\n"
347 ";;\n"
348 "(p6) srlz.d\n");
349
350DEFINE_VOID_FUNC2(ptcga,
351 "ptc.ga r8, r9\n");
352DEFINE_VOID_FUNC2(set_rr,
353 "mov rr[r8] = r9\n");
354
355/* ia64_native_getreg(_IA64_REG_PSR) & IA64_PSR_I */
356DEFINE_FUNC0(get_psr_i,
357 "mov r2 = " __stringify(1 << IA64_PSR_I_BIT) "\n"
358 "mov r8 = psr\n"
359 ";;\n"
360 "and r8 = r2, r8\n");
361
362DEFINE_FUNC1(thash, unsigned long,
363 "thash r8 = r8\n");
364DEFINE_FUNC1(get_cpuid, int,
365 "mov r8 = cpuid[r8]\n");
366DEFINE_FUNC1(get_pmd, int,
367 "mov r8 = pmd[r8]\n");
368DEFINE_FUNC1(get_rr, unsigned long,
369 "mov r8 = rr[r8]\n");
370
371DEFINE_VOID_FUNC0(ssm_i,
372 "ssm psr.i\n");
373DEFINE_VOID_FUNC0(rsm_i,
374 "rsm psr.i\n");
375
376extern void
377ia64_native_set_rr0_to_rr4_func(unsigned long val0, unsigned long val1,
378 unsigned long val2, unsigned long val3,
379 unsigned long val4);
380__DEFINE_FUNC(set_rr0_to_rr4,
381 "mov rr[r0] = r8\n"
382 "movl r2 = 0x2000000000000000\n"
383 ";;\n"
384 "mov rr[r2] = r9\n"
385 "shl r3 = r2, 1\n" /* movl r3 = 0x4000000000000000 */
386 ";;\n"
387 "add r2 = r2, r3\n" /* movl r2 = 0x6000000000000000 */
388 "mov rr[r3] = r10\n"
389 ";;\n"
390 "mov rr[r2] = r11\n"
391 "shl r3 = r3, 1\n" /* movl r3 = 0x8000000000000000 */
392 ";;\n"
393 "mov rr[r3] = r14\n");
394
395extern unsigned long ia64_native_getreg_func(int regnum);
396asm(".global ia64_native_getreg_func\n");
397#define __DEFINE_GET_REG(id, reg) \
398 "mov r2 = " __stringify(_IA64_REG_ ## id) "\n" \
399 ";;\n" \
400 "cmp.eq p6, p0 = r2, r8\n" \
401 ";;\n" \
402 "(p6) mov r8 = " #reg "\n" \
403 "(p6) br.cond.sptk.many b6\n" \
404 ";;\n"
405#define __DEFINE_GET_AR(id, reg) __DEFINE_GET_REG(AR_ ## id, ar.reg)
406#define __DEFINE_GET_CR(id, reg) __DEFINE_GET_REG(CR_ ## id, cr.reg)
407
408__DEFINE_FUNC(getreg,
409 __DEFINE_GET_REG(GP, gp)
410 /*__DEFINE_GET_REG(IP, ip)*/ /* returned ip value shouldn't be constant */
411 __DEFINE_GET_REG(PSR, psr)
412 __DEFINE_GET_REG(TP, tp)
413 __DEFINE_GET_REG(SP, sp)
414
415 __DEFINE_GET_REG(AR_KR0, ar0)
416 __DEFINE_GET_REG(AR_KR1, ar1)
417 __DEFINE_GET_REG(AR_KR2, ar2)
418 __DEFINE_GET_REG(AR_KR3, ar3)
419 __DEFINE_GET_REG(AR_KR4, ar4)
420 __DEFINE_GET_REG(AR_KR5, ar5)
421 __DEFINE_GET_REG(AR_KR6, ar6)
422 __DEFINE_GET_REG(AR_KR7, ar7)
423 __DEFINE_GET_AR(RSC, rsc)
424 __DEFINE_GET_AR(BSP, bsp)
425 __DEFINE_GET_AR(BSPSTORE, bspstore)
426 __DEFINE_GET_AR(RNAT, rnat)
427 __DEFINE_GET_AR(FCR, fcr)
428 __DEFINE_GET_AR(EFLAG, eflag)
429 __DEFINE_GET_AR(CSD, csd)
430 __DEFINE_GET_AR(SSD, ssd)
431 __DEFINE_GET_REG(AR_CFLAG, ar27)
432 __DEFINE_GET_AR(FSR, fsr)
433 __DEFINE_GET_AR(FIR, fir)
434 __DEFINE_GET_AR(FDR, fdr)
435 __DEFINE_GET_AR(CCV, ccv)
436 __DEFINE_GET_AR(UNAT, unat)
437 __DEFINE_GET_AR(FPSR, fpsr)
438 __DEFINE_GET_AR(ITC, itc)
439 __DEFINE_GET_AR(PFS, pfs)
440 __DEFINE_GET_AR(LC, lc)
441 __DEFINE_GET_AR(EC, ec)
442
443 __DEFINE_GET_CR(DCR, dcr)
444 __DEFINE_GET_CR(ITM, itm)
445 __DEFINE_GET_CR(IVA, iva)
446 __DEFINE_GET_CR(PTA, pta)
447 __DEFINE_GET_CR(IPSR, ipsr)
448 __DEFINE_GET_CR(ISR, isr)
449 __DEFINE_GET_CR(IIP, iip)
450 __DEFINE_GET_CR(IFA, ifa)
451 __DEFINE_GET_CR(ITIR, itir)
452 __DEFINE_GET_CR(IIPA, iipa)
453 __DEFINE_GET_CR(IFS, ifs)
454 __DEFINE_GET_CR(IIM, iim)
455 __DEFINE_GET_CR(IHA, iha)
456 __DEFINE_GET_CR(LID, lid)
457 __DEFINE_GET_CR(IVR, ivr)
458 __DEFINE_GET_CR(TPR, tpr)
459 __DEFINE_GET_CR(EOI, eoi)
460 __DEFINE_GET_CR(IRR0, irr0)
461 __DEFINE_GET_CR(IRR1, irr1)
462 __DEFINE_GET_CR(IRR2, irr2)
463 __DEFINE_GET_CR(IRR3, irr3)
464 __DEFINE_GET_CR(ITV, itv)
465 __DEFINE_GET_CR(PMV, pmv)
466 __DEFINE_GET_CR(CMCV, cmcv)
467 __DEFINE_GET_CR(LRR0, lrr0)
468 __DEFINE_GET_CR(LRR1, lrr1)
469
470 "mov r8 = -1\n" /* unsupported case */
471 );
472
473extern void ia64_native_setreg_func(int regnum, unsigned long val);
474asm(".global ia64_native_setreg_func\n");
475#define __DEFINE_SET_REG(id, reg) \
476 "mov r2 = " __stringify(_IA64_REG_ ## id) "\n" \
477 ";;\n" \
478 "cmp.eq p6, p0 = r2, r9\n" \
479 ";;\n" \
480 "(p6) mov " #reg " = r8\n" \
481 "(p6) br.cond.sptk.many b6\n" \
482 ";;\n"
483#define __DEFINE_SET_AR(id, reg) __DEFINE_SET_REG(AR_ ## id, ar.reg)
484#define __DEFINE_SET_CR(id, reg) __DEFINE_SET_REG(CR_ ## id, cr.reg)
485__DEFINE_FUNC(setreg,
486 "mov r2 = " __stringify(_IA64_REG_PSR_L) "\n"
487 ";;\n"
488 "cmp.eq p6, p0 = r2, r9\n"
489 ";;\n"
490 "(p6) mov psr.l = r8\n"
491#ifdef HAVE_SERIALIZE_DIRECTIVE
492 ".serialize.data\n"
493#endif
494 "(p6) br.cond.sptk.many b6\n"
495 __DEFINE_SET_REG(GP, gp)
496 __DEFINE_SET_REG(SP, sp)
497
498 __DEFINE_SET_REG(AR_KR0, ar0)
499 __DEFINE_SET_REG(AR_KR1, ar1)
500 __DEFINE_SET_REG(AR_KR2, ar2)
501 __DEFINE_SET_REG(AR_KR3, ar3)
502 __DEFINE_SET_REG(AR_KR4, ar4)
503 __DEFINE_SET_REG(AR_KR5, ar5)
504 __DEFINE_SET_REG(AR_KR6, ar6)
505 __DEFINE_SET_REG(AR_KR7, ar7)
506 __DEFINE_SET_AR(RSC, rsc)
507 __DEFINE_SET_AR(BSP, bsp)
508 __DEFINE_SET_AR(BSPSTORE, bspstore)
509 __DEFINE_SET_AR(RNAT, rnat)
510 __DEFINE_SET_AR(FCR, fcr)
511 __DEFINE_SET_AR(EFLAG, eflag)
512 __DEFINE_SET_AR(CSD, csd)
513 __DEFINE_SET_AR(SSD, ssd)
514 __DEFINE_SET_REG(AR_CFLAG, ar27)
515 __DEFINE_SET_AR(FSR, fsr)
516 __DEFINE_SET_AR(FIR, fir)
517 __DEFINE_SET_AR(FDR, fdr)
518 __DEFINE_SET_AR(CCV, ccv)
519 __DEFINE_SET_AR(UNAT, unat)
520 __DEFINE_SET_AR(FPSR, fpsr)
521 __DEFINE_SET_AR(ITC, itc)
522 __DEFINE_SET_AR(PFS, pfs)
523 __DEFINE_SET_AR(LC, lc)
524 __DEFINE_SET_AR(EC, ec)
525
526 __DEFINE_SET_CR(DCR, dcr)
527 __DEFINE_SET_CR(ITM, itm)
528 __DEFINE_SET_CR(IVA, iva)
529 __DEFINE_SET_CR(PTA, pta)
530 __DEFINE_SET_CR(IPSR, ipsr)
531 __DEFINE_SET_CR(ISR, isr)
532 __DEFINE_SET_CR(IIP, iip)
533 __DEFINE_SET_CR(IFA, ifa)
534 __DEFINE_SET_CR(ITIR, itir)
535 __DEFINE_SET_CR(IIPA, iipa)
536 __DEFINE_SET_CR(IFS, ifs)
537 __DEFINE_SET_CR(IIM, iim)
538 __DEFINE_SET_CR(IHA, iha)
539 __DEFINE_SET_CR(LID, lid)
540 __DEFINE_SET_CR(IVR, ivr)
541 __DEFINE_SET_CR(TPR, tpr)
542 __DEFINE_SET_CR(EOI, eoi)
543 __DEFINE_SET_CR(IRR0, irr0)
544 __DEFINE_SET_CR(IRR1, irr1)
545 __DEFINE_SET_CR(IRR2, irr2)
546 __DEFINE_SET_CR(IRR3, irr3)
547 __DEFINE_SET_CR(ITV, itv)
548 __DEFINE_SET_CR(PMV, pmv)
549 __DEFINE_SET_CR(CMCV, cmcv)
550 __DEFINE_SET_CR(LRR0, lrr0)
551 __DEFINE_SET_CR(LRR1, lrr1)
552 );
553#endif
277 554
278struct pv_cpu_ops pv_cpu_ops = { 555struct pv_cpu_ops pv_cpu_ops = {
279 .fc = ia64_native_fc_func, 556 .fc = ia64_native_fc_func,
@@ -366,4 +643,258 @@ ia64_native_do_steal_accounting(unsigned long *new_itm)
366 643
367struct pv_time_ops pv_time_ops = { 644struct pv_time_ops pv_time_ops = {
368 .do_steal_accounting = ia64_native_do_steal_accounting, 645 .do_steal_accounting = ia64_native_do_steal_accounting,
646 .sched_clock = ia64_native_sched_clock,
647};
648
649/***************************************************************************
650 * binary pacthing
651 * pv_init_ops.patch_bundle
652 */
653
654#ifdef ASM_SUPPORTED
655#define IA64_NATIVE_PATCH_DEFINE_GET_REG(name, reg) \
656 __DEFINE_FUNC(get_ ## name, \
657 ";;\n" \
658 "mov r8 = " #reg "\n" \
659 ";;\n")
660
661#define IA64_NATIVE_PATCH_DEFINE_SET_REG(name, reg) \
662 __DEFINE_FUNC(set_ ## name, \
663 ";;\n" \
664 "mov " #reg " = r8\n" \
665 ";;\n")
666
667#define IA64_NATIVE_PATCH_DEFINE_REG(name, reg) \
668 IA64_NATIVE_PATCH_DEFINE_GET_REG(name, reg); \
669 IA64_NATIVE_PATCH_DEFINE_SET_REG(name, reg) \
670
671#define IA64_NATIVE_PATCH_DEFINE_AR(name, reg) \
672 IA64_NATIVE_PATCH_DEFINE_REG(ar_ ## name, ar.reg)
673
674#define IA64_NATIVE_PATCH_DEFINE_CR(name, reg) \
675 IA64_NATIVE_PATCH_DEFINE_REG(cr_ ## name, cr.reg)
676
677
678IA64_NATIVE_PATCH_DEFINE_GET_REG(psr, psr);
679IA64_NATIVE_PATCH_DEFINE_GET_REG(tp, tp);
680
681/* IA64_NATIVE_PATCH_DEFINE_SET_REG(psr_l, psr.l); */
682__DEFINE_FUNC(set_psr_l,
683 ";;\n"
684 "mov psr.l = r8\n"
685#ifdef HAVE_SERIALIZE_DIRECTIVE
686 ".serialize.data\n"
687#endif
688 ";;\n");
689
690IA64_NATIVE_PATCH_DEFINE_REG(gp, gp);
691IA64_NATIVE_PATCH_DEFINE_REG(sp, sp);
692
693IA64_NATIVE_PATCH_DEFINE_REG(kr0, ar0);
694IA64_NATIVE_PATCH_DEFINE_REG(kr1, ar1);
695IA64_NATIVE_PATCH_DEFINE_REG(kr2, ar2);
696IA64_NATIVE_PATCH_DEFINE_REG(kr3, ar3);
697IA64_NATIVE_PATCH_DEFINE_REG(kr4, ar4);
698IA64_NATIVE_PATCH_DEFINE_REG(kr5, ar5);
699IA64_NATIVE_PATCH_DEFINE_REG(kr6, ar6);
700IA64_NATIVE_PATCH_DEFINE_REG(kr7, ar7);
701
702IA64_NATIVE_PATCH_DEFINE_AR(rsc, rsc);
703IA64_NATIVE_PATCH_DEFINE_AR(bsp, bsp);
704IA64_NATIVE_PATCH_DEFINE_AR(bspstore, bspstore);
705IA64_NATIVE_PATCH_DEFINE_AR(rnat, rnat);
706IA64_NATIVE_PATCH_DEFINE_AR(fcr, fcr);
707IA64_NATIVE_PATCH_DEFINE_AR(eflag, eflag);
708IA64_NATIVE_PATCH_DEFINE_AR(csd, csd);
709IA64_NATIVE_PATCH_DEFINE_AR(ssd, ssd);
710IA64_NATIVE_PATCH_DEFINE_REG(ar27, ar27);
711IA64_NATIVE_PATCH_DEFINE_AR(fsr, fsr);
712IA64_NATIVE_PATCH_DEFINE_AR(fir, fir);
713IA64_NATIVE_PATCH_DEFINE_AR(fdr, fdr);
714IA64_NATIVE_PATCH_DEFINE_AR(ccv, ccv);
715IA64_NATIVE_PATCH_DEFINE_AR(unat, unat);
716IA64_NATIVE_PATCH_DEFINE_AR(fpsr, fpsr);
717IA64_NATIVE_PATCH_DEFINE_AR(itc, itc);
718IA64_NATIVE_PATCH_DEFINE_AR(pfs, pfs);
719IA64_NATIVE_PATCH_DEFINE_AR(lc, lc);
720IA64_NATIVE_PATCH_DEFINE_AR(ec, ec);
721
722IA64_NATIVE_PATCH_DEFINE_CR(dcr, dcr);
723IA64_NATIVE_PATCH_DEFINE_CR(itm, itm);
724IA64_NATIVE_PATCH_DEFINE_CR(iva, iva);
725IA64_NATIVE_PATCH_DEFINE_CR(pta, pta);
726IA64_NATIVE_PATCH_DEFINE_CR(ipsr, ipsr);
727IA64_NATIVE_PATCH_DEFINE_CR(isr, isr);
728IA64_NATIVE_PATCH_DEFINE_CR(iip, iip);
729IA64_NATIVE_PATCH_DEFINE_CR(ifa, ifa);
730IA64_NATIVE_PATCH_DEFINE_CR(itir, itir);
731IA64_NATIVE_PATCH_DEFINE_CR(iipa, iipa);
732IA64_NATIVE_PATCH_DEFINE_CR(ifs, ifs);
733IA64_NATIVE_PATCH_DEFINE_CR(iim, iim);
734IA64_NATIVE_PATCH_DEFINE_CR(iha, iha);
735IA64_NATIVE_PATCH_DEFINE_CR(lid, lid);
736IA64_NATIVE_PATCH_DEFINE_CR(ivr, ivr);
737IA64_NATIVE_PATCH_DEFINE_CR(tpr, tpr);
738IA64_NATIVE_PATCH_DEFINE_CR(eoi, eoi);
739IA64_NATIVE_PATCH_DEFINE_CR(irr0, irr0);
740IA64_NATIVE_PATCH_DEFINE_CR(irr1, irr1);
741IA64_NATIVE_PATCH_DEFINE_CR(irr2, irr2);
742IA64_NATIVE_PATCH_DEFINE_CR(irr3, irr3);
743IA64_NATIVE_PATCH_DEFINE_CR(itv, itv);
744IA64_NATIVE_PATCH_DEFINE_CR(pmv, pmv);
745IA64_NATIVE_PATCH_DEFINE_CR(cmcv, cmcv);
746IA64_NATIVE_PATCH_DEFINE_CR(lrr0, lrr0);
747IA64_NATIVE_PATCH_DEFINE_CR(lrr1, lrr1);
748
749static const struct paravirt_patch_bundle_elem ia64_native_patch_bundle_elems[]
750__initdata_or_module =
751{
752#define IA64_NATIVE_PATCH_BUNDLE_ELEM(name, type) \
753 { \
754 (void*)ia64_native_ ## name ## _direct_start, \
755 (void*)ia64_native_ ## name ## _direct_end, \
756 PARAVIRT_PATCH_TYPE_ ## type, \
757 }
758
759 IA64_NATIVE_PATCH_BUNDLE_ELEM(fc, FC),
760 IA64_NATIVE_PATCH_BUNDLE_ELEM(thash, THASH),
761 IA64_NATIVE_PATCH_BUNDLE_ELEM(get_cpuid, GET_CPUID),
762 IA64_NATIVE_PATCH_BUNDLE_ELEM(get_pmd, GET_PMD),
763 IA64_NATIVE_PATCH_BUNDLE_ELEM(ptcga, PTCGA),
764 IA64_NATIVE_PATCH_BUNDLE_ELEM(get_rr, GET_RR),
765 IA64_NATIVE_PATCH_BUNDLE_ELEM(set_rr, SET_RR),
766 IA64_NATIVE_PATCH_BUNDLE_ELEM(set_rr0_to_rr4, SET_RR0_TO_RR4),
767 IA64_NATIVE_PATCH_BUNDLE_ELEM(ssm_i, SSM_I),
768 IA64_NATIVE_PATCH_BUNDLE_ELEM(rsm_i, RSM_I),
769 IA64_NATIVE_PATCH_BUNDLE_ELEM(get_psr_i, GET_PSR_I),
770 IA64_NATIVE_PATCH_BUNDLE_ELEM(intrin_local_irq_restore,
771 INTRIN_LOCAL_IRQ_RESTORE),
772
773#define IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(name, reg) \
774 { \
775 (void*)ia64_native_get_ ## name ## _direct_start, \
776 (void*)ia64_native_get_ ## name ## _direct_end, \
777 PARAVIRT_PATCH_TYPE_GETREG + _IA64_REG_ ## reg, \
778 }
779
780#define IA64_NATIVE_PATCH_BUNDLE_ELEM_SETREG(name, reg) \
781 { \
782 (void*)ia64_native_set_ ## name ## _direct_start, \
783 (void*)ia64_native_set_ ## name ## _direct_end, \
784 PARAVIRT_PATCH_TYPE_SETREG + _IA64_REG_ ## reg, \
785 }
786
787#define IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(name, reg) \
788 IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(name, reg), \
789 IA64_NATIVE_PATCH_BUNDLE_ELEM_SETREG(name, reg) \
790
791#define IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(name, reg) \
792 IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(ar_ ## name, AR_ ## reg)
793
794#define IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(name, reg) \
795 IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(cr_ ## name, CR_ ## reg)
796
797 IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(psr, PSR),
798 IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(tp, TP),
799
800 IA64_NATIVE_PATCH_BUNDLE_ELEM_SETREG(psr_l, PSR_L),
801
802 IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(gp, GP),
803 IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(sp, SP),
804
805 IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr0, AR_KR0),
806 IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr1, AR_KR1),
807 IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr2, AR_KR2),
808 IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr3, AR_KR3),
809 IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr4, AR_KR4),
810 IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr5, AR_KR5),
811 IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr6, AR_KR6),
812 IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr7, AR_KR7),
813
814 IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(rsc, RSC),
815 IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(bsp, BSP),
816 IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(bspstore, BSPSTORE),
817 IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(rnat, RNAT),
818 IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fcr, FCR),
819 IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(eflag, EFLAG),
820 IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(csd, CSD),
821 IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(ssd, SSD),
822 IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(ar27, AR_CFLAG),
823 IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fsr, FSR),
824 IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fir, FIR),
825 IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fdr, FDR),
826 IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(ccv, CCV),
827 IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(unat, UNAT),
828 IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fpsr, FPSR),
829 IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(itc, ITC),
830 IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(pfs, PFS),
831 IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(lc, LC),
832 IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(ec, EC),
833
834 IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(dcr, DCR),
835 IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(itm, ITM),
836 IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iva, IVA),
837 IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(pta, PTA),
838 IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ipsr, IPSR),
839 IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(isr, ISR),
840 IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iip, IIP),
841 IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ifa, IFA),
842 IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(itir, ITIR),
843 IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iipa, IIPA),
844 IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ifs, IFS),
845 IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iim, IIM),
846 IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iha, IHA),
847 IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(lid, LID),
848 IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ivr, IVR),
849 IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(tpr, TPR),
850 IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(eoi, EOI),
851 IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr0, IRR0),
852 IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr1, IRR1),
853 IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr2, IRR2),
854 IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr3, IRR3),
855 IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(itv, ITV),
856 IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(pmv, PMV),
857 IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(cmcv, CMCV),
858 IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(lrr0, LRR0),
859 IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(lrr1, LRR1),
369}; 860};
861
862unsigned long __init_or_module
863ia64_native_patch_bundle(void *sbundle, void *ebundle, unsigned long type)
864{
865 const unsigned long nelems = sizeof(ia64_native_patch_bundle_elems) /
866 sizeof(ia64_native_patch_bundle_elems[0]);
867
868 return __paravirt_patch_apply_bundle(sbundle, ebundle, type,
869 ia64_native_patch_bundle_elems,
870 nelems, NULL);
871}
872#endif /* ASM_SUPPOTED */
873
874extern const char ia64_native_switch_to[];
875extern const char ia64_native_leave_syscall[];
876extern const char ia64_native_work_processed_syscall[];
877extern const char ia64_native_leave_kernel[];
878
879const struct paravirt_patch_branch_target ia64_native_branch_target[]
880__initconst = {
881#define PARAVIRT_BR_TARGET(name, type) \
882 { \
883 ia64_native_ ## name, \
884 PARAVIRT_PATCH_TYPE_BR_ ## type, \
885 }
886 PARAVIRT_BR_TARGET(switch_to, SWITCH_TO),
887 PARAVIRT_BR_TARGET(leave_syscall, LEAVE_SYSCALL),
888 PARAVIRT_BR_TARGET(work_processed_syscall, WORK_PROCESSED_SYSCALL),
889 PARAVIRT_BR_TARGET(leave_kernel, LEAVE_KERNEL),
890};
891
892static void __init
893ia64_native_patch_branch(unsigned long tag, unsigned long type)
894{
895 const unsigned long nelem =
896 sizeof(ia64_native_branch_target) /
897 sizeof(ia64_native_branch_target[0]);
898 __paravirt_patch_apply_branch(tag, type,
899 ia64_native_branch_target, nelem);
900}
diff --git a/arch/ia64/kernel/paravirt_patch.c b/arch/ia64/kernel/paravirt_patch.c
new file mode 100644
index 000000000000..bfdfef1b1ffd
--- /dev/null
+++ b/arch/ia64/kernel/paravirt_patch.c
@@ -0,0 +1,514 @@
1/******************************************************************************
2 * linux/arch/ia64/xen/paravirt_patch.c
3 *
4 * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
5 * VA Linux Systems Japan K.K.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 *
21 */
22
23#include <linux/init.h>
24#include <asm/intrinsics.h>
25#include <asm/kprobes.h>
26#include <asm/paravirt.h>
27#include <asm/paravirt_patch.h>
28
29typedef union ia64_inst {
30 struct {
31 unsigned long long qp : 6;
32 unsigned long long : 31;
33 unsigned long long opcode : 4;
34 unsigned long long reserved : 23;
35 } generic;
36 unsigned long long l;
37} ia64_inst_t;
38
39/*
40 * flush_icache_range() can't be used here.
41 * we are here before cpu_init() which initializes
42 * ia64_i_cache_stride_shift. flush_icache_range() uses it.
43 */
44void __init_or_module
45paravirt_flush_i_cache_range(const void *instr, unsigned long size)
46{
47 extern void paravirt_fc_i(const void *addr);
48 unsigned long i;
49
50 for (i = 0; i < size; i += sizeof(bundle_t))
51 paravirt_fc_i(instr + i);
52}
53
54bundle_t* __init_or_module
55paravirt_get_bundle(unsigned long tag)
56{
57 return (bundle_t *)(tag & ~3UL);
58}
59
60unsigned long __init_or_module
61paravirt_get_slot(unsigned long tag)
62{
63 return tag & 3UL;
64}
65
66unsigned long __init_or_module
67paravirt_get_num_inst(unsigned long stag, unsigned long etag)
68{
69 bundle_t *sbundle = paravirt_get_bundle(stag);
70 unsigned long sslot = paravirt_get_slot(stag);
71 bundle_t *ebundle = paravirt_get_bundle(etag);
72 unsigned long eslot = paravirt_get_slot(etag);
73
74 return (ebundle - sbundle) * 3 + eslot - sslot + 1;
75}
76
77unsigned long __init_or_module
78paravirt_get_next_tag(unsigned long tag)
79{
80 unsigned long slot = paravirt_get_slot(tag);
81
82 switch (slot) {
83 case 0:
84 case 1:
85 return tag + 1;
86 case 2: {
87 bundle_t *bundle = paravirt_get_bundle(tag);
88 return (unsigned long)(bundle + 1);
89 }
90 default:
91 BUG();
92 }
93 /* NOTREACHED */
94}
95
96ia64_inst_t __init_or_module
97paravirt_read_slot0(const bundle_t *bundle)
98{
99 ia64_inst_t inst;
100 inst.l = bundle->quad0.slot0;
101 return inst;
102}
103
104ia64_inst_t __init_or_module
105paravirt_read_slot1(const bundle_t *bundle)
106{
107 ia64_inst_t inst;
108 inst.l = bundle->quad0.slot1_p0 |
109 ((unsigned long long)bundle->quad1.slot1_p1 << 18UL);
110 return inst;
111}
112
113ia64_inst_t __init_or_module
114paravirt_read_slot2(const bundle_t *bundle)
115{
116 ia64_inst_t inst;
117 inst.l = bundle->quad1.slot2;
118 return inst;
119}
120
121ia64_inst_t __init_or_module
122paravirt_read_inst(unsigned long tag)
123{
124 bundle_t *bundle = paravirt_get_bundle(tag);
125 unsigned long slot = paravirt_get_slot(tag);
126
127 switch (slot) {
128 case 0:
129 return paravirt_read_slot0(bundle);
130 case 1:
131 return paravirt_read_slot1(bundle);
132 case 2:
133 return paravirt_read_slot2(bundle);
134 default:
135 BUG();
136 }
137 /* NOTREACHED */
138}
139
140void __init_or_module
141paravirt_write_slot0(bundle_t *bundle, ia64_inst_t inst)
142{
143 bundle->quad0.slot0 = inst.l;
144}
145
146void __init_or_module
147paravirt_write_slot1(bundle_t *bundle, ia64_inst_t inst)
148{
149 bundle->quad0.slot1_p0 = inst.l;
150 bundle->quad1.slot1_p1 = inst.l >> 18UL;
151}
152
153void __init_or_module
154paravirt_write_slot2(bundle_t *bundle, ia64_inst_t inst)
155{
156 bundle->quad1.slot2 = inst.l;
157}
158
159void __init_or_module
160paravirt_write_inst(unsigned long tag, ia64_inst_t inst)
161{
162 bundle_t *bundle = paravirt_get_bundle(tag);
163 unsigned long slot = paravirt_get_slot(tag);
164
165 switch (slot) {
166 case 0:
167 paravirt_write_slot0(bundle, inst);
168 break;
169 case 1:
170 paravirt_write_slot1(bundle, inst);
171 break;
172 case 2:
173 paravirt_write_slot2(bundle, inst);
174 break;
175 default:
176 BUG();
177 break;
178 }
179 paravirt_flush_i_cache_range(bundle, sizeof(*bundle));
180}
181
182/* for debug */
183void
184paravirt_print_bundle(const bundle_t *bundle)
185{
186 const unsigned long *quad = (const unsigned long *)bundle;
187 ia64_inst_t slot0 = paravirt_read_slot0(bundle);
188 ia64_inst_t slot1 = paravirt_read_slot1(bundle);
189 ia64_inst_t slot2 = paravirt_read_slot2(bundle);
190
191 printk(KERN_DEBUG
192 "bundle 0x%p 0x%016lx 0x%016lx\n", bundle, quad[0], quad[1]);
193 printk(KERN_DEBUG
194 "bundle template 0x%x\n",
195 bundle->quad0.template);
196 printk(KERN_DEBUG
197 "slot0 0x%lx slot1_p0 0x%lx slot1_p1 0x%lx slot2 0x%lx\n",
198 (unsigned long)bundle->quad0.slot0,
199 (unsigned long)bundle->quad0.slot1_p0,
200 (unsigned long)bundle->quad1.slot1_p1,
201 (unsigned long)bundle->quad1.slot2);
202 printk(KERN_DEBUG
203 "slot0 0x%016llx slot1 0x%016llx slot2 0x%016llx\n",
204 slot0.l, slot1.l, slot2.l);
205}
206
207static int noreplace_paravirt __init_or_module = 0;
208
209static int __init setup_noreplace_paravirt(char *str)
210{
211 noreplace_paravirt = 1;
212 return 1;
213}
214__setup("noreplace-paravirt", setup_noreplace_paravirt);
215
216#ifdef ASM_SUPPORTED
217static void __init_or_module
218fill_nop_bundle(void *sbundle, void *ebundle)
219{
220 extern const char paravirt_nop_bundle[];
221 extern const unsigned long paravirt_nop_bundle_size;
222
223 void *bundle = sbundle;
224
225 BUG_ON((((unsigned long)sbundle) % sizeof(bundle_t)) != 0);
226 BUG_ON((((unsigned long)ebundle) % sizeof(bundle_t)) != 0);
227
228 while (bundle < ebundle) {
229 memcpy(bundle, paravirt_nop_bundle, paravirt_nop_bundle_size);
230
231 bundle += paravirt_nop_bundle_size;
232 }
233}
234
235/* helper function */
236unsigned long __init_or_module
237__paravirt_patch_apply_bundle(void *sbundle, void *ebundle, unsigned long type,
238 const struct paravirt_patch_bundle_elem *elems,
239 unsigned long nelems,
240 const struct paravirt_patch_bundle_elem **found)
241{
242 unsigned long used = 0;
243 unsigned long i;
244
245 BUG_ON((((unsigned long)sbundle) % sizeof(bundle_t)) != 0);
246 BUG_ON((((unsigned long)ebundle) % sizeof(bundle_t)) != 0);
247
248 found = NULL;
249 for (i = 0; i < nelems; i++) {
250 const struct paravirt_patch_bundle_elem *p = &elems[i];
251 if (p->type == type) {
252 unsigned long need = p->ebundle - p->sbundle;
253 unsigned long room = ebundle - sbundle;
254
255 if (found != NULL)
256 *found = p;
257
258 if (room < need) {
259 /* no room to replace. skip it */
260 printk(KERN_DEBUG
261 "the space is too small to put "
262 "bundles. type %ld need %ld room %ld\n",
263 type, need, room);
264 break;
265 }
266
267 used = need;
268 memcpy(sbundle, p->sbundle, used);
269 break;
270 }
271 }
272
273 return used;
274}
275
276void __init_or_module
277paravirt_patch_apply_bundle(const struct paravirt_patch_site_bundle *start,
278 const struct paravirt_patch_site_bundle *end)
279{
280 const struct paravirt_patch_site_bundle *p;
281
282 if (noreplace_paravirt)
283 return;
284 if (pv_init_ops.patch_bundle == NULL)
285 return;
286
287 for (p = start; p < end; p++) {
288 unsigned long used;
289
290 used = (*pv_init_ops.patch_bundle)(p->sbundle, p->ebundle,
291 p->type);
292 if (used == 0)
293 continue;
294
295 fill_nop_bundle(p->sbundle + used, p->ebundle);
296 paravirt_flush_i_cache_range(p->sbundle,
297 p->ebundle - p->sbundle);
298 }
299 ia64_sync_i();
300 ia64_srlz_i();
301}
302
303/*
304 * nop.i, nop.m, nop.f instruction are same format.
305 * but nop.b has differennt format.
306 * This doesn't support nop.b for now.
307 */
308static void __init_or_module
309fill_nop_inst(unsigned long stag, unsigned long etag)
310{
311 extern const bundle_t paravirt_nop_mfi_inst_bundle[];
312 unsigned long tag;
313 const ia64_inst_t nop_inst =
314 paravirt_read_slot0(paravirt_nop_mfi_inst_bundle);
315
316 for (tag = stag; tag < etag; tag = paravirt_get_next_tag(tag))
317 paravirt_write_inst(tag, nop_inst);
318}
319
320void __init_or_module
321paravirt_patch_apply_inst(const struct paravirt_patch_site_inst *start,
322 const struct paravirt_patch_site_inst *end)
323{
324 const struct paravirt_patch_site_inst *p;
325
326 if (noreplace_paravirt)
327 return;
328 if (pv_init_ops.patch_inst == NULL)
329 return;
330
331 for (p = start; p < end; p++) {
332 unsigned long tag;
333 bundle_t *sbundle;
334 bundle_t *ebundle;
335
336 tag = (*pv_init_ops.patch_inst)(p->stag, p->etag, p->type);
337 if (tag == p->stag)
338 continue;
339
340 fill_nop_inst(tag, p->etag);
341 sbundle = paravirt_get_bundle(p->stag);
342 ebundle = paravirt_get_bundle(p->etag) + 1;
343 paravirt_flush_i_cache_range(sbundle, (ebundle - sbundle) *
344 sizeof(bundle_t));
345 }
346 ia64_sync_i();
347 ia64_srlz_i();
348}
349#endif /* ASM_SUPPOTED */
350
351/* brl.cond.sptk.many <target64> X3 */
352typedef union inst_x3_op {
353 ia64_inst_t inst;
354 struct {
355 unsigned long qp: 6;
356 unsigned long btyp: 3;
357 unsigned long unused: 3;
358 unsigned long p: 1;
359 unsigned long imm20b: 20;
360 unsigned long wh: 2;
361 unsigned long d: 1;
362 unsigned long i: 1;
363 unsigned long opcode: 4;
364 };
365 unsigned long l;
366} inst_x3_op_t;
367
368typedef union inst_x3_imm {
369 ia64_inst_t inst;
370 struct {
371 unsigned long unused: 2;
372 unsigned long imm39: 39;
373 };
374 unsigned long l;
375} inst_x3_imm_t;
376
377void __init_or_module
378paravirt_patch_reloc_brl(unsigned long tag, const void *target)
379{
380 unsigned long tag_op = paravirt_get_next_tag(tag);
381 unsigned long tag_imm = tag;
382 bundle_t *bundle = paravirt_get_bundle(tag);
383
384 ia64_inst_t inst_op = paravirt_read_inst(tag_op);
385 ia64_inst_t inst_imm = paravirt_read_inst(tag_imm);
386
387 inst_x3_op_t inst_x3_op = { .l = inst_op.l };
388 inst_x3_imm_t inst_x3_imm = { .l = inst_imm.l };
389
390 unsigned long imm60 =
391 ((unsigned long)target - (unsigned long)bundle) >> 4;
392
393 BUG_ON(paravirt_get_slot(tag) != 1); /* MLX */
394 BUG_ON(((unsigned long)target & (sizeof(bundle_t) - 1)) != 0);
395
396 /* imm60[59] 1bit */
397 inst_x3_op.i = (imm60 >> 59) & 1;
398 /* imm60[19:0] 20bit */
399 inst_x3_op.imm20b = imm60 & ((1UL << 20) - 1);
400 /* imm60[58:20] 39bit */
401 inst_x3_imm.imm39 = (imm60 >> 20) & ((1UL << 39) - 1);
402
403 inst_op.l = inst_x3_op.l;
404 inst_imm.l = inst_x3_imm.l;
405
406 paravirt_write_inst(tag_op, inst_op);
407 paravirt_write_inst(tag_imm, inst_imm);
408}
409
410/* br.cond.sptk.many <target25> B1 */
411typedef union inst_b1 {
412 ia64_inst_t inst;
413 struct {
414 unsigned long qp: 6;
415 unsigned long btype: 3;
416 unsigned long unused: 3;
417 unsigned long p: 1;
418 unsigned long imm20b: 20;
419 unsigned long wh: 2;
420 unsigned long d: 1;
421 unsigned long s: 1;
422 unsigned long opcode: 4;
423 };
424 unsigned long l;
425} inst_b1_t;
426
427void __init
428paravirt_patch_reloc_br(unsigned long tag, const void *target)
429{
430 bundle_t *bundle = paravirt_get_bundle(tag);
431 ia64_inst_t inst = paravirt_read_inst(tag);
432 unsigned long target25 = (unsigned long)target - (unsigned long)bundle;
433 inst_b1_t inst_b1;
434
435 BUG_ON(((unsigned long)target & (sizeof(bundle_t) - 1)) != 0);
436
437 inst_b1.l = inst.l;
438 if (target25 & (1UL << 63))
439 inst_b1.s = 1;
440 else
441 inst_b1.s = 0;
442
443 inst_b1.imm20b = target25 >> 4;
444 inst.l = inst_b1.l;
445
446 paravirt_write_inst(tag, inst);
447}
448
449void __init
450__paravirt_patch_apply_branch(
451 unsigned long tag, unsigned long type,
452 const struct paravirt_patch_branch_target *entries,
453 unsigned int nr_entries)
454{
455 unsigned int i;
456 for (i = 0; i < nr_entries; i++) {
457 if (entries[i].type == type) {
458 paravirt_patch_reloc_br(tag, entries[i].entry);
459 break;
460 }
461 }
462}
463
464static void __init
465paravirt_patch_apply_branch(const struct paravirt_patch_site_branch *start,
466 const struct paravirt_patch_site_branch *end)
467{
468 const struct paravirt_patch_site_branch *p;
469
470 if (noreplace_paravirt)
471 return;
472 if (pv_init_ops.patch_branch == NULL)
473 return;
474
475 for (p = start; p < end; p++)
476 (*pv_init_ops.patch_branch)(p->tag, p->type);
477
478 ia64_sync_i();
479 ia64_srlz_i();
480}
481
482void __init
483paravirt_patch_apply(void)
484{
485 extern const char __start_paravirt_bundles[];
486 extern const char __stop_paravirt_bundles[];
487 extern const char __start_paravirt_insts[];
488 extern const char __stop_paravirt_insts[];
489 extern const char __start_paravirt_branches[];
490 extern const char __stop_paravirt_branches[];
491
492 paravirt_patch_apply_bundle((const struct paravirt_patch_site_bundle *)
493 __start_paravirt_bundles,
494 (const struct paravirt_patch_site_bundle *)
495 __stop_paravirt_bundles);
496 paravirt_patch_apply_inst((const struct paravirt_patch_site_inst *)
497 __start_paravirt_insts,
498 (const struct paravirt_patch_site_inst *)
499 __stop_paravirt_insts);
500 paravirt_patch_apply_branch((const struct paravirt_patch_site_branch *)
501 __start_paravirt_branches,
502 (const struct paravirt_patch_site_branch *)
503 __stop_paravirt_branches);
504}
505
506/*
507 * Local variables:
508 * mode: C
509 * c-set-style: "linux"
510 * c-basic-offset: 8
511 * tab-width: 8
512 * indent-tabs-mode: t
513 * End:
514 */
diff --git a/arch/ia64/kernel/paravirt_patchlist.c b/arch/ia64/kernel/paravirt_patchlist.c
new file mode 100644
index 000000000000..b28082a95d45
--- /dev/null
+++ b/arch/ia64/kernel/paravirt_patchlist.c
@@ -0,0 +1,79 @@
1/******************************************************************************
2 * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
3 * VA Linux Systems Japan K.K.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 *
19 */
20
21#include <linux/bug.h>
22#include <asm/paravirt.h>
23
24#define DECLARE(name) \
25 extern unsigned long \
26 __ia64_native_start_gate_##name##_patchlist[]; \
27 extern unsigned long \
28 __ia64_native_end_gate_##name##_patchlist[]
29
30DECLARE(fsyscall);
31DECLARE(brl_fsys_bubble_down);
32DECLARE(vtop);
33DECLARE(mckinley_e9);
34
35extern unsigned long __start_gate_section[];
36
37#define ASSIGN(name) \
38 .start_##name##_patchlist = \
39 (unsigned long)__ia64_native_start_gate_##name##_patchlist, \
40 .end_##name##_patchlist = \
41 (unsigned long)__ia64_native_end_gate_##name##_patchlist
42
43struct pv_patchdata pv_patchdata __initdata = {
44 ASSIGN(fsyscall),
45 ASSIGN(brl_fsys_bubble_down),
46 ASSIGN(vtop),
47 ASSIGN(mckinley_e9),
48
49 .gate_section = (void*)__start_gate_section,
50};
51
52
53unsigned long __init
54paravirt_get_gate_patchlist(enum pv_gate_patchlist type)
55{
56
57#define CASE(NAME, name) \
58 case PV_GATE_START_##NAME: \
59 return pv_patchdata.start_##name##_patchlist; \
60 case PV_GATE_END_##NAME: \
61 return pv_patchdata.end_##name##_patchlist; \
62
63 switch (type) {
64 CASE(FSYSCALL, fsyscall);
65 CASE(BRL_FSYS_BUBBLE_DOWN, brl_fsys_bubble_down);
66 CASE(VTOP, vtop);
67 CASE(MCKINLEY_E9, mckinley_e9);
68 default:
69 BUG();
70 break;
71 }
72 return 0;
73}
74
75void * __init
76paravirt_get_gate_section(void)
77{
78 return pv_patchdata.gate_section;
79}
diff --git a/arch/ia64/kernel/paravirt_patchlist.h b/arch/ia64/kernel/paravirt_patchlist.h
new file mode 100644
index 000000000000..0684aa6c6507
--- /dev/null
+++ b/arch/ia64/kernel/paravirt_patchlist.h
@@ -0,0 +1,28 @@
1/******************************************************************************
2 * linux/arch/ia64/xen/paravirt_patchlist.h
3 *
4 * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
5 * VA Linux Systems Japan K.K.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 *
21 */
22
23#if defined(__IA64_GATE_PARAVIRTUALIZED_XEN)
24#include <asm/xen/patchlist.h>
25#else
26#include <asm/native/patchlist.h>
27#endif
28
diff --git a/arch/ia64/kernel/paravirtentry.S b/arch/ia64/kernel/paravirtentry.S
index 2f42fcb9776a..6158560d7f17 100644
--- a/arch/ia64/kernel/paravirtentry.S
+++ b/arch/ia64/kernel/paravirtentry.S
@@ -20,8 +20,11 @@
20 * 20 *
21 */ 21 */
22 22
23#include <linux/init.h>
23#include <asm/asmmacro.h> 24#include <asm/asmmacro.h>
24#include <asm/asm-offsets.h> 25#include <asm/asm-offsets.h>
26#include <asm/paravirt_privop.h>
27#include <asm/paravirt_patch.h>
25#include "entry.h" 28#include "entry.h"
26 29
27#define DATA8(sym, init_value) \ 30#define DATA8(sym, init_value) \
@@ -32,29 +35,87 @@
32 data8 init_value ; \ 35 data8 init_value ; \
33 .popsection 36 .popsection
34 37
35#define BRANCH(targ, reg, breg) \ 38#define BRANCH(targ, reg, breg, type) \
36 movl reg=targ ; \ 39 PARAVIRT_PATCH_SITE_BR(PARAVIRT_PATCH_TYPE_BR_ ## type) ; \
37 ;; \ 40 ;; \
38 ld8 reg=[reg] ; \ 41 movl reg=targ ; \
39 ;; \ 42 ;; \
40 mov breg=reg ; \ 43 ld8 reg=[reg] ; \
44 ;; \
45 mov breg=reg ; \
41 br.cond.sptk.many breg 46 br.cond.sptk.many breg
42 47
43#define BRANCH_PROC(sym, reg, breg) \ 48#define BRANCH_PROC(sym, reg, breg, type) \
44 DATA8(paravirt_ ## sym ## _targ, ia64_native_ ## sym) ; \ 49 DATA8(paravirt_ ## sym ## _targ, ia64_native_ ## sym) ; \
45 GLOBAL_ENTRY(paravirt_ ## sym) ; \ 50 GLOBAL_ENTRY(paravirt_ ## sym) ; \
46 BRANCH(paravirt_ ## sym ## _targ, reg, breg) ; \ 51 BRANCH(paravirt_ ## sym ## _targ, reg, breg, type) ; \
47 END(paravirt_ ## sym) 52 END(paravirt_ ## sym)
48 53
49#define BRANCH_PROC_UNWINFO(sym, reg, breg) \ 54#define BRANCH_PROC_UNWINFO(sym, reg, breg, type) \
50 DATA8(paravirt_ ## sym ## _targ, ia64_native_ ## sym) ; \ 55 DATA8(paravirt_ ## sym ## _targ, ia64_native_ ## sym) ; \
51 GLOBAL_ENTRY(paravirt_ ## sym) ; \ 56 GLOBAL_ENTRY(paravirt_ ## sym) ; \
52 PT_REGS_UNWIND_INFO(0) ; \ 57 PT_REGS_UNWIND_INFO(0) ; \
53 BRANCH(paravirt_ ## sym ## _targ, reg, breg) ; \ 58 BRANCH(paravirt_ ## sym ## _targ, reg, breg, type) ; \
54 END(paravirt_ ## sym) 59 END(paravirt_ ## sym)
55 60
56 61
57BRANCH_PROC(switch_to, r22, b7) 62BRANCH_PROC(switch_to, r22, b7, SWITCH_TO)
58BRANCH_PROC_UNWINFO(leave_syscall, r22, b7) 63BRANCH_PROC_UNWINFO(leave_syscall, r22, b7, LEAVE_SYSCALL)
59BRANCH_PROC(work_processed_syscall, r2, b7) 64BRANCH_PROC(work_processed_syscall, r2, b7, WORK_PROCESSED_SYSCALL)
60BRANCH_PROC_UNWINFO(leave_kernel, r22, b7) 65BRANCH_PROC_UNWINFO(leave_kernel, r22, b7, LEAVE_KERNEL)
66
67
68#ifdef CONFIG_MODULES
69#define __INIT_OR_MODULE .text
70#define __INITDATA_OR_MODULE .data
71#else
72#define __INIT_OR_MODULE __INIT
73#define __INITDATA_OR_MODULE __INITDATA
74#endif /* CONFIG_MODULES */
75
76 __INIT_OR_MODULE
77 GLOBAL_ENTRY(paravirt_fc_i)
78 fc.i r32
79 br.ret.sptk.many rp
80 END(paravirt_fc_i)
81 __FINIT
82
83 __INIT_OR_MODULE
84 .align 32
85 GLOBAL_ENTRY(paravirt_nop_b_inst_bundle)
86 {
87 nop.b 0
88 nop.b 0
89 nop.b 0
90 }
91 END(paravirt_nop_b_inst_bundle)
92 __FINIT
93
94 /* NOTE: nop.[mfi] has same format */
95 __INIT_OR_MODULE
96 GLOBAL_ENTRY(paravirt_nop_mfi_inst_bundle)
97 {
98 nop.m 0
99 nop.f 0
100 nop.i 0
101 }
102 END(paravirt_nop_mfi_inst_bundle)
103 __FINIT
104
105 __INIT_OR_MODULE
106 GLOBAL_ENTRY(paravirt_nop_bundle)
107paravirt_nop_bundle_start:
108 {
109 nop 0
110 nop 0
111 nop 0
112 }
113paravirt_nop_bundle_end:
114 END(paravirt_nop_bundle)
115 __FINIT
116
117 __INITDATA_OR_MODULE
118 .align 8
119 .global paravirt_nop_bundle_size
120paravirt_nop_bundle_size:
121 data8 paravirt_nop_bundle_end - paravirt_nop_bundle_start
diff --git a/arch/ia64/kernel/patch.c b/arch/ia64/kernel/patch.c
index b83b2c516008..68a1311db806 100644
--- a/arch/ia64/kernel/patch.c
+++ b/arch/ia64/kernel/patch.c
@@ -7,6 +7,7 @@
7#include <linux/init.h> 7#include <linux/init.h>
8#include <linux/string.h> 8#include <linux/string.h>
9 9
10#include <asm/paravirt.h>
10#include <asm/patch.h> 11#include <asm/patch.h>
11#include <asm/processor.h> 12#include <asm/processor.h>
12#include <asm/sections.h> 13#include <asm/sections.h>
@@ -169,16 +170,35 @@ ia64_patch_mckinley_e9 (unsigned long start, unsigned long end)
169 ia64_srlz_i(); 170 ia64_srlz_i();
170} 171}
171 172
173extern unsigned long ia64_native_fsyscall_table[NR_syscalls];
174extern char ia64_native_fsys_bubble_down[];
175struct pv_fsys_data pv_fsys_data __initdata = {
176 .fsyscall_table = (unsigned long *)ia64_native_fsyscall_table,
177 .fsys_bubble_down = (void *)ia64_native_fsys_bubble_down,
178};
179
180unsigned long * __init
181paravirt_get_fsyscall_table(void)
182{
183 return pv_fsys_data.fsyscall_table;
184}
185
186char * __init
187paravirt_get_fsys_bubble_down(void)
188{
189 return pv_fsys_data.fsys_bubble_down;
190}
191
172static void __init 192static void __init
173patch_fsyscall_table (unsigned long start, unsigned long end) 193patch_fsyscall_table (unsigned long start, unsigned long end)
174{ 194{
175 extern unsigned long fsyscall_table[NR_syscalls]; 195 u64 fsyscall_table = (u64)paravirt_get_fsyscall_table();
176 s32 *offp = (s32 *) start; 196 s32 *offp = (s32 *) start;
177 u64 ip; 197 u64 ip;
178 198
179 while (offp < (s32 *) end) { 199 while (offp < (s32 *) end) {
180 ip = (u64) ia64_imva((char *) offp + *offp); 200 ip = (u64) ia64_imva((char *) offp + *offp);
181 ia64_patch_imm64(ip, (u64) fsyscall_table); 201 ia64_patch_imm64(ip, fsyscall_table);
182 ia64_fc((void *) ip); 202 ia64_fc((void *) ip);
183 ++offp; 203 ++offp;
184 } 204 }
@@ -189,7 +209,7 @@ patch_fsyscall_table (unsigned long start, unsigned long end)
189static void __init 209static void __init
190patch_brl_fsys_bubble_down (unsigned long start, unsigned long end) 210patch_brl_fsys_bubble_down (unsigned long start, unsigned long end)
191{ 211{
192 extern char fsys_bubble_down[]; 212 u64 fsys_bubble_down = (u64)paravirt_get_fsys_bubble_down();
193 s32 *offp = (s32 *) start; 213 s32 *offp = (s32 *) start;
194 u64 ip; 214 u64 ip;
195 215
@@ -207,13 +227,13 @@ patch_brl_fsys_bubble_down (unsigned long start, unsigned long end)
207void __init 227void __init
208ia64_patch_gate (void) 228ia64_patch_gate (void)
209{ 229{
210# define START(name) ((unsigned long) __start_gate_##name##_patchlist) 230# define START(name) paravirt_get_gate_patchlist(PV_GATE_START_##name)
211# define END(name) ((unsigned long)__end_gate_##name##_patchlist) 231# define END(name) paravirt_get_gate_patchlist(PV_GATE_END_##name)
212 232
213 patch_fsyscall_table(START(fsyscall), END(fsyscall)); 233 patch_fsyscall_table(START(FSYSCALL), END(FSYSCALL));
214 patch_brl_fsys_bubble_down(START(brl_fsys_bubble_down), END(brl_fsys_bubble_down)); 234 patch_brl_fsys_bubble_down(START(BRL_FSYS_BUBBLE_DOWN), END(BRL_FSYS_BUBBLE_DOWN));
215 ia64_patch_vtop(START(vtop), END(vtop)); 235 ia64_patch_vtop(START(VTOP), END(VTOP));
216 ia64_patch_mckinley_e9(START(mckinley_e9), END(mckinley_e9)); 236 ia64_patch_mckinley_e9(START(MCKINLEY_E9), END(MCKINLEY_E9));
217} 237}
218 238
219void ia64_patch_phys_stack_reg(unsigned long val) 239void ia64_patch_phys_stack_reg(unsigned long val)
@@ -229,7 +249,7 @@ void ia64_patch_phys_stack_reg(unsigned long val)
229 while (offp < end) { 249 while (offp < end) {
230 ip = (u64) offp + *offp; 250 ip = (u64) offp + *offp;
231 ia64_patch(ip, mask, imm); 251 ia64_patch(ip, mask, imm);
232 ia64_fc(ip); 252 ia64_fc((void *)ip);
233 ++offp; 253 ++offp;
234 } 254 }
235 ia64_sync_i(); 255 ia64_sync_i();
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 5c0f408cfd71..8a06dc480594 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -5603,7 +5603,7 @@ pfm_interrupt_handler(int irq, void *arg)
5603 * /proc/perfmon interface, for debug only 5603 * /proc/perfmon interface, for debug only
5604 */ 5604 */
5605 5605
5606#define PFM_PROC_SHOW_HEADER ((void *)NR_CPUS+1) 5606#define PFM_PROC_SHOW_HEADER ((void *)nr_cpu_ids+1)
5607 5607
5608static void * 5608static void *
5609pfm_proc_start(struct seq_file *m, loff_t *pos) 5609pfm_proc_start(struct seq_file *m, loff_t *pos)
@@ -5612,7 +5612,7 @@ pfm_proc_start(struct seq_file *m, loff_t *pos)
5612 return PFM_PROC_SHOW_HEADER; 5612 return PFM_PROC_SHOW_HEADER;
5613 } 5613 }
5614 5614
5615 while (*pos <= NR_CPUS) { 5615 while (*pos <= nr_cpu_ids) {
5616 if (cpu_online(*pos - 1)) { 5616 if (cpu_online(*pos - 1)) {
5617 return (void *)*pos; 5617 return (void *)*pos;
5618 } 5618 }
diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c
index ecb9eb78d687..7053c55b7649 100644
--- a/arch/ia64/kernel/salinfo.c
+++ b/arch/ia64/kernel/salinfo.c
@@ -317,7 +317,7 @@ retry:
317 } 317 }
318 318
319 n = data->cpu_check; 319 n = data->cpu_check;
320 for (i = 0; i < NR_CPUS; i++) { 320 for (i = 0; i < nr_cpu_ids; i++) {
321 if (cpu_isset(n, data->cpu_event)) { 321 if (cpu_isset(n, data->cpu_event)) {
322 if (!cpu_online(n)) { 322 if (!cpu_online(n)) {
323 cpu_clear(n, data->cpu_event); 323 cpu_clear(n, data->cpu_event);
@@ -326,7 +326,7 @@ retry:
326 cpu = n; 326 cpu = n;
327 break; 327 break;
328 } 328 }
329 if (++n == NR_CPUS) 329 if (++n == nr_cpu_ids)
330 n = 0; 330 n = 0;
331 } 331 }
332 332
@@ -337,7 +337,7 @@ retry:
337 337
338 /* for next read, start checking at next CPU */ 338 /* for next read, start checking at next CPU */
339 data->cpu_check = cpu; 339 data->cpu_check = cpu;
340 if (++data->cpu_check == NR_CPUS) 340 if (++data->cpu_check == nr_cpu_ids)
341 data->cpu_check = 0; 341 data->cpu_check = 0;
342 342
343 snprintf(cmd, sizeof(cmd), "read %d\n", cpu); 343 snprintf(cmd, sizeof(cmd), "read %d\n", cpu);
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 865af27c7737..714066aeda7f 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -52,6 +52,7 @@
52#include <asm/meminit.h> 52#include <asm/meminit.h>
53#include <asm/page.h> 53#include <asm/page.h>
54#include <asm/paravirt.h> 54#include <asm/paravirt.h>
55#include <asm/paravirt_patch.h>
55#include <asm/patch.h> 56#include <asm/patch.h>
56#include <asm/pgtable.h> 57#include <asm/pgtable.h>
57#include <asm/processor.h> 58#include <asm/processor.h>
@@ -537,6 +538,7 @@ setup_arch (char **cmdline_p)
537 paravirt_arch_setup_early(); 538 paravirt_arch_setup_early();
538 539
539 ia64_patch_vtop((u64) __start___vtop_patchlist, (u64) __end___vtop_patchlist); 540 ia64_patch_vtop((u64) __start___vtop_patchlist, (u64) __end___vtop_patchlist);
541 paravirt_patch_apply();
540 542
541 *cmdline_p = __va(ia64_boot_param->command_line); 543 *cmdline_p = __va(ia64_boot_param->command_line);
542 strlcpy(boot_command_line, *cmdline_p, COMMAND_LINE_SIZE); 544 strlcpy(boot_command_line, *cmdline_p, COMMAND_LINE_SIZE);
@@ -730,10 +732,10 @@ static void *
730c_start (struct seq_file *m, loff_t *pos) 732c_start (struct seq_file *m, loff_t *pos)
731{ 733{
732#ifdef CONFIG_SMP 734#ifdef CONFIG_SMP
733 while (*pos < NR_CPUS && !cpu_isset(*pos, cpu_online_map)) 735 while (*pos < nr_cpu_ids && !cpu_online(*pos))
734 ++*pos; 736 ++*pos;
735#endif 737#endif
736 return *pos < NR_CPUS ? cpu_data(*pos) : NULL; 738 return *pos < nr_cpu_ids ? cpu_data(*pos) : NULL;
737} 739}
738 740
739static void * 741static void *
@@ -1016,8 +1018,7 @@ cpu_init (void)
1016 | IA64_DCR_DA | IA64_DCR_DD | IA64_DCR_LC)); 1018 | IA64_DCR_DA | IA64_DCR_DD | IA64_DCR_LC));
1017 atomic_inc(&init_mm.mm_count); 1019 atomic_inc(&init_mm.mm_count);
1018 current->active_mm = &init_mm; 1020 current->active_mm = &init_mm;
1019 if (current->mm) 1021 BUG_ON(current->mm);
1020 BUG();
1021 1022
1022 ia64_mmu_init(ia64_imva(cpu_data)); 1023 ia64_mmu_init(ia64_imva(cpu_data));
1023 ia64_mca_cpu_init(ia64_imva(cpu_data)); 1024 ia64_mca_cpu_init(ia64_imva(cpu_data));
diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c
index da8f020d82c1..2ea4199d9c57 100644
--- a/arch/ia64/kernel/smp.c
+++ b/arch/ia64/kernel/smp.c
@@ -166,11 +166,11 @@ send_IPI_allbutself (int op)
166 * Called with preemption disabled. 166 * Called with preemption disabled.
167 */ 167 */
168static inline void 168static inline void
169send_IPI_mask(cpumask_t mask, int op) 169send_IPI_mask(const struct cpumask *mask, int op)
170{ 170{
171 unsigned int cpu; 171 unsigned int cpu;
172 172
173 for_each_cpu_mask(cpu, mask) { 173 for_each_cpu(cpu, mask) {
174 send_IPI_single(cpu, op); 174 send_IPI_single(cpu, op);
175 } 175 }
176} 176}
@@ -316,7 +316,7 @@ void arch_send_call_function_single_ipi(int cpu)
316 send_IPI_single(cpu, IPI_CALL_FUNC_SINGLE); 316 send_IPI_single(cpu, IPI_CALL_FUNC_SINGLE);
317} 317}
318 318
319void arch_send_call_function_ipi(cpumask_t mask) 319void arch_send_call_function_ipi_mask(const struct cpumask *mask)
320{ 320{
321 send_IPI_mask(mask, IPI_CALL_FUNC); 321 send_IPI_mask(mask, IPI_CALL_FUNC);
322} 322}
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 52290547c85b..7700e23034bb 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -581,14 +581,14 @@ smp_build_cpu_map (void)
581 581
582 ia64_cpu_to_sapicid[0] = boot_cpu_id; 582 ia64_cpu_to_sapicid[0] = boot_cpu_id;
583 cpus_clear(cpu_present_map); 583 cpus_clear(cpu_present_map);
584 cpu_set(0, cpu_present_map); 584 set_cpu_present(0, true);
585 cpu_set(0, cpu_possible_map); 585 set_cpu_possible(0, true);
586 for (cpu = 1, i = 0; i < smp_boot_data.cpu_count; i++) { 586 for (cpu = 1, i = 0; i < smp_boot_data.cpu_count; i++) {
587 sapicid = smp_boot_data.cpu_phys_id[i]; 587 sapicid = smp_boot_data.cpu_phys_id[i];
588 if (sapicid == boot_cpu_id) 588 if (sapicid == boot_cpu_id)
589 continue; 589 continue;
590 cpu_set(cpu, cpu_present_map); 590 set_cpu_present(cpu, true);
591 cpu_set(cpu, cpu_possible_map); 591 set_cpu_possible(cpu, true);
592 ia64_cpu_to_sapicid[cpu] = sapicid; 592 ia64_cpu_to_sapicid[cpu] = sapicid;
593 cpu++; 593 cpu++;
594 } 594 }
@@ -626,12 +626,9 @@ smp_prepare_cpus (unsigned int max_cpus)
626 */ 626 */
627 if (!max_cpus) { 627 if (!max_cpus) {
628 printk(KERN_INFO "SMP mode deactivated.\n"); 628 printk(KERN_INFO "SMP mode deactivated.\n");
629 cpus_clear(cpu_online_map); 629 init_cpu_online(cpumask_of(0));
630 cpus_clear(cpu_present_map); 630 init_cpu_present(cpumask_of(0));
631 cpus_clear(cpu_possible_map); 631 init_cpu_possible(cpumask_of(0));
632 cpu_set(0, cpu_online_map);
633 cpu_set(0, cpu_present_map);
634 cpu_set(0, cpu_possible_map);
635 return; 632 return;
636 } 633 }
637} 634}
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index d6747bae52d8..641c8b61c4f1 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -51,6 +51,15 @@ EXPORT_SYMBOL(last_cli_ip);
51#endif 51#endif
52 52
53#ifdef CONFIG_PARAVIRT 53#ifdef CONFIG_PARAVIRT
54/* We need to define a real function for sched_clock, to override the
55 weak default version */
56unsigned long long sched_clock(void)
57{
58 return paravirt_sched_clock();
59}
60#endif
61
62#ifdef CONFIG_PARAVIRT
54static void 63static void
55paravirt_clocksource_resume(void) 64paravirt_clocksource_resume(void)
56{ 65{
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
index 3765efc5f963..4a95e86b9ac2 100644
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -169,6 +169,30 @@ SECTIONS
169 __end___mckinley_e9_bundles = .; 169 __end___mckinley_e9_bundles = .;
170 } 170 }
171 171
172#if defined(CONFIG_PARAVIRT)
173 . = ALIGN(16);
174 .paravirt_bundles : AT(ADDR(.paravirt_bundles) - LOAD_OFFSET)
175 {
176 __start_paravirt_bundles = .;
177 *(.paravirt_bundles)
178 __stop_paravirt_bundles = .;
179 }
180 . = ALIGN(16);
181 .paravirt_insts : AT(ADDR(.paravirt_insts) - LOAD_OFFSET)
182 {
183 __start_paravirt_insts = .;
184 *(.paravirt_insts)
185 __stop_paravirt_insts = .;
186 }
187 . = ALIGN(16);
188 .paravirt_branches : AT(ADDR(.paravirt_branches) - LOAD_OFFSET)
189 {
190 __start_paravirt_branches = .;
191 *(.paravirt_branches)
192 __stop_paravirt_branches = .;
193 }
194#endif
195
172#if defined(CONFIG_IA64_GENERIC) 196#if defined(CONFIG_IA64_GENERIC)
173 /* Machine Vector */ 197 /* Machine Vector */
174 . = ALIGN(16); 198 . = ALIGN(16);
@@ -201,6 +225,12 @@ SECTIONS
201 __start_gate_section = .; 225 __start_gate_section = .;
202 *(.data.gate) 226 *(.data.gate)
203 __stop_gate_section = .; 227 __stop_gate_section = .;
228#ifdef CONFIG_XEN
229 . = ALIGN(PAGE_SIZE);
230 __xen_start_gate_section = .;
231 *(.data.gate.xen)
232 __xen_stop_gate_section = .;
233#endif
204 } 234 }
205 . = ALIGN(PAGE_SIZE); /* make sure the gate page doesn't expose 235 . = ALIGN(PAGE_SIZE); /* make sure the gate page doesn't expose
206 * kernel data 236 * kernel data
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 076b00d1dbff..28af6a731bb8 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -70,7 +70,7 @@ static void kvm_flush_icache(unsigned long start, unsigned long len)
70 int l; 70 int l;
71 71
72 for (l = 0; l < (len + 32); l += 32) 72 for (l = 0; l < (len + 32); l += 32)
73 ia64_fc(start + l); 73 ia64_fc((void *)(start + l));
74 74
75 ia64_sync_i(); 75 ia64_sync_i();
76 ia64_srlz_i(); 76 ia64_srlz_i();
diff --git a/arch/ia64/kvm/vcpu.c b/arch/ia64/kvm/vcpu.c
index d4d280505878..a18ee17b9192 100644
--- a/arch/ia64/kvm/vcpu.c
+++ b/arch/ia64/kvm/vcpu.c
@@ -386,7 +386,7 @@ void set_rse_reg(struct kvm_pt_regs *regs, unsigned long r1,
386 else 386 else
387 *rnat_addr = (*rnat_addr) & (~nat_mask); 387 *rnat_addr = (*rnat_addr) & (~nat_mask);
388 388
389 ia64_setreg(_IA64_REG_AR_BSPSTORE, bspstore); 389 ia64_setreg(_IA64_REG_AR_BSPSTORE, (unsigned long)bspstore);
390 ia64_setreg(_IA64_REG_AR_RNAT, rnat); 390 ia64_setreg(_IA64_REG_AR_RNAT, rnat);
391 } 391 }
392 local_irq_restore(psr); 392 local_irq_restore(psr);
diff --git a/arch/ia64/kvm/vtlb.c b/arch/ia64/kvm/vtlb.c
index 38232b37668b..2c2501f13159 100644
--- a/arch/ia64/kvm/vtlb.c
+++ b/arch/ia64/kvm/vtlb.c
@@ -210,6 +210,7 @@ void thash_vhpt_insert(struct kvm_vcpu *v, u64 pte, u64 itir, u64 va, int type)
210 phy_pte &= ~PAGE_FLAGS_RV_MASK; 210 phy_pte &= ~PAGE_FLAGS_RV_MASK;
211 psr = ia64_clear_ic(); 211 psr = ia64_clear_ic();
212 ia64_itc(type, va, phy_pte, itir_ps(itir)); 212 ia64_itc(type, va, phy_pte, itir_ps(itir));
213 paravirt_dv_serialize_data();
213 ia64_set_psr(psr); 214 ia64_set_psr(psr);
214 } 215 }
215 216
@@ -456,6 +457,7 @@ void thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir,
456 phy_pte &= ~PAGE_FLAGS_RV_MASK; 457 phy_pte &= ~PAGE_FLAGS_RV_MASK;
457 psr = ia64_clear_ic(); 458 psr = ia64_clear_ic();
458 ia64_itc(type, ifa, phy_pte, ps); 459 ia64_itc(type, ifa, phy_pte, ps);
460 paravirt_dv_serialize_data();
459 ia64_set_psr(psr); 461 ia64_set_psr(psr);
460 } 462 }
461 if (!(pte&VTLB_PTE_IO)) 463 if (!(pte&VTLB_PTE_IO))
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 56e12903973c..c0f3bee69042 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -35,6 +35,7 @@
35#include <asm/uaccess.h> 35#include <asm/uaccess.h>
36#include <asm/unistd.h> 36#include <asm/unistd.h>
37#include <asm/mca.h> 37#include <asm/mca.h>
38#include <asm/paravirt.h>
38 39
39DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 40DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
40 41
@@ -259,6 +260,7 @@ put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot)
259static void __init 260static void __init
260setup_gate (void) 261setup_gate (void)
261{ 262{
263 void *gate_section;
262 struct page *page; 264 struct page *page;
263 265
264 /* 266 /*
@@ -266,10 +268,11 @@ setup_gate (void)
266 * headers etc. and once execute-only page to enable 268 * headers etc. and once execute-only page to enable
267 * privilege-promotion via "epc": 269 * privilege-promotion via "epc":
268 */ 270 */
269 page = virt_to_page(ia64_imva(__start_gate_section)); 271 gate_section = paravirt_get_gate_section();
272 page = virt_to_page(ia64_imva(gate_section));
270 put_kernel_page(page, GATE_ADDR, PAGE_READONLY); 273 put_kernel_page(page, GATE_ADDR, PAGE_READONLY);
271#ifdef HAVE_BUGGY_SEGREL 274#ifdef HAVE_BUGGY_SEGREL
272 page = virt_to_page(ia64_imva(__start_gate_section + PAGE_SIZE)); 275 page = virt_to_page(ia64_imva(gate_section + PAGE_SIZE));
273 put_kernel_page(page, GATE_ADDR + PAGE_SIZE, PAGE_GATE); 276 put_kernel_page(page, GATE_ADDR + PAGE_SIZE, PAGE_GATE);
274#else 277#else
275 put_kernel_page(page, GATE_ADDR + PERCPU_PAGE_SIZE, PAGE_GATE); 278 put_kernel_page(page, GATE_ADDR + PERCPU_PAGE_SIZE, PAGE_GATE);
@@ -633,8 +636,7 @@ mem_init (void)
633#endif 636#endif
634 637
635#ifdef CONFIG_FLATMEM 638#ifdef CONFIG_FLATMEM
636 if (!mem_map) 639 BUG_ON(!mem_map);
637 BUG();
638 max_mapnr = max_low_pfn; 640 max_mapnr = max_low_pfn;
639#endif 641#endif
640 642
@@ -667,8 +669,8 @@ mem_init (void)
667 * code can tell them apart. 669 * code can tell them apart.
668 */ 670 */
669 for (i = 0; i < NR_syscalls; ++i) { 671 for (i = 0; i < NR_syscalls; ++i) {
670 extern unsigned long fsyscall_table[NR_syscalls];
671 extern unsigned long sys_call_table[NR_syscalls]; 672 extern unsigned long sys_call_table[NR_syscalls];
673 unsigned long *fsyscall_table = paravirt_get_fsyscall_table();
672 674
673 if (!fsyscall_table[i] || nolwsys) 675 if (!fsyscall_table[i] || nolwsys)
674 fsyscall_table[i] = sys_call_table[i] | 1; 676 fsyscall_table[i] = sys_call_table[i] | 1;
diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c
index bd9818a36b47..b9f3d7bbb338 100644
--- a/arch/ia64/mm/tlb.c
+++ b/arch/ia64/mm/tlb.c
@@ -309,7 +309,7 @@ flush_tlb_range (struct vm_area_struct *vma, unsigned long start,
309 309
310 preempt_disable(); 310 preempt_disable();
311#ifdef CONFIG_SMP 311#ifdef CONFIG_SMP
312 if (mm != current->active_mm || cpus_weight(mm->cpu_vm_mask) != 1) { 312 if (mm != current->active_mm || cpumask_weight(mm_cpumask(mm)) != 1) {
313 platform_global_tlb_purge(mm, start, end, nbits); 313 platform_global_tlb_purge(mm, start, end, nbits);
314 preempt_enable(); 314 preempt_enable();
315 return; 315 return;
diff --git a/arch/ia64/scripts/pvcheck.sed b/arch/ia64/scripts/pvcheck.sed
index ba66ac2e4c60..e59809a3fc01 100644
--- a/arch/ia64/scripts/pvcheck.sed
+++ b/arch/ia64/scripts/pvcheck.sed
@@ -17,6 +17,7 @@ s/mov.*=.*cr\.iip/.warning \"cr.iip should not used directly\"/g
17s/mov.*=.*cr\.ivr/.warning \"cr.ivr should not used directly\"/g 17s/mov.*=.*cr\.ivr/.warning \"cr.ivr should not used directly\"/g
18s/mov.*=[^\.]*psr/.warning \"psr should not used directly\"/g # avoid ar.fpsr 18s/mov.*=[^\.]*psr/.warning \"psr should not used directly\"/g # avoid ar.fpsr
19s/mov.*=.*ar\.eflags/.warning \"ar.eflags should not used directly\"/g 19s/mov.*=.*ar\.eflags/.warning \"ar.eflags should not used directly\"/g
20s/mov.*=.*ar\.itc.*/.warning \"ar.itc should not used directly\"/g
20s/mov.*cr\.ifa.*=.*/.warning \"cr.ifa should not used directly\"/g 21s/mov.*cr\.ifa.*=.*/.warning \"cr.ifa should not used directly\"/g
21s/mov.*cr\.itir.*=.*/.warning \"cr.itir should not used directly\"/g 22s/mov.*cr\.itir.*=.*/.warning \"cr.itir should not used directly\"/g
22s/mov.*cr\.iha.*=.*/.warning \"cr.iha should not used directly\"/g 23s/mov.*cr\.iha.*=.*/.warning \"cr.iha should not used directly\"/g
diff --git a/arch/ia64/sn/kernel/io_common.c b/arch/ia64/sn/kernel/io_common.c
index 0d4ffa4da1da..57f280dd9def 100644
--- a/arch/ia64/sn/kernel/io_common.c
+++ b/arch/ia64/sn/kernel/io_common.c
@@ -135,8 +135,7 @@ static s64 sn_device_fixup_war(u64 nasid, u64 widget, int device,
135 } 135 }
136 136
137 war_list = kzalloc(DEV_PER_WIDGET * sizeof(*war_list), GFP_KERNEL); 137 war_list = kzalloc(DEV_PER_WIDGET * sizeof(*war_list), GFP_KERNEL);
138 if (!war_list) 138 BUG_ON(!war_list);
139 BUG();
140 139
141 SAL_CALL_NOLOCK(isrv, SN_SAL_IOIF_GET_WIDGET_DMAFLUSH_LIST, 140 SAL_CALL_NOLOCK(isrv, SN_SAL_IOIF_GET_WIDGET_DMAFLUSH_LIST,
142 nasid, widget, __pa(war_list), 0, 0, 0 ,0); 141 nasid, widget, __pa(war_list), 0, 0, 0 ,0);
@@ -180,23 +179,20 @@ sn_common_hubdev_init(struct hubdev_info *hubdev)
180 sizeof(struct sn_flush_device_kernel *); 179 sizeof(struct sn_flush_device_kernel *);
181 hubdev->hdi_flush_nasid_list.widget_p = 180 hubdev->hdi_flush_nasid_list.widget_p =
182 kzalloc(size, GFP_KERNEL); 181 kzalloc(size, GFP_KERNEL);
183 if (!hubdev->hdi_flush_nasid_list.widget_p) 182 BUG_ON(!hubdev->hdi_flush_nasid_list.widget_p);
184 BUG();
185 183
186 for (widget = 0; widget <= HUB_WIDGET_ID_MAX; widget++) { 184 for (widget = 0; widget <= HUB_WIDGET_ID_MAX; widget++) {
187 size = DEV_PER_WIDGET * 185 size = DEV_PER_WIDGET *
188 sizeof(struct sn_flush_device_kernel); 186 sizeof(struct sn_flush_device_kernel);
189 sn_flush_device_kernel = kzalloc(size, GFP_KERNEL); 187 sn_flush_device_kernel = kzalloc(size, GFP_KERNEL);
190 if (!sn_flush_device_kernel) 188 BUG_ON(!sn_flush_device_kernel);
191 BUG();
192 189
193 dev_entry = sn_flush_device_kernel; 190 dev_entry = sn_flush_device_kernel;
194 for (device = 0; device < DEV_PER_WIDGET; 191 for (device = 0; device < DEV_PER_WIDGET;
195 device++, dev_entry++) { 192 device++, dev_entry++) {
196 size = sizeof(struct sn_flush_device_common); 193 size = sizeof(struct sn_flush_device_common);
197 dev_entry->common = kzalloc(size, GFP_KERNEL); 194 dev_entry->common = kzalloc(size, GFP_KERNEL);
198 if (!dev_entry->common) 195 BUG_ON(!dev_entry->common);
199 BUG();
200 if (sn_prom_feature_available(PRF_DEVICE_FLUSH_LIST)) 196 if (sn_prom_feature_available(PRF_DEVICE_FLUSH_LIST))
201 status = sal_get_device_dmaflush_list( 197 status = sal_get_device_dmaflush_list(
202 hubdev->hdi_nasid, widget, device, 198 hubdev->hdi_nasid, widget, device,
@@ -326,8 +322,7 @@ sn_common_bus_fixup(struct pci_bus *bus,
326 */ 322 */
327 controller->platform_data = kzalloc(sizeof(struct sn_platform_data), 323 controller->platform_data = kzalloc(sizeof(struct sn_platform_data),
328 GFP_KERNEL); 324 GFP_KERNEL);
329 if (controller->platform_data == NULL) 325 BUG_ON(controller->platform_data == NULL);
330 BUG();
331 sn_platform_data = 326 sn_platform_data =
332 (struct sn_platform_data *) controller->platform_data; 327 (struct sn_platform_data *) controller->platform_data;
333 sn_platform_data->provider_soft = provider_soft; 328 sn_platform_data->provider_soft = provider_soft;
diff --git a/arch/ia64/sn/kernel/io_init.c b/arch/ia64/sn/kernel/io_init.c
index e2eb2da60f96..ee774c366a06 100644
--- a/arch/ia64/sn/kernel/io_init.c
+++ b/arch/ia64/sn/kernel/io_init.c
@@ -128,8 +128,7 @@ sn_legacy_pci_window_fixup(struct pci_controller *controller,
128{ 128{
129 controller->window = kcalloc(2, sizeof(struct pci_window), 129 controller->window = kcalloc(2, sizeof(struct pci_window),
130 GFP_KERNEL); 130 GFP_KERNEL);
131 if (controller->window == NULL) 131 BUG_ON(controller->window == NULL);
132 BUG();
133 controller->window[0].offset = legacy_io; 132 controller->window[0].offset = legacy_io;
134 controller->window[0].resource.name = "legacy_io"; 133 controller->window[0].resource.name = "legacy_io";
135 controller->window[0].resource.flags = IORESOURCE_IO; 134 controller->window[0].resource.flags = IORESOURCE_IO;
@@ -168,8 +167,7 @@ sn_pci_window_fixup(struct pci_dev *dev, unsigned int count,
168 idx = controller->windows; 167 idx = controller->windows;
169 new_count = controller->windows + count; 168 new_count = controller->windows + count;
170 new_window = kcalloc(new_count, sizeof(struct pci_window), GFP_KERNEL); 169 new_window = kcalloc(new_count, sizeof(struct pci_window), GFP_KERNEL);
171 if (new_window == NULL) 170 BUG_ON(new_window == NULL);
172 BUG();
173 if (controller->window) { 171 if (controller->window) {
174 memcpy(new_window, controller->window, 172 memcpy(new_window, controller->window,
175 sizeof(struct pci_window) * controller->windows); 173 sizeof(struct pci_window) * controller->windows);
@@ -222,8 +220,7 @@ sn_io_slot_fixup(struct pci_dev *dev)
222 (u64) __pa(pcidev_info), 220 (u64) __pa(pcidev_info),
223 (u64) __pa(sn_irq_info)); 221 (u64) __pa(sn_irq_info));
224 222
225 if (status) 223 BUG_ON(status); /* Cannot get platform pci device information */
226 BUG(); /* Cannot get platform pci device information */
227 224
228 225
229 /* Copy over PIO Mapped Addresses */ 226 /* Copy over PIO Mapped Addresses */
@@ -307,8 +304,7 @@ sn_pci_controller_fixup(int segment, int busnum, struct pci_bus *bus)
307 prom_bussoft_ptr = __va(prom_bussoft_ptr); 304 prom_bussoft_ptr = __va(prom_bussoft_ptr);
308 305
309 controller = kzalloc(sizeof(*controller), GFP_KERNEL); 306 controller = kzalloc(sizeof(*controller), GFP_KERNEL);
310 if (!controller) 307 BUG_ON(!controller);
311 BUG();
312 controller->segment = segment; 308 controller->segment = segment;
313 309
314 /* 310 /*
diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c
index 02c5b8a9fb60..e456f062f241 100644
--- a/arch/ia64/sn/kernel/setup.c
+++ b/arch/ia64/sn/kernel/setup.c
@@ -732,8 +732,7 @@ void __init build_cnode_tables(void)
732 kl_config_hdr_t *klgraph_header; 732 kl_config_hdr_t *klgraph_header;
733 nasid = cnodeid_to_nasid(node); 733 nasid = cnodeid_to_nasid(node);
734 klgraph_header = ia64_sn_get_klconfig_addr(nasid); 734 klgraph_header = ia64_sn_get_klconfig_addr(nasid);
735 if (klgraph_header == NULL) 735 BUG_ON(klgraph_header == NULL);
736 BUG();
737 brd = NODE_OFFSET_TO_LBOARD(nasid, klgraph_header->ch_board_info); 736 brd = NODE_OFFSET_TO_LBOARD(nasid, klgraph_header->ch_board_info);
738 while (brd) { 737 while (brd) {
739 if (board_needs_cnode(brd->brd_type) && physical_node_map[brd->brd_nasid] < 0) { 738 if (board_needs_cnode(brd->brd_type) && physical_node_map[brd->brd_nasid] < 0) {
@@ -750,7 +749,7 @@ nasid_slice_to_cpuid(int nasid, int slice)
750{ 749{
751 long cpu; 750 long cpu;
752 751
753 for (cpu = 0; cpu < NR_CPUS; cpu++) 752 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
754 if (cpuid_to_nasid(cpu) == nasid && 753 if (cpuid_to_nasid(cpu) == nasid &&
755 cpuid_to_slice(cpu) == slice) 754 cpuid_to_slice(cpu) == slice)
756 return cpu; 755 return cpu;
diff --git a/arch/ia64/sn/kernel/sn2/sn2_smp.c b/arch/ia64/sn/kernel/sn2/sn2_smp.c
index e585f9a2afb9..1176506b2bae 100644
--- a/arch/ia64/sn/kernel/sn2/sn2_smp.c
+++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c
@@ -133,7 +133,7 @@ sn2_ipi_flush_all_tlb(struct mm_struct *mm)
133 unsigned long itc; 133 unsigned long itc;
134 134
135 itc = ia64_get_itc(); 135 itc = ia64_get_itc();
136 smp_flush_tlb_cpumask(mm->cpu_vm_mask); 136 smp_flush_tlb_cpumask(*mm_cpumask(mm));
137 itc = ia64_get_itc() - itc; 137 itc = ia64_get_itc() - itc;
138 __get_cpu_var(ptcstats).shub_ipi_flushes_itc_clocks += itc; 138 __get_cpu_var(ptcstats).shub_ipi_flushes_itc_clocks += itc;
139 __get_cpu_var(ptcstats).shub_ipi_flushes++; 139 __get_cpu_var(ptcstats).shub_ipi_flushes++;
@@ -182,7 +182,7 @@ sn2_global_tlb_purge(struct mm_struct *mm, unsigned long start,
182 nodes_clear(nodes_flushed); 182 nodes_clear(nodes_flushed);
183 i = 0; 183 i = 0;
184 184
185 for_each_cpu_mask(cpu, mm->cpu_vm_mask) { 185 for_each_cpu(cpu, mm_cpumask(mm)) {
186 cnode = cpu_to_node(cpu); 186 cnode = cpu_to_node(cpu);
187 node_set(cnode, nodes_flushed); 187 node_set(cnode, nodes_flushed);
188 lcpu = cpu; 188 lcpu = cpu;
@@ -461,7 +461,7 @@ bool sn_cpu_disable_allowed(int cpu)
461 461
462static void *sn2_ptc_seq_start(struct seq_file *file, loff_t * offset) 462static void *sn2_ptc_seq_start(struct seq_file *file, loff_t * offset)
463{ 463{
464 if (*offset < NR_CPUS) 464 if (*offset < nr_cpu_ids)
465 return offset; 465 return offset;
466 return NULL; 466 return NULL;
467} 467}
@@ -469,7 +469,7 @@ static void *sn2_ptc_seq_start(struct seq_file *file, loff_t * offset)
469static void *sn2_ptc_seq_next(struct seq_file *file, void *data, loff_t * offset) 469static void *sn2_ptc_seq_next(struct seq_file *file, void *data, loff_t * offset)
470{ 470{
471 (*offset)++; 471 (*offset)++;
472 if (*offset < NR_CPUS) 472 if (*offset < nr_cpu_ids)
473 return offset; 473 return offset;
474 return NULL; 474 return NULL;
475} 475}
@@ -491,7 +491,7 @@ static int sn2_ptc_seq_show(struct seq_file *file, void *data)
491 seq_printf(file, "# ptctest %d, flushopt %d\n", sn2_ptctest, sn2_flush_opt); 491 seq_printf(file, "# ptctest %d, flushopt %d\n", sn2_ptctest, sn2_flush_opt);
492 } 492 }
493 493
494 if (cpu < NR_CPUS && cpu_online(cpu)) { 494 if (cpu < nr_cpu_ids && cpu_online(cpu)) {
495 stat = &per_cpu(ptcstats, cpu); 495 stat = &per_cpu(ptcstats, cpu);
496 seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", cpu, stat->ptc_l, 496 seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", cpu, stat->ptc_l,
497 stat->change_rid, stat->shub_ptc_flushes, stat->nodes_flushed, 497 stat->change_rid, stat->shub_ptc_flushes, stat->nodes_flushed,
@@ -554,7 +554,7 @@ static int __init sn2_ptc_init(void)
554 554
555 proc_sn2_ptc = proc_create(PTC_BASENAME, 0444, 555 proc_sn2_ptc = proc_create(PTC_BASENAME, 0444,
556 NULL, &proc_sn2_ptc_operations); 556 NULL, &proc_sn2_ptc_operations);
557 if (!&proc_sn2_ptc_operations) { 557 if (!proc_sn2_ptc) {
558 printk(KERN_ERR "unable to create %s proc entry", PTC_BASENAME); 558 printk(KERN_ERR "unable to create %s proc entry", PTC_BASENAME);
559 return -EINVAL; 559 return -EINVAL;
560 } 560 }
diff --git a/arch/ia64/sn/kernel/sn2/sn_hwperf.c b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
index be339477f906..9e6491cf72bd 100644
--- a/arch/ia64/sn/kernel/sn2/sn_hwperf.c
+++ b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
@@ -275,8 +275,7 @@ static int sn_hwperf_get_nearest_node_objdata(struct sn_hwperf_object_info *objb
275 275
276 /* get it's interconnect topology */ 276 /* get it's interconnect topology */
277 sz = op->ports * sizeof(struct sn_hwperf_port_info); 277 sz = op->ports * sizeof(struct sn_hwperf_port_info);
278 if (sz > sizeof(ptdata)) 278 BUG_ON(sz > sizeof(ptdata));
279 BUG();
280 e = ia64_sn_hwperf_op(sn_hwperf_master_nasid, 279 e = ia64_sn_hwperf_op(sn_hwperf_master_nasid,
281 SN_HWPERF_ENUM_PORTS, nodeobj->id, sz, 280 SN_HWPERF_ENUM_PORTS, nodeobj->id, sz,
282 (u64)&ptdata, 0, 0, NULL); 281 (u64)&ptdata, 0, 0, NULL);
@@ -310,8 +309,7 @@ static int sn_hwperf_get_nearest_node_objdata(struct sn_hwperf_object_info *objb
310 if (router && (!found_cpu || !found_mem)) { 309 if (router && (!found_cpu || !found_mem)) {
311 /* search for a node connected to the same router */ 310 /* search for a node connected to the same router */
312 sz = router->ports * sizeof(struct sn_hwperf_port_info); 311 sz = router->ports * sizeof(struct sn_hwperf_port_info);
313 if (sz > sizeof(ptdata)) 312 BUG_ON(sz > sizeof(ptdata));
314 BUG();
315 e = ia64_sn_hwperf_op(sn_hwperf_master_nasid, 313 e = ia64_sn_hwperf_op(sn_hwperf_master_nasid,
316 SN_HWPERF_ENUM_PORTS, router->id, sz, 314 SN_HWPERF_ENUM_PORTS, router->id, sz,
317 (u64)&ptdata, 0, 0, NULL); 315 (u64)&ptdata, 0, 0, NULL);
@@ -612,7 +610,7 @@ static int sn_hwperf_op_cpu(struct sn_hwperf_op_info *op_info)
612 op_info->a->arg &= SN_HWPERF_ARG_OBJID_MASK; 610 op_info->a->arg &= SN_HWPERF_ARG_OBJID_MASK;
613 611
614 if (cpu != SN_HWPERF_ARG_ANY_CPU) { 612 if (cpu != SN_HWPERF_ARG_ANY_CPU) {
615 if (cpu >= NR_CPUS || !cpu_online(cpu)) { 613 if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
616 r = -EINVAL; 614 r = -EINVAL;
617 goto out; 615 goto out;
618 } 616 }
diff --git a/arch/ia64/sn/pci/pcibr/pcibr_dma.c b/arch/ia64/sn/pci/pcibr/pcibr_dma.c
index 060df4aa9916..c659ad5613a0 100644
--- a/arch/ia64/sn/pci/pcibr/pcibr_dma.c
+++ b/arch/ia64/sn/pci/pcibr/pcibr_dma.c
@@ -256,9 +256,7 @@ void sn_dma_flush(u64 addr)
256 256
257 hubinfo = (NODEPDA(nasid_to_cnodeid(nasid)))->pdinfo; 257 hubinfo = (NODEPDA(nasid_to_cnodeid(nasid)))->pdinfo;
258 258
259 if (!hubinfo) { 259 BUG_ON(!hubinfo);
260 BUG();
261 }
262 260
263 flush_nasid_list = &hubinfo->hdi_flush_nasid_list; 261 flush_nasid_list = &hubinfo->hdi_flush_nasid_list;
264 if (flush_nasid_list->widget_p == NULL) 262 if (flush_nasid_list->widget_p == NULL)
diff --git a/arch/ia64/xen/Makefile b/arch/ia64/xen/Makefile
index 0ad0224693d9..e6f4a0a74228 100644
--- a/arch/ia64/xen/Makefile
+++ b/arch/ia64/xen/Makefile
@@ -3,14 +3,29 @@
3# 3#
4 4
5obj-y := hypercall.o xenivt.o xensetup.o xen_pv_ops.o irq_xen.o \ 5obj-y := hypercall.o xenivt.o xensetup.o xen_pv_ops.o irq_xen.o \
6 hypervisor.o xencomm.o xcom_hcall.o grant-table.o time.o suspend.o 6 hypervisor.o xencomm.o xcom_hcall.o grant-table.o time.o suspend.o \
7 gate-data.o
7 8
8obj-$(CONFIG_IA64_GENERIC) += machvec.o 9obj-$(CONFIG_IA64_GENERIC) += machvec.o
9 10
11# The gate DSO image is built using a special linker script.
12include $(srctree)/arch/ia64/kernel/Makefile.gate
13
14# tell compiled for xen
15CPPFLAGS_gate.lds += -D__IA64_GATE_PARAVIRTUALIZED_XEN
16AFLAGS_gate.o += -D__IA64_ASM_PARAVIRTUALIZED_XEN -D__IA64_GATE_PARAVIRTUALIZED_XEN
17
18# use same file of native.
19$(obj)/gate.o: $(src)/../kernel/gate.S FORCE
20 $(call if_changed_dep,as_o_S)
21$(obj)/gate.lds: $(src)/../kernel/gate.lds.S FORCE
22 $(call if_changed_dep,cpp_lds_S)
23
24
10AFLAGS_xenivt.o += -D__IA64_ASM_PARAVIRTUALIZED_XEN 25AFLAGS_xenivt.o += -D__IA64_ASM_PARAVIRTUALIZED_XEN
11 26
12# xen multi compile 27# xen multi compile
13ASM_PARAVIRT_MULTI_COMPILE_SRCS = ivt.S entry.S 28ASM_PARAVIRT_MULTI_COMPILE_SRCS = ivt.S entry.S fsys.S
14ASM_PARAVIRT_OBJS = $(addprefix xen-,$(ASM_PARAVIRT_MULTI_COMPILE_SRCS:.S=.o)) 29ASM_PARAVIRT_OBJS = $(addprefix xen-,$(ASM_PARAVIRT_MULTI_COMPILE_SRCS:.S=.o))
15obj-y += $(ASM_PARAVIRT_OBJS) 30obj-y += $(ASM_PARAVIRT_OBJS)
16define paravirtualized_xen 31define paravirtualized_xen
diff --git a/arch/ia64/xen/gate-data.S b/arch/ia64/xen/gate-data.S
new file mode 100644
index 000000000000..7d4830afc91d
--- /dev/null
+++ b/arch/ia64/xen/gate-data.S
@@ -0,0 +1,3 @@
1 .section .data.gate.xen, "aw"
2
3 .incbin "arch/ia64/xen/gate.so"
diff --git a/arch/ia64/xen/hypercall.S b/arch/ia64/xen/hypercall.S
index 45e02bb64a92..e32dae444dd6 100644
--- a/arch/ia64/xen/hypercall.S
+++ b/arch/ia64/xen/hypercall.S
@@ -9,6 +9,7 @@
9#include <asm/intrinsics.h> 9#include <asm/intrinsics.h>
10#include <asm/xen/privop.h> 10#include <asm/xen/privop.h>
11 11
12#ifdef __INTEL_COMPILER
12/* 13/*
13 * Hypercalls without parameter. 14 * Hypercalls without parameter.
14 */ 15 */
@@ -72,6 +73,7 @@ GLOBAL_ENTRY(xen_set_rr0_to_rr4)
72 br.ret.sptk.many rp 73 br.ret.sptk.many rp
73 ;; 74 ;;
74END(xen_set_rr0_to_rr4) 75END(xen_set_rr0_to_rr4)
76#endif
75 77
76GLOBAL_ENTRY(xen_send_ipi) 78GLOBAL_ENTRY(xen_send_ipi)
77 mov r14=r32 79 mov r14=r32
diff --git a/arch/ia64/xen/time.c b/arch/ia64/xen/time.c
index 68d6204c3f16..fb8332690179 100644
--- a/arch/ia64/xen/time.c
+++ b/arch/ia64/xen/time.c
@@ -175,10 +175,58 @@ static void xen_itc_jitter_data_reset(void)
175 } while (unlikely(ret != lcycle)); 175 } while (unlikely(ret != lcycle));
176} 176}
177 177
178/* based on xen_sched_clock() in arch/x86/xen/time.c. */
179/*
180 * This relies on HAVE_UNSTABLE_SCHED_CLOCK. If it can't be defined,
181 * something similar logic should be implemented here.
182 */
183/*
184 * Xen sched_clock implementation. Returns the number of unstolen
185 * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
186 * states.
187 */
188static unsigned long long xen_sched_clock(void)
189{
190 struct vcpu_runstate_info runstate;
191
192 unsigned long long now;
193 unsigned long long offset;
194 unsigned long long ret;
195
196 /*
197 * Ideally sched_clock should be called on a per-cpu basis
198 * anyway, so preempt should already be disabled, but that's
199 * not current practice at the moment.
200 */
201 preempt_disable();
202
203 /*
204 * both ia64_native_sched_clock() and xen's runstate are
205 * based on mAR.ITC. So difference of them makes sense.
206 */
207 now = ia64_native_sched_clock();
208
209 get_runstate_snapshot(&runstate);
210
211 WARN_ON(runstate.state != RUNSTATE_running);
212
213 offset = 0;
214 if (now > runstate.state_entry_time)
215 offset = now - runstate.state_entry_time;
216 ret = runstate.time[RUNSTATE_blocked] +
217 runstate.time[RUNSTATE_running] +
218 offset;
219
220 preempt_enable();
221
222 return ret;
223}
224
178struct pv_time_ops xen_time_ops __initdata = { 225struct pv_time_ops xen_time_ops __initdata = {
179 .init_missing_ticks_accounting = xen_init_missing_ticks_accounting, 226 .init_missing_ticks_accounting = xen_init_missing_ticks_accounting,
180 .do_steal_accounting = xen_do_steal_accounting, 227 .do_steal_accounting = xen_do_steal_accounting,
181 .clocksource_resume = xen_itc_jitter_data_reset, 228 .clocksource_resume = xen_itc_jitter_data_reset,
229 .sched_clock = xen_sched_clock,
182}; 230};
183 231
184/* Called after suspend, to resume time. */ 232/* Called after suspend, to resume time. */
diff --git a/arch/ia64/xen/xen_pv_ops.c b/arch/ia64/xen/xen_pv_ops.c
index 936cff3c96e0..5e2270a999fa 100644
--- a/arch/ia64/xen/xen_pv_ops.c
+++ b/arch/ia64/xen/xen_pv_ops.c
@@ -24,6 +24,7 @@
24#include <linux/irq.h> 24#include <linux/irq.h>
25#include <linux/kernel.h> 25#include <linux/kernel.h>
26#include <linux/pm.h> 26#include <linux/pm.h>
27#include <linux/unistd.h>
27 28
28#include <asm/xen/hypervisor.h> 29#include <asm/xen/hypervisor.h>
29#include <asm/xen/xencomm.h> 30#include <asm/xen/xencomm.h>
@@ -153,6 +154,13 @@ xen_post_smp_prepare_boot_cpu(void)
153 xen_setup_vcpu_info_placement(); 154 xen_setup_vcpu_info_placement();
154} 155}
155 156
157#ifdef ASM_SUPPORTED
158static unsigned long __init_or_module
159xen_patch_bundle(void *sbundle, void *ebundle, unsigned long type);
160#endif
161static void __init
162xen_patch_branch(unsigned long tag, unsigned long type);
163
156static const struct pv_init_ops xen_init_ops __initconst = { 164static const struct pv_init_ops xen_init_ops __initconst = {
157 .banner = xen_banner, 165 .banner = xen_banner,
158 166
@@ -163,6 +171,53 @@ static const struct pv_init_ops xen_init_ops __initconst = {
163 .arch_setup_nomca = xen_arch_setup_nomca, 171 .arch_setup_nomca = xen_arch_setup_nomca,
164 172
165 .post_smp_prepare_boot_cpu = xen_post_smp_prepare_boot_cpu, 173 .post_smp_prepare_boot_cpu = xen_post_smp_prepare_boot_cpu,
174#ifdef ASM_SUPPORTED
175 .patch_bundle = xen_patch_bundle,
176#endif
177 .patch_branch = xen_patch_branch,
178};
179
180/***************************************************************************
181 * pv_fsys_data
182 * addresses for fsys
183 */
184
185extern unsigned long xen_fsyscall_table[NR_syscalls];
186extern char xen_fsys_bubble_down[];
187struct pv_fsys_data xen_fsys_data __initdata = {
188 .fsyscall_table = (unsigned long *)xen_fsyscall_table,
189 .fsys_bubble_down = (void *)xen_fsys_bubble_down,
190};
191
192/***************************************************************************
193 * pv_patchdata
194 * patchdata addresses
195 */
196
197#define DECLARE(name) \
198 extern unsigned long __xen_start_gate_##name##_patchlist[]; \
199 extern unsigned long __xen_end_gate_##name##_patchlist[]
200
201DECLARE(fsyscall);
202DECLARE(brl_fsys_bubble_down);
203DECLARE(vtop);
204DECLARE(mckinley_e9);
205
206extern unsigned long __xen_start_gate_section[];
207
208#define ASSIGN(name) \
209 .start_##name##_patchlist = \
210 (unsigned long)__xen_start_gate_##name##_patchlist, \
211 .end_##name##_patchlist = \
212 (unsigned long)__xen_end_gate_##name##_patchlist
213
214static struct pv_patchdata xen_patchdata __initdata = {
215 ASSIGN(fsyscall),
216 ASSIGN(brl_fsys_bubble_down),
217 ASSIGN(vtop),
218 ASSIGN(mckinley_e9),
219
220 .gate_section = (void*)__xen_start_gate_section,
166}; 221};
167 222
168/*************************************************************************** 223/***************************************************************************
@@ -170,6 +225,76 @@ static const struct pv_init_ops xen_init_ops __initconst = {
170 * intrinsics hooks. 225 * intrinsics hooks.
171 */ 226 */
172 227
228#ifndef ASM_SUPPORTED
229static void
230xen_set_itm_with_offset(unsigned long val)
231{
232 /* ia64_cpu_local_tick() calls this with interrupt enabled. */
233 /* WARN_ON(!irqs_disabled()); */
234 xen_set_itm(val - XEN_MAPPEDREGS->itc_offset);
235}
236
237static unsigned long
238xen_get_itm_with_offset(void)
239{
240 /* unused at this moment */
241 printk(KERN_DEBUG "%s is called.\n", __func__);
242
243 WARN_ON(!irqs_disabled());
244 return ia64_native_getreg(_IA64_REG_CR_ITM) +
245 XEN_MAPPEDREGS->itc_offset;
246}
247
248/* ia64_set_itc() is only called by
249 * cpu_init() with ia64_set_itc(0) and ia64_sync_itc().
250 * So XEN_MAPPEDRESG->itc_offset cal be considered as almost constant.
251 */
252static void
253xen_set_itc(unsigned long val)
254{
255 unsigned long mitc;
256
257 WARN_ON(!irqs_disabled());
258 mitc = ia64_native_getreg(_IA64_REG_AR_ITC);
259 XEN_MAPPEDREGS->itc_offset = val - mitc;
260 XEN_MAPPEDREGS->itc_last = val;
261}
262
263static unsigned long
264xen_get_itc(void)
265{
266 unsigned long res;
267 unsigned long itc_offset;
268 unsigned long itc_last;
269 unsigned long ret_itc_last;
270
271 itc_offset = XEN_MAPPEDREGS->itc_offset;
272 do {
273 itc_last = XEN_MAPPEDREGS->itc_last;
274 res = ia64_native_getreg(_IA64_REG_AR_ITC);
275 res += itc_offset;
276 if (itc_last >= res)
277 res = itc_last + 1;
278 ret_itc_last = cmpxchg(&XEN_MAPPEDREGS->itc_last,
279 itc_last, res);
280 } while (unlikely(ret_itc_last != itc_last));
281 return res;
282
283#if 0
284 /* ia64_itc_udelay() calls ia64_get_itc() with interrupt enabled.
285 Should it be paravirtualized instead? */
286 WARN_ON(!irqs_disabled());
287 itc_offset = XEN_MAPPEDREGS->itc_offset;
288 itc_last = XEN_MAPPEDREGS->itc_last;
289 res = ia64_native_getreg(_IA64_REG_AR_ITC);
290 res += itc_offset;
291 if (itc_last >= res)
292 res = itc_last + 1;
293 XEN_MAPPEDREGS->itc_last = res;
294 return res;
295#endif
296}
297
173static void xen_setreg(int regnum, unsigned long val) 298static void xen_setreg(int regnum, unsigned long val)
174{ 299{
175 switch (regnum) { 300 switch (regnum) {
@@ -181,11 +306,14 @@ static void xen_setreg(int regnum, unsigned long val)
181 xen_set_eflag(val); 306 xen_set_eflag(val);
182 break; 307 break;
183#endif 308#endif
309 case _IA64_REG_AR_ITC:
310 xen_set_itc(val);
311 break;
184 case _IA64_REG_CR_TPR: 312 case _IA64_REG_CR_TPR:
185 xen_set_tpr(val); 313 xen_set_tpr(val);
186 break; 314 break;
187 case _IA64_REG_CR_ITM: 315 case _IA64_REG_CR_ITM:
188 xen_set_itm(val); 316 xen_set_itm_with_offset(val);
189 break; 317 break;
190 case _IA64_REG_CR_EOI: 318 case _IA64_REG_CR_EOI:
191 xen_eoi(val); 319 xen_eoi(val);
@@ -209,6 +337,12 @@ static unsigned long xen_getreg(int regnum)
209 res = xen_get_eflag(); 337 res = xen_get_eflag();
210 break; 338 break;
211#endif 339#endif
340 case _IA64_REG_AR_ITC:
341 res = xen_get_itc();
342 break;
343 case _IA64_REG_CR_ITM:
344 res = xen_get_itm_with_offset();
345 break;
212 case _IA64_REG_CR_IVR: 346 case _IA64_REG_CR_IVR:
213 res = xen_get_ivr(); 347 res = xen_get_ivr();
214 break; 348 break;
@@ -259,8 +393,417 @@ xen_intrin_local_irq_restore(unsigned long mask)
259 else 393 else
260 xen_rsm_i(); 394 xen_rsm_i();
261} 395}
396#else
397#define __DEFINE_FUNC(name, code) \
398 extern const char xen_ ## name ## _direct_start[]; \
399 extern const char xen_ ## name ## _direct_end[]; \
400 asm (".align 32\n" \
401 ".proc xen_" #name "\n" \
402 "xen_" #name ":\n" \
403 "xen_" #name "_direct_start:\n" \
404 code \
405 "xen_" #name "_direct_end:\n" \
406 "br.cond.sptk.many b6\n" \
407 ".endp xen_" #name "\n")
408
409#define DEFINE_VOID_FUNC0(name, code) \
410 extern void \
411 xen_ ## name (void); \
412 __DEFINE_FUNC(name, code)
413
414#define DEFINE_VOID_FUNC1(name, code) \
415 extern void \
416 xen_ ## name (unsigned long arg); \
417 __DEFINE_FUNC(name, code)
418
419#define DEFINE_VOID_FUNC1_VOID(name, code) \
420 extern void \
421 xen_ ## name (void *arg); \
422 __DEFINE_FUNC(name, code)
423
424#define DEFINE_VOID_FUNC2(name, code) \
425 extern void \
426 xen_ ## name (unsigned long arg0, \
427 unsigned long arg1); \
428 __DEFINE_FUNC(name, code)
262 429
263static const struct pv_cpu_ops xen_cpu_ops __initdata = { 430#define DEFINE_FUNC0(name, code) \
431 extern unsigned long \
432 xen_ ## name (void); \
433 __DEFINE_FUNC(name, code)
434
435#define DEFINE_FUNC1(name, type, code) \
436 extern unsigned long \
437 xen_ ## name (type arg); \
438 __DEFINE_FUNC(name, code)
439
440#define XEN_PSR_I_ADDR_ADDR (XSI_BASE + XSI_PSR_I_ADDR_OFS)
441
442/*
443 * static void xen_set_itm_with_offset(unsigned long val)
444 * xen_set_itm(val - XEN_MAPPEDREGS->itc_offset);
445 */
446/* 2 bundles */
447DEFINE_VOID_FUNC1(set_itm_with_offset,
448 "mov r2 = " __stringify(XSI_BASE) " + "
449 __stringify(XSI_ITC_OFFSET_OFS) "\n"
450 ";;\n"
451 "ld8 r3 = [r2]\n"
452 ";;\n"
453 "sub r8 = r8, r3\n"
454 "break " __stringify(HYPERPRIVOP_SET_ITM) "\n");
455
456/*
457 * static unsigned long xen_get_itm_with_offset(void)
458 * return ia64_native_getreg(_IA64_REG_CR_ITM) + XEN_MAPPEDREGS->itc_offset;
459 */
460/* 2 bundles */
461DEFINE_FUNC0(get_itm_with_offset,
462 "mov r2 = " __stringify(XSI_BASE) " + "
463 __stringify(XSI_ITC_OFFSET_OFS) "\n"
464 ";;\n"
465 "ld8 r3 = [r2]\n"
466 "mov r8 = cr.itm\n"
467 ";;\n"
468 "add r8 = r8, r2\n");
469
470/*
471 * static void xen_set_itc(unsigned long val)
472 * unsigned long mitc;
473 *
474 * WARN_ON(!irqs_disabled());
475 * mitc = ia64_native_getreg(_IA64_REG_AR_ITC);
476 * XEN_MAPPEDREGS->itc_offset = val - mitc;
477 * XEN_MAPPEDREGS->itc_last = val;
478 */
479/* 2 bundles */
480DEFINE_VOID_FUNC1(set_itc,
481 "mov r2 = " __stringify(XSI_BASE) " + "
482 __stringify(XSI_ITC_LAST_OFS) "\n"
483 "mov r3 = ar.itc\n"
484 ";;\n"
485 "sub r3 = r8, r3\n"
486 "st8 [r2] = r8, "
487 __stringify(XSI_ITC_LAST_OFS) " - "
488 __stringify(XSI_ITC_OFFSET_OFS) "\n"
489 ";;\n"
490 "st8 [r2] = r3\n");
491
492/*
493 * static unsigned long xen_get_itc(void)
494 * unsigned long res;
495 * unsigned long itc_offset;
496 * unsigned long itc_last;
497 * unsigned long ret_itc_last;
498 *
499 * itc_offset = XEN_MAPPEDREGS->itc_offset;
500 * do {
501 * itc_last = XEN_MAPPEDREGS->itc_last;
502 * res = ia64_native_getreg(_IA64_REG_AR_ITC);
503 * res += itc_offset;
504 * if (itc_last >= res)
505 * res = itc_last + 1;
506 * ret_itc_last = cmpxchg(&XEN_MAPPEDREGS->itc_last,
507 * itc_last, res);
508 * } while (unlikely(ret_itc_last != itc_last));
509 * return res;
510 */
511/* 5 bundles */
512DEFINE_FUNC0(get_itc,
513 "mov r2 = " __stringify(XSI_BASE) " + "
514 __stringify(XSI_ITC_OFFSET_OFS) "\n"
515 ";;\n"
516 "ld8 r9 = [r2], " __stringify(XSI_ITC_LAST_OFS) " - "
517 __stringify(XSI_ITC_OFFSET_OFS) "\n"
518 /* r9 = itc_offset */
519 /* r2 = XSI_ITC_OFFSET */
520 "888:\n"
521 "mov r8 = ar.itc\n" /* res = ar.itc */
522 ";;\n"
523 "ld8 r3 = [r2]\n" /* r3 = itc_last */
524 "add r8 = r8, r9\n" /* res = ar.itc + itc_offset */
525 ";;\n"
526 "cmp.gtu p6, p0 = r3, r8\n"
527 ";;\n"
528 "(p6) add r8 = 1, r3\n" /* if (itc_last > res) itc_last + 1 */
529 ";;\n"
530 "mov ar.ccv = r8\n"
531 ";;\n"
532 "cmpxchg8.acq r10 = [r2], r8, ar.ccv\n"
533 ";;\n"
534 "cmp.ne p6, p0 = r10, r3\n"
535 "(p6) hint @pause\n"
536 "(p6) br.cond.spnt 888b\n");
537
538DEFINE_VOID_FUNC1_VOID(fc,
539 "break " __stringify(HYPERPRIVOP_FC) "\n");
540
541/*
542 * psr_i_addr_addr = XEN_PSR_I_ADDR_ADDR
543 * masked_addr = *psr_i_addr_addr
544 * pending_intr_addr = masked_addr - 1
545 * if (val & IA64_PSR_I) {
546 * masked = *masked_addr
547 * *masked_addr = 0:xen_set_virtual_psr_i(1)
548 * compiler barrier
549 * if (masked) {
550 * uint8_t pending = *pending_intr_addr;
551 * if (pending)
552 * XEN_HYPER_SSM_I
553 * }
554 * } else {
555 * *masked_addr = 1:xen_set_virtual_psr_i(0)
556 * }
557 */
558/* 6 bundles */
559DEFINE_VOID_FUNC1(intrin_local_irq_restore,
560 /* r8 = input value: 0 or IA64_PSR_I
561 * p6 = (flags & IA64_PSR_I)
562 * = if clause
563 * p7 = !(flags & IA64_PSR_I)
564 * = else clause
565 */
566 "cmp.ne p6, p7 = r8, r0\n"
567 "mov r9 = " __stringify(XEN_PSR_I_ADDR_ADDR) "\n"
568 ";;\n"
569 /* r9 = XEN_PSR_I_ADDR */
570 "ld8 r9 = [r9]\n"
571 ";;\n"
572
573 /* r10 = masked previous value */
574 "(p6) ld1.acq r10 = [r9]\n"
575 ";;\n"
576
577 /* p8 = !masked interrupt masked previously? */
578 "(p6) cmp.ne.unc p8, p0 = r10, r0\n"
579
580 /* p7 = else clause */
581 "(p7) mov r11 = 1\n"
582 ";;\n"
583 /* masked = 1 */
584 "(p7) st1.rel [r9] = r11\n"
585
586 /* p6 = if clause */
587 /* masked = 0
588 * r9 = masked_addr - 1
589 * = pending_intr_addr
590 */
591 "(p8) st1.rel [r9] = r0, -1\n"
592 ";;\n"
593 /* r8 = pending_intr */
594 "(p8) ld1.acq r11 = [r9]\n"
595 ";;\n"
596 /* p9 = interrupt pending? */
597 "(p8) cmp.ne.unc p9, p10 = r11, r0\n"
598 ";;\n"
599 "(p10) mf\n"
600 /* issue hypercall to trigger interrupt */
601 "(p9) break " __stringify(HYPERPRIVOP_SSM_I) "\n");
602
603DEFINE_VOID_FUNC2(ptcga,
604 "break " __stringify(HYPERPRIVOP_PTC_GA) "\n");
605DEFINE_VOID_FUNC2(set_rr,
606 "break " __stringify(HYPERPRIVOP_SET_RR) "\n");
607
608/*
609 * tmp = XEN_MAPPEDREGS->interrupt_mask_addr = XEN_PSR_I_ADDR_ADDR;
610 * tmp = *tmp
611 * tmp = *tmp;
612 * psr_i = tmp? 0: IA64_PSR_I;
613 */
614/* 4 bundles */
615DEFINE_FUNC0(get_psr_i,
616 "mov r9 = " __stringify(XEN_PSR_I_ADDR_ADDR) "\n"
617 ";;\n"
618 "ld8 r9 = [r9]\n" /* r9 = XEN_PSR_I_ADDR */
619 "mov r8 = 0\n" /* psr_i = 0 */
620 ";;\n"
621 "ld1.acq r9 = [r9]\n" /* r9 = XEN_PSR_I */
622 ";;\n"
623 "cmp.eq.unc p6, p0 = r9, r0\n" /* p6 = (XEN_PSR_I != 0) */
624 ";;\n"
625 "(p6) mov r8 = " __stringify(1 << IA64_PSR_I_BIT) "\n");
626
627DEFINE_FUNC1(thash, unsigned long,
628 "break " __stringify(HYPERPRIVOP_THASH) "\n");
629DEFINE_FUNC1(get_cpuid, int,
630 "break " __stringify(HYPERPRIVOP_GET_CPUID) "\n");
631DEFINE_FUNC1(get_pmd, int,
632 "break " __stringify(HYPERPRIVOP_GET_PMD) "\n");
633DEFINE_FUNC1(get_rr, unsigned long,
634 "break " __stringify(HYPERPRIVOP_GET_RR) "\n");
635
636/*
637 * void xen_privop_ssm_i(void)
638 *
639 * int masked = !xen_get_virtual_psr_i();
640 * // masked = *(*XEN_MAPPEDREGS->interrupt_mask_addr)
641 * xen_set_virtual_psr_i(1)
642 * // *(*XEN_MAPPEDREGS->interrupt_mask_addr) = 0
643 * // compiler barrier
644 * if (masked) {
645 * uint8_t* pend_int_addr =
646 * (uint8_t*)(*XEN_MAPPEDREGS->interrupt_mask_addr) - 1;
647 * uint8_t pending = *pend_int_addr;
648 * if (pending)
649 * XEN_HYPER_SSM_I
650 * }
651 */
652/* 4 bundles */
653DEFINE_VOID_FUNC0(ssm_i,
654 "mov r8 = " __stringify(XEN_PSR_I_ADDR_ADDR) "\n"
655 ";;\n"
656 "ld8 r8 = [r8]\n" /* r8 = XEN_PSR_I_ADDR */
657 ";;\n"
658 "ld1.acq r9 = [r8]\n" /* r9 = XEN_PSR_I */
659 ";;\n"
660 "st1.rel [r8] = r0, -1\n" /* psr_i = 0. enable interrupt
661 * r8 = XEN_PSR_I_ADDR - 1
662 * = pend_int_addr
663 */
664 "cmp.eq.unc p0, p6 = r9, r0\n"/* p6 = !XEN_PSR_I
665 * previously interrupt
666 * masked?
667 */
668 ";;\n"
669 "(p6) ld1.acq r8 = [r8]\n" /* r8 = xen_pend_int */
670 ";;\n"
671 "(p6) cmp.eq.unc p6, p7 = r8, r0\n" /*interrupt pending?*/
672 ";;\n"
673 /* issue hypercall to get interrupt */
674 "(p7) break " __stringify(HYPERPRIVOP_SSM_I) "\n"
675 ";;\n");
676
677/*
678 * psr_i_addr_addr = XEN_MAPPEDREGS->interrupt_mask_addr
679 * = XEN_PSR_I_ADDR_ADDR;
680 * psr_i_addr = *psr_i_addr_addr;
681 * *psr_i_addr = 1;
682 */
683/* 2 bundles */
684DEFINE_VOID_FUNC0(rsm_i,
685 "mov r8 = " __stringify(XEN_PSR_I_ADDR_ADDR) "\n"
686 /* r8 = XEN_PSR_I_ADDR */
687 "mov r9 = 1\n"
688 ";;\n"
689 "ld8 r8 = [r8]\n" /* r8 = XEN_PSR_I */
690 ";;\n"
691 "st1.rel [r8] = r9\n"); /* XEN_PSR_I = 1 */
692
693extern void
694xen_set_rr0_to_rr4(unsigned long val0, unsigned long val1,
695 unsigned long val2, unsigned long val3,
696 unsigned long val4);
697__DEFINE_FUNC(set_rr0_to_rr4,
698 "break " __stringify(HYPERPRIVOP_SET_RR0_TO_RR4) "\n");
699
700
701extern unsigned long xen_getreg(int regnum);
702#define __DEFINE_GET_REG(id, privop) \
703 "mov r2 = " __stringify(_IA64_REG_ ## id) "\n" \
704 ";;\n" \
705 "cmp.eq p6, p0 = r2, r8\n" \
706 ";;\n" \
707 "(p6) break " __stringify(HYPERPRIVOP_GET_ ## privop) "\n" \
708 "(p6) br.cond.sptk.many b6\n" \
709 ";;\n"
710
711__DEFINE_FUNC(getreg,
712 __DEFINE_GET_REG(PSR, PSR)
713#ifdef CONFIG_IA32_SUPPORT
714 __DEFINE_GET_REG(AR_EFLAG, EFLAG)
715#endif
716
717 /* get_itc */
718 "mov r2 = " __stringify(_IA64_REG_AR_ITC) "\n"
719 ";;\n"
720 "cmp.eq p6, p0 = r2, r8\n"
721 ";;\n"
722 "(p6) br.cond.spnt xen_get_itc\n"
723 ";;\n"
724
725 /* get itm */
726 "mov r2 = " __stringify(_IA64_REG_CR_ITM) "\n"
727 ";;\n"
728 "cmp.eq p6, p0 = r2, r8\n"
729 ";;\n"
730 "(p6) br.cond.spnt xen_get_itm_with_offset\n"
731 ";;\n"
732
733 __DEFINE_GET_REG(CR_IVR, IVR)
734 __DEFINE_GET_REG(CR_TPR, TPR)
735
736 /* fall back */
737 "movl r2 = ia64_native_getreg_func\n"
738 ";;\n"
739 "mov b7 = r2\n"
740 ";;\n"
741 "br.cond.sptk.many b7\n");
742
743extern void xen_setreg(int regnum, unsigned long val);
744#define __DEFINE_SET_REG(id, privop) \
745 "mov r2 = " __stringify(_IA64_REG_ ## id) "\n" \
746 ";;\n" \
747 "cmp.eq p6, p0 = r2, r9\n" \
748 ";;\n" \
749 "(p6) break " __stringify(HYPERPRIVOP_ ## privop) "\n" \
750 "(p6) br.cond.sptk.many b6\n" \
751 ";;\n"
752
753__DEFINE_FUNC(setreg,
754 /* kr0 .. kr 7*/
755 /*
756 * if (_IA64_REG_AR_KR0 <= regnum &&
757 * regnum <= _IA64_REG_AR_KR7) {
758 * register __index asm ("r8") = regnum - _IA64_REG_AR_KR0
759 * register __val asm ("r9") = val
760 * "break HYPERPRIVOP_SET_KR"
761 * }
762 */
763 "mov r17 = r9\n"
764 "mov r2 = " __stringify(_IA64_REG_AR_KR0) "\n"
765 ";;\n"
766 "cmp.ge p6, p0 = r9, r2\n"
767 "sub r17 = r17, r2\n"
768 ";;\n"
769 "(p6) cmp.ge.unc p7, p0 = "
770 __stringify(_IA64_REG_AR_KR7) " - " __stringify(_IA64_REG_AR_KR0)
771 ", r17\n"
772 ";;\n"
773 "(p7) mov r9 = r8\n"
774 ";;\n"
775 "(p7) mov r8 = r17\n"
776 "(p7) break " __stringify(HYPERPRIVOP_SET_KR) "\n"
777
778 /* set itm */
779 "mov r2 = " __stringify(_IA64_REG_CR_ITM) "\n"
780 ";;\n"
781 "cmp.eq p6, p0 = r2, r8\n"
782 ";;\n"
783 "(p6) br.cond.spnt xen_set_itm_with_offset\n"
784
785 /* set itc */
786 "mov r2 = " __stringify(_IA64_REG_AR_ITC) "\n"
787 ";;\n"
788 "cmp.eq p6, p0 = r2, r8\n"
789 ";;\n"
790 "(p6) br.cond.spnt xen_set_itc\n"
791
792#ifdef CONFIG_IA32_SUPPORT
793 __DEFINE_SET_REG(AR_EFLAG, SET_EFLAG)
794#endif
795 __DEFINE_SET_REG(CR_TPR, SET_TPR)
796 __DEFINE_SET_REG(CR_EOI, EOI)
797
798 /* fall back */
799 "movl r2 = ia64_native_setreg_func\n"
800 ";;\n"
801 "mov b7 = r2\n"
802 ";;\n"
803 "br.cond.sptk.many b7\n");
804#endif
805
806static const struct pv_cpu_ops xen_cpu_ops __initconst = {
264 .fc = xen_fc, 807 .fc = xen_fc,
265 .thash = xen_thash, 808 .thash = xen_thash,
266 .get_cpuid = xen_get_cpuid, 809 .get_cpuid = xen_get_cpuid,
@@ -337,7 +880,7 @@ xen_iosapic_write(char __iomem *iosapic, unsigned int reg, u32 val)
337 HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op); 880 HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op);
338} 881}
339 882
340static const struct pv_iosapic_ops xen_iosapic_ops __initconst = { 883static struct pv_iosapic_ops xen_iosapic_ops __initdata = {
341 .pcat_compat_init = xen_pcat_compat_init, 884 .pcat_compat_init = xen_pcat_compat_init,
342 .__get_irq_chip = xen_iosapic_get_irq_chip, 885 .__get_irq_chip = xen_iosapic_get_irq_chip,
343 886
@@ -355,6 +898,8 @@ xen_setup_pv_ops(void)
355 xen_info_init(); 898 xen_info_init();
356 pv_info = xen_info; 899 pv_info = xen_info;
357 pv_init_ops = xen_init_ops; 900 pv_init_ops = xen_init_ops;
901 pv_fsys_data = xen_fsys_data;
902 pv_patchdata = xen_patchdata;
358 pv_cpu_ops = xen_cpu_ops; 903 pv_cpu_ops = xen_cpu_ops;
359 pv_iosapic_ops = xen_iosapic_ops; 904 pv_iosapic_ops = xen_iosapic_ops;
360 pv_irq_ops = xen_irq_ops; 905 pv_irq_ops = xen_irq_ops;
@@ -362,3 +907,252 @@ xen_setup_pv_ops(void)
362 907
363 paravirt_cpu_asm_init(&xen_cpu_asm_switch); 908 paravirt_cpu_asm_init(&xen_cpu_asm_switch);
364} 909}
910
911#ifdef ASM_SUPPORTED
912/***************************************************************************
913 * binary pacthing
914 * pv_init_ops.patch_bundle
915 */
916
917#define DEFINE_FUNC_GETREG(name, privop) \
918 DEFINE_FUNC0(get_ ## name, \
919 "break "__stringify(HYPERPRIVOP_GET_ ## privop) "\n")
920
921DEFINE_FUNC_GETREG(psr, PSR);
922DEFINE_FUNC_GETREG(eflag, EFLAG);
923DEFINE_FUNC_GETREG(ivr, IVR);
924DEFINE_FUNC_GETREG(tpr, TPR);
925
926#define DEFINE_FUNC_SET_KR(n) \
927 DEFINE_VOID_FUNC0(set_kr ## n, \
928 ";;\n" \
929 "mov r9 = r8\n" \
930 "mov r8 = " #n "\n" \
931 "break " __stringify(HYPERPRIVOP_SET_KR) "\n")
932
933DEFINE_FUNC_SET_KR(0);
934DEFINE_FUNC_SET_KR(1);
935DEFINE_FUNC_SET_KR(2);
936DEFINE_FUNC_SET_KR(3);
937DEFINE_FUNC_SET_KR(4);
938DEFINE_FUNC_SET_KR(5);
939DEFINE_FUNC_SET_KR(6);
940DEFINE_FUNC_SET_KR(7);
941
942#define __DEFINE_FUNC_SETREG(name, privop) \
943 DEFINE_VOID_FUNC0(name, \
944 "break "__stringify(HYPERPRIVOP_ ## privop) "\n")
945
946#define DEFINE_FUNC_SETREG(name, privop) \
947 __DEFINE_FUNC_SETREG(set_ ## name, SET_ ## privop)
948
949DEFINE_FUNC_SETREG(eflag, EFLAG);
950DEFINE_FUNC_SETREG(tpr, TPR);
951__DEFINE_FUNC_SETREG(eoi, EOI);
952
953extern const char xen_check_events[];
954extern const char __xen_intrin_local_irq_restore_direct_start[];
955extern const char __xen_intrin_local_irq_restore_direct_end[];
956extern const unsigned long __xen_intrin_local_irq_restore_direct_reloc;
957
958asm (
959 ".align 32\n"
960 ".proc xen_check_events\n"
961 "xen_check_events:\n"
962 /* masked = 0
963 * r9 = masked_addr - 1
964 * = pending_intr_addr
965 */
966 "st1.rel [r9] = r0, -1\n"
967 ";;\n"
968 /* r8 = pending_intr */
969 "ld1.acq r11 = [r9]\n"
970 ";;\n"
971 /* p9 = interrupt pending? */
972 "cmp.ne p9, p10 = r11, r0\n"
973 ";;\n"
974 "(p10) mf\n"
975 /* issue hypercall to trigger interrupt */
976 "(p9) break " __stringify(HYPERPRIVOP_SSM_I) "\n"
977 "br.cond.sptk.many b6\n"
978 ".endp xen_check_events\n"
979 "\n"
980 ".align 32\n"
981 ".proc __xen_intrin_local_irq_restore_direct\n"
982 "__xen_intrin_local_irq_restore_direct:\n"
983 "__xen_intrin_local_irq_restore_direct_start:\n"
984 "1:\n"
985 "{\n"
986 "cmp.ne p6, p7 = r8, r0\n"
987 "mov r17 = ip\n" /* get ip to calc return address */
988 "mov r9 = "__stringify(XEN_PSR_I_ADDR_ADDR) "\n"
989 ";;\n"
990 "}\n"
991 "{\n"
992 /* r9 = XEN_PSR_I_ADDR */
993 "ld8 r9 = [r9]\n"
994 ";;\n"
995 /* r10 = masked previous value */
996 "(p6) ld1.acq r10 = [r9]\n"
997 "adds r17 = 1f - 1b, r17\n" /* calculate return address */
998 ";;\n"
999 "}\n"
1000 "{\n"
1001 /* p8 = !masked interrupt masked previously? */
1002 "(p6) cmp.ne.unc p8, p0 = r10, r0\n"
1003 "\n"
1004 /* p7 = else clause */
1005 "(p7) mov r11 = 1\n"
1006 ";;\n"
1007 "(p8) mov b6 = r17\n" /* set return address */
1008 "}\n"
1009 "{\n"
1010 /* masked = 1 */
1011 "(p7) st1.rel [r9] = r11\n"
1012 "\n"
1013 "[99:]\n"
1014 "(p8) brl.cond.dptk.few xen_check_events\n"
1015 "}\n"
1016 /* pv calling stub is 5 bundles. fill nop to adjust return address */
1017 "{\n"
1018 "nop 0\n"
1019 "nop 0\n"
1020 "nop 0\n"
1021 "}\n"
1022 "1:\n"
1023 "__xen_intrin_local_irq_restore_direct_end:\n"
1024 ".endp __xen_intrin_local_irq_restore_direct\n"
1025 "\n"
1026 ".align 8\n"
1027 "__xen_intrin_local_irq_restore_direct_reloc:\n"
1028 "data8 99b\n"
1029);
1030
1031static struct paravirt_patch_bundle_elem xen_patch_bundle_elems[]
1032__initdata_or_module =
1033{
1034#define XEN_PATCH_BUNDLE_ELEM(name, type) \
1035 { \
1036 (void*)xen_ ## name ## _direct_start, \
1037 (void*)xen_ ## name ## _direct_end, \
1038 PARAVIRT_PATCH_TYPE_ ## type, \
1039 }
1040
1041 XEN_PATCH_BUNDLE_ELEM(fc, FC),
1042 XEN_PATCH_BUNDLE_ELEM(thash, THASH),
1043 XEN_PATCH_BUNDLE_ELEM(get_cpuid, GET_CPUID),
1044 XEN_PATCH_BUNDLE_ELEM(get_pmd, GET_PMD),
1045 XEN_PATCH_BUNDLE_ELEM(ptcga, PTCGA),
1046 XEN_PATCH_BUNDLE_ELEM(get_rr, GET_RR),
1047 XEN_PATCH_BUNDLE_ELEM(set_rr, SET_RR),
1048 XEN_PATCH_BUNDLE_ELEM(set_rr0_to_rr4, SET_RR0_TO_RR4),
1049 XEN_PATCH_BUNDLE_ELEM(ssm_i, SSM_I),
1050 XEN_PATCH_BUNDLE_ELEM(rsm_i, RSM_I),
1051 XEN_PATCH_BUNDLE_ELEM(get_psr_i, GET_PSR_I),
1052 {
1053 (void*)__xen_intrin_local_irq_restore_direct_start,
1054 (void*)__xen_intrin_local_irq_restore_direct_end,
1055 PARAVIRT_PATCH_TYPE_INTRIN_LOCAL_IRQ_RESTORE,
1056 },
1057
1058#define XEN_PATCH_BUNDLE_ELEM_GETREG(name, reg) \
1059 { \
1060 xen_get_ ## name ## _direct_start, \
1061 xen_get_ ## name ## _direct_end, \
1062 PARAVIRT_PATCH_TYPE_GETREG + _IA64_REG_ ## reg, \
1063 }
1064
1065 XEN_PATCH_BUNDLE_ELEM_GETREG(psr, PSR),
1066 XEN_PATCH_BUNDLE_ELEM_GETREG(eflag, AR_EFLAG),
1067
1068 XEN_PATCH_BUNDLE_ELEM_GETREG(ivr, CR_IVR),
1069 XEN_PATCH_BUNDLE_ELEM_GETREG(tpr, CR_TPR),
1070
1071 XEN_PATCH_BUNDLE_ELEM_GETREG(itc, AR_ITC),
1072 XEN_PATCH_BUNDLE_ELEM_GETREG(itm_with_offset, CR_ITM),
1073
1074
1075#define __XEN_PATCH_BUNDLE_ELEM_SETREG(name, reg) \
1076 { \
1077 xen_ ## name ## _direct_start, \
1078 xen_ ## name ## _direct_end, \
1079 PARAVIRT_PATCH_TYPE_SETREG + _IA64_REG_ ## reg, \
1080 }
1081
1082#define XEN_PATCH_BUNDLE_ELEM_SETREG(name, reg) \
1083 __XEN_PATCH_BUNDLE_ELEM_SETREG(set_ ## name, reg)
1084
1085 XEN_PATCH_BUNDLE_ELEM_SETREG(kr0, AR_KR0),
1086 XEN_PATCH_BUNDLE_ELEM_SETREG(kr1, AR_KR1),
1087 XEN_PATCH_BUNDLE_ELEM_SETREG(kr2, AR_KR2),
1088 XEN_PATCH_BUNDLE_ELEM_SETREG(kr3, AR_KR3),
1089 XEN_PATCH_BUNDLE_ELEM_SETREG(kr4, AR_KR4),
1090 XEN_PATCH_BUNDLE_ELEM_SETREG(kr5, AR_KR5),
1091 XEN_PATCH_BUNDLE_ELEM_SETREG(kr6, AR_KR6),
1092 XEN_PATCH_BUNDLE_ELEM_SETREG(kr7, AR_KR7),
1093
1094 XEN_PATCH_BUNDLE_ELEM_SETREG(eflag, AR_EFLAG),
1095 XEN_PATCH_BUNDLE_ELEM_SETREG(tpr, CR_TPR),
1096 __XEN_PATCH_BUNDLE_ELEM_SETREG(eoi, CR_EOI),
1097
1098 XEN_PATCH_BUNDLE_ELEM_SETREG(itc, AR_ITC),
1099 XEN_PATCH_BUNDLE_ELEM_SETREG(itm_with_offset, CR_ITM),
1100};
1101
1102static unsigned long __init_or_module
1103xen_patch_bundle(void *sbundle, void *ebundle, unsigned long type)
1104{
1105 const unsigned long nelems = sizeof(xen_patch_bundle_elems) /
1106 sizeof(xen_patch_bundle_elems[0]);
1107 unsigned long used;
1108 const struct paravirt_patch_bundle_elem *found;
1109
1110 used = __paravirt_patch_apply_bundle(sbundle, ebundle, type,
1111 xen_patch_bundle_elems, nelems,
1112 &found);
1113
1114 if (found == NULL)
1115 /* fallback */
1116 return ia64_native_patch_bundle(sbundle, ebundle, type);
1117 if (used == 0)
1118 return used;
1119
1120 /* relocation */
1121 switch (type) {
1122 case PARAVIRT_PATCH_TYPE_INTRIN_LOCAL_IRQ_RESTORE: {
1123 unsigned long reloc =
1124 __xen_intrin_local_irq_restore_direct_reloc;
1125 unsigned long reloc_offset = reloc - (unsigned long)
1126 __xen_intrin_local_irq_restore_direct_start;
1127 unsigned long tag = (unsigned long)sbundle + reloc_offset;
1128 paravirt_patch_reloc_brl(tag, xen_check_events);
1129 break;
1130 }
1131 default:
1132 /* nothing */
1133 break;
1134 }
1135 return used;
1136}
1137#endif /* ASM_SUPPOTED */
1138
1139const struct paravirt_patch_branch_target xen_branch_target[]
1140__initconst = {
1141#define PARAVIRT_BR_TARGET(name, type) \
1142 { \
1143 &xen_ ## name, \
1144 PARAVIRT_PATCH_TYPE_BR_ ## type, \
1145 }
1146 PARAVIRT_BR_TARGET(switch_to, SWITCH_TO),
1147 PARAVIRT_BR_TARGET(leave_syscall, LEAVE_SYSCALL),
1148 PARAVIRT_BR_TARGET(work_processed_syscall, WORK_PROCESSED_SYSCALL),
1149 PARAVIRT_BR_TARGET(leave_kernel, LEAVE_KERNEL),
1150};
1151
1152static void __init
1153xen_patch_branch(unsigned long tag, unsigned long type)
1154{
1155 const unsigned long nelem =
1156 sizeof(xen_branch_target) / sizeof(xen_branch_target[0]);
1157 __paravirt_patch_apply_branch(tag, type, xen_branch_target, nelem);
1158}
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index 8c3c25f35578..5054c2ddd1a0 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -16,24 +17,38 @@
16 17
17#define SMAP 0x534d4150 /* ASCII "SMAP" */ 18#define SMAP 0x534d4150 /* ASCII "SMAP" */
18 19
20struct e820_ext_entry {
21 struct e820entry std;
22 u32 ext_flags;
23} __attribute__((packed));
24
19static int detect_memory_e820(void) 25static int detect_memory_e820(void)
20{ 26{
21 int count = 0; 27 int count = 0;
22 u32 next = 0; 28 u32 next = 0;
23 u32 size, id; 29 u32 size, id, edi;
24 u8 err; 30 u8 err;
25 struct e820entry *desc = boot_params.e820_map; 31 struct e820entry *desc = boot_params.e820_map;
32 static struct e820_ext_entry buf; /* static so it is zeroed */
33
34 /*
35 * Set this here so that if the BIOS doesn't change this field
36 * but still doesn't change %ecx, we're still okay...
37 */
38 buf.ext_flags = 1;
26 39
27 do { 40 do {
28 size = sizeof(struct e820entry); 41 size = sizeof buf;
29 42
30 /* Important: %edx is clobbered by some BIOSes, 43 /* Important: %edx and %esi are clobbered by some BIOSes,
31 so it must be either used for the error output 44 so they must be either used for the error output
32 or explicitly marked clobbered. */ 45 or explicitly marked clobbered. Given that, assume there
33 asm("int $0x15; setc %0" 46 is something out there clobbering %ebp and %edi, too. */
47 asm("pushl %%ebp; int $0x15; popl %%ebp; setc %0"
34 : "=d" (err), "+b" (next), "=a" (id), "+c" (size), 48 : "=d" (err), "+b" (next), "=a" (id), "+c" (size),
35 "=m" (*desc) 49 "=D" (edi), "+m" (buf)
36 : "D" (desc), "d" (SMAP), "a" (0xe820)); 50 : "D" (&buf), "d" (SMAP), "a" (0xe820)
51 : "esi");
37 52
38 /* BIOSes which terminate the chain with CF = 1 as opposed 53 /* BIOSes which terminate the chain with CF = 1 as opposed
39 to %ebx = 0 don't always report the SMAP signature on 54 to %ebx = 0 don't always report the SMAP signature on
@@ -51,8 +66,14 @@ static int detect_memory_e820(void)
51 break; 66 break;
52 } 67 }
53 68
69 /* ACPI 3.0 added the extended flags support. If bit 0
70 in the extended flags is zero, we're supposed to simply
71 ignore the entry -- a backwards incompatible change! */
72 if (size > 20 && !(buf.ext_flags & 1))
73 continue;
74
75 *desc++ = buf.std;
54 count++; 76 count++;
55 desc++;
56 } while (next && count < ARRAY_SIZE(boot_params.e820_map)); 77 } while (next && count < ARRAY_SIZE(boot_params.e820_map));
57 78
58 return boot_params.e820_entries = count; 79 return boot_params.e820_entries = count;
diff --git a/drivers/gpu/drm/drm_crtc_helper.c b/drivers/gpu/drm/drm_crtc_helper.c
index 1c3a8c557140..a04639dc633d 100644
--- a/drivers/gpu/drm/drm_crtc_helper.c
+++ b/drivers/gpu/drm/drm_crtc_helper.c
@@ -42,6 +42,26 @@ static struct drm_display_mode std_modes[] = {
42 DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, 42 DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) },
43}; 43};
44 44
45static void drm_mode_validate_flag(struct drm_connector *connector,
46 int flags)
47{
48 struct drm_display_mode *mode, *t;
49
50 if (flags == (DRM_MODE_FLAG_DBLSCAN | DRM_MODE_FLAG_INTERLACE))
51 return;
52
53 list_for_each_entry_safe(mode, t, &connector->modes, head) {
54 if ((mode->flags & DRM_MODE_FLAG_INTERLACE) &&
55 !(flags & DRM_MODE_FLAG_INTERLACE))
56 mode->status = MODE_NO_INTERLACE;
57 if ((mode->flags & DRM_MODE_FLAG_DBLSCAN) &&
58 !(flags & DRM_MODE_FLAG_DBLSCAN))
59 mode->status = MODE_NO_DBLESCAN;
60 }
61
62 return;
63}
64
45/** 65/**
46 * drm_helper_probe_connector_modes - get complete set of display modes 66 * drm_helper_probe_connector_modes - get complete set of display modes
47 * @dev: DRM device 67 * @dev: DRM device
@@ -72,6 +92,7 @@ int drm_helper_probe_single_connector_modes(struct drm_connector *connector,
72 struct drm_connector_helper_funcs *connector_funcs = 92 struct drm_connector_helper_funcs *connector_funcs =
73 connector->helper_private; 93 connector->helper_private;
74 int count = 0; 94 int count = 0;
95 int mode_flags = 0;
75 96
76 DRM_DEBUG("%s\n", drm_get_connector_name(connector)); 97 DRM_DEBUG("%s\n", drm_get_connector_name(connector));
77 /* set all modes to the unverified state */ 98 /* set all modes to the unverified state */
@@ -96,6 +117,13 @@ int drm_helper_probe_single_connector_modes(struct drm_connector *connector,
96 if (maxX && maxY) 117 if (maxX && maxY)
97 drm_mode_validate_size(dev, &connector->modes, maxX, 118 drm_mode_validate_size(dev, &connector->modes, maxX,
98 maxY, 0); 119 maxY, 0);
120
121 if (connector->interlace_allowed)
122 mode_flags |= DRM_MODE_FLAG_INTERLACE;
123 if (connector->doublescan_allowed)
124 mode_flags |= DRM_MODE_FLAG_DBLSCAN;
125 drm_mode_validate_flag(connector, mode_flags);
126
99 list_for_each_entry_safe(mode, t, &connector->modes, head) { 127 list_for_each_entry_safe(mode, t, &connector->modes, head) {
100 if (mode->status == MODE_OK) 128 if (mode->status == MODE_OK)
101 mode->status = connector_funcs->mode_valid(connector, 129 mode->status = connector_funcs->mode_valid(connector,
@@ -885,7 +913,6 @@ bool drm_helper_plugged_event(struct drm_device *dev)
885/** 913/**
886 * drm_initial_config - setup a sane initial connector configuration 914 * drm_initial_config - setup a sane initial connector configuration
887 * @dev: DRM device 915 * @dev: DRM device
888 * @can_grow: this configuration is growable
889 * 916 *
890 * LOCKING: 917 * LOCKING:
891 * Called at init time, must take mode config lock. 918 * Called at init time, must take mode config lock.
@@ -897,7 +924,7 @@ bool drm_helper_plugged_event(struct drm_device *dev)
897 * RETURNS: 924 * RETURNS:
898 * Zero if everything went ok, nonzero otherwise. 925 * Zero if everything went ok, nonzero otherwise.
899 */ 926 */
900bool drm_helper_initial_config(struct drm_device *dev, bool can_grow) 927bool drm_helper_initial_config(struct drm_device *dev)
901{ 928{
902 struct drm_connector *connector; 929 struct drm_connector *connector;
903 int count = 0; 930 int count = 0;
diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index c67400067b85..ca9c61656714 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -125,10 +125,8 @@ static bool edid_is_valid(struct edid *edid)
125 DRM_ERROR("EDID has major version %d, instead of 1\n", edid->version); 125 DRM_ERROR("EDID has major version %d, instead of 1\n", edid->version);
126 goto bad; 126 goto bad;
127 } 127 }
128 if (edid->revision > 3) { 128 if (edid->revision > 4)
129 DRM_ERROR("EDID has minor version %d, which is not between 0-3\n", edid->revision); 129 DRM_DEBUG("EDID minor > 4, assuming backward compatibility\n");
130 goto bad;
131 }
132 130
133 for (i = 0; i < EDID_LENGTH; i++) 131 for (i = 0; i < EDID_LENGTH; i++)
134 csum += raw_edid[i]; 132 csum += raw_edid[i];
@@ -162,7 +160,7 @@ static bool edid_vendor(struct edid *edid, char *vendor)
162 edid_vendor[0] = ((edid->mfg_id[0] & 0x7c) >> 2) + '@'; 160 edid_vendor[0] = ((edid->mfg_id[0] & 0x7c) >> 2) + '@';
163 edid_vendor[1] = (((edid->mfg_id[0] & 0x3) << 3) | 161 edid_vendor[1] = (((edid->mfg_id[0] & 0x3) << 3) |
164 ((edid->mfg_id[1] & 0xe0) >> 5)) + '@'; 162 ((edid->mfg_id[1] & 0xe0) >> 5)) + '@';
165 edid_vendor[2] = (edid->mfg_id[2] & 0x1f) + '@'; 163 edid_vendor[2] = (edid->mfg_id[1] & 0x1f) + '@';
166 164
167 return !strncmp(edid_vendor, vendor, 3); 165 return !strncmp(edid_vendor, vendor, 3);
168} 166}
diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 0b9984ffed12..c23b3a95b7ce 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1042,7 +1042,7 @@ static int i915_load_modeset_init(struct drm_device *dev)
1042 1042
1043 intel_modeset_init(dev); 1043 intel_modeset_init(dev);
1044 1044
1045 drm_helper_initial_config(dev, false); 1045 drm_helper_initial_config(dev);
1046 1046
1047 return 0; 1047 return 0;
1048 1048
diff --git a/drivers/s390/net/qeth_core_offl.c b/drivers/s390/net/qeth_core_offl.c
deleted file mode 100644
index e69de29bb2d1..000000000000
--- a/drivers/s390/net/qeth_core_offl.c
+++ /dev/null
diff --git a/drivers/s390/net/qeth_core_offl.h b/drivers/s390/net/qeth_core_offl.h
deleted file mode 100644
index e69de29bb2d1..000000000000
--- a/drivers/s390/net/qeth_core_offl.h
+++ /dev/null
diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c
index bf3c0e32a334..b0bb29d804ae 100644
--- a/drivers/serial/serial_core.c
+++ b/drivers/serial/serial_core.c
@@ -1765,7 +1765,7 @@ static void uart_line_info(struct seq_file *m, struct uart_driver *drv, int i)
1765 1765
1766static int uart_proc_show(struct seq_file *m, void *v) 1766static int uart_proc_show(struct seq_file *m, void *v)
1767{ 1767{
1768 struct tty_driver *ttydrv = v; 1768 struct tty_driver *ttydrv = m->private;
1769 struct uart_driver *drv = ttydrv->driver_state; 1769 struct uart_driver *drv = ttydrv->driver_state;
1770 int i; 1770 int i;
1771 1771
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index d2cf5a54a4b8..9adf5e4f7e96 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
8 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 8 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
9 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 9 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
10 ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \ 10 ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
11 compression.o 11 compression.o delayed-ref.o
12else 12else
13 13
14# Normal Makefile 14# Normal Makefile
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 72677ce2b74f..b30986f00b9d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,6 +66,12 @@ struct btrfs_inode {
66 */ 66 */
67 struct list_head delalloc_inodes; 67 struct list_head delalloc_inodes;
68 68
69 /*
70 * list for tracking inodes that must be sent to disk before a
71 * rename or truncate commit
72 */
73 struct list_head ordered_operations;
74
69 /* the space_info for where this inode's data allocations are done */ 75 /* the space_info for where this inode's data allocations are done */
70 struct btrfs_space_info *space_info; 76 struct btrfs_space_info *space_info;
71 77
@@ -86,12 +92,6 @@ struct btrfs_inode {
86 */ 92 */
87 u64 logged_trans; 93 u64 logged_trans;
88 94
89 /*
90 * trans that last made a change that should be fully fsync'd. This
91 * gets reset to zero each time the inode is logged
92 */
93 u64 log_dirty_trans;
94
95 /* total number of bytes pending delalloc, used by stat to calc the 95 /* total number of bytes pending delalloc, used by stat to calc the
96 * real block usage of the file 96 * real block usage of the file
97 */ 97 */
@@ -121,6 +121,25 @@ struct btrfs_inode {
121 /* the start of block group preferred for allocations. */ 121 /* the start of block group preferred for allocations. */
122 u64 block_group; 122 u64 block_group;
123 123
124 /* the fsync log has some corner cases that mean we have to check
125 * directories to see if any unlinks have been done before
126 * the directory was logged. See tree-log.c for all the
127 * details
128 */
129 u64 last_unlink_trans;
130
131 /*
132 * ordered_data_close is set by truncate when a file that used
133 * to have good data has been truncated to zero. When it is set
134 * the btrfs file release call will add this inode to the
135 * ordered operations list so that we make sure to flush out any
136 * new data the application may have written before commit.
137 *
138 * yes, its silly to have a single bitflag, but we might grow more
139 * of these.
140 */
141 unsigned ordered_data_close:1;
142
124 struct inode vfs_inode; 143 struct inode vfs_inode;
125}; 144};
126 145
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 37f31b5529aa..dbb724124633 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -254,18 +254,13 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
254 * empty_size -- a hint that you plan on doing more cow. This is the size in 254 * empty_size -- a hint that you plan on doing more cow. This is the size in
255 * bytes the allocator should try to find free next to the block it returns. 255 * bytes the allocator should try to find free next to the block it returns.
256 * This is just a hint and may be ignored by the allocator. 256 * This is just a hint and may be ignored by the allocator.
257 *
258 * prealloc_dest -- if you have already reserved a destination for the cow,
259 * this uses that block instead of allocating a new one.
260 * btrfs_alloc_reserved_extent is used to finish the allocation.
261 */ 257 */
262static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, 258static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
263 struct btrfs_root *root, 259 struct btrfs_root *root,
264 struct extent_buffer *buf, 260 struct extent_buffer *buf,
265 struct extent_buffer *parent, int parent_slot, 261 struct extent_buffer *parent, int parent_slot,
266 struct extent_buffer **cow_ret, 262 struct extent_buffer **cow_ret,
267 u64 search_start, u64 empty_size, 263 u64 search_start, u64 empty_size)
268 u64 prealloc_dest)
269{ 264{
270 u64 parent_start; 265 u64 parent_start;
271 struct extent_buffer *cow; 266 struct extent_buffer *cow;
@@ -291,26 +286,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
291 level = btrfs_header_level(buf); 286 level = btrfs_header_level(buf);
292 nritems = btrfs_header_nritems(buf); 287 nritems = btrfs_header_nritems(buf);
293 288
294 if (prealloc_dest) { 289 cow = btrfs_alloc_free_block(trans, root, buf->len,
295 struct btrfs_key ins; 290 parent_start, root->root_key.objectid,
296 291 trans->transid, level,
297 ins.objectid = prealloc_dest; 292 search_start, empty_size);
298 ins.offset = buf->len;
299 ins.type = BTRFS_EXTENT_ITEM_KEY;
300
301 ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
302 root->root_key.objectid,
303 trans->transid, level, &ins);
304 BUG_ON(ret);
305 cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
306 buf->len, level);
307 } else {
308 cow = btrfs_alloc_free_block(trans, root, buf->len,
309 parent_start,
310 root->root_key.objectid,
311 trans->transid, level,
312 search_start, empty_size);
313 }
314 if (IS_ERR(cow)) 293 if (IS_ERR(cow))
315 return PTR_ERR(cow); 294 return PTR_ERR(cow);
316 295
@@ -413,7 +392,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
413noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, 392noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
414 struct btrfs_root *root, struct extent_buffer *buf, 393 struct btrfs_root *root, struct extent_buffer *buf,
415 struct extent_buffer *parent, int parent_slot, 394 struct extent_buffer *parent, int parent_slot,
416 struct extent_buffer **cow_ret, u64 prealloc_dest) 395 struct extent_buffer **cow_ret)
417{ 396{
418 u64 search_start; 397 u64 search_start;
419 int ret; 398 int ret;
@@ -436,7 +415,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
436 btrfs_header_owner(buf) == root->root_key.objectid && 415 btrfs_header_owner(buf) == root->root_key.objectid &&
437 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 416 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
438 *cow_ret = buf; 417 *cow_ret = buf;
439 WARN_ON(prealloc_dest);
440 return 0; 418 return 0;
441 } 419 }
442 420
@@ -447,8 +425,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
447 btrfs_set_lock_blocking(buf); 425 btrfs_set_lock_blocking(buf);
448 426
449 ret = __btrfs_cow_block(trans, root, buf, parent, 427 ret = __btrfs_cow_block(trans, root, buf, parent,
450 parent_slot, cow_ret, search_start, 0, 428 parent_slot, cow_ret, search_start, 0);
451 prealloc_dest);
452 return ret; 429 return ret;
453} 430}
454 431
@@ -617,7 +594,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
617 err = __btrfs_cow_block(trans, root, cur, parent, i, 594 err = __btrfs_cow_block(trans, root, cur, parent, i,
618 &cur, search_start, 595 &cur, search_start,
619 min(16 * blocksize, 596 min(16 * blocksize,
620 (end_slot - i) * blocksize), 0); 597 (end_slot - i) * blocksize));
621 if (err) { 598 if (err) {
622 btrfs_tree_unlock(cur); 599 btrfs_tree_unlock(cur);
623 free_extent_buffer(cur); 600 free_extent_buffer(cur);
@@ -937,7 +914,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
937 BUG_ON(!child); 914 BUG_ON(!child);
938 btrfs_tree_lock(child); 915 btrfs_tree_lock(child);
939 btrfs_set_lock_blocking(child); 916 btrfs_set_lock_blocking(child);
940 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); 917 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
941 BUG_ON(ret); 918 BUG_ON(ret);
942 919
943 spin_lock(&root->node_lock); 920 spin_lock(&root->node_lock);
@@ -945,6 +922,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
945 spin_unlock(&root->node_lock); 922 spin_unlock(&root->node_lock);
946 923
947 ret = btrfs_update_extent_ref(trans, root, child->start, 924 ret = btrfs_update_extent_ref(trans, root, child->start,
925 child->len,
948 mid->start, child->start, 926 mid->start, child->start,
949 root->root_key.objectid, 927 root->root_key.objectid,
950 trans->transid, level - 1); 928 trans->transid, level - 1);
@@ -971,6 +949,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
971 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) 949 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
972 return 0; 950 return 0;
973 951
952 if (trans->transaction->delayed_refs.flushing &&
953 btrfs_header_nritems(mid) > 2)
954 return 0;
955
974 if (btrfs_header_nritems(mid) < 2) 956 if (btrfs_header_nritems(mid) < 2)
975 err_on_enospc = 1; 957 err_on_enospc = 1;
976 958
@@ -979,7 +961,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
979 btrfs_tree_lock(left); 961 btrfs_tree_lock(left);
980 btrfs_set_lock_blocking(left); 962 btrfs_set_lock_blocking(left);
981 wret = btrfs_cow_block(trans, root, left, 963 wret = btrfs_cow_block(trans, root, left,
982 parent, pslot - 1, &left, 0); 964 parent, pslot - 1, &left);
983 if (wret) { 965 if (wret) {
984 ret = wret; 966 ret = wret;
985 goto enospc; 967 goto enospc;
@@ -990,7 +972,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
990 btrfs_tree_lock(right); 972 btrfs_tree_lock(right);
991 btrfs_set_lock_blocking(right); 973 btrfs_set_lock_blocking(right);
992 wret = btrfs_cow_block(trans, root, right, 974 wret = btrfs_cow_block(trans, root, right,
993 parent, pslot + 1, &right, 0); 975 parent, pslot + 1, &right);
994 if (wret) { 976 if (wret) {
995 ret = wret; 977 ret = wret;
996 goto enospc; 978 goto enospc;
@@ -1171,7 +1153,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1171 wret = 1; 1153 wret = 1;
1172 } else { 1154 } else {
1173 ret = btrfs_cow_block(trans, root, left, parent, 1155 ret = btrfs_cow_block(trans, root, left, parent,
1174 pslot - 1, &left, 0); 1156 pslot - 1, &left);
1175 if (ret) 1157 if (ret)
1176 wret = 1; 1158 wret = 1;
1177 else { 1159 else {
@@ -1222,7 +1204,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1222 } else { 1204 } else {
1223 ret = btrfs_cow_block(trans, root, right, 1205 ret = btrfs_cow_block(trans, root, right,
1224 parent, pslot + 1, 1206 parent, pslot + 1,
1225 &right, 0); 1207 &right);
1226 if (ret) 1208 if (ret)
1227 wret = 1; 1209 wret = 1;
1228 else { 1210 else {
@@ -1492,7 +1474,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1492 u8 lowest_level = 0; 1474 u8 lowest_level = 0;
1493 u64 blocknr; 1475 u64 blocknr;
1494 u64 gen; 1476 u64 gen;
1495 struct btrfs_key prealloc_block;
1496 1477
1497 lowest_level = p->lowest_level; 1478 lowest_level = p->lowest_level;
1498 WARN_ON(lowest_level && ins_len > 0); 1479 WARN_ON(lowest_level && ins_len > 0);
@@ -1501,8 +1482,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1501 if (ins_len < 0) 1482 if (ins_len < 0)
1502 lowest_unlock = 2; 1483 lowest_unlock = 2;
1503 1484
1504 prealloc_block.objectid = 0;
1505
1506again: 1485again:
1507 if (p->skip_locking) 1486 if (p->skip_locking)
1508 b = btrfs_root_node(root); 1487 b = btrfs_root_node(root);
@@ -1529,44 +1508,11 @@ again:
1529 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { 1508 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
1530 goto cow_done; 1509 goto cow_done;
1531 } 1510 }
1532
1533 /* ok, we have to cow, is our old prealloc the right
1534 * size?
1535 */
1536 if (prealloc_block.objectid &&
1537 prealloc_block.offset != b->len) {
1538 btrfs_release_path(root, p);
1539 btrfs_free_reserved_extent(root,
1540 prealloc_block.objectid,
1541 prealloc_block.offset);
1542 prealloc_block.objectid = 0;
1543 goto again;
1544 }
1545
1546 /*
1547 * for higher level blocks, try not to allocate blocks
1548 * with the block and the parent locks held.
1549 */
1550 if (level > 0 && !prealloc_block.objectid) {
1551 u32 size = b->len;
1552 u64 hint = b->start;
1553
1554 btrfs_release_path(root, p);
1555 ret = btrfs_reserve_extent(trans, root,
1556 size, size, 0,
1557 hint, (u64)-1,
1558 &prealloc_block, 0);
1559 BUG_ON(ret);
1560 goto again;
1561 }
1562
1563 btrfs_set_path_blocking(p); 1511 btrfs_set_path_blocking(p);
1564 1512
1565 wret = btrfs_cow_block(trans, root, b, 1513 wret = btrfs_cow_block(trans, root, b,
1566 p->nodes[level + 1], 1514 p->nodes[level + 1],
1567 p->slots[level + 1], 1515 p->slots[level + 1], &b);
1568 &b, prealloc_block.objectid);
1569 prealloc_block.objectid = 0;
1570 if (wret) { 1516 if (wret) {
1571 free_extent_buffer(b); 1517 free_extent_buffer(b);
1572 ret = wret; 1518 ret = wret;
@@ -1742,12 +1688,8 @@ done:
1742 * we don't really know what they plan on doing with the path 1688 * we don't really know what they plan on doing with the path
1743 * from here on, so for now just mark it as blocking 1689 * from here on, so for now just mark it as blocking
1744 */ 1690 */
1745 btrfs_set_path_blocking(p); 1691 if (!p->leave_spinning)
1746 if (prealloc_block.objectid) { 1692 btrfs_set_path_blocking(p);
1747 btrfs_free_reserved_extent(root,
1748 prealloc_block.objectid,
1749 prealloc_block.offset);
1750 }
1751 return ret; 1693 return ret;
1752} 1694}
1753 1695
@@ -1768,7 +1710,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1768 int ret; 1710 int ret;
1769 1711
1770 eb = btrfs_lock_root_node(root); 1712 eb = btrfs_lock_root_node(root);
1771 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0); 1713 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb);
1772 BUG_ON(ret); 1714 BUG_ON(ret);
1773 1715
1774 btrfs_set_lock_blocking(eb); 1716 btrfs_set_lock_blocking(eb);
@@ -1826,7 +1768,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1826 } 1768 }
1827 1769
1828 ret = btrfs_cow_block(trans, root, eb, parent, slot, 1770 ret = btrfs_cow_block(trans, root, eb, parent, slot,
1829 &eb, 0); 1771 &eb);
1830 BUG_ON(ret); 1772 BUG_ON(ret);
1831 1773
1832 if (root->root_key.objectid == 1774 if (root->root_key.objectid ==
@@ -2139,7 +2081,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2139 spin_unlock(&root->node_lock); 2081 spin_unlock(&root->node_lock);
2140 2082
2141 ret = btrfs_update_extent_ref(trans, root, lower->start, 2083 ret = btrfs_update_extent_ref(trans, root, lower->start,
2142 lower->start, c->start, 2084 lower->len, lower->start, c->start,
2143 root->root_key.objectid, 2085 root->root_key.objectid,
2144 trans->transid, level - 1); 2086 trans->transid, level - 1);
2145 BUG_ON(ret); 2087 BUG_ON(ret);
@@ -2221,7 +2163,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2221 ret = insert_new_root(trans, root, path, level + 1); 2163 ret = insert_new_root(trans, root, path, level + 1);
2222 if (ret) 2164 if (ret)
2223 return ret; 2165 return ret;
2224 } else { 2166 } else if (!trans->transaction->delayed_refs.flushing) {
2225 ret = push_nodes_for_insert(trans, root, path, level); 2167 ret = push_nodes_for_insert(trans, root, path, level);
2226 c = path->nodes[level]; 2168 c = path->nodes[level];
2227 if (!ret && btrfs_header_nritems(c) < 2169 if (!ret && btrfs_header_nritems(c) <
@@ -2329,66 +2271,27 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
2329 return ret; 2271 return ret;
2330} 2272}
2331 2273
2332/* 2274static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2333 * push some data in the path leaf to the right, trying to free up at 2275 struct btrfs_root *root,
2334 * least data_size bytes. returns zero if the push worked, nonzero otherwise 2276 struct btrfs_path *path,
2335 * 2277 int data_size, int empty,
2336 * returns 1 if the push failed because the other node didn't have enough 2278 struct extent_buffer *right,
2337 * room, 0 if everything worked out and < 0 if there were major errors. 2279 int free_space, u32 left_nritems)
2338 */
2339static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2340 *root, struct btrfs_path *path, int data_size,
2341 int empty)
2342{ 2280{
2343 struct extent_buffer *left = path->nodes[0]; 2281 struct extent_buffer *left = path->nodes[0];
2344 struct extent_buffer *right; 2282 struct extent_buffer *upper = path->nodes[1];
2345 struct extent_buffer *upper;
2346 struct btrfs_disk_key disk_key; 2283 struct btrfs_disk_key disk_key;
2347 int slot; 2284 int slot;
2348 u32 i; 2285 u32 i;
2349 int free_space;
2350 int push_space = 0; 2286 int push_space = 0;
2351 int push_items = 0; 2287 int push_items = 0;
2352 struct btrfs_item *item; 2288 struct btrfs_item *item;
2353 u32 left_nritems;
2354 u32 nr; 2289 u32 nr;
2355 u32 right_nritems; 2290 u32 right_nritems;
2356 u32 data_end; 2291 u32 data_end;
2357 u32 this_item_size; 2292 u32 this_item_size;
2358 int ret; 2293 int ret;
2359 2294
2360 slot = path->slots[1];
2361 if (!path->nodes[1])
2362 return 1;
2363
2364 upper = path->nodes[1];
2365 if (slot >= btrfs_header_nritems(upper) - 1)
2366 return 1;
2367
2368 btrfs_assert_tree_locked(path->nodes[1]);
2369
2370 right = read_node_slot(root, upper, slot + 1);
2371 btrfs_tree_lock(right);
2372 btrfs_set_lock_blocking(right);
2373
2374 free_space = btrfs_leaf_free_space(root, right);
2375 if (free_space < data_size)
2376 goto out_unlock;
2377
2378 /* cow and double check */
2379 ret = btrfs_cow_block(trans, root, right, upper,
2380 slot + 1, &right, 0);
2381 if (ret)
2382 goto out_unlock;
2383
2384 free_space = btrfs_leaf_free_space(root, right);
2385 if (free_space < data_size)
2386 goto out_unlock;
2387
2388 left_nritems = btrfs_header_nritems(left);
2389 if (left_nritems == 0)
2390 goto out_unlock;
2391
2392 if (empty) 2295 if (empty)
2393 nr = 0; 2296 nr = 0;
2394 else 2297 else
@@ -2397,6 +2300,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2397 if (path->slots[0] >= left_nritems) 2300 if (path->slots[0] >= left_nritems)
2398 push_space += data_size; 2301 push_space += data_size;
2399 2302
2303 slot = path->slots[1];
2400 i = left_nritems - 1; 2304 i = left_nritems - 1;
2401 while (i >= nr) { 2305 while (i >= nr) {
2402 item = btrfs_item_nr(left, i); 2306 item = btrfs_item_nr(left, i);
@@ -2528,24 +2432,82 @@ out_unlock:
2528} 2432}
2529 2433
2530/* 2434/*
2435 * push some data in the path leaf to the right, trying to free up at
2436 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2437 *
2438 * returns 1 if the push failed because the other node didn't have enough
2439 * room, 0 if everything worked out and < 0 if there were major errors.
2440 */
2441static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2442 *root, struct btrfs_path *path, int data_size,
2443 int empty)
2444{
2445 struct extent_buffer *left = path->nodes[0];
2446 struct extent_buffer *right;
2447 struct extent_buffer *upper;
2448 int slot;
2449 int free_space;
2450 u32 left_nritems;
2451 int ret;
2452
2453 if (!path->nodes[1])
2454 return 1;
2455
2456 slot = path->slots[1];
2457 upper = path->nodes[1];
2458 if (slot >= btrfs_header_nritems(upper) - 1)
2459 return 1;
2460
2461 btrfs_assert_tree_locked(path->nodes[1]);
2462
2463 right = read_node_slot(root, upper, slot + 1);
2464 btrfs_tree_lock(right);
2465 btrfs_set_lock_blocking(right);
2466
2467 free_space = btrfs_leaf_free_space(root, right);
2468 if (free_space < data_size)
2469 goto out_unlock;
2470
2471 /* cow and double check */
2472 ret = btrfs_cow_block(trans, root, right, upper,
2473 slot + 1, &right);
2474 if (ret)
2475 goto out_unlock;
2476
2477 free_space = btrfs_leaf_free_space(root, right);
2478 if (free_space < data_size)
2479 goto out_unlock;
2480
2481 left_nritems = btrfs_header_nritems(left);
2482 if (left_nritems == 0)
2483 goto out_unlock;
2484
2485 return __push_leaf_right(trans, root, path, data_size, empty,
2486 right, free_space, left_nritems);
2487out_unlock:
2488 btrfs_tree_unlock(right);
2489 free_extent_buffer(right);
2490 return 1;
2491}
2492
2493/*
2531 * push some data in the path leaf to the left, trying to free up at 2494 * push some data in the path leaf to the left, trying to free up at
2532 * least data_size bytes. returns zero if the push worked, nonzero otherwise 2495 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2533 */ 2496 */
2534static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root 2497static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2535 *root, struct btrfs_path *path, int data_size, 2498 struct btrfs_root *root,
2536 int empty) 2499 struct btrfs_path *path, int data_size,
2500 int empty, struct extent_buffer *left,
2501 int free_space, int right_nritems)
2537{ 2502{
2538 struct btrfs_disk_key disk_key; 2503 struct btrfs_disk_key disk_key;
2539 struct extent_buffer *right = path->nodes[0]; 2504 struct extent_buffer *right = path->nodes[0];
2540 struct extent_buffer *left;
2541 int slot; 2505 int slot;
2542 int i; 2506 int i;
2543 int free_space;
2544 int push_space = 0; 2507 int push_space = 0;
2545 int push_items = 0; 2508 int push_items = 0;
2546 struct btrfs_item *item; 2509 struct btrfs_item *item;
2547 u32 old_left_nritems; 2510 u32 old_left_nritems;
2548 u32 right_nritems;
2549 u32 nr; 2511 u32 nr;
2550 int ret = 0; 2512 int ret = 0;
2551 int wret; 2513 int wret;
@@ -2553,41 +2515,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2553 u32 old_left_item_size; 2515 u32 old_left_item_size;
2554 2516
2555 slot = path->slots[1]; 2517 slot = path->slots[1];
2556 if (slot == 0)
2557 return 1;
2558 if (!path->nodes[1])
2559 return 1;
2560
2561 right_nritems = btrfs_header_nritems(right);
2562 if (right_nritems == 0)
2563 return 1;
2564
2565 btrfs_assert_tree_locked(path->nodes[1]);
2566
2567 left = read_node_slot(root, path->nodes[1], slot - 1);
2568 btrfs_tree_lock(left);
2569 btrfs_set_lock_blocking(left);
2570
2571 free_space = btrfs_leaf_free_space(root, left);
2572 if (free_space < data_size) {
2573 ret = 1;
2574 goto out;
2575 }
2576
2577 /* cow and double check */
2578 ret = btrfs_cow_block(trans, root, left,
2579 path->nodes[1], slot - 1, &left, 0);
2580 if (ret) {
2581 /* we hit -ENOSPC, but it isn't fatal here */
2582 ret = 1;
2583 goto out;
2584 }
2585
2586 free_space = btrfs_leaf_free_space(root, left);
2587 if (free_space < data_size) {
2588 ret = 1;
2589 goto out;
2590 }
2591 2518
2592 if (empty) 2519 if (empty)
2593 nr = right_nritems; 2520 nr = right_nritems;
@@ -2755,6 +2682,154 @@ out:
2755} 2682}
2756 2683
2757/* 2684/*
2685 * push some data in the path leaf to the left, trying to free up at
2686 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2687 */
2688static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2689 *root, struct btrfs_path *path, int data_size,
2690 int empty)
2691{
2692 struct extent_buffer *right = path->nodes[0];
2693 struct extent_buffer *left;
2694 int slot;
2695 int free_space;
2696 u32 right_nritems;
2697 int ret = 0;
2698
2699 slot = path->slots[1];
2700 if (slot == 0)
2701 return 1;
2702 if (!path->nodes[1])
2703 return 1;
2704
2705 right_nritems = btrfs_header_nritems(right);
2706 if (right_nritems == 0)
2707 return 1;
2708
2709 btrfs_assert_tree_locked(path->nodes[1]);
2710
2711 left = read_node_slot(root, path->nodes[1], slot - 1);
2712 btrfs_tree_lock(left);
2713 btrfs_set_lock_blocking(left);
2714
2715 free_space = btrfs_leaf_free_space(root, left);
2716 if (free_space < data_size) {
2717 ret = 1;
2718 goto out;
2719 }
2720
2721 /* cow and double check */
2722 ret = btrfs_cow_block(trans, root, left,
2723 path->nodes[1], slot - 1, &left);
2724 if (ret) {
2725 /* we hit -ENOSPC, but it isn't fatal here */
2726 ret = 1;
2727 goto out;
2728 }
2729
2730 free_space = btrfs_leaf_free_space(root, left);
2731 if (free_space < data_size) {
2732 ret = 1;
2733 goto out;
2734 }
2735
2736 return __push_leaf_left(trans, root, path, data_size,
2737 empty, left, free_space, right_nritems);
2738out:
2739 btrfs_tree_unlock(left);
2740 free_extent_buffer(left);
2741 return ret;
2742}
2743
2744/*
2745 * split the path's leaf in two, making sure there is at least data_size
2746 * available for the resulting leaf level of the path.
2747 *
2748 * returns 0 if all went well and < 0 on failure.
2749 */
2750static noinline int copy_for_split(struct btrfs_trans_handle *trans,
2751 struct btrfs_root *root,
2752 struct btrfs_path *path,
2753 struct extent_buffer *l,
2754 struct extent_buffer *right,
2755 int slot, int mid, int nritems)
2756{
2757 int data_copy_size;
2758 int rt_data_off;
2759 int i;
2760 int ret = 0;
2761 int wret;
2762 struct btrfs_disk_key disk_key;
2763
2764 nritems = nritems - mid;
2765 btrfs_set_header_nritems(right, nritems);
2766 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2767
2768 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2769 btrfs_item_nr_offset(mid),
2770 nritems * sizeof(struct btrfs_item));
2771
2772 copy_extent_buffer(right, l,
2773 btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2774 data_copy_size, btrfs_leaf_data(l) +
2775 leaf_data_end(root, l), data_copy_size);
2776
2777 rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2778 btrfs_item_end_nr(l, mid);
2779
2780 for (i = 0; i < nritems; i++) {
2781 struct btrfs_item *item = btrfs_item_nr(right, i);
2782 u32 ioff;
2783
2784 if (!right->map_token) {
2785 map_extent_buffer(right, (unsigned long)item,
2786 sizeof(struct btrfs_item),
2787 &right->map_token, &right->kaddr,
2788 &right->map_start, &right->map_len,
2789 KM_USER1);
2790 }
2791
2792 ioff = btrfs_item_offset(right, item);
2793 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2794 }
2795
2796 if (right->map_token) {
2797 unmap_extent_buffer(right, right->map_token, KM_USER1);
2798 right->map_token = NULL;
2799 }
2800
2801 btrfs_set_header_nritems(l, mid);
2802 ret = 0;
2803 btrfs_item_key(right, &disk_key, 0);
2804 wret = insert_ptr(trans, root, path, &disk_key, right->start,
2805 path->slots[1] + 1, 1);
2806 if (wret)
2807 ret = wret;
2808
2809 btrfs_mark_buffer_dirty(right);
2810 btrfs_mark_buffer_dirty(l);
2811 BUG_ON(path->slots[0] != slot);
2812
2813 ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
2814 BUG_ON(ret);
2815
2816 if (mid <= slot) {
2817 btrfs_tree_unlock(path->nodes[0]);
2818 free_extent_buffer(path->nodes[0]);
2819 path->nodes[0] = right;
2820 path->slots[0] -= mid;
2821 path->slots[1] += 1;
2822 } else {
2823 btrfs_tree_unlock(right);
2824 free_extent_buffer(right);
2825 }
2826
2827 BUG_ON(path->slots[0] < 0);
2828
2829 return ret;
2830}
2831
2832/*
2758 * split the path's leaf in two, making sure there is at least data_size 2833 * split the path's leaf in two, making sure there is at least data_size
2759 * available for the resulting leaf level of the path. 2834 * available for the resulting leaf level of the path.
2760 * 2835 *
@@ -2771,17 +2846,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
2771 int mid; 2846 int mid;
2772 int slot; 2847 int slot;
2773 struct extent_buffer *right; 2848 struct extent_buffer *right;
2774 int data_copy_size;
2775 int rt_data_off;
2776 int i;
2777 int ret = 0; 2849 int ret = 0;
2778 int wret; 2850 int wret;
2779 int double_split; 2851 int double_split;
2780 int num_doubles = 0; 2852 int num_doubles = 0;
2781 struct btrfs_disk_key disk_key;
2782 2853
2783 /* first try to make some room by pushing left and right */ 2854 /* first try to make some room by pushing left and right */
2784 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { 2855 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY &&
2856 !trans->transaction->delayed_refs.flushing) {
2785 wret = push_leaf_right(trans, root, path, data_size, 0); 2857 wret = push_leaf_right(trans, root, path, data_size, 0);
2786 if (wret < 0) 2858 if (wret < 0)
2787 return wret; 2859 return wret;
@@ -2830,11 +2902,14 @@ again:
2830 write_extent_buffer(right, root->fs_info->chunk_tree_uuid, 2902 write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
2831 (unsigned long)btrfs_header_chunk_tree_uuid(right), 2903 (unsigned long)btrfs_header_chunk_tree_uuid(right),
2832 BTRFS_UUID_SIZE); 2904 BTRFS_UUID_SIZE);
2905
2833 if (mid <= slot) { 2906 if (mid <= slot) {
2834 if (nritems == 1 || 2907 if (nritems == 1 ||
2835 leaf_space_used(l, mid, nritems - mid) + data_size > 2908 leaf_space_used(l, mid, nritems - mid) + data_size >
2836 BTRFS_LEAF_DATA_SIZE(root)) { 2909 BTRFS_LEAF_DATA_SIZE(root)) {
2837 if (slot >= nritems) { 2910 if (slot >= nritems) {
2911 struct btrfs_disk_key disk_key;
2912
2838 btrfs_cpu_key_to_disk(&disk_key, ins_key); 2913 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2839 btrfs_set_header_nritems(right, 0); 2914 btrfs_set_header_nritems(right, 0);
2840 wret = insert_ptr(trans, root, path, 2915 wret = insert_ptr(trans, root, path,
@@ -2862,6 +2937,8 @@ again:
2862 if (leaf_space_used(l, 0, mid) + data_size > 2937 if (leaf_space_used(l, 0, mid) + data_size >
2863 BTRFS_LEAF_DATA_SIZE(root)) { 2938 BTRFS_LEAF_DATA_SIZE(root)) {
2864 if (!extend && data_size && slot == 0) { 2939 if (!extend && data_size && slot == 0) {
2940 struct btrfs_disk_key disk_key;
2941
2865 btrfs_cpu_key_to_disk(&disk_key, ins_key); 2942 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2866 btrfs_set_header_nritems(right, 0); 2943 btrfs_set_header_nritems(right, 0);
2867 wret = insert_ptr(trans, root, path, 2944 wret = insert_ptr(trans, root, path,
@@ -2894,76 +2971,16 @@ again:
2894 } 2971 }
2895 } 2972 }
2896 } 2973 }
2897 nritems = nritems - mid;
2898 btrfs_set_header_nritems(right, nritems);
2899 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2900
2901 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2902 btrfs_item_nr_offset(mid),
2903 nritems * sizeof(struct btrfs_item));
2904
2905 copy_extent_buffer(right, l,
2906 btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2907 data_copy_size, btrfs_leaf_data(l) +
2908 leaf_data_end(root, l), data_copy_size);
2909
2910 rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2911 btrfs_item_end_nr(l, mid);
2912
2913 for (i = 0; i < nritems; i++) {
2914 struct btrfs_item *item = btrfs_item_nr(right, i);
2915 u32 ioff;
2916
2917 if (!right->map_token) {
2918 map_extent_buffer(right, (unsigned long)item,
2919 sizeof(struct btrfs_item),
2920 &right->map_token, &right->kaddr,
2921 &right->map_start, &right->map_len,
2922 KM_USER1);
2923 }
2924
2925 ioff = btrfs_item_offset(right, item);
2926 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2927 }
2928
2929 if (right->map_token) {
2930 unmap_extent_buffer(right, right->map_token, KM_USER1);
2931 right->map_token = NULL;
2932 }
2933
2934 btrfs_set_header_nritems(l, mid);
2935 ret = 0;
2936 btrfs_item_key(right, &disk_key, 0);
2937 wret = insert_ptr(trans, root, path, &disk_key, right->start,
2938 path->slots[1] + 1, 1);
2939 if (wret)
2940 ret = wret;
2941
2942 btrfs_mark_buffer_dirty(right);
2943 btrfs_mark_buffer_dirty(l);
2944 BUG_ON(path->slots[0] != slot);
2945 2974
2946 ret = btrfs_update_ref(trans, root, l, right, 0, nritems); 2975 ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
2947 BUG_ON(ret); 2976 BUG_ON(ret);
2948 2977
2949 if (mid <= slot) {
2950 btrfs_tree_unlock(path->nodes[0]);
2951 free_extent_buffer(path->nodes[0]);
2952 path->nodes[0] = right;
2953 path->slots[0] -= mid;
2954 path->slots[1] += 1;
2955 } else {
2956 btrfs_tree_unlock(right);
2957 free_extent_buffer(right);
2958 }
2959
2960 BUG_ON(path->slots[0] < 0);
2961
2962 if (double_split) { 2978 if (double_split) {
2963 BUG_ON(num_doubles != 0); 2979 BUG_ON(num_doubles != 0);
2964 num_doubles++; 2980 num_doubles++;
2965 goto again; 2981 goto again;
2966 } 2982 }
2983
2967 return ret; 2984 return ret;
2968} 2985}
2969 2986
@@ -3021,26 +3038,27 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
3021 return -EAGAIN; 3038 return -EAGAIN;
3022 } 3039 }
3023 3040
3041 btrfs_set_path_blocking(path);
3024 ret = split_leaf(trans, root, &orig_key, path, 3042 ret = split_leaf(trans, root, &orig_key, path,
3025 sizeof(struct btrfs_item), 1); 3043 sizeof(struct btrfs_item), 1);
3026 path->keep_locks = 0; 3044 path->keep_locks = 0;
3027 BUG_ON(ret); 3045 BUG_ON(ret);
3028 3046
3047 btrfs_unlock_up_safe(path, 1);
3048 leaf = path->nodes[0];
3049 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
3050
3051split:
3029 /* 3052 /*
3030 * make sure any changes to the path from split_leaf leave it 3053 * make sure any changes to the path from split_leaf leave it
3031 * in a blocking state 3054 * in a blocking state
3032 */ 3055 */
3033 btrfs_set_path_blocking(path); 3056 btrfs_set_path_blocking(path);
3034 3057
3035 leaf = path->nodes[0];
3036 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
3037
3038split:
3039 item = btrfs_item_nr(leaf, path->slots[0]); 3058 item = btrfs_item_nr(leaf, path->slots[0]);
3040 orig_offset = btrfs_item_offset(leaf, item); 3059 orig_offset = btrfs_item_offset(leaf, item);
3041 item_size = btrfs_item_size(leaf, item); 3060 item_size = btrfs_item_size(leaf, item);
3042 3061
3043
3044 buf = kmalloc(item_size, GFP_NOFS); 3062 buf = kmalloc(item_size, GFP_NOFS);
3045 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, 3063 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3046 path->slots[0]), item_size); 3064 path->slots[0]), item_size);
@@ -3445,39 +3463,27 @@ out:
3445} 3463}
3446 3464
3447/* 3465/*
3448 * Given a key and some data, insert items into the tree. 3466 * this is a helper for btrfs_insert_empty_items, the main goal here is
3449 * This does all the path init required, making room in the tree if needed. 3467 * to save stack depth by doing the bulk of the work in a function
3468 * that doesn't call btrfs_search_slot
3450 */ 3469 */
3451int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, 3470static noinline_for_stack int
3452 struct btrfs_root *root, 3471setup_items_for_insert(struct btrfs_trans_handle *trans,
3453 struct btrfs_path *path, 3472 struct btrfs_root *root, struct btrfs_path *path,
3454 struct btrfs_key *cpu_key, u32 *data_size, 3473 struct btrfs_key *cpu_key, u32 *data_size,
3455 int nr) 3474 u32 total_data, u32 total_size, int nr)
3456{ 3475{
3457 struct extent_buffer *leaf;
3458 struct btrfs_item *item; 3476 struct btrfs_item *item;
3459 int ret = 0;
3460 int slot;
3461 int slot_orig;
3462 int i; 3477 int i;
3463 u32 nritems; 3478 u32 nritems;
3464 u32 total_size = 0;
3465 u32 total_data = 0;
3466 unsigned int data_end; 3479 unsigned int data_end;
3467 struct btrfs_disk_key disk_key; 3480 struct btrfs_disk_key disk_key;
3481 int ret;
3482 struct extent_buffer *leaf;
3483 int slot;
3468 3484
3469 for (i = 0; i < nr; i++)
3470 total_data += data_size[i];
3471
3472 total_size = total_data + (nr * sizeof(struct btrfs_item));
3473 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3474 if (ret == 0)
3475 return -EEXIST;
3476 if (ret < 0)
3477 goto out;
3478
3479 slot_orig = path->slots[0];
3480 leaf = path->nodes[0]; 3485 leaf = path->nodes[0];
3486 slot = path->slots[0];
3481 3487
3482 nritems = btrfs_header_nritems(leaf); 3488 nritems = btrfs_header_nritems(leaf);
3483 data_end = leaf_data_end(root, leaf); 3489 data_end = leaf_data_end(root, leaf);
@@ -3489,9 +3495,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3489 BUG(); 3495 BUG();
3490 } 3496 }
3491 3497
3492 slot = path->slots[0];
3493 BUG_ON(slot < 0);
3494
3495 if (slot != nritems) { 3498 if (slot != nritems) {
3496 unsigned int old_data = btrfs_item_end_nr(leaf, slot); 3499 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3497 3500
@@ -3547,21 +3550,60 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3547 data_end -= data_size[i]; 3550 data_end -= data_size[i];
3548 btrfs_set_item_size(leaf, item, data_size[i]); 3551 btrfs_set_item_size(leaf, item, data_size[i]);
3549 } 3552 }
3553
3550 btrfs_set_header_nritems(leaf, nritems + nr); 3554 btrfs_set_header_nritems(leaf, nritems + nr);
3551 btrfs_mark_buffer_dirty(leaf);
3552 3555
3553 ret = 0; 3556 ret = 0;
3554 if (slot == 0) { 3557 if (slot == 0) {
3558 struct btrfs_disk_key disk_key;
3555 btrfs_cpu_key_to_disk(&disk_key, cpu_key); 3559 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3556 ret = fixup_low_keys(trans, root, path, &disk_key, 1); 3560 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3557 } 3561 }
3562 btrfs_unlock_up_safe(path, 1);
3563 btrfs_mark_buffer_dirty(leaf);
3558 3564
3559 if (btrfs_leaf_free_space(root, leaf) < 0) { 3565 if (btrfs_leaf_free_space(root, leaf) < 0) {
3560 btrfs_print_leaf(root, leaf); 3566 btrfs_print_leaf(root, leaf);
3561 BUG(); 3567 BUG();
3562 } 3568 }
3569 return ret;
3570}
3571
3572/*
3573 * Given a key and some data, insert items into the tree.
3574 * This does all the path init required, making room in the tree if needed.
3575 */
3576int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3577 struct btrfs_root *root,
3578 struct btrfs_path *path,
3579 struct btrfs_key *cpu_key, u32 *data_size,
3580 int nr)
3581{
3582 struct extent_buffer *leaf;
3583 int ret = 0;
3584 int slot;
3585 int i;
3586 u32 total_size = 0;
3587 u32 total_data = 0;
3588
3589 for (i = 0; i < nr; i++)
3590 total_data += data_size[i];
3591
3592 total_size = total_data + (nr * sizeof(struct btrfs_item));
3593 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3594 if (ret == 0)
3595 return -EEXIST;
3596 if (ret < 0)
3597 goto out;
3598
3599 leaf = path->nodes[0];
3600 slot = path->slots[0];
3601 BUG_ON(slot < 0);
3602
3603 ret = setup_items_for_insert(trans, root, path, cpu_key, data_size,
3604 total_data, total_size, nr);
3605
3563out: 3606out:
3564 btrfs_unlock_up_safe(path, 1);
3565 return ret; 3607 return ret;
3566} 3608}
3567 3609
@@ -3749,7 +3791,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3749 } 3791 }
3750 3792
3751 /* delete the leaf if it is mostly empty */ 3793 /* delete the leaf if it is mostly empty */
3752 if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) { 3794 if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 &&
3795 !trans->transaction->delayed_refs.flushing) {
3753 /* push_leaf_left fixes the path. 3796 /* push_leaf_left fixes the path.
3754 * make sure the path still points to our leaf 3797 * make sure the path still points to our leaf
3755 * for possible call to del_ptr below 3798 * for possible call to del_ptr below
@@ -3757,6 +3800,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3757 slot = path->slots[1]; 3800 slot = path->slots[1];
3758 extent_buffer_get(leaf); 3801 extent_buffer_get(leaf);
3759 3802
3803 btrfs_set_path_blocking(path);
3760 wret = push_leaf_left(trans, root, path, 1, 1); 3804 wret = push_leaf_left(trans, root, path, 1, 1);
3761 if (wret < 0 && wret != -ENOSPC) 3805 if (wret < 0 && wret != -ENOSPC)
3762 ret = wret; 3806 ret = wret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7dd1b6d0bf32..9417713542a2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -45,6 +45,13 @@ struct btrfs_ordered_sum;
45 45
46#define BTRFS_MAX_LEVEL 8 46#define BTRFS_MAX_LEVEL 8
47 47
48/*
49 * files bigger than this get some pre-flushing when they are added
50 * to the ordered operations list. That way we limit the total
51 * work done by the commit
52 */
53#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)
54
48/* holds pointers to all of the tree roots */ 55/* holds pointers to all of the tree roots */
49#define BTRFS_ROOT_TREE_OBJECTID 1ULL 56#define BTRFS_ROOT_TREE_OBJECTID 1ULL
50 57
@@ -401,15 +408,16 @@ struct btrfs_path {
401 int locks[BTRFS_MAX_LEVEL]; 408 int locks[BTRFS_MAX_LEVEL];
402 int reada; 409 int reada;
403 /* keep some upper locks as we walk down */ 410 /* keep some upper locks as we walk down */
404 int keep_locks;
405 int skip_locking;
406 int lowest_level; 411 int lowest_level;
407 412
408 /* 413 /*
409 * set by btrfs_split_item, tells search_slot to keep all locks 414 * set by btrfs_split_item, tells search_slot to keep all locks
410 * and to force calls to keep space in the nodes 415 * and to force calls to keep space in the nodes
411 */ 416 */
412 int search_for_split; 417 unsigned int search_for_split:1;
418 unsigned int keep_locks:1;
419 unsigned int skip_locking:1;
420 unsigned int leave_spinning:1;
413}; 421};
414 422
415/* 423/*
@@ -688,15 +696,18 @@ struct btrfs_fs_info {
688 struct rb_root block_group_cache_tree; 696 struct rb_root block_group_cache_tree;
689 697
690 struct extent_io_tree pinned_extents; 698 struct extent_io_tree pinned_extents;
691 struct extent_io_tree pending_del;
692 struct extent_io_tree extent_ins;
693 699
694 /* logical->physical extent mapping */ 700 /* logical->physical extent mapping */
695 struct btrfs_mapping_tree mapping_tree; 701 struct btrfs_mapping_tree mapping_tree;
696 702
697 u64 generation; 703 u64 generation;
698 u64 last_trans_committed; 704 u64 last_trans_committed;
699 u64 last_trans_new_blockgroup; 705
706 /*
707 * this is updated to the current trans every time a full commit
708 * is required instead of the faster short fsync log commits
709 */
710 u64 last_trans_log_full_commit;
700 u64 open_ioctl_trans; 711 u64 open_ioctl_trans;
701 unsigned long mount_opt; 712 unsigned long mount_opt;
702 u64 max_extent; 713 u64 max_extent;
@@ -717,12 +728,21 @@ struct btrfs_fs_info {
717 struct mutex tree_log_mutex; 728 struct mutex tree_log_mutex;
718 struct mutex transaction_kthread_mutex; 729 struct mutex transaction_kthread_mutex;
719 struct mutex cleaner_mutex; 730 struct mutex cleaner_mutex;
720 struct mutex extent_ins_mutex;
721 struct mutex pinned_mutex; 731 struct mutex pinned_mutex;
722 struct mutex chunk_mutex; 732 struct mutex chunk_mutex;
723 struct mutex drop_mutex; 733 struct mutex drop_mutex;
724 struct mutex volume_mutex; 734 struct mutex volume_mutex;
725 struct mutex tree_reloc_mutex; 735 struct mutex tree_reloc_mutex;
736
737 /*
738 * this protects the ordered operations list only while we are
739 * processing all of the entries on it. This way we make
740 * sure the commit code doesn't find the list temporarily empty
741 * because another function happens to be doing non-waiting preflush
742 * before jumping into the main commit.
743 */
744 struct mutex ordered_operations_mutex;
745
726 struct list_head trans_list; 746 struct list_head trans_list;
727 struct list_head hashers; 747 struct list_head hashers;
728 struct list_head dead_roots; 748 struct list_head dead_roots;
@@ -737,10 +757,29 @@ struct btrfs_fs_info {
737 * ordered extents 757 * ordered extents
738 */ 758 */
739 spinlock_t ordered_extent_lock; 759 spinlock_t ordered_extent_lock;
760
761 /*
762 * all of the data=ordered extents pending writeback
763 * these can span multiple transactions and basically include
764 * every dirty data page that isn't from nodatacow
765 */
740 struct list_head ordered_extents; 766 struct list_head ordered_extents;
767
768 /*
769 * all of the inodes that have delalloc bytes. It is possible for
770 * this list to be empty even when there is still dirty data=ordered
771 * extents waiting to finish IO.
772 */
741 struct list_head delalloc_inodes; 773 struct list_head delalloc_inodes;
742 774
743 /* 775 /*
776 * special rename and truncate targets that must be on disk before
777 * we're allowed to commit. This is basically the ext3 style
778 * data=ordered list.
779 */
780 struct list_head ordered_operations;
781
782 /*
744 * there is a pool of worker threads for checksumming during writes 783 * there is a pool of worker threads for checksumming during writes
745 * and a pool for checksumming after reads. This is because readers 784 * and a pool for checksumming after reads. This is because readers
746 * can run with FS locks held, and the writers may be waiting for 785 * can run with FS locks held, and the writers may be waiting for
@@ -781,6 +820,11 @@ struct btrfs_fs_info {
781 atomic_t throttle_gen; 820 atomic_t throttle_gen;
782 821
783 u64 total_pinned; 822 u64 total_pinned;
823
824 /* protected by the delalloc lock, used to keep from writing
825 * metadata until there is a nice batch
826 */
827 u64 dirty_metadata_bytes;
784 struct list_head dirty_cowonly_roots; 828 struct list_head dirty_cowonly_roots;
785 829
786 struct btrfs_fs_devices *fs_devices; 830 struct btrfs_fs_devices *fs_devices;
@@ -1704,18 +1748,15 @@ static inline struct dentry *fdentry(struct file *file)
1704} 1748}
1705 1749
1706/* extent-tree.c */ 1750/* extent-tree.c */
1751int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1752 struct btrfs_root *root, unsigned long count);
1707int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); 1753int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
1708int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
1709 struct btrfs_root *root, u64 bytenr,
1710 u64 num_bytes, u32 *refs);
1711int btrfs_update_pinned_extents(struct btrfs_root *root, 1754int btrfs_update_pinned_extents(struct btrfs_root *root,
1712 u64 bytenr, u64 num, int pin); 1755 u64 bytenr, u64 num, int pin);
1713int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, 1756int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
1714 struct btrfs_root *root, struct extent_buffer *leaf); 1757 struct btrfs_root *root, struct extent_buffer *leaf);
1715int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 1758int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
1716 struct btrfs_root *root, u64 objectid, u64 bytenr); 1759 struct btrfs_root *root, u64 objectid, u64 bytenr);
1717int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
1718 struct btrfs_root *root);
1719int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy); 1760int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
1720struct btrfs_block_group_cache *btrfs_lookup_block_group( 1761struct btrfs_block_group_cache *btrfs_lookup_block_group(
1721 struct btrfs_fs_info *info, 1762 struct btrfs_fs_info *info,
@@ -1777,7 +1818,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1777 u64 root_objectid, u64 ref_generation, 1818 u64 root_objectid, u64 ref_generation,
1778 u64 owner_objectid); 1819 u64 owner_objectid);
1779int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, 1820int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1780 struct btrfs_root *root, u64 bytenr, 1821 struct btrfs_root *root, u64 bytenr, u64 num_bytes,
1781 u64 orig_parent, u64 parent, 1822 u64 orig_parent, u64 parent,
1782 u64 root_objectid, u64 ref_generation, 1823 u64 root_objectid, u64 ref_generation,
1783 u64 owner_objectid); 1824 u64 owner_objectid);
@@ -1838,7 +1879,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
1838int btrfs_cow_block(struct btrfs_trans_handle *trans, 1879int btrfs_cow_block(struct btrfs_trans_handle *trans,
1839 struct btrfs_root *root, struct extent_buffer *buf, 1880 struct btrfs_root *root, struct extent_buffer *buf,
1840 struct extent_buffer *parent, int parent_slot, 1881 struct extent_buffer *parent, int parent_slot,
1841 struct extent_buffer **cow_ret, u64 prealloc_dest); 1882 struct extent_buffer **cow_ret);
1842int btrfs_copy_root(struct btrfs_trans_handle *trans, 1883int btrfs_copy_root(struct btrfs_trans_handle *trans,
1843 struct btrfs_root *root, 1884 struct btrfs_root *root,
1844 struct extent_buffer *buf, 1885 struct extent_buffer *buf,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
new file mode 100644
index 000000000000..cbf7dc8ae3ec
--- /dev/null
+++ b/fs/btrfs/delayed-ref.c
@@ -0,0 +1,669 @@
1/*
2 * Copyright (C) 2009 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/sort.h>
21#include <linux/ftrace.h>
22#include "ctree.h"
23#include "delayed-ref.h"
24#include "transaction.h"
25
26/*
27 * delayed back reference update tracking. For subvolume trees
28 * we queue up extent allocations and backref maintenance for
29 * delayed processing. This avoids deep call chains where we
30 * add extents in the middle of btrfs_search_slot, and it allows
31 * us to buffer up frequently modified backrefs in an rb tree instead
32 * of hammering updates on the extent allocation tree.
33 *
34 * Right now this code is only used for reference counted trees, but
35 * the long term goal is to get rid of the similar code for delayed
36 * extent tree modifications.
37 */
38
39/*
40 * entries in the rb tree are ordered by the byte number of the extent
41 * and by the byte number of the parent block.
42 */
43static int comp_entry(struct btrfs_delayed_ref_node *ref,
44 u64 bytenr, u64 parent)
45{
46 if (bytenr < ref->bytenr)
47 return -1;
48 if (bytenr > ref->bytenr)
49 return 1;
50 if (parent < ref->parent)
51 return -1;
52 if (parent > ref->parent)
53 return 1;
54 return 0;
55}
56
57/*
58 * insert a new ref into the rbtree. This returns any existing refs
59 * for the same (bytenr,parent) tuple, or NULL if the new node was properly
60 * inserted.
61 */
62static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
63 u64 bytenr, u64 parent,
64 struct rb_node *node)
65{
66 struct rb_node **p = &root->rb_node;
67 struct rb_node *parent_node = NULL;
68 struct btrfs_delayed_ref_node *entry;
69 int cmp;
70
71 while (*p) {
72 parent_node = *p;
73 entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
74 rb_node);
75
76 cmp = comp_entry(entry, bytenr, parent);
77 if (cmp < 0)
78 p = &(*p)->rb_left;
79 else if (cmp > 0)
80 p = &(*p)->rb_right;
81 else
82 return entry;
83 }
84
85 entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
86 rb_link_node(node, parent_node, p);
87 rb_insert_color(node, root);
88 return NULL;
89}
90
91/*
92 * find an entry based on (bytenr,parent). This returns the delayed
93 * ref if it was able to find one, or NULL if nothing was in that spot
94 */
95static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
96 u64 bytenr, u64 parent,
97 struct btrfs_delayed_ref_node **last)
98{
99 struct rb_node *n = root->rb_node;
100 struct btrfs_delayed_ref_node *entry;
101 int cmp;
102
103 while (n) {
104 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
105 WARN_ON(!entry->in_tree);
106 if (last)
107 *last = entry;
108
109 cmp = comp_entry(entry, bytenr, parent);
110 if (cmp < 0)
111 n = n->rb_left;
112 else if (cmp > 0)
113 n = n->rb_right;
114 else
115 return entry;
116 }
117 return NULL;
118}
119
120int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
121 struct btrfs_delayed_ref_head *head)
122{
123 struct btrfs_delayed_ref_root *delayed_refs;
124
125 delayed_refs = &trans->transaction->delayed_refs;
126 assert_spin_locked(&delayed_refs->lock);
127 if (mutex_trylock(&head->mutex))
128 return 0;
129
130 atomic_inc(&head->node.refs);
131 spin_unlock(&delayed_refs->lock);
132
133 mutex_lock(&head->mutex);
134 spin_lock(&delayed_refs->lock);
135 if (!head->node.in_tree) {
136 mutex_unlock(&head->mutex);
137 btrfs_put_delayed_ref(&head->node);
138 return -EAGAIN;
139 }
140 btrfs_put_delayed_ref(&head->node);
141 return 0;
142}
143
144int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
145 struct list_head *cluster, u64 start)
146{
147 int count = 0;
148 struct btrfs_delayed_ref_root *delayed_refs;
149 struct rb_node *node;
150 struct btrfs_delayed_ref_node *ref;
151 struct btrfs_delayed_ref_head *head;
152
153 delayed_refs = &trans->transaction->delayed_refs;
154 if (start == 0) {
155 node = rb_first(&delayed_refs->root);
156 } else {
157 ref = NULL;
158 tree_search(&delayed_refs->root, start, (u64)-1, &ref);
159 if (ref) {
160 struct btrfs_delayed_ref_node *tmp;
161
162 node = rb_prev(&ref->rb_node);
163 while (node) {
164 tmp = rb_entry(node,
165 struct btrfs_delayed_ref_node,
166 rb_node);
167 if (tmp->bytenr < start)
168 break;
169 ref = tmp;
170 node = rb_prev(&ref->rb_node);
171 }
172 node = &ref->rb_node;
173 } else
174 node = rb_first(&delayed_refs->root);
175 }
176again:
177 while (node && count < 32) {
178 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
179 if (btrfs_delayed_ref_is_head(ref)) {
180 head = btrfs_delayed_node_to_head(ref);
181 if (list_empty(&head->cluster)) {
182 list_add_tail(&head->cluster, cluster);
183 delayed_refs->run_delayed_start =
184 head->node.bytenr;
185 count++;
186
187 WARN_ON(delayed_refs->num_heads_ready == 0);
188 delayed_refs->num_heads_ready--;
189 } else if (count) {
190 /* the goal of the clustering is to find extents
191 * that are likely to end up in the same extent
192 * leaf on disk. So, we don't want them spread
193 * all over the tree. Stop now if we've hit
194 * a head that was already in use
195 */
196 break;
197 }
198 }
199 node = rb_next(node);
200 }
201 if (count) {
202 return 0;
203 } else if (start) {
204 /*
205 * we've gone to the end of the rbtree without finding any
206 * clusters. start from the beginning and try again
207 */
208 start = 0;
209 node = rb_first(&delayed_refs->root);
210 goto again;
211 }
212 return 1;
213}
214
215/*
216 * This checks to see if there are any delayed refs in the
217 * btree for a given bytenr. It returns one if it finds any
218 * and zero otherwise.
219 *
220 * If it only finds a head node, it returns 0.
221 *
222 * The idea is to use this when deciding if you can safely delete an
223 * extent from the extent allocation tree. There may be a pending
224 * ref in the rbtree that adds or removes references, so as long as this
225 * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent
226 * allocation tree.
227 */
228int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
229{
230 struct btrfs_delayed_ref_node *ref;
231 struct btrfs_delayed_ref_root *delayed_refs;
232 struct rb_node *prev_node;
233 int ret = 0;
234
235 delayed_refs = &trans->transaction->delayed_refs;
236 spin_lock(&delayed_refs->lock);
237
238 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
239 if (ref) {
240 prev_node = rb_prev(&ref->rb_node);
241 if (!prev_node)
242 goto out;
243 ref = rb_entry(prev_node, struct btrfs_delayed_ref_node,
244 rb_node);
245 if (ref->bytenr == bytenr)
246 ret = 1;
247 }
248out:
249 spin_unlock(&delayed_refs->lock);
250 return ret;
251}
252
253/*
254 * helper function to lookup reference count
255 *
256 * the head node for delayed ref is used to store the sum of all the
257 * reference count modifications queued up in the rbtree. This way you
258 * can check to see what the reference count would be if all of the
259 * delayed refs are processed.
260 */
261int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
262 struct btrfs_root *root, u64 bytenr,
263 u64 num_bytes, u32 *refs)
264{
265 struct btrfs_delayed_ref_node *ref;
266 struct btrfs_delayed_ref_head *head;
267 struct btrfs_delayed_ref_root *delayed_refs;
268 struct btrfs_path *path;
269 struct extent_buffer *leaf;
270 struct btrfs_extent_item *ei;
271 struct btrfs_key key;
272 u32 num_refs;
273 int ret;
274
275 path = btrfs_alloc_path();
276 if (!path)
277 return -ENOMEM;
278
279 key.objectid = bytenr;
280 key.type = BTRFS_EXTENT_ITEM_KEY;
281 key.offset = num_bytes;
282 delayed_refs = &trans->transaction->delayed_refs;
283again:
284 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
285 &key, path, 0, 0);
286 if (ret < 0)
287 goto out;
288
289 if (ret == 0) {
290 leaf = path->nodes[0];
291 ei = btrfs_item_ptr(leaf, path->slots[0],
292 struct btrfs_extent_item);
293 num_refs = btrfs_extent_refs(leaf, ei);
294 } else {
295 num_refs = 0;
296 ret = 0;
297 }
298
299 spin_lock(&delayed_refs->lock);
300 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
301 if (ref) {
302 head = btrfs_delayed_node_to_head(ref);
303 if (mutex_trylock(&head->mutex)) {
304 num_refs += ref->ref_mod;
305 mutex_unlock(&head->mutex);
306 *refs = num_refs;
307 goto out;
308 }
309
310 atomic_inc(&ref->refs);
311 spin_unlock(&delayed_refs->lock);
312
313 btrfs_release_path(root->fs_info->extent_root, path);
314
315 mutex_lock(&head->mutex);
316 mutex_unlock(&head->mutex);
317 btrfs_put_delayed_ref(ref);
318 goto again;
319 } else {
320 *refs = num_refs;
321 }
322out:
323 spin_unlock(&delayed_refs->lock);
324 btrfs_free_path(path);
325 return ret;
326}
327
328/*
329 * helper function to update an extent delayed ref in the
330 * rbtree. existing and update must both have the same
331 * bytenr and parent
332 *
333 * This may free existing if the update cancels out whatever
334 * operation it was doing.
335 */
336static noinline void
337update_existing_ref(struct btrfs_trans_handle *trans,
338 struct btrfs_delayed_ref_root *delayed_refs,
339 struct btrfs_delayed_ref_node *existing,
340 struct btrfs_delayed_ref_node *update)
341{
342 struct btrfs_delayed_ref *existing_ref;
343 struct btrfs_delayed_ref *ref;
344
345 existing_ref = btrfs_delayed_node_to_ref(existing);
346 ref = btrfs_delayed_node_to_ref(update);
347
348 if (ref->pin)
349 existing_ref->pin = 1;
350
351 if (ref->action != existing_ref->action) {
352 /*
353 * this is effectively undoing either an add or a
354 * drop. We decrement the ref_mod, and if it goes
355 * down to zero we just delete the entry without
356 * every changing the extent allocation tree.
357 */
358 existing->ref_mod--;
359 if (existing->ref_mod == 0) {
360 rb_erase(&existing->rb_node,
361 &delayed_refs->root);
362 existing->in_tree = 0;
363 btrfs_put_delayed_ref(existing);
364 delayed_refs->num_entries--;
365 if (trans->delayed_ref_updates)
366 trans->delayed_ref_updates--;
367 }
368 } else {
369 if (existing_ref->action == BTRFS_ADD_DELAYED_REF) {
370 /* if we're adding refs, make sure all the
371 * details match up. The extent could
372 * have been totally freed and reallocated
373 * by a different owner before the delayed
374 * ref entries were removed.
375 */
376 existing_ref->owner_objectid = ref->owner_objectid;
377 existing_ref->generation = ref->generation;
378 existing_ref->root = ref->root;
379 existing->num_bytes = update->num_bytes;
380 }
381 /*
382 * the action on the existing ref matches
383 * the action on the ref we're trying to add.
384 * Bump the ref_mod by one so the backref that
385 * is eventually added/removed has the correct
386 * reference count
387 */
388 existing->ref_mod += update->ref_mod;
389 }
390}
391
392/*
393 * helper function to update the accounting in the head ref
394 * existing and update must have the same bytenr
395 */
396static noinline void
397update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
398 struct btrfs_delayed_ref_node *update)
399{
400 struct btrfs_delayed_ref_head *existing_ref;
401 struct btrfs_delayed_ref_head *ref;
402
403 existing_ref = btrfs_delayed_node_to_head(existing);
404 ref = btrfs_delayed_node_to_head(update);
405
406 if (ref->must_insert_reserved) {
407 /* if the extent was freed and then
408 * reallocated before the delayed ref
409 * entries were processed, we can end up
410 * with an existing head ref without
411 * the must_insert_reserved flag set.
412 * Set it again here
413 */
414 existing_ref->must_insert_reserved = ref->must_insert_reserved;
415
416 /*
417 * update the num_bytes so we make sure the accounting
418 * is done correctly
419 */
420 existing->num_bytes = update->num_bytes;
421
422 }
423
424 /*
425 * update the reference mod on the head to reflect this new operation
426 */
427 existing->ref_mod += update->ref_mod;
428}
429
430/*
431 * helper function to actually insert a delayed ref into the rbtree.
432 * this does all the dirty work in terms of maintaining the correct
433 * overall modification count in the head node and properly dealing
434 * with updating existing nodes as new modifications are queued.
435 */
436static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
437 struct btrfs_delayed_ref_node *ref,
438 u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
439 u64 ref_generation, u64 owner_objectid, int action,
440 int pin)
441{
442 struct btrfs_delayed_ref_node *existing;
443 struct btrfs_delayed_ref *full_ref;
444 struct btrfs_delayed_ref_head *head_ref = NULL;
445 struct btrfs_delayed_ref_root *delayed_refs;
446 int count_mod = 1;
447 int must_insert_reserved = 0;
448
449 /*
450 * the head node stores the sum of all the mods, so dropping a ref
451 * should drop the sum in the head node by one.
452 */
453 if (parent == (u64)-1) {
454 if (action == BTRFS_DROP_DELAYED_REF)
455 count_mod = -1;
456 else if (action == BTRFS_UPDATE_DELAYED_HEAD)
457 count_mod = 0;
458 }
459
460 /*
461 * BTRFS_ADD_DELAYED_EXTENT means that we need to update
462 * the reserved accounting when the extent is finally added, or
463 * if a later modification deletes the delayed ref without ever
464 * inserting the extent into the extent allocation tree.
465 * ref->must_insert_reserved is the flag used to record
466 * that accounting mods are required.
467 *
468 * Once we record must_insert_reserved, switch the action to
469 * BTRFS_ADD_DELAYED_REF because other special casing is not required.
470 */
471 if (action == BTRFS_ADD_DELAYED_EXTENT) {
472 must_insert_reserved = 1;
473 action = BTRFS_ADD_DELAYED_REF;
474 } else {
475 must_insert_reserved = 0;
476 }
477
478
479 delayed_refs = &trans->transaction->delayed_refs;
480
481 /* first set the basic ref node struct up */
482 atomic_set(&ref->refs, 1);
483 ref->bytenr = bytenr;
484 ref->parent = parent;
485 ref->ref_mod = count_mod;
486 ref->in_tree = 1;
487 ref->num_bytes = num_bytes;
488
489 if (btrfs_delayed_ref_is_head(ref)) {
490 head_ref = btrfs_delayed_node_to_head(ref);
491 head_ref->must_insert_reserved = must_insert_reserved;
492 INIT_LIST_HEAD(&head_ref->cluster);
493 mutex_init(&head_ref->mutex);
494 } else {
495 full_ref = btrfs_delayed_node_to_ref(ref);
496 full_ref->root = ref_root;
497 full_ref->generation = ref_generation;
498 full_ref->owner_objectid = owner_objectid;
499 full_ref->pin = pin;
500 full_ref->action = action;
501 }
502
503 existing = tree_insert(&delayed_refs->root, bytenr,
504 parent, &ref->rb_node);
505
506 if (existing) {
507 if (btrfs_delayed_ref_is_head(ref))
508 update_existing_head_ref(existing, ref);
509 else
510 update_existing_ref(trans, delayed_refs, existing, ref);
511
512 /*
513 * we've updated the existing ref, free the newly
514 * allocated ref
515 */
516 kfree(ref);
517 } else {
518 if (btrfs_delayed_ref_is_head(ref)) {
519 delayed_refs->num_heads++;
520 delayed_refs->num_heads_ready++;
521 }
522 delayed_refs->num_entries++;
523 trans->delayed_ref_updates++;
524 }
525 return 0;
526}
527
528/*
529 * add a delayed ref to the tree. This does all of the accounting required
530 * to make sure the delayed ref is eventually processed before this
531 * transaction commits.
532 */
533int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
534 u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
535 u64 ref_generation, u64 owner_objectid, int action,
536 int pin)
537{
538 struct btrfs_delayed_ref *ref;
539 struct btrfs_delayed_ref_head *head_ref;
540 struct btrfs_delayed_ref_root *delayed_refs;
541 int ret;
542
543 ref = kmalloc(sizeof(*ref), GFP_NOFS);
544 if (!ref)
545 return -ENOMEM;
546
547 /*
548 * the parent = 0 case comes from cases where we don't actually
549 * know the parent yet. It will get updated later via a add/drop
550 * pair.
551 */
552 if (parent == 0)
553 parent = bytenr;
554
555 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
556 if (!head_ref) {
557 kfree(ref);
558 return -ENOMEM;
559 }
560 delayed_refs = &trans->transaction->delayed_refs;
561 spin_lock(&delayed_refs->lock);
562
563 /*
564 * insert both the head node and the new ref without dropping
565 * the spin lock
566 */
567 ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
568 (u64)-1, 0, 0, 0, action, pin);
569 BUG_ON(ret);
570
571 ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
572 parent, ref_root, ref_generation,
573 owner_objectid, action, pin);
574 BUG_ON(ret);
575 spin_unlock(&delayed_refs->lock);
576 return 0;
577}
578
579/*
580 * this does a simple search for the head node for a given extent.
581 * It must be called with the delayed ref spinlock held, and it returns
582 * the head node if any where found, or NULL if not.
583 */
584struct btrfs_delayed_ref_head *
585btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
586{
587 struct btrfs_delayed_ref_node *ref;
588 struct btrfs_delayed_ref_root *delayed_refs;
589
590 delayed_refs = &trans->transaction->delayed_refs;
591 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
592 if (ref)
593 return btrfs_delayed_node_to_head(ref);
594 return NULL;
595}
596
597/*
598 * add a delayed ref to the tree. This does all of the accounting required
599 * to make sure the delayed ref is eventually processed before this
600 * transaction commits.
601 *
602 * The main point of this call is to add and remove a backreference in a single
603 * shot, taking the lock only once, and only searching for the head node once.
604 *
605 * It is the same as doing a ref add and delete in two separate calls.
606 */
607int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
608 u64 bytenr, u64 num_bytes, u64 orig_parent,
609 u64 parent, u64 orig_ref_root, u64 ref_root,
610 u64 orig_ref_generation, u64 ref_generation,
611 u64 owner_objectid, int pin)
612{
613 struct btrfs_delayed_ref *ref;
614 struct btrfs_delayed_ref *old_ref;
615 struct btrfs_delayed_ref_head *head_ref;
616 struct btrfs_delayed_ref_root *delayed_refs;
617 int ret;
618
619 ref = kmalloc(sizeof(*ref), GFP_NOFS);
620 if (!ref)
621 return -ENOMEM;
622
623 old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS);
624 if (!old_ref) {
625 kfree(ref);
626 return -ENOMEM;
627 }
628
629 /*
630 * the parent = 0 case comes from cases where we don't actually
631 * know the parent yet. It will get updated later via a add/drop
632 * pair.
633 */
634 if (parent == 0)
635 parent = bytenr;
636 if (orig_parent == 0)
637 orig_parent = bytenr;
638
639 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
640 if (!head_ref) {
641 kfree(ref);
642 kfree(old_ref);
643 return -ENOMEM;
644 }
645 delayed_refs = &trans->transaction->delayed_refs;
646 spin_lock(&delayed_refs->lock);
647
648 /*
649 * insert both the head node and the new ref without dropping
650 * the spin lock
651 */
652 ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
653 (u64)-1, 0, 0, 0,
654 BTRFS_UPDATE_DELAYED_HEAD, 0);
655 BUG_ON(ret);
656
657 ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
658 parent, ref_root, ref_generation,
659 owner_objectid, BTRFS_ADD_DELAYED_REF, 0);
660 BUG_ON(ret);
661
662 ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes,
663 orig_parent, orig_ref_root,
664 orig_ref_generation, owner_objectid,
665 BTRFS_DROP_DELAYED_REF, pin);
666 BUG_ON(ret);
667 spin_unlock(&delayed_refs->lock);
668 return 0;
669}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
new file mode 100644
index 000000000000..3bec2ff0b15c
--- /dev/null
+++ b/fs/btrfs/delayed-ref.h
@@ -0,0 +1,193 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#ifndef __DELAYED_REF__
19#define __DELAYED_REF__
20
21/* these are the possible values of struct btrfs_delayed_ref->action */
22#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */
23#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */
24#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
25#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
26
27struct btrfs_delayed_ref_node {
28 struct rb_node rb_node;
29
30 /* the starting bytenr of the extent */
31 u64 bytenr;
32
33 /* the parent our backref will point to */
34 u64 parent;
35
36 /* the size of the extent */
37 u64 num_bytes;
38
39 /* ref count on this data structure */
40 atomic_t refs;
41
42 /*
43 * how many refs is this entry adding or deleting. For
44 * head refs, this may be a negative number because it is keeping
45 * track of the total mods done to the reference count.
46 * For individual refs, this will always be a positive number
47 *
48 * It may be more than one, since it is possible for a single
49 * parent to have more than one ref on an extent
50 */
51 int ref_mod;
52
53 /* is this node still in the rbtree? */
54 unsigned int in_tree:1;
55};
56
57/*
58 * the head refs are used to hold a lock on a given extent, which allows us
59 * to make sure that only one process is running the delayed refs
60 * at a time for a single extent. They also store the sum of all the
61 * reference count modifications we've queued up.
62 */
63struct btrfs_delayed_ref_head {
64 struct btrfs_delayed_ref_node node;
65
66 /*
67 * the mutex is held while running the refs, and it is also
68 * held when checking the sum of reference modifications.
69 */
70 struct mutex mutex;
71
72 struct list_head cluster;
73
74 /*
75 * when a new extent is allocated, it is just reserved in memory
76 * The actual extent isn't inserted into the extent allocation tree
77 * until the delayed ref is processed. must_insert_reserved is
78 * used to flag a delayed ref so the accounting can be updated
79 * when a full insert is done.
80 *
81 * It is possible the extent will be freed before it is ever
82 * inserted into the extent allocation tree. In this case
83 * we need to update the in ram accounting to properly reflect
84 * the free has happened.
85 */
86 unsigned int must_insert_reserved:1;
87};
88
89struct btrfs_delayed_ref {
90 struct btrfs_delayed_ref_node node;
91
92 /* the root objectid our ref will point to */
93 u64 root;
94
95 /* the generation for the backref */
96 u64 generation;
97
98 /* owner_objectid of the backref */
99 u64 owner_objectid;
100
101 /* operation done by this entry in the rbtree */
102 u8 action;
103
104 /* if pin == 1, when the extent is freed it will be pinned until
105 * transaction commit
106 */
107 unsigned int pin:1;
108};
109
110struct btrfs_delayed_ref_root {
111 struct rb_root root;
112
113 /* this spin lock protects the rbtree and the entries inside */
114 spinlock_t lock;
115
116 /* how many delayed ref updates we've queued, used by the
117 * throttling code
118 */
119 unsigned long num_entries;
120
121 /* total number of head nodes in tree */
122 unsigned long num_heads;
123
124 /* total number of head nodes ready for processing */
125 unsigned long num_heads_ready;
126
127 /*
128 * set when the tree is flushing before a transaction commit,
129 * used by the throttling code to decide if new updates need
130 * to be run right away
131 */
132 int flushing;
133
134 u64 run_delayed_start;
135};
136
137static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
138{
139 WARN_ON(atomic_read(&ref->refs) == 0);
140 if (atomic_dec_and_test(&ref->refs)) {
141 WARN_ON(ref->in_tree);
142 kfree(ref);
143 }
144}
145
146int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
147 u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
148 u64 ref_generation, u64 owner_objectid, int action,
149 int pin);
150
151struct btrfs_delayed_ref_head *
152btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
153int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
154int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
155 struct btrfs_root *root, u64 bytenr,
156 u64 num_bytes, u32 *refs);
157int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
158 u64 bytenr, u64 num_bytes, u64 orig_parent,
159 u64 parent, u64 orig_ref_root, u64 ref_root,
160 u64 orig_ref_generation, u64 ref_generation,
161 u64 owner_objectid, int pin);
162int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
163 struct btrfs_delayed_ref_head *head);
164int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
165 struct list_head *cluster, u64 search_start);
166/*
167 * a node might live in a head or a regular ref, this lets you
168 * test for the proper type to use.
169 */
170static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node)
171{
172 return node->parent == (u64)-1;
173}
174
175/*
176 * helper functions to cast a node into its container
177 */
178static inline struct btrfs_delayed_ref *
179btrfs_delayed_node_to_ref(struct btrfs_delayed_ref_node *node)
180{
181 WARN_ON(btrfs_delayed_ref_is_head(node));
182 return container_of(node, struct btrfs_delayed_ref, node);
183
184}
185
186static inline struct btrfs_delayed_ref_head *
187btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node)
188{
189 WARN_ON(!btrfs_delayed_ref_is_head(node));
190 return container_of(node, struct btrfs_delayed_ref_head, node);
191
192}
193#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 926a0b287a7d..1d70236ba00c 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -145,7 +145,10 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
145 key.objectid = dir; 145 key.objectid = dir;
146 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); 146 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
147 key.offset = btrfs_name_hash(name, name_len); 147 key.offset = btrfs_name_hash(name, name_len);
148
148 path = btrfs_alloc_path(); 149 path = btrfs_alloc_path();
150 path->leave_spinning = 1;
151
149 data_size = sizeof(*dir_item) + name_len; 152 data_size = sizeof(*dir_item) + name_len;
150 dir_item = insert_with_overflow(trans, root, path, &key, data_size, 153 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
151 name, name_len); 154 name, name_len);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6ec80c0fc869..92d73929d381 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -668,14 +668,31 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
668static int btree_writepage(struct page *page, struct writeback_control *wbc) 668static int btree_writepage(struct page *page, struct writeback_control *wbc)
669{ 669{
670 struct extent_io_tree *tree; 670 struct extent_io_tree *tree;
671 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
672 struct extent_buffer *eb;
673 int was_dirty;
674
671 tree = &BTRFS_I(page->mapping->host)->io_tree; 675 tree = &BTRFS_I(page->mapping->host)->io_tree;
676 if (!(current->flags & PF_MEMALLOC)) {
677 return extent_write_full_page(tree, page,
678 btree_get_extent, wbc);
679 }
672 680
673 if (current->flags & PF_MEMALLOC) { 681 redirty_page_for_writepage(wbc, page);
674 redirty_page_for_writepage(wbc, page); 682 eb = btrfs_find_tree_block(root, page_offset(page),
675 unlock_page(page); 683 PAGE_CACHE_SIZE);
676 return 0; 684 WARN_ON(!eb);
685
686 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
687 if (!was_dirty) {
688 spin_lock(&root->fs_info->delalloc_lock);
689 root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
690 spin_unlock(&root->fs_info->delalloc_lock);
677 } 691 }
678 return extent_write_full_page(tree, page, btree_get_extent, wbc); 692 free_extent_buffer(eb);
693
694 unlock_page(page);
695 return 0;
679} 696}
680 697
681static int btree_writepages(struct address_space *mapping, 698static int btree_writepages(struct address_space *mapping,
@@ -684,15 +701,15 @@ static int btree_writepages(struct address_space *mapping,
684 struct extent_io_tree *tree; 701 struct extent_io_tree *tree;
685 tree = &BTRFS_I(mapping->host)->io_tree; 702 tree = &BTRFS_I(mapping->host)->io_tree;
686 if (wbc->sync_mode == WB_SYNC_NONE) { 703 if (wbc->sync_mode == WB_SYNC_NONE) {
704 struct btrfs_root *root = BTRFS_I(mapping->host)->root;
687 u64 num_dirty; 705 u64 num_dirty;
688 u64 start = 0;
689 unsigned long thresh = 32 * 1024 * 1024; 706 unsigned long thresh = 32 * 1024 * 1024;
690 707
691 if (wbc->for_kupdate) 708 if (wbc->for_kupdate)
692 return 0; 709 return 0;
693 710
694 num_dirty = count_range_bits(tree, &start, (u64)-1, 711 /* this is a bit racy, but that's ok */
695 thresh, EXTENT_DIRTY); 712 num_dirty = root->fs_info->dirty_metadata_bytes;
696 if (num_dirty < thresh) 713 if (num_dirty < thresh)
697 return 0; 714 return 0;
698 } 715 }
@@ -859,9 +876,17 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
859 root->fs_info->running_transaction->transid) { 876 root->fs_info->running_transaction->transid) {
860 btrfs_assert_tree_locked(buf); 877 btrfs_assert_tree_locked(buf);
861 878
862 /* ugh, clear_extent_buffer_dirty can be expensive */ 879 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
863 btrfs_set_lock_blocking(buf); 880 spin_lock(&root->fs_info->delalloc_lock);
881 if (root->fs_info->dirty_metadata_bytes >= buf->len)
882 root->fs_info->dirty_metadata_bytes -= buf->len;
883 else
884 WARN_ON(1);
885 spin_unlock(&root->fs_info->delalloc_lock);
886 }
864 887
888 /* ugh, clear_extent_buffer_dirty needs to lock the page */
889 btrfs_set_lock_blocking(buf);
865 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, 890 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
866 buf); 891 buf);
867 } 892 }
@@ -1471,12 +1496,6 @@ static int transaction_kthread(void *arg)
1471 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1496 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1472 mutex_lock(&root->fs_info->transaction_kthread_mutex); 1497 mutex_lock(&root->fs_info->transaction_kthread_mutex);
1473 1498
1474 if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
1475 printk(KERN_INFO "btrfs: total reference cache "
1476 "size %llu\n",
1477 root->fs_info->total_ref_cache_size);
1478 }
1479
1480 mutex_lock(&root->fs_info->trans_mutex); 1499 mutex_lock(&root->fs_info->trans_mutex);
1481 cur = root->fs_info->running_transaction; 1500 cur = root->fs_info->running_transaction;
1482 if (!cur) { 1501 if (!cur) {
@@ -1493,6 +1512,7 @@ static int transaction_kthread(void *arg)
1493 mutex_unlock(&root->fs_info->trans_mutex); 1512 mutex_unlock(&root->fs_info->trans_mutex);
1494 trans = btrfs_start_transaction(root, 1); 1513 trans = btrfs_start_transaction(root, 1);
1495 ret = btrfs_commit_transaction(trans, root); 1514 ret = btrfs_commit_transaction(trans, root);
1515
1496sleep: 1516sleep:
1497 wake_up_process(root->fs_info->cleaner_kthread); 1517 wake_up_process(root->fs_info->cleaner_kthread);
1498 mutex_unlock(&root->fs_info->transaction_kthread_mutex); 1518 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1552,6 +1572,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1552 INIT_LIST_HEAD(&fs_info->dead_roots); 1572 INIT_LIST_HEAD(&fs_info->dead_roots);
1553 INIT_LIST_HEAD(&fs_info->hashers); 1573 INIT_LIST_HEAD(&fs_info->hashers);
1554 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 1574 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1575 INIT_LIST_HEAD(&fs_info->ordered_operations);
1555 spin_lock_init(&fs_info->delalloc_lock); 1576 spin_lock_init(&fs_info->delalloc_lock);
1556 spin_lock_init(&fs_info->new_trans_lock); 1577 spin_lock_init(&fs_info->new_trans_lock);
1557 spin_lock_init(&fs_info->ref_cache_lock); 1578 spin_lock_init(&fs_info->ref_cache_lock);
@@ -1611,10 +1632,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1611 1632
1612 extent_io_tree_init(&fs_info->pinned_extents, 1633 extent_io_tree_init(&fs_info->pinned_extents,
1613 fs_info->btree_inode->i_mapping, GFP_NOFS); 1634 fs_info->btree_inode->i_mapping, GFP_NOFS);
1614 extent_io_tree_init(&fs_info->pending_del,
1615 fs_info->btree_inode->i_mapping, GFP_NOFS);
1616 extent_io_tree_init(&fs_info->extent_ins,
1617 fs_info->btree_inode->i_mapping, GFP_NOFS);
1618 fs_info->do_barriers = 1; 1635 fs_info->do_barriers = 1;
1619 1636
1620 INIT_LIST_HEAD(&fs_info->dead_reloc_roots); 1637 INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
@@ -1627,9 +1644,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1627 insert_inode_hash(fs_info->btree_inode); 1644 insert_inode_hash(fs_info->btree_inode);
1628 1645
1629 mutex_init(&fs_info->trans_mutex); 1646 mutex_init(&fs_info->trans_mutex);
1647 mutex_init(&fs_info->ordered_operations_mutex);
1630 mutex_init(&fs_info->tree_log_mutex); 1648 mutex_init(&fs_info->tree_log_mutex);
1631 mutex_init(&fs_info->drop_mutex); 1649 mutex_init(&fs_info->drop_mutex);
1632 mutex_init(&fs_info->extent_ins_mutex);
1633 mutex_init(&fs_info->pinned_mutex); 1650 mutex_init(&fs_info->pinned_mutex);
1634 mutex_init(&fs_info->chunk_mutex); 1651 mutex_init(&fs_info->chunk_mutex);
1635 mutex_init(&fs_info->transaction_kthread_mutex); 1652 mutex_init(&fs_info->transaction_kthread_mutex);
@@ -2358,8 +2375,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
2358 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; 2375 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
2359 u64 transid = btrfs_header_generation(buf); 2376 u64 transid = btrfs_header_generation(buf);
2360 struct inode *btree_inode = root->fs_info->btree_inode; 2377 struct inode *btree_inode = root->fs_info->btree_inode;
2361 2378 int was_dirty;
2362 btrfs_set_lock_blocking(buf);
2363 2379
2364 btrfs_assert_tree_locked(buf); 2380 btrfs_assert_tree_locked(buf);
2365 if (transid != root->fs_info->generation) { 2381 if (transid != root->fs_info->generation) {
@@ -2370,7 +2386,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
2370 (unsigned long long)root->fs_info->generation); 2386 (unsigned long long)root->fs_info->generation);
2371 WARN_ON(1); 2387 WARN_ON(1);
2372 } 2388 }
2373 set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf); 2389 was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
2390 buf);
2391 if (!was_dirty) {
2392 spin_lock(&root->fs_info->delalloc_lock);
2393 root->fs_info->dirty_metadata_bytes += buf->len;
2394 spin_unlock(&root->fs_info->delalloc_lock);
2395 }
2374} 2396}
2375 2397
2376void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) 2398void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
@@ -2410,6 +2432,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2410int btree_lock_page_hook(struct page *page) 2432int btree_lock_page_hook(struct page *page)
2411{ 2433{
2412 struct inode *inode = page->mapping->host; 2434 struct inode *inode = page->mapping->host;
2435 struct btrfs_root *root = BTRFS_I(inode)->root;
2413 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2436 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2414 struct extent_buffer *eb; 2437 struct extent_buffer *eb;
2415 unsigned long len; 2438 unsigned long len;
@@ -2425,6 +2448,16 @@ int btree_lock_page_hook(struct page *page)
2425 2448
2426 btrfs_tree_lock(eb); 2449 btrfs_tree_lock(eb);
2427 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 2450 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
2451
2452 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
2453 spin_lock(&root->fs_info->delalloc_lock);
2454 if (root->fs_info->dirty_metadata_bytes >= eb->len)
2455 root->fs_info->dirty_metadata_bytes -= eb->len;
2456 else
2457 WARN_ON(1);
2458 spin_unlock(&root->fs_info->delalloc_lock);
2459 }
2460
2428 btrfs_tree_unlock(eb); 2461 btrfs_tree_unlock(eb);
2429 free_extent_buffer(eb); 2462 free_extent_buffer(eb);
2430out: 2463out:
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 95029db227be..c958ecbc1916 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -72,6 +72,7 @@ int btrfs_insert_dev_radix(struct btrfs_root *root,
72void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); 72void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
73int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); 73int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
74void btrfs_mark_buffer_dirty(struct extent_buffer *buf); 74void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
75void btrfs_mark_buffer_dirty_nonblocking(struct extent_buffer *buf);
75int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid); 76int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
76int btrfs_set_buffer_uptodate(struct extent_buffer *buf); 77int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
77int wait_on_tree_block_writeback(struct btrfs_root *root, 78int wait_on_tree_block_writeback(struct btrfs_root *root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fefe83ad2059..f5e7cae63d80 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -49,17 +49,23 @@ struct pending_extent_op {
49 int del; 49 int del;
50}; 50};
51 51
52static int finish_current_insert(struct btrfs_trans_handle *trans, 52static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
53 struct btrfs_root *extent_root, int all); 53 struct btrfs_root *root, u64 parent,
54static int del_pending_extents(struct btrfs_trans_handle *trans, 54 u64 root_objectid, u64 ref_generation,
55 struct btrfs_root *extent_root, int all); 55 u64 owner, struct btrfs_key *ins,
56static int pin_down_bytes(struct btrfs_trans_handle *trans, 56 int ref_mod);
57 struct btrfs_root *root, 57static int update_reserved_extents(struct btrfs_root *root,
58 u64 bytenr, u64 num_bytes, int is_data); 58 u64 bytenr, u64 num, int reserve);
59static int update_block_group(struct btrfs_trans_handle *trans, 59static int update_block_group(struct btrfs_trans_handle *trans,
60 struct btrfs_root *root, 60 struct btrfs_root *root,
61 u64 bytenr, u64 num_bytes, int alloc, 61 u64 bytenr, u64 num_bytes, int alloc,
62 int mark_free); 62 int mark_free);
63static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans,
64 struct btrfs_root *root,
65 u64 bytenr, u64 num_bytes, u64 parent,
66 u64 root_objectid, u64 ref_generation,
67 u64 owner_objectid, int pin,
68 int ref_to_drop);
63 69
64static int do_chunk_alloc(struct btrfs_trans_handle *trans, 70static int do_chunk_alloc(struct btrfs_trans_handle *trans,
65 struct btrfs_root *extent_root, u64 alloc_bytes, 71 struct btrfs_root *extent_root, u64 alloc_bytes,
@@ -554,262 +560,13 @@ out:
554 return ret; 560 return ret;
555} 561}
556 562
557/*
558 * updates all the backrefs that are pending on update_list for the
559 * extent_root
560 */
561static noinline int update_backrefs(struct btrfs_trans_handle *trans,
562 struct btrfs_root *extent_root,
563 struct btrfs_path *path,
564 struct list_head *update_list)
565{
566 struct btrfs_key key;
567 struct btrfs_extent_ref *ref;
568 struct btrfs_fs_info *info = extent_root->fs_info;
569 struct pending_extent_op *op;
570 struct extent_buffer *leaf;
571 int ret = 0;
572 struct list_head *cur = update_list->next;
573 u64 ref_objectid;
574 u64 ref_root = extent_root->root_key.objectid;
575
576 op = list_entry(cur, struct pending_extent_op, list);
577
578search:
579 key.objectid = op->bytenr;
580 key.type = BTRFS_EXTENT_REF_KEY;
581 key.offset = op->orig_parent;
582
583 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
584 BUG_ON(ret);
585
586 leaf = path->nodes[0];
587
588loop:
589 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
590
591 ref_objectid = btrfs_ref_objectid(leaf, ref);
592
593 if (btrfs_ref_root(leaf, ref) != ref_root ||
594 btrfs_ref_generation(leaf, ref) != op->orig_generation ||
595 (ref_objectid != op->level &&
596 ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
597 printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, "
598 "root %llu, owner %u\n",
599 (unsigned long long)op->bytenr,
600 (unsigned long long)op->orig_parent,
601 (unsigned long long)ref_root, op->level);
602 btrfs_print_leaf(extent_root, leaf);
603 BUG();
604 }
605
606 key.objectid = op->bytenr;
607 key.offset = op->parent;
608 key.type = BTRFS_EXTENT_REF_KEY;
609 ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
610 BUG_ON(ret);
611 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
612 btrfs_set_ref_generation(leaf, ref, op->generation);
613
614 cur = cur->next;
615
616 list_del_init(&op->list);
617 unlock_extent(&info->extent_ins, op->bytenr,
618 op->bytenr + op->num_bytes - 1, GFP_NOFS);
619 kfree(op);
620
621 if (cur == update_list) {
622 btrfs_mark_buffer_dirty(path->nodes[0]);
623 btrfs_release_path(extent_root, path);
624 goto out;
625 }
626
627 op = list_entry(cur, struct pending_extent_op, list);
628
629 path->slots[0]++;
630 while (path->slots[0] < btrfs_header_nritems(leaf)) {
631 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
632 if (key.objectid == op->bytenr &&
633 key.type == BTRFS_EXTENT_REF_KEY)
634 goto loop;
635 path->slots[0]++;
636 }
637
638 btrfs_mark_buffer_dirty(path->nodes[0]);
639 btrfs_release_path(extent_root, path);
640 goto search;
641
642out:
643 return 0;
644}
645
646static noinline int insert_extents(struct btrfs_trans_handle *trans,
647 struct btrfs_root *extent_root,
648 struct btrfs_path *path,
649 struct list_head *insert_list, int nr)
650{
651 struct btrfs_key *keys;
652 u32 *data_size;
653 struct pending_extent_op *op;
654 struct extent_buffer *leaf;
655 struct list_head *cur = insert_list->next;
656 struct btrfs_fs_info *info = extent_root->fs_info;
657 u64 ref_root = extent_root->root_key.objectid;
658 int i = 0, last = 0, ret;
659 int total = nr * 2;
660
661 if (!nr)
662 return 0;
663
664 keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
665 if (!keys)
666 return -ENOMEM;
667
668 data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
669 if (!data_size) {
670 kfree(keys);
671 return -ENOMEM;
672 }
673
674 list_for_each_entry(op, insert_list, list) {
675 keys[i].objectid = op->bytenr;
676 keys[i].offset = op->num_bytes;
677 keys[i].type = BTRFS_EXTENT_ITEM_KEY;
678 data_size[i] = sizeof(struct btrfs_extent_item);
679 i++;
680
681 keys[i].objectid = op->bytenr;
682 keys[i].offset = op->parent;
683 keys[i].type = BTRFS_EXTENT_REF_KEY;
684 data_size[i] = sizeof(struct btrfs_extent_ref);
685 i++;
686 }
687
688 op = list_entry(cur, struct pending_extent_op, list);
689 i = 0;
690 while (i < total) {
691 int c;
692 ret = btrfs_insert_some_items(trans, extent_root, path,
693 keys+i, data_size+i, total-i);
694 BUG_ON(ret < 0);
695
696 if (last && ret > 1)
697 BUG();
698
699 leaf = path->nodes[0];
700 for (c = 0; c < ret; c++) {
701 int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
702
703 /*
704 * if the first item we inserted was a backref, then
705 * the EXTENT_ITEM will be the odd c's, else it will
706 * be the even c's
707 */
708 if ((ref_first && (c % 2)) ||
709 (!ref_first && !(c % 2))) {
710 struct btrfs_extent_item *itm;
711
712 itm = btrfs_item_ptr(leaf, path->slots[0] + c,
713 struct btrfs_extent_item);
714 btrfs_set_extent_refs(path->nodes[0], itm, 1);
715 op->del++;
716 } else {
717 struct btrfs_extent_ref *ref;
718
719 ref = btrfs_item_ptr(leaf, path->slots[0] + c,
720 struct btrfs_extent_ref);
721 btrfs_set_ref_root(leaf, ref, ref_root);
722 btrfs_set_ref_generation(leaf, ref,
723 op->generation);
724 btrfs_set_ref_objectid(leaf, ref, op->level);
725 btrfs_set_ref_num_refs(leaf, ref, 1);
726 op->del++;
727 }
728
729 /*
730 * using del to see when its ok to free up the
731 * pending_extent_op. In the case where we insert the
732 * last item on the list in order to help do batching
733 * we need to not free the extent op until we actually
734 * insert the extent_item
735 */
736 if (op->del == 2) {
737 unlock_extent(&info->extent_ins, op->bytenr,
738 op->bytenr + op->num_bytes - 1,
739 GFP_NOFS);
740 cur = cur->next;
741 list_del_init(&op->list);
742 kfree(op);
743 if (cur != insert_list)
744 op = list_entry(cur,
745 struct pending_extent_op,
746 list);
747 }
748 }
749 btrfs_mark_buffer_dirty(leaf);
750 btrfs_release_path(extent_root, path);
751
752 /*
753 * Ok backref's and items usually go right next to eachother,
754 * but if we could only insert 1 item that means that we
755 * inserted on the end of a leaf, and we have no idea what may
756 * be on the next leaf so we just play it safe. In order to
757 * try and help this case we insert the last thing on our
758 * insert list so hopefully it will end up being the last
759 * thing on the leaf and everything else will be before it,
760 * which will let us insert a whole bunch of items at the same
761 * time.
762 */
763 if (ret == 1 && !last && (i + ret < total)) {
764 /*
765 * last: where we will pick up the next time around
766 * i: our current key to insert, will be total - 1
767 * cur: the current op we are screwing with
768 * op: duh
769 */
770 last = i + ret;
771 i = total - 1;
772 cur = insert_list->prev;
773 op = list_entry(cur, struct pending_extent_op, list);
774 } else if (last) {
775 /*
776 * ok we successfully inserted the last item on the
777 * list, lets reset everything
778 *
779 * i: our current key to insert, so where we left off
780 * last time
781 * last: done with this
782 * cur: the op we are messing with
783 * op: duh
784 * total: since we inserted the last key, we need to
785 * decrement total so we dont overflow
786 */
787 i = last;
788 last = 0;
789 total--;
790 if (i < total) {
791 cur = insert_list->next;
792 op = list_entry(cur, struct pending_extent_op,
793 list);
794 }
795 } else {
796 i += ret;
797 }
798
799 cond_resched();
800 }
801 ret = 0;
802 kfree(keys);
803 kfree(data_size);
804 return ret;
805}
806
807static noinline int insert_extent_backref(struct btrfs_trans_handle *trans, 563static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
808 struct btrfs_root *root, 564 struct btrfs_root *root,
809 struct btrfs_path *path, 565 struct btrfs_path *path,
810 u64 bytenr, u64 parent, 566 u64 bytenr, u64 parent,
811 u64 ref_root, u64 ref_generation, 567 u64 ref_root, u64 ref_generation,
812 u64 owner_objectid) 568 u64 owner_objectid,
569 int refs_to_add)
813{ 570{
814 struct btrfs_key key; 571 struct btrfs_key key;
815 struct extent_buffer *leaf; 572 struct extent_buffer *leaf;
@@ -829,9 +586,10 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
829 btrfs_set_ref_root(leaf, ref, ref_root); 586 btrfs_set_ref_root(leaf, ref, ref_root);
830 btrfs_set_ref_generation(leaf, ref, ref_generation); 587 btrfs_set_ref_generation(leaf, ref, ref_generation);
831 btrfs_set_ref_objectid(leaf, ref, owner_objectid); 588 btrfs_set_ref_objectid(leaf, ref, owner_objectid);
832 btrfs_set_ref_num_refs(leaf, ref, 1); 589 btrfs_set_ref_num_refs(leaf, ref, refs_to_add);
833 } else if (ret == -EEXIST) { 590 } else if (ret == -EEXIST) {
834 u64 existing_owner; 591 u64 existing_owner;
592
835 BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID); 593 BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
836 leaf = path->nodes[0]; 594 leaf = path->nodes[0];
837 ref = btrfs_item_ptr(leaf, path->slots[0], 595 ref = btrfs_item_ptr(leaf, path->slots[0],
@@ -845,7 +603,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
845 603
846 num_refs = btrfs_ref_num_refs(leaf, ref); 604 num_refs = btrfs_ref_num_refs(leaf, ref);
847 BUG_ON(num_refs == 0); 605 BUG_ON(num_refs == 0);
848 btrfs_set_ref_num_refs(leaf, ref, num_refs + 1); 606 btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add);
849 607
850 existing_owner = btrfs_ref_objectid(leaf, ref); 608 existing_owner = btrfs_ref_objectid(leaf, ref);
851 if (existing_owner != owner_objectid && 609 if (existing_owner != owner_objectid &&
@@ -857,6 +615,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
857 } else { 615 } else {
858 goto out; 616 goto out;
859 } 617 }
618 btrfs_unlock_up_safe(path, 1);
860 btrfs_mark_buffer_dirty(path->nodes[0]); 619 btrfs_mark_buffer_dirty(path->nodes[0]);
861out: 620out:
862 btrfs_release_path(root, path); 621 btrfs_release_path(root, path);
@@ -865,7 +624,8 @@ out:
865 624
866static noinline int remove_extent_backref(struct btrfs_trans_handle *trans, 625static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
867 struct btrfs_root *root, 626 struct btrfs_root *root,
868 struct btrfs_path *path) 627 struct btrfs_path *path,
628 int refs_to_drop)
869{ 629{
870 struct extent_buffer *leaf; 630 struct extent_buffer *leaf;
871 struct btrfs_extent_ref *ref; 631 struct btrfs_extent_ref *ref;
@@ -875,8 +635,8 @@ static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
875 leaf = path->nodes[0]; 635 leaf = path->nodes[0];
876 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); 636 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
877 num_refs = btrfs_ref_num_refs(leaf, ref); 637 num_refs = btrfs_ref_num_refs(leaf, ref);
878 BUG_ON(num_refs == 0); 638 BUG_ON(num_refs < refs_to_drop);
879 num_refs -= 1; 639 num_refs -= refs_to_drop;
880 if (num_refs == 0) { 640 if (num_refs == 0) {
881 ret = btrfs_del_item(trans, root, path); 641 ret = btrfs_del_item(trans, root, path);
882 } else { 642 } else {
@@ -927,332 +687,28 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
927#endif 687#endif
928} 688}
929 689
930static noinline int free_extents(struct btrfs_trans_handle *trans,
931 struct btrfs_root *extent_root,
932 struct list_head *del_list)
933{
934 struct btrfs_fs_info *info = extent_root->fs_info;
935 struct btrfs_path *path;
936 struct btrfs_key key, found_key;
937 struct extent_buffer *leaf;
938 struct list_head *cur;
939 struct pending_extent_op *op;
940 struct btrfs_extent_item *ei;
941 int ret, num_to_del, extent_slot = 0, found_extent = 0;
942 u32 refs;
943 u64 bytes_freed = 0;
944
945 path = btrfs_alloc_path();
946 if (!path)
947 return -ENOMEM;
948 path->reada = 1;
949
950search:
951 /* search for the backref for the current ref we want to delete */
952 cur = del_list->next;
953 op = list_entry(cur, struct pending_extent_op, list);
954 ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
955 op->orig_parent,
956 extent_root->root_key.objectid,
957 op->orig_generation, op->level, 1);
958 if (ret) {
959 printk(KERN_ERR "btrfs unable to find backref byte nr %llu "
960 "root %llu gen %llu owner %u\n",
961 (unsigned long long)op->bytenr,
962 (unsigned long long)extent_root->root_key.objectid,
963 (unsigned long long)op->orig_generation, op->level);
964 btrfs_print_leaf(extent_root, path->nodes[0]);
965 WARN_ON(1);
966 goto out;
967 }
968
969 extent_slot = path->slots[0];
970 num_to_del = 1;
971 found_extent = 0;
972
973 /*
974 * if we aren't the first item on the leaf we can move back one and see
975 * if our ref is right next to our extent item
976 */
977 if (likely(extent_slot)) {
978 extent_slot--;
979 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
980 extent_slot);
981 if (found_key.objectid == op->bytenr &&
982 found_key.type == BTRFS_EXTENT_ITEM_KEY &&
983 found_key.offset == op->num_bytes) {
984 num_to_del++;
985 found_extent = 1;
986 }
987 }
988
989 /*
990 * if we didn't find the extent we need to delete the backref and then
991 * search for the extent item key so we can update its ref count
992 */
993 if (!found_extent) {
994 key.objectid = op->bytenr;
995 key.type = BTRFS_EXTENT_ITEM_KEY;
996 key.offset = op->num_bytes;
997
998 ret = remove_extent_backref(trans, extent_root, path);
999 BUG_ON(ret);
1000 btrfs_release_path(extent_root, path);
1001 ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
1002 BUG_ON(ret);
1003 extent_slot = path->slots[0];
1004 }
1005
1006 /* this is where we update the ref count for the extent */
1007 leaf = path->nodes[0];
1008 ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
1009 refs = btrfs_extent_refs(leaf, ei);
1010 BUG_ON(refs == 0);
1011 refs--;
1012 btrfs_set_extent_refs(leaf, ei, refs);
1013
1014 btrfs_mark_buffer_dirty(leaf);
1015
1016 /*
1017 * This extent needs deleting. The reason cur_slot is extent_slot +
1018 * num_to_del is because extent_slot points to the slot where the extent
1019 * is, and if the backref was not right next to the extent we will be
1020 * deleting at least 1 item, and will want to start searching at the
1021 * slot directly next to extent_slot. However if we did find the
1022 * backref next to the extent item them we will be deleting at least 2
1023 * items and will want to start searching directly after the ref slot
1024 */
1025 if (!refs) {
1026 struct list_head *pos, *n, *end;
1027 int cur_slot = extent_slot+num_to_del;
1028 u64 super_used;
1029 u64 root_used;
1030
1031 path->slots[0] = extent_slot;
1032 bytes_freed = op->num_bytes;
1033
1034 mutex_lock(&info->pinned_mutex);
1035 ret = pin_down_bytes(trans, extent_root, op->bytenr,
1036 op->num_bytes, op->level >=
1037 BTRFS_FIRST_FREE_OBJECTID);
1038 mutex_unlock(&info->pinned_mutex);
1039 BUG_ON(ret < 0);
1040 op->del = ret;
1041
1042 /*
1043 * we need to see if we can delete multiple things at once, so
1044 * start looping through the list of extents we are wanting to
1045 * delete and see if their extent/backref's are right next to
1046 * eachother and the extents only have 1 ref
1047 */
1048 for (pos = cur->next; pos != del_list; pos = pos->next) {
1049 struct pending_extent_op *tmp;
1050
1051 tmp = list_entry(pos, struct pending_extent_op, list);
1052
1053 /* we only want to delete extent+ref at this stage */
1054 if (cur_slot >= btrfs_header_nritems(leaf) - 1)
1055 break;
1056
1057 btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
1058 if (found_key.objectid != tmp->bytenr ||
1059 found_key.type != BTRFS_EXTENT_ITEM_KEY ||
1060 found_key.offset != tmp->num_bytes)
1061 break;
1062
1063 /* check to make sure this extent only has one ref */
1064 ei = btrfs_item_ptr(leaf, cur_slot,
1065 struct btrfs_extent_item);
1066 if (btrfs_extent_refs(leaf, ei) != 1)
1067 break;
1068
1069 btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
1070 if (found_key.objectid != tmp->bytenr ||
1071 found_key.type != BTRFS_EXTENT_REF_KEY ||
1072 found_key.offset != tmp->orig_parent)
1073 break;
1074
1075 /*
1076 * the ref is right next to the extent, we can set the
1077 * ref count to 0 since we will delete them both now
1078 */
1079 btrfs_set_extent_refs(leaf, ei, 0);
1080
1081 /* pin down the bytes for this extent */
1082 mutex_lock(&info->pinned_mutex);
1083 ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
1084 tmp->num_bytes, tmp->level >=
1085 BTRFS_FIRST_FREE_OBJECTID);
1086 mutex_unlock(&info->pinned_mutex);
1087 BUG_ON(ret < 0);
1088
1089 /*
1090 * use the del field to tell if we need to go ahead and
1091 * free up the extent when we delete the item or not.
1092 */
1093 tmp->del = ret;
1094 bytes_freed += tmp->num_bytes;
1095
1096 num_to_del += 2;
1097 cur_slot += 2;
1098 }
1099 end = pos;
1100
1101 /* update the free space counters */
1102 spin_lock(&info->delalloc_lock);
1103 super_used = btrfs_super_bytes_used(&info->super_copy);
1104 btrfs_set_super_bytes_used(&info->super_copy,
1105 super_used - bytes_freed);
1106
1107 root_used = btrfs_root_used(&extent_root->root_item);
1108 btrfs_set_root_used(&extent_root->root_item,
1109 root_used - bytes_freed);
1110 spin_unlock(&info->delalloc_lock);
1111
1112 /* delete the items */
1113 ret = btrfs_del_items(trans, extent_root, path,
1114 path->slots[0], num_to_del);
1115 BUG_ON(ret);
1116
1117 /*
1118 * loop through the extents we deleted and do the cleanup work
1119 * on them
1120 */
1121 for (pos = cur, n = pos->next; pos != end;
1122 pos = n, n = pos->next) {
1123 struct pending_extent_op *tmp;
1124 tmp = list_entry(pos, struct pending_extent_op, list);
1125
1126 /*
1127 * remember tmp->del tells us wether or not we pinned
1128 * down the extent
1129 */
1130 ret = update_block_group(trans, extent_root,
1131 tmp->bytenr, tmp->num_bytes, 0,
1132 tmp->del);
1133 BUG_ON(ret);
1134
1135 list_del_init(&tmp->list);
1136 unlock_extent(&info->extent_ins, tmp->bytenr,
1137 tmp->bytenr + tmp->num_bytes - 1,
1138 GFP_NOFS);
1139 kfree(tmp);
1140 }
1141 } else if (refs && found_extent) {
1142 /*
1143 * the ref and extent were right next to eachother, but the
1144 * extent still has a ref, so just free the backref and keep
1145 * going
1146 */
1147 ret = remove_extent_backref(trans, extent_root, path);
1148 BUG_ON(ret);
1149
1150 list_del_init(&op->list);
1151 unlock_extent(&info->extent_ins, op->bytenr,
1152 op->bytenr + op->num_bytes - 1, GFP_NOFS);
1153 kfree(op);
1154 } else {
1155 /*
1156 * the extent has multiple refs and the backref we were looking
1157 * for was not right next to it, so just unlock and go next,
1158 * we're good to go
1159 */
1160 list_del_init(&op->list);
1161 unlock_extent(&info->extent_ins, op->bytenr,
1162 op->bytenr + op->num_bytes - 1, GFP_NOFS);
1163 kfree(op);
1164 }
1165
1166 btrfs_release_path(extent_root, path);
1167 if (!list_empty(del_list))
1168 goto search;
1169
1170out:
1171 btrfs_free_path(path);
1172 return ret;
1173}
1174
1175static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans, 690static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1176 struct btrfs_root *root, u64 bytenr, 691 struct btrfs_root *root, u64 bytenr,
692 u64 num_bytes,
1177 u64 orig_parent, u64 parent, 693 u64 orig_parent, u64 parent,
1178 u64 orig_root, u64 ref_root, 694 u64 orig_root, u64 ref_root,
1179 u64 orig_generation, u64 ref_generation, 695 u64 orig_generation, u64 ref_generation,
1180 u64 owner_objectid) 696 u64 owner_objectid)
1181{ 697{
1182 int ret; 698 int ret;
1183 struct btrfs_root *extent_root = root->fs_info->extent_root; 699 int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID;
1184 struct btrfs_path *path;
1185
1186 if (root == root->fs_info->extent_root) {
1187 struct pending_extent_op *extent_op;
1188 u64 num_bytes;
1189
1190 BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
1191 num_bytes = btrfs_level_size(root, (int)owner_objectid);
1192 mutex_lock(&root->fs_info->extent_ins_mutex);
1193 if (test_range_bit(&root->fs_info->extent_ins, bytenr,
1194 bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
1195 u64 priv;
1196 ret = get_state_private(&root->fs_info->extent_ins,
1197 bytenr, &priv);
1198 BUG_ON(ret);
1199 extent_op = (struct pending_extent_op *)
1200 (unsigned long)priv;
1201 BUG_ON(extent_op->parent != orig_parent);
1202 BUG_ON(extent_op->generation != orig_generation);
1203 700
1204 extent_op->parent = parent; 701 ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes,
1205 extent_op->generation = ref_generation; 702 orig_parent, parent, orig_root,
1206 } else { 703 ref_root, orig_generation,
1207 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); 704 ref_generation, owner_objectid, pin);
1208 BUG_ON(!extent_op);
1209
1210 extent_op->type = PENDING_BACKREF_UPDATE;
1211 extent_op->bytenr = bytenr;
1212 extent_op->num_bytes = num_bytes;
1213 extent_op->parent = parent;
1214 extent_op->orig_parent = orig_parent;
1215 extent_op->generation = ref_generation;
1216 extent_op->orig_generation = orig_generation;
1217 extent_op->level = (int)owner_objectid;
1218 INIT_LIST_HEAD(&extent_op->list);
1219 extent_op->del = 0;
1220
1221 set_extent_bits(&root->fs_info->extent_ins,
1222 bytenr, bytenr + num_bytes - 1,
1223 EXTENT_WRITEBACK, GFP_NOFS);
1224 set_state_private(&root->fs_info->extent_ins,
1225 bytenr, (unsigned long)extent_op);
1226 }
1227 mutex_unlock(&root->fs_info->extent_ins_mutex);
1228 return 0;
1229 }
1230
1231 path = btrfs_alloc_path();
1232 if (!path)
1233 return -ENOMEM;
1234 ret = lookup_extent_backref(trans, extent_root, path,
1235 bytenr, orig_parent, orig_root,
1236 orig_generation, owner_objectid, 1);
1237 if (ret)
1238 goto out;
1239 ret = remove_extent_backref(trans, extent_root, path);
1240 if (ret)
1241 goto out;
1242 ret = insert_extent_backref(trans, extent_root, path, bytenr,
1243 parent, ref_root, ref_generation,
1244 owner_objectid);
1245 BUG_ON(ret); 705 BUG_ON(ret);
1246 finish_current_insert(trans, extent_root, 0);
1247 del_pending_extents(trans, extent_root, 0);
1248out:
1249 btrfs_free_path(path);
1250 return ret; 706 return ret;
1251} 707}
1252 708
1253int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, 709int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1254 struct btrfs_root *root, u64 bytenr, 710 struct btrfs_root *root, u64 bytenr,
1255 u64 orig_parent, u64 parent, 711 u64 num_bytes, u64 orig_parent, u64 parent,
1256 u64 ref_root, u64 ref_generation, 712 u64 ref_root, u64 ref_generation,
1257 u64 owner_objectid) 713 u64 owner_objectid)
1258{ 714{
@@ -1260,20 +716,36 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1260 if (ref_root == BTRFS_TREE_LOG_OBJECTID && 716 if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1261 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) 717 owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1262 return 0; 718 return 0;
1263 ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent, 719
1264 parent, ref_root, ref_root, 720 ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
1265 ref_generation, ref_generation, 721 orig_parent, parent, ref_root,
1266 owner_objectid); 722 ref_root, ref_generation,
723 ref_generation, owner_objectid);
1267 return ret; 724 return ret;
1268} 725}
1269
1270static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 726static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1271 struct btrfs_root *root, u64 bytenr, 727 struct btrfs_root *root, u64 bytenr,
728 u64 num_bytes,
1272 u64 orig_parent, u64 parent, 729 u64 orig_parent, u64 parent,
1273 u64 orig_root, u64 ref_root, 730 u64 orig_root, u64 ref_root,
1274 u64 orig_generation, u64 ref_generation, 731 u64 orig_generation, u64 ref_generation,
1275 u64 owner_objectid) 732 u64 owner_objectid)
1276{ 733{
734 int ret;
735
736 ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root,
737 ref_generation, owner_objectid,
738 BTRFS_ADD_DELAYED_REF, 0);
739 BUG_ON(ret);
740 return ret;
741}
742
743static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
744 struct btrfs_root *root, u64 bytenr,
745 u64 num_bytes, u64 parent, u64 ref_root,
746 u64 ref_generation, u64 owner_objectid,
747 int refs_to_add)
748{
1277 struct btrfs_path *path; 749 struct btrfs_path *path;
1278 int ret; 750 int ret;
1279 struct btrfs_key key; 751 struct btrfs_key key;
@@ -1286,17 +758,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1286 return -ENOMEM; 758 return -ENOMEM;
1287 759
1288 path->reada = 1; 760 path->reada = 1;
761 path->leave_spinning = 1;
1289 key.objectid = bytenr; 762 key.objectid = bytenr;
1290 key.type = BTRFS_EXTENT_ITEM_KEY; 763 key.type = BTRFS_EXTENT_ITEM_KEY;
1291 key.offset = (u64)-1; 764 key.offset = num_bytes;
1292 765
1293 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, 766 /* first find the extent item and update its reference count */
1294 0, 1); 767 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
1295 if (ret < 0) 768 path, 0, 1);
769 if (ret < 0) {
770 btrfs_set_path_blocking(path);
1296 return ret; 771 return ret;
1297 BUG_ON(ret == 0 || path->slots[0] == 0); 772 }
1298 773
1299 path->slots[0]--; 774 if (ret > 0) {
775 WARN_ON(1);
776 btrfs_free_path(path);
777 return -EIO;
778 }
1300 l = path->nodes[0]; 779 l = path->nodes[0];
1301 780
1302 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 781 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
@@ -1310,21 +789,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1310 BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY); 789 BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
1311 790
1312 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item); 791 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
792
1313 refs = btrfs_extent_refs(l, item); 793 refs = btrfs_extent_refs(l, item);
1314 btrfs_set_extent_refs(l, item, refs + 1); 794 btrfs_set_extent_refs(l, item, refs + refs_to_add);
795 btrfs_unlock_up_safe(path, 1);
796
1315 btrfs_mark_buffer_dirty(path->nodes[0]); 797 btrfs_mark_buffer_dirty(path->nodes[0]);
1316 798
1317 btrfs_release_path(root->fs_info->extent_root, path); 799 btrfs_release_path(root->fs_info->extent_root, path);
1318 800
1319 path->reada = 1; 801 path->reada = 1;
802 path->leave_spinning = 1;
803
804 /* now insert the actual backref */
1320 ret = insert_extent_backref(trans, root->fs_info->extent_root, 805 ret = insert_extent_backref(trans, root->fs_info->extent_root,
1321 path, bytenr, parent, 806 path, bytenr, parent,
1322 ref_root, ref_generation, 807 ref_root, ref_generation,
1323 owner_objectid); 808 owner_objectid, refs_to_add);
1324 BUG_ON(ret); 809 BUG_ON(ret);
1325 finish_current_insert(trans, root->fs_info->extent_root, 0);
1326 del_pending_extents(trans, root->fs_info->extent_root, 0);
1327
1328 btrfs_free_path(path); 810 btrfs_free_path(path);
1329 return 0; 811 return 0;
1330} 812}
@@ -1339,68 +821,278 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1339 if (ref_root == BTRFS_TREE_LOG_OBJECTID && 821 if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1340 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) 822 owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1341 return 0; 823 return 0;
1342 ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent, 824
825 ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent,
1343 0, ref_root, 0, ref_generation, 826 0, ref_root, 0, ref_generation,
1344 owner_objectid); 827 owner_objectid);
1345 return ret; 828 return ret;
1346} 829}
1347 830
1348int btrfs_extent_post_op(struct btrfs_trans_handle *trans, 831static int drop_delayed_ref(struct btrfs_trans_handle *trans,
1349 struct btrfs_root *root) 832 struct btrfs_root *root,
833 struct btrfs_delayed_ref_node *node)
834{
835 int ret = 0;
836 struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node);
837
838 BUG_ON(node->ref_mod == 0);
839 ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes,
840 node->parent, ref->root, ref->generation,
841 ref->owner_objectid, ref->pin, node->ref_mod);
842
843 return ret;
844}
845
846/* helper function to actually process a single delayed ref entry */
847static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
848 struct btrfs_root *root,
849 struct btrfs_delayed_ref_node *node,
850 int insert_reserved)
1350{ 851{
1351 u64 start;
1352 u64 end;
1353 int ret; 852 int ret;
853 struct btrfs_delayed_ref *ref;
854
855 if (node->parent == (u64)-1) {
856 struct btrfs_delayed_ref_head *head;
857 /*
858 * we've hit the end of the chain and we were supposed
859 * to insert this extent into the tree. But, it got
860 * deleted before we ever needed to insert it, so all
861 * we have to do is clean up the accounting
862 */
863 if (insert_reserved) {
864 update_reserved_extents(root, node->bytenr,
865 node->num_bytes, 0);
866 }
867 head = btrfs_delayed_node_to_head(node);
868 mutex_unlock(&head->mutex);
869 return 0;
870 }
1354 871
1355 while(1) { 872 ref = btrfs_delayed_node_to_ref(node);
1356 finish_current_insert(trans, root->fs_info->extent_root, 1); 873 if (ref->action == BTRFS_ADD_DELAYED_REF) {
1357 del_pending_extents(trans, root->fs_info->extent_root, 1); 874 if (insert_reserved) {
875 struct btrfs_key ins;
1358 876
1359 /* is there more work to do? */ 877 ins.objectid = node->bytenr;
1360 ret = find_first_extent_bit(&root->fs_info->pending_del, 878 ins.offset = node->num_bytes;
1361 0, &start, &end, EXTENT_WRITEBACK); 879 ins.type = BTRFS_EXTENT_ITEM_KEY;
1362 if (!ret) 880
1363 continue; 881 /* record the full extent allocation */
1364 ret = find_first_extent_bit(&root->fs_info->extent_ins, 882 ret = __btrfs_alloc_reserved_extent(trans, root,
1365 0, &start, &end, EXTENT_WRITEBACK); 883 node->parent, ref->root,
1366 if (!ret) 884 ref->generation, ref->owner_objectid,
1367 continue; 885 &ins, node->ref_mod);
1368 break; 886 update_reserved_extents(root, node->bytenr,
887 node->num_bytes, 0);
888 } else {
889 /* just add one backref */
890 ret = add_extent_ref(trans, root, node->bytenr,
891 node->num_bytes,
892 node->parent, ref->root, ref->generation,
893 ref->owner_objectid, node->ref_mod);
894 }
895 BUG_ON(ret);
896 } else if (ref->action == BTRFS_DROP_DELAYED_REF) {
897 WARN_ON(insert_reserved);
898 ret = drop_delayed_ref(trans, root, node);
1369 } 899 }
1370 return 0; 900 return 0;
1371} 901}
1372 902
1373int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, 903static noinline struct btrfs_delayed_ref_node *
1374 struct btrfs_root *root, u64 bytenr, 904select_delayed_ref(struct btrfs_delayed_ref_head *head)
1375 u64 num_bytes, u32 *refs)
1376{ 905{
1377 struct btrfs_path *path; 906 struct rb_node *node;
907 struct btrfs_delayed_ref_node *ref;
908 int action = BTRFS_ADD_DELAYED_REF;
909again:
910 /*
911 * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
912 * this prevents ref count from going down to zero when
913 * there still are pending delayed ref.
914 */
915 node = rb_prev(&head->node.rb_node);
916 while (1) {
917 if (!node)
918 break;
919 ref = rb_entry(node, struct btrfs_delayed_ref_node,
920 rb_node);
921 if (ref->bytenr != head->node.bytenr)
922 break;
923 if (btrfs_delayed_node_to_ref(ref)->action == action)
924 return ref;
925 node = rb_prev(node);
926 }
927 if (action == BTRFS_ADD_DELAYED_REF) {
928 action = BTRFS_DROP_DELAYED_REF;
929 goto again;
930 }
931 return NULL;
932}
933
934static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
935 struct btrfs_root *root,
936 struct list_head *cluster)
937{
938 struct btrfs_delayed_ref_root *delayed_refs;
939 struct btrfs_delayed_ref_node *ref;
940 struct btrfs_delayed_ref_head *locked_ref = NULL;
1378 int ret; 941 int ret;
1379 struct btrfs_key key; 942 int count = 0;
1380 struct extent_buffer *l; 943 int must_insert_reserved = 0;
1381 struct btrfs_extent_item *item;
1382 944
1383 WARN_ON(num_bytes < root->sectorsize); 945 delayed_refs = &trans->transaction->delayed_refs;
1384 path = btrfs_alloc_path(); 946 while (1) {
1385 path->reada = 1; 947 if (!locked_ref) {
1386 key.objectid = bytenr; 948 /* pick a new head ref from the cluster list */
1387 key.offset = num_bytes; 949 if (list_empty(cluster))
1388 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); 950 break;
1389 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, 951
1390 0, 0); 952 locked_ref = list_entry(cluster->next,
1391 if (ret < 0) 953 struct btrfs_delayed_ref_head, cluster);
1392 goto out; 954
1393 if (ret != 0) { 955 /* grab the lock that says we are going to process
1394 btrfs_print_leaf(root, path->nodes[0]); 956 * all the refs for this head */
1395 printk(KERN_INFO "btrfs failed to find block number %llu\n", 957 ret = btrfs_delayed_ref_lock(trans, locked_ref);
1396 (unsigned long long)bytenr); 958
1397 BUG(); 959 /*
960 * we may have dropped the spin lock to get the head
961 * mutex lock, and that might have given someone else
962 * time to free the head. If that's true, it has been
963 * removed from our list and we can move on.
964 */
965 if (ret == -EAGAIN) {
966 locked_ref = NULL;
967 count++;
968 continue;
969 }
970 }
971
972 /*
973 * record the must insert reserved flag before we
974 * drop the spin lock.
975 */
976 must_insert_reserved = locked_ref->must_insert_reserved;
977 locked_ref->must_insert_reserved = 0;
978
979 /*
980 * locked_ref is the head node, so we have to go one
981 * node back for any delayed ref updates
982 */
983 ref = select_delayed_ref(locked_ref);
984 if (!ref) {
985 /* All delayed refs have been processed, Go ahead
986 * and send the head node to run_one_delayed_ref,
987 * so that any accounting fixes can happen
988 */
989 ref = &locked_ref->node;
990 list_del_init(&locked_ref->cluster);
991 locked_ref = NULL;
992 }
993
994 ref->in_tree = 0;
995 rb_erase(&ref->rb_node, &delayed_refs->root);
996 delayed_refs->num_entries--;
997 spin_unlock(&delayed_refs->lock);
998
999 ret = run_one_delayed_ref(trans, root, ref,
1000 must_insert_reserved);
1001 BUG_ON(ret);
1002 btrfs_put_delayed_ref(ref);
1003
1004 count++;
1005 cond_resched();
1006 spin_lock(&delayed_refs->lock);
1007 }
1008 return count;
1009}
1010
1011/*
1012 * this starts processing the delayed reference count updates and
1013 * extent insertions we have queued up so far. count can be
1014 * 0, which means to process everything in the tree at the start
1015 * of the run (but not newly added entries), or it can be some target
1016 * number you'd like to process.
1017 */
1018int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1019 struct btrfs_root *root, unsigned long count)
1020{
1021 struct rb_node *node;
1022 struct btrfs_delayed_ref_root *delayed_refs;
1023 struct btrfs_delayed_ref_node *ref;
1024 struct list_head cluster;
1025 int ret;
1026 int run_all = count == (unsigned long)-1;
1027 int run_most = 0;
1028
1029 if (root == root->fs_info->extent_root)
1030 root = root->fs_info->tree_root;
1031
1032 delayed_refs = &trans->transaction->delayed_refs;
1033 INIT_LIST_HEAD(&cluster);
1034again:
1035 spin_lock(&delayed_refs->lock);
1036 if (count == 0) {
1037 count = delayed_refs->num_entries * 2;
1038 run_most = 1;
1039 }
1040 while (1) {
1041 if (!(run_all || run_most) &&
1042 delayed_refs->num_heads_ready < 64)
1043 break;
1044
1045 /*
1046 * go find something we can process in the rbtree. We start at
1047 * the beginning of the tree, and then build a cluster
1048 * of refs to process starting at the first one we are able to
1049 * lock
1050 */
1051 ret = btrfs_find_ref_cluster(trans, &cluster,
1052 delayed_refs->run_delayed_start);
1053 if (ret)
1054 break;
1055
1056 ret = run_clustered_refs(trans, root, &cluster);
1057 BUG_ON(ret < 0);
1058
1059 count -= min_t(unsigned long, ret, count);
1060
1061 if (count == 0)
1062 break;
1063 }
1064
1065 if (run_all) {
1066 node = rb_first(&delayed_refs->root);
1067 if (!node)
1068 goto out;
1069 count = (unsigned long)-1;
1070
1071 while (node) {
1072 ref = rb_entry(node, struct btrfs_delayed_ref_node,
1073 rb_node);
1074 if (btrfs_delayed_ref_is_head(ref)) {
1075 struct btrfs_delayed_ref_head *head;
1076
1077 head = btrfs_delayed_node_to_head(ref);
1078 atomic_inc(&ref->refs);
1079
1080 spin_unlock(&delayed_refs->lock);
1081 mutex_lock(&head->mutex);
1082 mutex_unlock(&head->mutex);
1083
1084 btrfs_put_delayed_ref(ref);
1085 cond_resched();
1086 goto again;
1087 }
1088 node = rb_next(node);
1089 }
1090 spin_unlock(&delayed_refs->lock);
1091 schedule_timeout(1);
1092 goto again;
1398 } 1093 }
1399 l = path->nodes[0];
1400 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
1401 *refs = btrfs_extent_refs(l, item);
1402out: 1094out:
1403 btrfs_free_path(path); 1095 spin_unlock(&delayed_refs->lock);
1404 return 0; 1096 return 0;
1405} 1097}
1406 1098
@@ -1624,7 +1316,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1624 int refi = 0; 1316 int refi = 0;
1625 int slot; 1317 int slot;
1626 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 1318 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1627 u64, u64, u64, u64, u64, u64, u64, u64); 1319 u64, u64, u64, u64, u64, u64, u64, u64, u64);
1628 1320
1629 ref_root = btrfs_header_owner(buf); 1321 ref_root = btrfs_header_owner(buf);
1630 ref_generation = btrfs_header_generation(buf); 1322 ref_generation = btrfs_header_generation(buf);
@@ -1696,12 +1388,19 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1696 1388
1697 if (level == 0) { 1389 if (level == 0) {
1698 btrfs_item_key_to_cpu(buf, &key, slot); 1390 btrfs_item_key_to_cpu(buf, &key, slot);
1391 fi = btrfs_item_ptr(buf, slot,
1392 struct btrfs_file_extent_item);
1393
1394 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1395 if (bytenr == 0)
1396 continue;
1699 1397
1700 ret = process_func(trans, root, bytenr, 1398 ret = process_func(trans, root, bytenr,
1701 orig_buf->start, buf->start, 1399 btrfs_file_extent_disk_num_bytes(buf, fi),
1702 orig_root, ref_root, 1400 orig_buf->start, buf->start,
1703 orig_generation, ref_generation, 1401 orig_root, ref_root,
1704 key.objectid); 1402 orig_generation, ref_generation,
1403 key.objectid);
1705 1404
1706 if (ret) { 1405 if (ret) {
1707 faili = slot; 1406 faili = slot;
@@ -1709,7 +1408,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1709 goto fail; 1408 goto fail;
1710 } 1409 }
1711 } else { 1410 } else {
1712 ret = process_func(trans, root, bytenr, 1411 ret = process_func(trans, root, bytenr, buf->len,
1713 orig_buf->start, buf->start, 1412 orig_buf->start, buf->start,
1714 orig_root, ref_root, 1413 orig_root, ref_root,
1715 orig_generation, ref_generation, 1414 orig_generation, ref_generation,
@@ -1786,17 +1485,17 @@ int btrfs_update_ref(struct btrfs_trans_handle *trans,
1786 if (bytenr == 0) 1485 if (bytenr == 0)
1787 continue; 1486 continue;
1788 ret = __btrfs_update_extent_ref(trans, root, bytenr, 1487 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1789 orig_buf->start, buf->start, 1488 btrfs_file_extent_disk_num_bytes(buf, fi),
1790 orig_root, ref_root, 1489 orig_buf->start, buf->start,
1791 orig_generation, ref_generation, 1490 orig_root, ref_root, orig_generation,
1792 key.objectid); 1491 ref_generation, key.objectid);
1793 if (ret) 1492 if (ret)
1794 goto fail; 1493 goto fail;
1795 } else { 1494 } else {
1796 bytenr = btrfs_node_blockptr(buf, slot); 1495 bytenr = btrfs_node_blockptr(buf, slot);
1797 ret = __btrfs_update_extent_ref(trans, root, bytenr, 1496 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1798 orig_buf->start, buf->start, 1497 buf->len, orig_buf->start,
1799 orig_root, ref_root, 1498 buf->start, orig_root, ref_root,
1800 orig_generation, ref_generation, 1499 orig_generation, ref_generation,
1801 level - 1); 1500 level - 1);
1802 if (ret) 1501 if (ret)
@@ -1815,7 +1514,6 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
1815 struct btrfs_block_group_cache *cache) 1514 struct btrfs_block_group_cache *cache)
1816{ 1515{
1817 int ret; 1516 int ret;
1818 int pending_ret;
1819 struct btrfs_root *extent_root = root->fs_info->extent_root; 1517 struct btrfs_root *extent_root = root->fs_info->extent_root;
1820 unsigned long bi; 1518 unsigned long bi;
1821 struct extent_buffer *leaf; 1519 struct extent_buffer *leaf;
@@ -1831,12 +1529,8 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
1831 btrfs_mark_buffer_dirty(leaf); 1529 btrfs_mark_buffer_dirty(leaf);
1832 btrfs_release_path(extent_root, path); 1530 btrfs_release_path(extent_root, path);
1833fail: 1531fail:
1834 finish_current_insert(trans, extent_root, 0);
1835 pending_ret = del_pending_extents(trans, extent_root, 0);
1836 if (ret) 1532 if (ret)
1837 return ret; 1533 return ret;
1838 if (pending_ret)
1839 return pending_ret;
1840 return 0; 1534 return 0;
1841 1535
1842} 1536}
@@ -2361,6 +2055,8 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2361 clear_extent_dirty(&fs_info->pinned_extents, 2055 clear_extent_dirty(&fs_info->pinned_extents,
2362 bytenr, bytenr + num - 1, GFP_NOFS); 2056 bytenr, bytenr + num - 1, GFP_NOFS);
2363 } 2057 }
2058 mutex_unlock(&root->fs_info->pinned_mutex);
2059
2364 while (num > 0) { 2060 while (num > 0) {
2365 cache = btrfs_lookup_block_group(fs_info, bytenr); 2061 cache = btrfs_lookup_block_group(fs_info, bytenr);
2366 BUG_ON(!cache); 2062 BUG_ON(!cache);
@@ -2452,8 +2148,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2452 u64 end; 2148 u64 end;
2453 int ret; 2149 int ret;
2454 2150
2455 mutex_lock(&root->fs_info->pinned_mutex);
2456 while (1) { 2151 while (1) {
2152 mutex_lock(&root->fs_info->pinned_mutex);
2457 ret = find_first_extent_bit(unpin, 0, &start, &end, 2153 ret = find_first_extent_bit(unpin, 0, &start, &end,
2458 EXTENT_DIRTY); 2154 EXTENT_DIRTY);
2459 if (ret) 2155 if (ret)
@@ -2461,209 +2157,21 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2461 2157
2462 ret = btrfs_discard_extent(root, start, end + 1 - start); 2158 ret = btrfs_discard_extent(root, start, end + 1 - start);
2463 2159
2160 /* unlocks the pinned mutex */
2464 btrfs_update_pinned_extents(root, start, end + 1 - start, 0); 2161 btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
2465 clear_extent_dirty(unpin, start, end, GFP_NOFS); 2162 clear_extent_dirty(unpin, start, end, GFP_NOFS);
2466 2163
2467 if (need_resched()) { 2164 cond_resched();
2468 mutex_unlock(&root->fs_info->pinned_mutex);
2469 cond_resched();
2470 mutex_lock(&root->fs_info->pinned_mutex);
2471 }
2472 } 2165 }
2473 mutex_unlock(&root->fs_info->pinned_mutex); 2166 mutex_unlock(&root->fs_info->pinned_mutex);
2474 return ret; 2167 return ret;
2475} 2168}
2476 2169
2477static int finish_current_insert(struct btrfs_trans_handle *trans,
2478 struct btrfs_root *extent_root, int all)
2479{
2480 u64 start;
2481 u64 end;
2482 u64 priv;
2483 u64 search = 0;
2484 struct btrfs_fs_info *info = extent_root->fs_info;
2485 struct btrfs_path *path;
2486 struct pending_extent_op *extent_op, *tmp;
2487 struct list_head insert_list, update_list;
2488 int ret;
2489 int num_inserts = 0, max_inserts, restart = 0;
2490
2491 path = btrfs_alloc_path();
2492 INIT_LIST_HEAD(&insert_list);
2493 INIT_LIST_HEAD(&update_list);
2494
2495 max_inserts = extent_root->leafsize /
2496 (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
2497 sizeof(struct btrfs_extent_ref) +
2498 sizeof(struct btrfs_extent_item));
2499again:
2500 mutex_lock(&info->extent_ins_mutex);
2501 while (1) {
2502 ret = find_first_extent_bit(&info->extent_ins, search, &start,
2503 &end, EXTENT_WRITEBACK);
2504 if (ret) {
2505 if (restart && !num_inserts &&
2506 list_empty(&update_list)) {
2507 restart = 0;
2508 search = 0;
2509 continue;
2510 }
2511 break;
2512 }
2513
2514 ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
2515 if (!ret) {
2516 if (all)
2517 restart = 1;
2518 search = end + 1;
2519 if (need_resched()) {
2520 mutex_unlock(&info->extent_ins_mutex);
2521 cond_resched();
2522 mutex_lock(&info->extent_ins_mutex);
2523 }
2524 continue;
2525 }
2526
2527 ret = get_state_private(&info->extent_ins, start, &priv);
2528 BUG_ON(ret);
2529 extent_op = (struct pending_extent_op *)(unsigned long) priv;
2530
2531 if (extent_op->type == PENDING_EXTENT_INSERT) {
2532 num_inserts++;
2533 list_add_tail(&extent_op->list, &insert_list);
2534 search = end + 1;
2535 if (num_inserts == max_inserts) {
2536 restart = 1;
2537 break;
2538 }
2539 } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
2540 list_add_tail(&extent_op->list, &update_list);
2541 search = end + 1;
2542 } else {
2543 BUG();
2544 }
2545 }
2546
2547 /*
2548 * process the update list, clear the writeback bit for it, and if
2549 * somebody marked this thing for deletion then just unlock it and be
2550 * done, the free_extents will handle it
2551 */
2552 list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
2553 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2554 extent_op->bytenr + extent_op->num_bytes - 1,
2555 EXTENT_WRITEBACK, GFP_NOFS);
2556 if (extent_op->del) {
2557 list_del_init(&extent_op->list);
2558 unlock_extent(&info->extent_ins, extent_op->bytenr,
2559 extent_op->bytenr + extent_op->num_bytes
2560 - 1, GFP_NOFS);
2561 kfree(extent_op);
2562 }
2563 }
2564 mutex_unlock(&info->extent_ins_mutex);
2565
2566 /*
2567 * still have things left on the update list, go ahead an update
2568 * everything
2569 */
2570 if (!list_empty(&update_list)) {
2571 ret = update_backrefs(trans, extent_root, path, &update_list);
2572 BUG_ON(ret);
2573
2574 /* we may have COW'ed new blocks, so lets start over */
2575 if (all)
2576 restart = 1;
2577 }
2578
2579 /*
2580 * if no inserts need to be done, but we skipped some extents and we
2581 * need to make sure everything is cleaned then reset everything and
2582 * go back to the beginning
2583 */
2584 if (!num_inserts && restart) {
2585 search = 0;
2586 restart = 0;
2587 INIT_LIST_HEAD(&update_list);
2588 INIT_LIST_HEAD(&insert_list);
2589 goto again;
2590 } else if (!num_inserts) {
2591 goto out;
2592 }
2593
2594 /*
2595 * process the insert extents list. Again if we are deleting this
2596 * extent, then just unlock it, pin down the bytes if need be, and be
2597 * done with it. Saves us from having to actually insert the extent
2598 * into the tree and then subsequently come along and delete it
2599 */
2600 mutex_lock(&info->extent_ins_mutex);
2601 list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
2602 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2603 extent_op->bytenr + extent_op->num_bytes - 1,
2604 EXTENT_WRITEBACK, GFP_NOFS);
2605 if (extent_op->del) {
2606 u64 used;
2607 list_del_init(&extent_op->list);
2608 unlock_extent(&info->extent_ins, extent_op->bytenr,
2609 extent_op->bytenr + extent_op->num_bytes
2610 - 1, GFP_NOFS);
2611
2612 mutex_lock(&extent_root->fs_info->pinned_mutex);
2613 ret = pin_down_bytes(trans, extent_root,
2614 extent_op->bytenr,
2615 extent_op->num_bytes, 0);
2616 mutex_unlock(&extent_root->fs_info->pinned_mutex);
2617
2618 spin_lock(&info->delalloc_lock);
2619 used = btrfs_super_bytes_used(&info->super_copy);
2620 btrfs_set_super_bytes_used(&info->super_copy,
2621 used - extent_op->num_bytes);
2622 used = btrfs_root_used(&extent_root->root_item);
2623 btrfs_set_root_used(&extent_root->root_item,
2624 used - extent_op->num_bytes);
2625 spin_unlock(&info->delalloc_lock);
2626
2627 ret = update_block_group(trans, extent_root,
2628 extent_op->bytenr,
2629 extent_op->num_bytes,
2630 0, ret > 0);
2631 BUG_ON(ret);
2632 kfree(extent_op);
2633 num_inserts--;
2634 }
2635 }
2636 mutex_unlock(&info->extent_ins_mutex);
2637
2638 ret = insert_extents(trans, extent_root, path, &insert_list,
2639 num_inserts);
2640 BUG_ON(ret);
2641
2642 /*
2643 * if restart is set for whatever reason we need to go back and start
2644 * searching through the pending list again.
2645 *
2646 * We just inserted some extents, which could have resulted in new
2647 * blocks being allocated, which would result in new blocks needing
2648 * updates, so if all is set we _must_ restart to get the updated
2649 * blocks.
2650 */
2651 if (restart || all) {
2652 INIT_LIST_HEAD(&insert_list);
2653 INIT_LIST_HEAD(&update_list);
2654 search = 0;
2655 restart = 0;
2656 num_inserts = 0;
2657 goto again;
2658 }
2659out:
2660 btrfs_free_path(path);
2661 return 0;
2662}
2663
2664static int pin_down_bytes(struct btrfs_trans_handle *trans, 2170static int pin_down_bytes(struct btrfs_trans_handle *trans,
2665 struct btrfs_root *root, 2171 struct btrfs_root *root,
2666 u64 bytenr, u64 num_bytes, int is_data) 2172 struct btrfs_path *path,
2173 u64 bytenr, u64 num_bytes, int is_data,
2174 struct extent_buffer **must_clean)
2667{ 2175{
2668 int err = 0; 2176 int err = 0;
2669 struct extent_buffer *buf; 2177 struct extent_buffer *buf;
@@ -2686,17 +2194,19 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
2686 u64 header_transid = btrfs_header_generation(buf); 2194 u64 header_transid = btrfs_header_generation(buf);
2687 if (header_owner != BTRFS_TREE_LOG_OBJECTID && 2195 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
2688 header_owner != BTRFS_TREE_RELOC_OBJECTID && 2196 header_owner != BTRFS_TREE_RELOC_OBJECTID &&
2197 header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID &&
2689 header_transid == trans->transid && 2198 header_transid == trans->transid &&
2690 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 2199 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
2691 clean_tree_block(NULL, root, buf); 2200 *must_clean = buf;
2692 btrfs_tree_unlock(buf);
2693 free_extent_buffer(buf);
2694 return 1; 2201 return 1;
2695 } 2202 }
2696 btrfs_tree_unlock(buf); 2203 btrfs_tree_unlock(buf);
2697 } 2204 }
2698 free_extent_buffer(buf); 2205 free_extent_buffer(buf);
2699pinit: 2206pinit:
2207 btrfs_set_path_blocking(path);
2208 mutex_lock(&root->fs_info->pinned_mutex);
2209 /* unlocks the pinned mutex */
2700 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); 2210 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2701 2211
2702 BUG_ON(err < 0); 2212 BUG_ON(err < 0);
@@ -2710,7 +2220,8 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2710 struct btrfs_root *root, 2220 struct btrfs_root *root,
2711 u64 bytenr, u64 num_bytes, u64 parent, 2221 u64 bytenr, u64 num_bytes, u64 parent,
2712 u64 root_objectid, u64 ref_generation, 2222 u64 root_objectid, u64 ref_generation,
2713 u64 owner_objectid, int pin, int mark_free) 2223 u64 owner_objectid, int pin, int mark_free,
2224 int refs_to_drop)
2714{ 2225{
2715 struct btrfs_path *path; 2226 struct btrfs_path *path;
2716 struct btrfs_key key; 2227 struct btrfs_key key;
@@ -2732,6 +2243,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2732 return -ENOMEM; 2243 return -ENOMEM;
2733 2244
2734 path->reada = 1; 2245 path->reada = 1;
2246 path->leave_spinning = 1;
2735 ret = lookup_extent_backref(trans, extent_root, path, 2247 ret = lookup_extent_backref(trans, extent_root, path,
2736 bytenr, parent, root_objectid, 2248 bytenr, parent, root_objectid,
2737 ref_generation, owner_objectid, 1); 2249 ref_generation, owner_objectid, 1);
@@ -2753,9 +2265,11 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2753 break; 2265 break;
2754 } 2266 }
2755 if (!found_extent) { 2267 if (!found_extent) {
2756 ret = remove_extent_backref(trans, extent_root, path); 2268 ret = remove_extent_backref(trans, extent_root, path,
2269 refs_to_drop);
2757 BUG_ON(ret); 2270 BUG_ON(ret);
2758 btrfs_release_path(extent_root, path); 2271 btrfs_release_path(extent_root, path);
2272 path->leave_spinning = 1;
2759 ret = btrfs_search_slot(trans, extent_root, 2273 ret = btrfs_search_slot(trans, extent_root,
2760 &key, path, -1, 1); 2274 &key, path, -1, 1);
2761 if (ret) { 2275 if (ret) {
@@ -2771,8 +2285,9 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2771 btrfs_print_leaf(extent_root, path->nodes[0]); 2285 btrfs_print_leaf(extent_root, path->nodes[0]);
2772 WARN_ON(1); 2286 WARN_ON(1);
2773 printk(KERN_ERR "btrfs unable to find ref byte nr %llu " 2287 printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
2774 "root %llu gen %llu owner %llu\n", 2288 "parent %llu root %llu gen %llu owner %llu\n",
2775 (unsigned long long)bytenr, 2289 (unsigned long long)bytenr,
2290 (unsigned long long)parent,
2776 (unsigned long long)root_objectid, 2291 (unsigned long long)root_objectid,
2777 (unsigned long long)ref_generation, 2292 (unsigned long long)ref_generation,
2778 (unsigned long long)owner_objectid); 2293 (unsigned long long)owner_objectid);
@@ -2782,17 +2297,23 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2782 ei = btrfs_item_ptr(leaf, extent_slot, 2297 ei = btrfs_item_ptr(leaf, extent_slot,
2783 struct btrfs_extent_item); 2298 struct btrfs_extent_item);
2784 refs = btrfs_extent_refs(leaf, ei); 2299 refs = btrfs_extent_refs(leaf, ei);
2785 BUG_ON(refs == 0);
2786 refs -= 1;
2787 btrfs_set_extent_refs(leaf, ei, refs);
2788 2300
2301 /*
2302 * we're not allowed to delete the extent item if there
2303 * are other delayed ref updates pending
2304 */
2305
2306 BUG_ON(refs < refs_to_drop);
2307 refs -= refs_to_drop;
2308 btrfs_set_extent_refs(leaf, ei, refs);
2789 btrfs_mark_buffer_dirty(leaf); 2309 btrfs_mark_buffer_dirty(leaf);
2790 2310
2791 if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) { 2311 if (refs == 0 && found_extent &&
2312 path->slots[0] == extent_slot + 1) {
2792 struct btrfs_extent_ref *ref; 2313 struct btrfs_extent_ref *ref;
2793 ref = btrfs_item_ptr(leaf, path->slots[0], 2314 ref = btrfs_item_ptr(leaf, path->slots[0],
2794 struct btrfs_extent_ref); 2315 struct btrfs_extent_ref);
2795 BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1); 2316 BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop);
2796 /* if the back ref and the extent are next to each other 2317 /* if the back ref and the extent are next to each other
2797 * they get deleted below in one shot 2318 * they get deleted below in one shot
2798 */ 2319 */
@@ -2800,11 +2321,13 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2800 num_to_del = 2; 2321 num_to_del = 2;
2801 } else if (found_extent) { 2322 } else if (found_extent) {
2802 /* otherwise delete the extent back ref */ 2323 /* otherwise delete the extent back ref */
2803 ret = remove_extent_backref(trans, extent_root, path); 2324 ret = remove_extent_backref(trans, extent_root, path,
2325 refs_to_drop);
2804 BUG_ON(ret); 2326 BUG_ON(ret);
2805 /* if refs are 0, we need to setup the path for deletion */ 2327 /* if refs are 0, we need to setup the path for deletion */
2806 if (refs == 0) { 2328 if (refs == 0) {
2807 btrfs_release_path(extent_root, path); 2329 btrfs_release_path(extent_root, path);
2330 path->leave_spinning = 1;
2808 ret = btrfs_search_slot(trans, extent_root, &key, path, 2331 ret = btrfs_search_slot(trans, extent_root, &key, path,
2809 -1, 1); 2332 -1, 1);
2810 BUG_ON(ret); 2333 BUG_ON(ret);
@@ -2814,16 +2337,18 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2814 if (refs == 0) { 2337 if (refs == 0) {
2815 u64 super_used; 2338 u64 super_used;
2816 u64 root_used; 2339 u64 root_used;
2340 struct extent_buffer *must_clean = NULL;
2817 2341
2818 if (pin) { 2342 if (pin) {
2819 mutex_lock(&root->fs_info->pinned_mutex); 2343 ret = pin_down_bytes(trans, root, path,
2820 ret = pin_down_bytes(trans, root, bytenr, num_bytes, 2344 bytenr, num_bytes,
2821 owner_objectid >= BTRFS_FIRST_FREE_OBJECTID); 2345 owner_objectid >= BTRFS_FIRST_FREE_OBJECTID,
2822 mutex_unlock(&root->fs_info->pinned_mutex); 2346 &must_clean);
2823 if (ret > 0) 2347 if (ret > 0)
2824 mark_free = 1; 2348 mark_free = 1;
2825 BUG_ON(ret < 0); 2349 BUG_ON(ret < 0);
2826 } 2350 }
2351
2827 /* block accounting for super block */ 2352 /* block accounting for super block */
2828 spin_lock(&info->delalloc_lock); 2353 spin_lock(&info->delalloc_lock);
2829 super_used = btrfs_super_bytes_used(&info->super_copy); 2354 super_used = btrfs_super_bytes_used(&info->super_copy);
@@ -2835,14 +2360,34 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2835 btrfs_set_root_used(&root->root_item, 2360 btrfs_set_root_used(&root->root_item,
2836 root_used - num_bytes); 2361 root_used - num_bytes);
2837 spin_unlock(&info->delalloc_lock); 2362 spin_unlock(&info->delalloc_lock);
2363
2364 /*
2365 * it is going to be very rare for someone to be waiting
2366 * on the block we're freeing. del_items might need to
2367 * schedule, so rather than get fancy, just force it
2368 * to blocking here
2369 */
2370 if (must_clean)
2371 btrfs_set_lock_blocking(must_clean);
2372
2838 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 2373 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
2839 num_to_del); 2374 num_to_del);
2840 BUG_ON(ret); 2375 BUG_ON(ret);
2841 btrfs_release_path(extent_root, path); 2376 btrfs_release_path(extent_root, path);
2842 2377
2378 if (must_clean) {
2379 clean_tree_block(NULL, root, must_clean);
2380 btrfs_tree_unlock(must_clean);
2381 free_extent_buffer(must_clean);
2382 }
2383
2843 if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 2384 if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
2844 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 2385 ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
2845 BUG_ON(ret); 2386 BUG_ON(ret);
2387 } else {
2388 invalidate_mapping_pages(info->btree_inode->i_mapping,
2389 bytenr >> PAGE_CACHE_SHIFT,
2390 (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
2846 } 2391 }
2847 2392
2848 ret = update_block_group(trans, root, bytenr, num_bytes, 0, 2393 ret = update_block_group(trans, root, bytenr, num_bytes, 0,
@@ -2850,218 +2395,103 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2850 BUG_ON(ret); 2395 BUG_ON(ret);
2851 } 2396 }
2852 btrfs_free_path(path); 2397 btrfs_free_path(path);
2853 finish_current_insert(trans, extent_root, 0);
2854 return ret; 2398 return ret;
2855} 2399}
2856 2400
2857/* 2401/*
2858 * find all the blocks marked as pending in the radix tree and remove 2402 * remove an extent from the root, returns 0 on success
2859 * them from the extent map
2860 */ 2403 */
2861static int del_pending_extents(struct btrfs_trans_handle *trans, 2404static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2862 struct btrfs_root *extent_root, int all) 2405 struct btrfs_root *root,
2406 u64 bytenr, u64 num_bytes, u64 parent,
2407 u64 root_objectid, u64 ref_generation,
2408 u64 owner_objectid, int pin,
2409 int refs_to_drop)
2863{ 2410{
2864 int ret; 2411 WARN_ON(num_bytes < root->sectorsize);
2865 int err = 0;
2866 u64 start;
2867 u64 end;
2868 u64 priv;
2869 u64 search = 0;
2870 int nr = 0, skipped = 0;
2871 struct extent_io_tree *pending_del;
2872 struct extent_io_tree *extent_ins;
2873 struct pending_extent_op *extent_op;
2874 struct btrfs_fs_info *info = extent_root->fs_info;
2875 struct list_head delete_list;
2876
2877 INIT_LIST_HEAD(&delete_list);
2878 extent_ins = &extent_root->fs_info->extent_ins;
2879 pending_del = &extent_root->fs_info->pending_del;
2880
2881again:
2882 mutex_lock(&info->extent_ins_mutex);
2883 while (1) {
2884 ret = find_first_extent_bit(pending_del, search, &start, &end,
2885 EXTENT_WRITEBACK);
2886 if (ret) {
2887 if (all && skipped && !nr) {
2888 search = 0;
2889 skipped = 0;
2890 continue;
2891 }
2892 mutex_unlock(&info->extent_ins_mutex);
2893 break;
2894 }
2895
2896 ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
2897 if (!ret) {
2898 search = end+1;
2899 skipped = 1;
2900
2901 if (need_resched()) {
2902 mutex_unlock(&info->extent_ins_mutex);
2903 cond_resched();
2904 mutex_lock(&info->extent_ins_mutex);
2905 }
2906
2907 continue;
2908 }
2909 BUG_ON(ret < 0);
2910
2911 ret = get_state_private(pending_del, start, &priv);
2912 BUG_ON(ret);
2913 extent_op = (struct pending_extent_op *)(unsigned long)priv;
2914
2915 clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
2916 GFP_NOFS);
2917 if (!test_range_bit(extent_ins, start, end,
2918 EXTENT_WRITEBACK, 0)) {
2919 list_add_tail(&extent_op->list, &delete_list);
2920 nr++;
2921 } else {
2922 kfree(extent_op);
2923
2924 ret = get_state_private(&info->extent_ins, start,
2925 &priv);
2926 BUG_ON(ret);
2927 extent_op = (struct pending_extent_op *)
2928 (unsigned long)priv;
2929
2930 clear_extent_bits(&info->extent_ins, start, end,
2931 EXTENT_WRITEBACK, GFP_NOFS);
2932
2933 if (extent_op->type == PENDING_BACKREF_UPDATE) {
2934 list_add_tail(&extent_op->list, &delete_list);
2935 search = end + 1;
2936 nr++;
2937 continue;
2938 }
2939
2940 mutex_lock(&extent_root->fs_info->pinned_mutex);
2941 ret = pin_down_bytes(trans, extent_root, start,
2942 end + 1 - start, 0);
2943 mutex_unlock(&extent_root->fs_info->pinned_mutex);
2944
2945 ret = update_block_group(trans, extent_root, start,
2946 end + 1 - start, 0, ret > 0);
2947
2948 unlock_extent(extent_ins, start, end, GFP_NOFS);
2949 BUG_ON(ret);
2950 kfree(extent_op);
2951 }
2952 if (ret)
2953 err = ret;
2954
2955 search = end + 1;
2956
2957 if (need_resched()) {
2958 mutex_unlock(&info->extent_ins_mutex);
2959 cond_resched();
2960 mutex_lock(&info->extent_ins_mutex);
2961 }
2962 }
2963 2412
2964 if (nr) { 2413 /*
2965 ret = free_extents(trans, extent_root, &delete_list); 2414 * if metadata always pin
2966 BUG_ON(ret); 2415 * if data pin when any transaction has committed this
2967 } 2416 */
2417 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID ||
2418 ref_generation != trans->transid)
2419 pin = 1;
2968 2420
2969 if (all && skipped) { 2421 if (ref_generation != trans->transid)
2970 INIT_LIST_HEAD(&delete_list); 2422 pin = 1;
2971 search = 0;
2972 nr = 0;
2973 goto again;
2974 }
2975 2423
2976 if (!err) 2424 return __free_extent(trans, root, bytenr, num_bytes, parent,
2977 finish_current_insert(trans, extent_root, 0); 2425 root_objectid, ref_generation,
2978 return err; 2426 owner_objectid, pin, pin == 0, refs_to_drop);
2979} 2427}
2980 2428
2981/* 2429/*
2982 * remove an extent from the root, returns 0 on success 2430 * when we free an extent, it is possible (and likely) that we free the last
2431 * delayed ref for that extent as well. This searches the delayed ref tree for
2432 * a given extent, and if there are no other delayed refs to be processed, it
2433 * removes it from the tree.
2983 */ 2434 */
2984static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 2435static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
2985 struct btrfs_root *root, 2436 struct btrfs_root *root, u64 bytenr)
2986 u64 bytenr, u64 num_bytes, u64 parent,
2987 u64 root_objectid, u64 ref_generation,
2988 u64 owner_objectid, int pin)
2989{ 2437{
2990 struct btrfs_root *extent_root = root->fs_info->extent_root; 2438 struct btrfs_delayed_ref_head *head;
2991 int pending_ret; 2439 struct btrfs_delayed_ref_root *delayed_refs;
2440 struct btrfs_delayed_ref_node *ref;
2441 struct rb_node *node;
2992 int ret; 2442 int ret;
2993 2443
2994 WARN_ON(num_bytes < root->sectorsize); 2444 delayed_refs = &trans->transaction->delayed_refs;
2995 if (root == extent_root) { 2445 spin_lock(&delayed_refs->lock);
2996 struct pending_extent_op *extent_op = NULL; 2446 head = btrfs_find_delayed_ref_head(trans, bytenr);
2997 2447 if (!head)
2998 mutex_lock(&root->fs_info->extent_ins_mutex); 2448 goto out;
2999 if (test_range_bit(&root->fs_info->extent_ins, bytenr,
3000 bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
3001 u64 priv;
3002 ret = get_state_private(&root->fs_info->extent_ins,
3003 bytenr, &priv);
3004 BUG_ON(ret);
3005 extent_op = (struct pending_extent_op *)
3006 (unsigned long)priv;
3007 2449
3008 extent_op->del = 1; 2450 node = rb_prev(&head->node.rb_node);
3009 if (extent_op->type == PENDING_EXTENT_INSERT) { 2451 if (!node)
3010 mutex_unlock(&root->fs_info->extent_ins_mutex); 2452 goto out;
3011 return 0;
3012 }
3013 }
3014 2453
3015 if (extent_op) { 2454 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
3016 ref_generation = extent_op->orig_generation;
3017 parent = extent_op->orig_parent;
3018 }
3019 2455
3020 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); 2456 /* there are still entries for this ref, we can't drop it */
3021 BUG_ON(!extent_op); 2457 if (ref->bytenr == bytenr)
3022 2458 goto out;
3023 extent_op->type = PENDING_EXTENT_DELETE;
3024 extent_op->bytenr = bytenr;
3025 extent_op->num_bytes = num_bytes;
3026 extent_op->parent = parent;
3027 extent_op->orig_parent = parent;
3028 extent_op->generation = ref_generation;
3029 extent_op->orig_generation = ref_generation;
3030 extent_op->level = (int)owner_objectid;
3031 INIT_LIST_HEAD(&extent_op->list);
3032 extent_op->del = 0;
3033
3034 set_extent_bits(&root->fs_info->pending_del,
3035 bytenr, bytenr + num_bytes - 1,
3036 EXTENT_WRITEBACK, GFP_NOFS);
3037 set_state_private(&root->fs_info->pending_del,
3038 bytenr, (unsigned long)extent_op);
3039 mutex_unlock(&root->fs_info->extent_ins_mutex);
3040 return 0;
3041 }
3042 /* if metadata always pin */
3043 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
3044 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
3045 mutex_lock(&root->fs_info->pinned_mutex);
3046 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
3047 mutex_unlock(&root->fs_info->pinned_mutex);
3048 update_reserved_extents(root, bytenr, num_bytes, 0);
3049 return 0;
3050 }
3051 pin = 1;
3052 }
3053 2459
3054 /* if data pin when any transaction has committed this */ 2460 /*
3055 if (ref_generation != trans->transid) 2461 * waiting for the lock here would deadlock. If someone else has it
3056 pin = 1; 2462 * locked they are already in the process of dropping it anyway
2463 */
2464 if (!mutex_trylock(&head->mutex))
2465 goto out;
3057 2466
3058 ret = __free_extent(trans, root, bytenr, num_bytes, parent, 2467 /*
3059 root_objectid, ref_generation, 2468 * at this point we have a head with no other entries. Go
3060 owner_objectid, pin, pin == 0); 2469 * ahead and process it.
2470 */
2471 head->node.in_tree = 0;
2472 rb_erase(&head->node.rb_node, &delayed_refs->root);
3061 2473
3062 finish_current_insert(trans, root->fs_info->extent_root, 0); 2474 delayed_refs->num_entries--;
3063 pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0); 2475
3064 return ret ? ret : pending_ret; 2476 /*
2477 * we don't take a ref on the node because we're removing it from the
2478 * tree, so we just steal the ref the tree was holding.
2479 */
2480 delayed_refs->num_heads--;
2481 if (list_empty(&head->cluster))
2482 delayed_refs->num_heads_ready--;
2483
2484 list_del_init(&head->cluster);
2485 spin_unlock(&delayed_refs->lock);
2486
2487 ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
2488 &head->node, head->must_insert_reserved);
2489 BUG_ON(ret);
2490 btrfs_put_delayed_ref(&head->node);
2491 return 0;
2492out:
2493 spin_unlock(&delayed_refs->lock);
2494 return 0;
3065} 2495}
3066 2496
3067int btrfs_free_extent(struct btrfs_trans_handle *trans, 2497int btrfs_free_extent(struct btrfs_trans_handle *trans,
@@ -3072,9 +2502,30 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
3072{ 2502{
3073 int ret; 2503 int ret;
3074 2504
3075 ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent, 2505 /*
3076 root_objectid, ref_generation, 2506 * tree log blocks never actually go into the extent allocation
3077 owner_objectid, pin); 2507 * tree, just update pinning info and exit early.
2508 *
2509 * data extents referenced by the tree log do need to have
2510 * their reference counts bumped.
2511 */
2512 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
2513 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2514 mutex_lock(&root->fs_info->pinned_mutex);
2515
2516 /* unlocks the pinned mutex */
2517 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2518 update_reserved_extents(root, bytenr, num_bytes, 0);
2519 ret = 0;
2520 } else {
2521 ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent,
2522 root_objectid, ref_generation,
2523 owner_objectid,
2524 BTRFS_DROP_DELAYED_REF, 1);
2525 BUG_ON(ret);
2526 ret = check_ref_cleanup(trans, root, bytenr);
2527 BUG_ON(ret);
2528 }
3078 return ret; 2529 return ret;
3079} 2530}
3080 2531
@@ -3475,10 +2926,10 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3475static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, 2926static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3476 struct btrfs_root *root, u64 parent, 2927 struct btrfs_root *root, u64 parent,
3477 u64 root_objectid, u64 ref_generation, 2928 u64 root_objectid, u64 ref_generation,
3478 u64 owner, struct btrfs_key *ins) 2929 u64 owner, struct btrfs_key *ins,
2930 int ref_mod)
3479{ 2931{
3480 int ret; 2932 int ret;
3481 int pending_ret;
3482 u64 super_used; 2933 u64 super_used;
3483 u64 root_used; 2934 u64 root_used;
3484 u64 num_bytes = ins->offset; 2935 u64 num_bytes = ins->offset;
@@ -3503,33 +2954,6 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3503 btrfs_set_root_used(&root->root_item, root_used + num_bytes); 2954 btrfs_set_root_used(&root->root_item, root_used + num_bytes);
3504 spin_unlock(&info->delalloc_lock); 2955 spin_unlock(&info->delalloc_lock);
3505 2956
3506 if (root == extent_root) {
3507 struct pending_extent_op *extent_op;
3508
3509 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
3510 BUG_ON(!extent_op);
3511
3512 extent_op->type = PENDING_EXTENT_INSERT;
3513 extent_op->bytenr = ins->objectid;
3514 extent_op->num_bytes = ins->offset;
3515 extent_op->parent = parent;
3516 extent_op->orig_parent = 0;
3517 extent_op->generation = ref_generation;
3518 extent_op->orig_generation = 0;
3519 extent_op->level = (int)owner;
3520 INIT_LIST_HEAD(&extent_op->list);
3521 extent_op->del = 0;
3522
3523 mutex_lock(&root->fs_info->extent_ins_mutex);
3524 set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
3525 ins->objectid + ins->offset - 1,
3526 EXTENT_WRITEBACK, GFP_NOFS);
3527 set_state_private(&root->fs_info->extent_ins,
3528 ins->objectid, (unsigned long)extent_op);
3529 mutex_unlock(&root->fs_info->extent_ins_mutex);
3530 goto update_block;
3531 }
3532
3533 memcpy(&keys[0], ins, sizeof(*ins)); 2957 memcpy(&keys[0], ins, sizeof(*ins));
3534 keys[1].objectid = ins->objectid; 2958 keys[1].objectid = ins->objectid;
3535 keys[1].type = BTRFS_EXTENT_REF_KEY; 2959 keys[1].type = BTRFS_EXTENT_REF_KEY;
@@ -3540,37 +2964,31 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3540 path = btrfs_alloc_path(); 2964 path = btrfs_alloc_path();
3541 BUG_ON(!path); 2965 BUG_ON(!path);
3542 2966
2967 path->leave_spinning = 1;
3543 ret = btrfs_insert_empty_items(trans, extent_root, path, keys, 2968 ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
3544 sizes, 2); 2969 sizes, 2);
3545 BUG_ON(ret); 2970 BUG_ON(ret);
3546 2971
3547 extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2972 extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3548 struct btrfs_extent_item); 2973 struct btrfs_extent_item);
3549 btrfs_set_extent_refs(path->nodes[0], extent_item, 1); 2974 btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod);
3550 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 2975 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
3551 struct btrfs_extent_ref); 2976 struct btrfs_extent_ref);
3552 2977
3553 btrfs_set_ref_root(path->nodes[0], ref, root_objectid); 2978 btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
3554 btrfs_set_ref_generation(path->nodes[0], ref, ref_generation); 2979 btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
3555 btrfs_set_ref_objectid(path->nodes[0], ref, owner); 2980 btrfs_set_ref_objectid(path->nodes[0], ref, owner);
3556 btrfs_set_ref_num_refs(path->nodes[0], ref, 1); 2981 btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod);
3557 2982
3558 btrfs_mark_buffer_dirty(path->nodes[0]); 2983 btrfs_mark_buffer_dirty(path->nodes[0]);
3559 2984
3560 trans->alloc_exclude_start = 0; 2985 trans->alloc_exclude_start = 0;
3561 trans->alloc_exclude_nr = 0; 2986 trans->alloc_exclude_nr = 0;
3562 btrfs_free_path(path); 2987 btrfs_free_path(path);
3563 finish_current_insert(trans, extent_root, 0);
3564 pending_ret = del_pending_extents(trans, extent_root, 0);
3565 2988
3566 if (ret) 2989 if (ret)
3567 goto out; 2990 goto out;
3568 if (pending_ret) {
3569 ret = pending_ret;
3570 goto out;
3571 }
3572 2991
3573update_block:
3574 ret = update_block_group(trans, root, ins->objectid, 2992 ret = update_block_group(trans, root, ins->objectid,
3575 ins->offset, 1, 0); 2993 ins->offset, 1, 0);
3576 if (ret) { 2994 if (ret) {
@@ -3592,9 +3010,12 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3592 3010
3593 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) 3011 if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
3594 return 0; 3012 return 0;
3595 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, 3013
3596 ref_generation, owner, ins); 3014 ret = btrfs_add_delayed_ref(trans, ins->objectid,
3597 update_reserved_extents(root, ins->objectid, ins->offset, 0); 3015 ins->offset, parent, root_objectid,
3016 ref_generation, owner,
3017 BTRFS_ADD_DELAYED_EXTENT, 0);
3018 BUG_ON(ret);
3598 return ret; 3019 return ret;
3599} 3020}
3600 3021
@@ -3621,7 +3042,7 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
3621 BUG_ON(ret); 3042 BUG_ON(ret);
3622 put_block_group(block_group); 3043 put_block_group(block_group);
3623 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, 3044 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3624 ref_generation, owner, ins); 3045 ref_generation, owner, ins, 1);
3625 return ret; 3046 return ret;
3626} 3047}
3627 3048
@@ -3640,20 +3061,18 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
3640 u64 search_end, struct btrfs_key *ins, u64 data) 3061 u64 search_end, struct btrfs_key *ins, u64 data)
3641{ 3062{
3642 int ret; 3063 int ret;
3643
3644 ret = __btrfs_reserve_extent(trans, root, num_bytes, 3064 ret = __btrfs_reserve_extent(trans, root, num_bytes,
3645 min_alloc_size, empty_size, hint_byte, 3065 min_alloc_size, empty_size, hint_byte,
3646 search_end, ins, data); 3066 search_end, ins, data);
3647 BUG_ON(ret); 3067 BUG_ON(ret);
3648 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 3068 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
3649 ret = __btrfs_alloc_reserved_extent(trans, root, parent, 3069 ret = btrfs_add_delayed_ref(trans, ins->objectid,
3650 root_objectid, ref_generation, 3070 ins->offset, parent, root_objectid,
3651 owner_objectid, ins); 3071 ref_generation, owner_objectid,
3072 BTRFS_ADD_DELAYED_EXTENT, 0);
3652 BUG_ON(ret); 3073 BUG_ON(ret);
3653
3654 } else {
3655 update_reserved_extents(root, ins->objectid, ins->offset, 1);
3656 } 3074 }
3075 update_reserved_extents(root, ins->objectid, ins->offset, 1);
3657 return ret; 3076 return ret;
3658} 3077}
3659 3078
@@ -3789,7 +3208,7 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3789 3208
3790 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 3209 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
3791 3210
3792 ret = __btrfs_free_extent(trans, root, disk_bytenr, 3211 ret = btrfs_free_extent(trans, root, disk_bytenr,
3793 btrfs_file_extent_disk_num_bytes(leaf, fi), 3212 btrfs_file_extent_disk_num_bytes(leaf, fi),
3794 leaf->start, leaf_owner, leaf_generation, 3213 leaf->start, leaf_owner, leaf_generation,
3795 key.objectid, 0); 3214 key.objectid, 0);
@@ -3829,7 +3248,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3829 */ 3248 */
3830 for (i = 0; i < ref->nritems; i++) { 3249 for (i = 0; i < ref->nritems; i++) {
3831 info = ref->extents + sorted[i].slot; 3250 info = ref->extents + sorted[i].slot;
3832 ret = __btrfs_free_extent(trans, root, info->bytenr, 3251 ret = btrfs_free_extent(trans, root, info->bytenr,
3833 info->num_bytes, ref->bytenr, 3252 info->num_bytes, ref->bytenr,
3834 ref->owner, ref->generation, 3253 ref->owner, ref->generation,
3835 info->objectid, 0); 3254 info->objectid, 0);
@@ -3846,12 +3265,13 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3846 return 0; 3265 return 0;
3847} 3266}
3848 3267
3849static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, 3268static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
3269 struct btrfs_root *root, u64 start,
3850 u64 len, u32 *refs) 3270 u64 len, u32 *refs)
3851{ 3271{
3852 int ret; 3272 int ret;
3853 3273
3854 ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs); 3274 ret = btrfs_lookup_extent_ref(trans, root, start, len, refs);
3855 BUG_ON(ret); 3275 BUG_ON(ret);
3856 3276
3857#if 0 /* some debugging code in case we see problems here */ 3277#if 0 /* some debugging code in case we see problems here */
@@ -3959,7 +3379,8 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
3959 * we just decrement it below and don't update any 3379 * we just decrement it below and don't update any
3960 * of the refs the leaf points to. 3380 * of the refs the leaf points to.
3961 */ 3381 */
3962 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); 3382 ret = drop_snap_lookup_refcount(trans, root, bytenr,
3383 blocksize, &refs);
3963 BUG_ON(ret); 3384 BUG_ON(ret);
3964 if (refs != 1) 3385 if (refs != 1)
3965 continue; 3386 continue;
@@ -4010,7 +3431,7 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
4010 */ 3431 */
4011 for (i = 0; i < refi; i++) { 3432 for (i = 0; i < refi; i++) {
4012 bytenr = sorted[i].bytenr; 3433 bytenr = sorted[i].bytenr;
4013 ret = __btrfs_free_extent(trans, root, bytenr, 3434 ret = btrfs_free_extent(trans, root, bytenr,
4014 blocksize, eb->start, 3435 blocksize, eb->start,
4015 root_owner, root_gen, 0, 1); 3436 root_owner, root_gen, 0, 1);
4016 BUG_ON(ret); 3437 BUG_ON(ret);
@@ -4053,7 +3474,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4053 3474
4054 WARN_ON(*level < 0); 3475 WARN_ON(*level < 0);
4055 WARN_ON(*level >= BTRFS_MAX_LEVEL); 3476 WARN_ON(*level >= BTRFS_MAX_LEVEL);
4056 ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start, 3477 ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
4057 path->nodes[*level]->len, &refs); 3478 path->nodes[*level]->len, &refs);
4058 BUG_ON(ret); 3479 BUG_ON(ret);
4059 if (refs > 1) 3480 if (refs > 1)
@@ -4104,7 +3525,8 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4104 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 3525 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
4105 blocksize = btrfs_level_size(root, *level - 1); 3526 blocksize = btrfs_level_size(root, *level - 1);
4106 3527
4107 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); 3528 ret = drop_snap_lookup_refcount(trans, root, bytenr,
3529 blocksize, &refs);
4108 BUG_ON(ret); 3530 BUG_ON(ret);
4109 3531
4110 /* 3532 /*
@@ -4119,7 +3541,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4119 root_gen = btrfs_header_generation(parent); 3541 root_gen = btrfs_header_generation(parent);
4120 path->slots[*level]++; 3542 path->slots[*level]++;
4121 3543
4122 ret = __btrfs_free_extent(trans, root, bytenr, 3544 ret = btrfs_free_extent(trans, root, bytenr,
4123 blocksize, parent->start, 3545 blocksize, parent->start,
4124 root_owner, root_gen, 3546 root_owner, root_gen,
4125 *level - 1, 1); 3547 *level - 1, 1);
@@ -4165,7 +3587,7 @@ out:
4165 * cleanup and free the reference on the last node 3587 * cleanup and free the reference on the last node
4166 * we processed 3588 * we processed
4167 */ 3589 */
4168 ret = __btrfs_free_extent(trans, root, bytenr, blocksize, 3590 ret = btrfs_free_extent(trans, root, bytenr, blocksize,
4169 parent->start, root_owner, root_gen, 3591 parent->start, root_owner, root_gen,
4170 *level, 1); 3592 *level, 1);
4171 free_extent_buffer(path->nodes[*level]); 3593 free_extent_buffer(path->nodes[*level]);
@@ -4354,6 +3776,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
4354 struct btrfs_path *path; 3776 struct btrfs_path *path;
4355 int i; 3777 int i;
4356 int orig_level; 3778 int orig_level;
3779 int update_count;
4357 struct btrfs_root_item *root_item = &root->root_item; 3780 struct btrfs_root_item *root_item = &root->root_item;
4358 3781
4359 WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex)); 3782 WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
@@ -4395,6 +3818,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
4395 } 3818 }
4396 } 3819 }
4397 while (1) { 3820 while (1) {
3821 unsigned long update;
4398 wret = walk_down_tree(trans, root, path, &level); 3822 wret = walk_down_tree(trans, root, path, &level);
4399 if (wret > 0) 3823 if (wret > 0)
4400 break; 3824 break;
@@ -4407,12 +3831,21 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
4407 break; 3831 break;
4408 if (wret < 0) 3832 if (wret < 0)
4409 ret = wret; 3833 ret = wret;
4410 if (trans->transaction->in_commit) { 3834 if (trans->transaction->in_commit ||
3835 trans->transaction->delayed_refs.flushing) {
4411 ret = -EAGAIN; 3836 ret = -EAGAIN;
4412 break; 3837 break;
4413 } 3838 }
4414 atomic_inc(&root->fs_info->throttle_gen); 3839 atomic_inc(&root->fs_info->throttle_gen);
4415 wake_up(&root->fs_info->transaction_throttle); 3840 wake_up(&root->fs_info->transaction_throttle);
3841 for (update_count = 0; update_count < 16; update_count++) {
3842 update = trans->delayed_ref_updates;
3843 trans->delayed_ref_updates = 0;
3844 if (update)
3845 btrfs_run_delayed_refs(trans, root, update);
3846 else
3847 break;
3848 }
4416 } 3849 }
4417 for (i = 0; i <= orig_level; i++) { 3850 for (i = 0; i <= orig_level; i++) {
4418 if (path->nodes[i]) { 3851 if (path->nodes[i]) {
@@ -5457,6 +4890,7 @@ static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
5457 root->root_key.objectid, 4890 root->root_key.objectid,
5458 trans->transid, key.objectid); 4891 trans->transid, key.objectid);
5459 BUG_ON(ret); 4892 BUG_ON(ret);
4893
5460 ret = btrfs_free_extent(trans, root, 4894 ret = btrfs_free_extent(trans, root,
5461 bytenr, num_bytes, leaf->start, 4895 bytenr, num_bytes, leaf->start,
5462 btrfs_header_owner(leaf), 4896 btrfs_header_owner(leaf),
@@ -5768,9 +5202,6 @@ static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
5768 ref_path, NULL, NULL); 5202 ref_path, NULL, NULL);
5769 BUG_ON(ret); 5203 BUG_ON(ret);
5770 5204
5771 if (root == root->fs_info->extent_root)
5772 btrfs_extent_post_op(trans, root);
5773
5774 return 0; 5205 return 0;
5775} 5206}
5776 5207
@@ -6038,6 +5469,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
6038 if (!path) 5469 if (!path)
6039 return -ENOMEM; 5470 return -ENOMEM;
6040 5471
5472 path->leave_spinning = 1;
6041 ret = btrfs_insert_empty_inode(trans, root, path, objectid); 5473 ret = btrfs_insert_empty_inode(trans, root, path, objectid);
6042 if (ret) 5474 if (ret)
6043 goto out; 5475 goto out;
@@ -6208,6 +5640,9 @@ again:
6208 btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1); 5640 btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
6209 mutex_unlock(&root->fs_info->cleaner_mutex); 5641 mutex_unlock(&root->fs_info->cleaner_mutex);
6210 5642
5643 trans = btrfs_start_transaction(info->tree_root, 1);
5644 btrfs_commit_transaction(trans, info->tree_root);
5645
6211 while (1) { 5646 while (1) {
6212 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5647 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6213 if (ret < 0) 5648 if (ret < 0)
@@ -6466,7 +5901,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
6466 5901
6467 extent_root = root->fs_info->extent_root; 5902 extent_root = root->fs_info->extent_root;
6468 5903
6469 root->fs_info->last_trans_new_blockgroup = trans->transid; 5904 root->fs_info->last_trans_log_full_commit = trans->transid;
6470 5905
6471 cache = kzalloc(sizeof(*cache), GFP_NOFS); 5906 cache = kzalloc(sizeof(*cache), GFP_NOFS);
6472 if (!cache) 5907 if (!cache)
@@ -6500,9 +5935,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
6500 sizeof(cache->item)); 5935 sizeof(cache->item));
6501 BUG_ON(ret); 5936 BUG_ON(ret);
6502 5937
6503 finish_current_insert(trans, extent_root, 0);
6504 ret = del_pending_extents(trans, extent_root, 0);
6505 BUG_ON(ret);
6506 set_avail_alloc_bits(extent_root->fs_info, type); 5938 set_avail_alloc_bits(extent_root->fs_info, type);
6507 5939
6508 return 0; 5940 return 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ebe6b29e6069..08085af089e2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3124,20 +3124,15 @@ void free_extent_buffer(struct extent_buffer *eb)
3124int clear_extent_buffer_dirty(struct extent_io_tree *tree, 3124int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3125 struct extent_buffer *eb) 3125 struct extent_buffer *eb)
3126{ 3126{
3127 int set;
3128 unsigned long i; 3127 unsigned long i;
3129 unsigned long num_pages; 3128 unsigned long num_pages;
3130 struct page *page; 3129 struct page *page;
3131 3130
3132 u64 start = eb->start;
3133 u64 end = start + eb->len - 1;
3134
3135 set = clear_extent_dirty(tree, start, end, GFP_NOFS);
3136 num_pages = num_extent_pages(eb->start, eb->len); 3131 num_pages = num_extent_pages(eb->start, eb->len);
3137 3132
3138 for (i = 0; i < num_pages; i++) { 3133 for (i = 0; i < num_pages; i++) {
3139 page = extent_buffer_page(eb, i); 3134 page = extent_buffer_page(eb, i);
3140 if (!set && !PageDirty(page)) 3135 if (!PageDirty(page))
3141 continue; 3136 continue;
3142 3137
3143 lock_page(page); 3138 lock_page(page);
@@ -3146,22 +3141,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3146 else 3141 else
3147 set_page_private(page, EXTENT_PAGE_PRIVATE); 3142 set_page_private(page, EXTENT_PAGE_PRIVATE);
3148 3143
3149 /*
3150 * if we're on the last page or the first page and the
3151 * block isn't aligned on a page boundary, do extra checks
3152 * to make sure we don't clean page that is partially dirty
3153 */
3154 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3155 ((i == num_pages - 1) &&
3156 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3157 start = (u64)page->index << PAGE_CACHE_SHIFT;
3158 end = start + PAGE_CACHE_SIZE - 1;
3159 if (test_range_bit(tree, start, end,
3160 EXTENT_DIRTY, 0)) {
3161 unlock_page(page);
3162 continue;
3163 }
3164 }
3165 clear_page_dirty_for_io(page); 3144 clear_page_dirty_for_io(page);
3166 spin_lock_irq(&page->mapping->tree_lock); 3145 spin_lock_irq(&page->mapping->tree_lock);
3167 if (!PageDirty(page)) { 3146 if (!PageDirty(page)) {
@@ -3187,29 +3166,13 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
3187{ 3166{
3188 unsigned long i; 3167 unsigned long i;
3189 unsigned long num_pages; 3168 unsigned long num_pages;
3169 int was_dirty = 0;
3190 3170
3171 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
3191 num_pages = num_extent_pages(eb->start, eb->len); 3172 num_pages = num_extent_pages(eb->start, eb->len);
3192 for (i = 0; i < num_pages; i++) { 3173 for (i = 0; i < num_pages; i++)
3193 struct page *page = extent_buffer_page(eb, i);
3194 /* writepage may need to do something special for the
3195 * first page, we have to make sure page->private is
3196 * properly set. releasepage may drop page->private
3197 * on us if the page isn't already dirty.
3198 */
3199 lock_page(page);
3200 if (i == 0) {
3201 set_page_extent_head(page, eb->len);
3202 } else if (PagePrivate(page) &&
3203 page->private != EXTENT_PAGE_PRIVATE) {
3204 set_page_extent_mapped(page);
3205 }
3206 __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); 3174 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
3207 set_extent_dirty(tree, page_offset(page), 3175 return was_dirty;
3208 page_offset(page) + PAGE_CACHE_SIZE - 1,
3209 GFP_NOFS);
3210 unlock_page(page);
3211 }
3212 return 0;
3213} 3176}
3214 3177
3215int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 3178int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
@@ -3789,6 +3752,10 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3789 ret = 0; 3752 ret = 0;
3790 goto out; 3753 goto out;
3791 } 3754 }
3755 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3756 ret = 0;
3757 goto out;
3758 }
3792 /* at this point we can safely release the extent buffer */ 3759 /* at this point we can safely release the extent buffer */
3793 num_pages = num_extent_pages(eb->start, eb->len); 3760 num_pages = num_extent_pages(eb->start, eb->len);
3794 for (i = 0; i < num_pages; i++) 3761 for (i = 0; i < num_pages; i++)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 1f9df88afbf6..5bc20abf3f3d 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -25,6 +25,7 @@
25/* these are bit numbers for test/set bit */ 25/* these are bit numbers for test/set bit */
26#define EXTENT_BUFFER_UPTODATE 0 26#define EXTENT_BUFFER_UPTODATE 0
27#define EXTENT_BUFFER_BLOCKING 1 27#define EXTENT_BUFFER_BLOCKING 1
28#define EXTENT_BUFFER_DIRTY 2
28 29
29/* 30/*
30 * page->private values. Every page that is controlled by the extent 31 * page->private values. Every page that is controlled by the extent
@@ -254,6 +255,8 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
254 struct extent_buffer *eb); 255 struct extent_buffer *eb);
255int set_extent_buffer_dirty(struct extent_io_tree *tree, 256int set_extent_buffer_dirty(struct extent_io_tree *tree,
256 struct extent_buffer *eb); 257 struct extent_buffer *eb);
258int test_extent_buffer_dirty(struct extent_io_tree *tree,
259 struct extent_buffer *eb);
257int set_extent_buffer_uptodate(struct extent_io_tree *tree, 260int set_extent_buffer_uptodate(struct extent_io_tree *tree,
258 struct extent_buffer *eb); 261 struct extent_buffer *eb);
259int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 262int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 964652435fd1..9b99886562d0 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -52,6 +52,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
52 file_key.offset = pos; 52 file_key.offset = pos;
53 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); 53 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
54 54
55 path->leave_spinning = 1;
55 ret = btrfs_insert_empty_item(trans, root, path, &file_key, 56 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
56 sizeof(*item)); 57 sizeof(*item));
57 if (ret < 0) 58 if (ret < 0)
@@ -523,6 +524,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
523 key.offset = end_byte - 1; 524 key.offset = end_byte - 1;
524 key.type = BTRFS_EXTENT_CSUM_KEY; 525 key.type = BTRFS_EXTENT_CSUM_KEY;
525 526
527 path->leave_spinning = 1;
526 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 528 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
527 if (ret > 0) { 529 if (ret > 0) {
528 if (path->slots[0] == 0) 530 if (path->slots[0] == 0)
@@ -757,8 +759,10 @@ insert:
757 } else { 759 } else {
758 ins_size = csum_size; 760 ins_size = csum_size;
759 } 761 }
762 path->leave_spinning = 1;
760 ret = btrfs_insert_empty_item(trans, root, path, &file_key, 763 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
761 ins_size); 764 ins_size);
765 path->leave_spinning = 0;
762 if (ret < 0) 766 if (ret < 0)
763 goto fail_unlock; 767 goto fail_unlock;
764 if (ret != 0) { 768 if (ret != 0) {
@@ -776,7 +780,6 @@ found:
776 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + 780 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
777 btrfs_item_size_nr(leaf, path->slots[0])); 781 btrfs_item_size_nr(leaf, path->slots[0]));
778 eb_token = NULL; 782 eb_token = NULL;
779 cond_resched();
780next_sector: 783next_sector:
781 784
782 if (!eb_token || 785 if (!eb_token ||
@@ -817,9 +820,9 @@ next_sector:
817 eb_token = NULL; 820 eb_token = NULL;
818 } 821 }
819 btrfs_mark_buffer_dirty(path->nodes[0]); 822 btrfs_mark_buffer_dirty(path->nodes[0]);
820 cond_resched();
821 if (total_bytes < sums->len) { 823 if (total_bytes < sums->len) {
822 btrfs_release_path(root, path); 824 btrfs_release_path(root, path);
825 cond_resched();
823 goto again; 826 goto again;
824 } 827 }
825out: 828out:
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index dc78954861b3..9c9fb46ccd08 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -606,6 +606,7 @@ next_slot:
606 btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY); 606 btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
607 607
608 btrfs_release_path(root, path); 608 btrfs_release_path(root, path);
609 path->leave_spinning = 1;
609 ret = btrfs_insert_empty_item(trans, root, path, &ins, 610 ret = btrfs_insert_empty_item(trans, root, path, &ins,
610 sizeof(*extent)); 611 sizeof(*extent));
611 BUG_ON(ret); 612 BUG_ON(ret);
@@ -639,17 +640,22 @@ next_slot:
639 ram_bytes); 640 ram_bytes);
640 btrfs_set_file_extent_type(leaf, extent, found_type); 641 btrfs_set_file_extent_type(leaf, extent, found_type);
641 642
643 btrfs_unlock_up_safe(path, 1);
642 btrfs_mark_buffer_dirty(path->nodes[0]); 644 btrfs_mark_buffer_dirty(path->nodes[0]);
645 btrfs_set_lock_blocking(path->nodes[0]);
643 646
644 if (disk_bytenr != 0) { 647 if (disk_bytenr != 0) {
645 ret = btrfs_update_extent_ref(trans, root, 648 ret = btrfs_update_extent_ref(trans, root,
646 disk_bytenr, orig_parent, 649 disk_bytenr,
650 le64_to_cpu(old.disk_num_bytes),
651 orig_parent,
647 leaf->start, 652 leaf->start,
648 root->root_key.objectid, 653 root->root_key.objectid,
649 trans->transid, ins.objectid); 654 trans->transid, ins.objectid);
650 655
651 BUG_ON(ret); 656 BUG_ON(ret);
652 } 657 }
658 path->leave_spinning = 0;
653 btrfs_release_path(root, path); 659 btrfs_release_path(root, path);
654 if (disk_bytenr != 0) 660 if (disk_bytenr != 0)
655 inode_add_bytes(inode, extent_end - end); 661 inode_add_bytes(inode, extent_end - end);
@@ -912,7 +918,7 @@ again:
912 btrfs_set_file_extent_other_encoding(leaf, fi, 0); 918 btrfs_set_file_extent_other_encoding(leaf, fi, 0);
913 919
914 if (orig_parent != leaf->start) { 920 if (orig_parent != leaf->start) {
915 ret = btrfs_update_extent_ref(trans, root, bytenr, 921 ret = btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
916 orig_parent, leaf->start, 922 orig_parent, leaf->start,
917 root->root_key.objectid, 923 root->root_key.objectid,
918 trans->transid, inode->i_ino); 924 trans->transid, inode->i_ino);
@@ -1155,6 +1161,20 @@ out_nolock:
1155 page_cache_release(pinned[1]); 1161 page_cache_release(pinned[1]);
1156 *ppos = pos; 1162 *ppos = pos;
1157 1163
1164 /*
1165 * we want to make sure fsync finds this change
1166 * but we haven't joined a transaction running right now.
1167 *
1168 * Later on, someone is sure to update the inode and get the
1169 * real transid recorded.
1170 *
1171 * We set last_trans now to the fs_info generation + 1,
1172 * this will either be one more than the running transaction
1173 * or the generation used for the next transaction if there isn't
1174 * one running right now.
1175 */
1176 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1177
1158 if (num_written > 0 && will_write) { 1178 if (num_written > 0 && will_write) {
1159 struct btrfs_trans_handle *trans; 1179 struct btrfs_trans_handle *trans;
1160 1180
@@ -1167,8 +1187,11 @@ out_nolock:
1167 ret = btrfs_log_dentry_safe(trans, root, 1187 ret = btrfs_log_dentry_safe(trans, root,
1168 file->f_dentry); 1188 file->f_dentry);
1169 if (ret == 0) { 1189 if (ret == 0) {
1170 btrfs_sync_log(trans, root); 1190 ret = btrfs_sync_log(trans, root);
1171 btrfs_end_transaction(trans, root); 1191 if (ret == 0)
1192 btrfs_end_transaction(trans, root);
1193 else
1194 btrfs_commit_transaction(trans, root);
1172 } else { 1195 } else {
1173 btrfs_commit_transaction(trans, root); 1196 btrfs_commit_transaction(trans, root);
1174 } 1197 }
@@ -1185,6 +1208,18 @@ out_nolock:
1185 1208
1186int btrfs_release_file(struct inode *inode, struct file *filp) 1209int btrfs_release_file(struct inode *inode, struct file *filp)
1187{ 1210{
1211 /*
1212 * ordered_data_close is set by settattr when we are about to truncate
1213 * a file from a non-zero size to a zero size. This tries to
1214 * flush down new bytes that may have been written if the
1215 * application were using truncate to replace a file in place.
1216 */
1217 if (BTRFS_I(inode)->ordered_data_close) {
1218 BTRFS_I(inode)->ordered_data_close = 0;
1219 btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
1220 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
1221 filemap_flush(inode->i_mapping);
1222 }
1188 if (filp->private_data) 1223 if (filp->private_data)
1189 btrfs_ioctl_trans_end(filp); 1224 btrfs_ioctl_trans_end(filp);
1190 return 0; 1225 return 0;
@@ -1260,8 +1295,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1260 if (ret > 0) { 1295 if (ret > 0) {
1261 ret = btrfs_commit_transaction(trans, root); 1296 ret = btrfs_commit_transaction(trans, root);
1262 } else { 1297 } else {
1263 btrfs_sync_log(trans, root); 1298 ret = btrfs_sync_log(trans, root);
1264 ret = btrfs_end_transaction(trans, root); 1299 if (ret == 0)
1300 ret = btrfs_end_transaction(trans, root);
1301 else
1302 ret = btrfs_commit_transaction(trans, root);
1265 } 1303 }
1266 mutex_lock(&dentry->d_inode->i_mutex); 1304 mutex_lock(&dentry->d_inode->i_mutex);
1267out: 1305out:
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 3d46fa1f29a4..6b627c611808 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -73,6 +73,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
73 if (!path) 73 if (!path)
74 return -ENOMEM; 74 return -ENOMEM;
75 75
76 path->leave_spinning = 1;
77
76 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 78 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
77 if (ret > 0) { 79 if (ret > 0) {
78 ret = -ENOENT; 80 ret = -ENOENT;
@@ -127,6 +129,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
127 if (!path) 129 if (!path)
128 return -ENOMEM; 130 return -ENOMEM;
129 131
132 path->leave_spinning = 1;
130 ret = btrfs_insert_empty_item(trans, root, path, &key, 133 ret = btrfs_insert_empty_item(trans, root, path, &key,
131 ins_len); 134 ins_len);
132 if (ret == -EEXIST) { 135 if (ret == -EEXIST) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 17e608c4dc70..06d8db5afb08 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -134,6 +134,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
134 if (!path) 134 if (!path)
135 return -ENOMEM; 135 return -ENOMEM;
136 136
137 path->leave_spinning = 1;
137 btrfs_set_trans_block_group(trans, inode); 138 btrfs_set_trans_block_group(trans, inode);
138 139
139 key.objectid = inode->i_ino; 140 key.objectid = inode->i_ino;
@@ -167,9 +168,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
167 cur_size = min_t(unsigned long, compressed_size, 168 cur_size = min_t(unsigned long, compressed_size,
168 PAGE_CACHE_SIZE); 169 PAGE_CACHE_SIZE);
169 170
170 kaddr = kmap(cpage); 171 kaddr = kmap_atomic(cpage, KM_USER0);
171 write_extent_buffer(leaf, kaddr, ptr, cur_size); 172 write_extent_buffer(leaf, kaddr, ptr, cur_size);
172 kunmap(cpage); 173 kunmap_atomic(kaddr, KM_USER0);
173 174
174 i++; 175 i++;
175 ptr += cur_size; 176 ptr += cur_size;
@@ -204,7 +205,7 @@ fail:
204 * does the checks required to make sure the data is small enough 205 * does the checks required to make sure the data is small enough
205 * to fit as an inline extent. 206 * to fit as an inline extent.
206 */ 207 */
207static int cow_file_range_inline(struct btrfs_trans_handle *trans, 208static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
208 struct btrfs_root *root, 209 struct btrfs_root *root,
209 struct inode *inode, u64 start, u64 end, 210 struct inode *inode, u64 start, u64 end,
210 size_t compressed_size, 211 size_t compressed_size,
@@ -854,11 +855,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
854 u64 cur_end; 855 u64 cur_end;
855 int limit = 10 * 1024 * 1042; 856 int limit = 10 * 1024 * 1042;
856 857
857 if (!btrfs_test_opt(root, COMPRESS)) {
858 return cow_file_range(inode, locked_page, start, end,
859 page_started, nr_written, 1);
860 }
861
862 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | 858 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
863 EXTENT_DELALLOC, 1, 0, GFP_NOFS); 859 EXTENT_DELALLOC, 1, 0, GFP_NOFS);
864 while (start < end) { 860 while (start < end) {
@@ -935,7 +931,8 @@ static noinline int csum_exist_in_range(struct btrfs_root *root,
935 * If no cow copies or snapshots exist, we write directly to the existing 931 * If no cow copies or snapshots exist, we write directly to the existing
936 * blocks on disk 932 * blocks on disk
937 */ 933 */
938static int run_delalloc_nocow(struct inode *inode, struct page *locked_page, 934static noinline int run_delalloc_nocow(struct inode *inode,
935 struct page *locked_page,
939 u64 start, u64 end, int *page_started, int force, 936 u64 start, u64 end, int *page_started, int force,
940 unsigned long *nr_written) 937 unsigned long *nr_written)
941{ 938{
@@ -1133,6 +1130,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1133 unsigned long *nr_written) 1130 unsigned long *nr_written)
1134{ 1131{
1135 int ret; 1132 int ret;
1133 struct btrfs_root *root = BTRFS_I(inode)->root;
1136 1134
1137 if (btrfs_test_flag(inode, NODATACOW)) 1135 if (btrfs_test_flag(inode, NODATACOW))
1138 ret = run_delalloc_nocow(inode, locked_page, start, end, 1136 ret = run_delalloc_nocow(inode, locked_page, start, end,
@@ -1140,10 +1138,12 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1140 else if (btrfs_test_flag(inode, PREALLOC)) 1138 else if (btrfs_test_flag(inode, PREALLOC))
1141 ret = run_delalloc_nocow(inode, locked_page, start, end, 1139 ret = run_delalloc_nocow(inode, locked_page, start, end,
1142 page_started, 0, nr_written); 1140 page_started, 0, nr_written);
1141 else if (!btrfs_test_opt(root, COMPRESS))
1142 ret = cow_file_range(inode, locked_page, start, end,
1143 page_started, nr_written, 1);
1143 else 1144 else
1144 ret = cow_file_range_async(inode, locked_page, start, end, 1145 ret = cow_file_range_async(inode, locked_page, start, end,
1145 page_started, nr_written); 1146 page_started, nr_written);
1146
1147 return ret; 1147 return ret;
1148} 1148}
1149 1149
@@ -1453,6 +1453,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1453 path = btrfs_alloc_path(); 1453 path = btrfs_alloc_path();
1454 BUG_ON(!path); 1454 BUG_ON(!path);
1455 1455
1456 path->leave_spinning = 1;
1456 ret = btrfs_drop_extents(trans, root, inode, file_pos, 1457 ret = btrfs_drop_extents(trans, root, inode, file_pos,
1457 file_pos + num_bytes, file_pos, &hint); 1458 file_pos + num_bytes, file_pos, &hint);
1458 BUG_ON(ret); 1459 BUG_ON(ret);
@@ -1475,6 +1476,10 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1475 btrfs_set_file_extent_compression(leaf, fi, compression); 1476 btrfs_set_file_extent_compression(leaf, fi, compression);
1476 btrfs_set_file_extent_encryption(leaf, fi, encryption); 1477 btrfs_set_file_extent_encryption(leaf, fi, encryption);
1477 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); 1478 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
1479
1480 btrfs_unlock_up_safe(path, 1);
1481 btrfs_set_lock_blocking(leaf);
1482
1478 btrfs_mark_buffer_dirty(leaf); 1483 btrfs_mark_buffer_dirty(leaf);
1479 1484
1480 inode_add_bytes(inode, num_bytes); 1485 inode_add_bytes(inode, num_bytes);
@@ -1487,11 +1492,35 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1487 root->root_key.objectid, 1492 root->root_key.objectid,
1488 trans->transid, inode->i_ino, &ins); 1493 trans->transid, inode->i_ino, &ins);
1489 BUG_ON(ret); 1494 BUG_ON(ret);
1490
1491 btrfs_free_path(path); 1495 btrfs_free_path(path);
1496
1492 return 0; 1497 return 0;
1493} 1498}
1494 1499
1500/*
1501 * helper function for btrfs_finish_ordered_io, this
1502 * just reads in some of the csum leaves to prime them into ram
1503 * before we start the transaction. It limits the amount of btree
1504 * reads required while inside the transaction.
1505 */
1506static noinline void reada_csum(struct btrfs_root *root,
1507 struct btrfs_path *path,
1508 struct btrfs_ordered_extent *ordered_extent)
1509{
1510 struct btrfs_ordered_sum *sum;
1511 u64 bytenr;
1512
1513 sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum,
1514 list);
1515 bytenr = sum->sums[0].bytenr;
1516
1517 /*
1518 * we don't care about the results, the point of this search is
1519 * just to get the btree leaves into ram
1520 */
1521 btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0);
1522}
1523
1495/* as ordered data IO finishes, this gets called so we can finish 1524/* as ordered data IO finishes, this gets called so we can finish
1496 * an ordered extent if the range of bytes in the file it covers are 1525 * an ordered extent if the range of bytes in the file it covers are
1497 * fully written. 1526 * fully written.
@@ -1500,8 +1529,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1500{ 1529{
1501 struct btrfs_root *root = BTRFS_I(inode)->root; 1530 struct btrfs_root *root = BTRFS_I(inode)->root;
1502 struct btrfs_trans_handle *trans; 1531 struct btrfs_trans_handle *trans;
1503 struct btrfs_ordered_extent *ordered_extent; 1532 struct btrfs_ordered_extent *ordered_extent = NULL;
1504 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1533 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1534 struct btrfs_path *path;
1505 int compressed = 0; 1535 int compressed = 0;
1506 int ret; 1536 int ret;
1507 1537
@@ -1509,9 +1539,33 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1509 if (!ret) 1539 if (!ret)
1510 return 0; 1540 return 0;
1511 1541
1542 /*
1543 * before we join the transaction, try to do some of our IO.
1544 * This will limit the amount of IO that we have to do with
1545 * the transaction running. We're unlikely to need to do any
1546 * IO if the file extents are new, the disk_i_size checks
1547 * covers the most common case.
1548 */
1549 if (start < BTRFS_I(inode)->disk_i_size) {
1550 path = btrfs_alloc_path();
1551 if (path) {
1552 ret = btrfs_lookup_file_extent(NULL, root, path,
1553 inode->i_ino,
1554 start, 0);
1555 ordered_extent = btrfs_lookup_ordered_extent(inode,
1556 start);
1557 if (!list_empty(&ordered_extent->list)) {
1558 btrfs_release_path(root, path);
1559 reada_csum(root, path, ordered_extent);
1560 }
1561 btrfs_free_path(path);
1562 }
1563 }
1564
1512 trans = btrfs_join_transaction(root, 1); 1565 trans = btrfs_join_transaction(root, 1);
1513 1566
1514 ordered_extent = btrfs_lookup_ordered_extent(inode, start); 1567 if (!ordered_extent)
1568 ordered_extent = btrfs_lookup_ordered_extent(inode, start);
1515 BUG_ON(!ordered_extent); 1569 BUG_ON(!ordered_extent);
1516 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) 1570 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
1517 goto nocow; 1571 goto nocow;
@@ -2101,6 +2155,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2101 2155
2102 path = btrfs_alloc_path(); 2156 path = btrfs_alloc_path();
2103 BUG_ON(!path); 2157 BUG_ON(!path);
2158 path->leave_spinning = 1;
2104 ret = btrfs_lookup_inode(trans, root, path, 2159 ret = btrfs_lookup_inode(trans, root, path,
2105 &BTRFS_I(inode)->location, 1); 2160 &BTRFS_I(inode)->location, 1);
2106 if (ret) { 2161 if (ret) {
@@ -2147,6 +2202,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2147 goto err; 2202 goto err;
2148 } 2203 }
2149 2204
2205 path->leave_spinning = 1;
2150 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, 2206 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
2151 name, name_len, -1); 2207 name, name_len, -1);
2152 if (IS_ERR(di)) { 2208 if (IS_ERR(di)) {
@@ -2190,8 +2246,6 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2190 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, 2246 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
2191 inode, dir->i_ino); 2247 inode, dir->i_ino);
2192 BUG_ON(ret != 0 && ret != -ENOENT); 2248 BUG_ON(ret != 0 && ret != -ENOENT);
2193 if (ret != -ENOENT)
2194 BTRFS_I(dir)->log_dirty_trans = trans->transid;
2195 2249
2196 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, 2250 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
2197 dir, index); 2251 dir, index);
@@ -2224,6 +2278,9 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2224 trans = btrfs_start_transaction(root, 1); 2278 trans = btrfs_start_transaction(root, 1);
2225 2279
2226 btrfs_set_trans_block_group(trans, dir); 2280 btrfs_set_trans_block_group(trans, dir);
2281
2282 btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
2283
2227 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 2284 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2228 dentry->d_name.name, dentry->d_name.len); 2285 dentry->d_name.name, dentry->d_name.len);
2229 2286
@@ -2498,6 +2555,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2498 key.type = (u8)-1; 2555 key.type = (u8)-1;
2499 2556
2500search_again: 2557search_again:
2558 path->leave_spinning = 1;
2501 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2559 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2502 if (ret < 0) 2560 if (ret < 0)
2503 goto error; 2561 goto error;
@@ -2644,6 +2702,7 @@ delete:
2644 break; 2702 break;
2645 } 2703 }
2646 if (found_extent) { 2704 if (found_extent) {
2705 btrfs_set_path_blocking(path);
2647 ret = btrfs_free_extent(trans, root, extent_start, 2706 ret = btrfs_free_extent(trans, root, extent_start,
2648 extent_num_bytes, 2707 extent_num_bytes,
2649 leaf->start, root_owner, 2708 leaf->start, root_owner,
@@ -2848,11 +2907,21 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
2848 if (err) 2907 if (err)
2849 return err; 2908 return err;
2850 2909
2851 if (S_ISREG(inode->i_mode) && 2910 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
2852 attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) { 2911 if (attr->ia_size > inode->i_size) {
2853 err = btrfs_cont_expand(inode, attr->ia_size); 2912 err = btrfs_cont_expand(inode, attr->ia_size);
2854 if (err) 2913 if (err)
2855 return err; 2914 return err;
2915 } else if (inode->i_size > 0 &&
2916 attr->ia_size == 0) {
2917
2918 /* we're truncating a file that used to have good
2919 * data down to zero. Make sure it gets into
2920 * the ordered flush list so that any new writes
2921 * get down to disk quickly.
2922 */
2923 BTRFS_I(inode)->ordered_data_close = 1;
2924 }
2856 } 2925 }
2857 2926
2858 err = inode_setattr(inode, attr); 2927 err = inode_setattr(inode, attr);
@@ -2984,13 +3053,14 @@ static noinline void init_btrfs_i(struct inode *inode)
2984 bi->disk_i_size = 0; 3053 bi->disk_i_size = 0;
2985 bi->flags = 0; 3054 bi->flags = 0;
2986 bi->index_cnt = (u64)-1; 3055 bi->index_cnt = (u64)-1;
2987 bi->log_dirty_trans = 0; 3056 bi->last_unlink_trans = 0;
2988 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); 3057 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
2989 extent_io_tree_init(&BTRFS_I(inode)->io_tree, 3058 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
2990 inode->i_mapping, GFP_NOFS); 3059 inode->i_mapping, GFP_NOFS);
2991 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, 3060 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
2992 inode->i_mapping, GFP_NOFS); 3061 inode->i_mapping, GFP_NOFS);
2993 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); 3062 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
3063 INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
2994 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); 3064 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
2995 mutex_init(&BTRFS_I(inode)->extent_mutex); 3065 mutex_init(&BTRFS_I(inode)->extent_mutex);
2996 mutex_init(&BTRFS_I(inode)->log_mutex); 3066 mutex_init(&BTRFS_I(inode)->log_mutex);
@@ -3449,6 +3519,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3449 sizes[0] = sizeof(struct btrfs_inode_item); 3519 sizes[0] = sizeof(struct btrfs_inode_item);
3450 sizes[1] = name_len + sizeof(*ref); 3520 sizes[1] = name_len + sizeof(*ref);
3451 3521
3522 path->leave_spinning = 1;
3452 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); 3523 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
3453 if (ret != 0) 3524 if (ret != 0)
3454 goto fail; 3525 goto fail;
@@ -3727,6 +3798,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
3727 drop_inode = 1; 3798 drop_inode = 1;
3728 3799
3729 nr = trans->blocks_used; 3800 nr = trans->blocks_used;
3801
3802 btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
3730 btrfs_end_transaction_throttle(trans, root); 3803 btrfs_end_transaction_throttle(trans, root);
3731fail: 3804fail:
3732 if (drop_inode) { 3805 if (drop_inode) {
@@ -4363,6 +4436,8 @@ again:
4363 } 4436 }
4364 ClearPageChecked(page); 4437 ClearPageChecked(page);
4365 set_page_dirty(page); 4438 set_page_dirty(page);
4439
4440 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
4366 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 4441 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4367 4442
4368out_unlock: 4443out_unlock:
@@ -4388,6 +4463,27 @@ static void btrfs_truncate(struct inode *inode)
4388 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 4463 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
4389 4464
4390 trans = btrfs_start_transaction(root, 1); 4465 trans = btrfs_start_transaction(root, 1);
4466
4467 /*
4468 * setattr is responsible for setting the ordered_data_close flag,
4469 * but that is only tested during the last file release. That
4470 * could happen well after the next commit, leaving a great big
4471 * window where new writes may get lost if someone chooses to write
4472 * to this file after truncating to zero
4473 *
4474 * The inode doesn't have any dirty data here, and so if we commit
4475 * this is a noop. If someone immediately starts writing to the inode
4476 * it is very likely we'll catch some of their writes in this
4477 * transaction, and the commit will find this file on the ordered
4478 * data list with good things to send down.
4479 *
4480 * This is a best effort solution, there is still a window where
4481 * using truncate to replace the contents of the file will
4482 * end up with a zero length file after a crash.
4483 */
4484 if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
4485 btrfs_add_ordered_operation(trans, root, inode);
4486
4391 btrfs_set_trans_block_group(trans, inode); 4487 btrfs_set_trans_block_group(trans, inode);
4392 btrfs_i_size_write(inode, inode->i_size); 4488 btrfs_i_size_write(inode, inode->i_size);
4393 4489
@@ -4464,12 +4560,15 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
4464 ei->i_acl = BTRFS_ACL_NOT_CACHED; 4560 ei->i_acl = BTRFS_ACL_NOT_CACHED;
4465 ei->i_default_acl = BTRFS_ACL_NOT_CACHED; 4561 ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
4466 INIT_LIST_HEAD(&ei->i_orphan); 4562 INIT_LIST_HEAD(&ei->i_orphan);
4563 INIT_LIST_HEAD(&ei->ordered_operations);
4467 return &ei->vfs_inode; 4564 return &ei->vfs_inode;
4468} 4565}
4469 4566
4470void btrfs_destroy_inode(struct inode *inode) 4567void btrfs_destroy_inode(struct inode *inode)
4471{ 4568{
4472 struct btrfs_ordered_extent *ordered; 4569 struct btrfs_ordered_extent *ordered;
4570 struct btrfs_root *root = BTRFS_I(inode)->root;
4571
4473 WARN_ON(!list_empty(&inode->i_dentry)); 4572 WARN_ON(!list_empty(&inode->i_dentry));
4474 WARN_ON(inode->i_data.nrpages); 4573 WARN_ON(inode->i_data.nrpages);
4475 4574
@@ -4480,13 +4579,24 @@ void btrfs_destroy_inode(struct inode *inode)
4480 BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED) 4579 BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
4481 posix_acl_release(BTRFS_I(inode)->i_default_acl); 4580 posix_acl_release(BTRFS_I(inode)->i_default_acl);
4482 4581
4483 spin_lock(&BTRFS_I(inode)->root->list_lock); 4582 /*
4583 * Make sure we're properly removed from the ordered operation
4584 * lists.
4585 */
4586 smp_mb();
4587 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
4588 spin_lock(&root->fs_info->ordered_extent_lock);
4589 list_del_init(&BTRFS_I(inode)->ordered_operations);
4590 spin_unlock(&root->fs_info->ordered_extent_lock);
4591 }
4592
4593 spin_lock(&root->list_lock);
4484 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 4594 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
4485 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan" 4595 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
4486 " list\n", inode->i_ino); 4596 " list\n", inode->i_ino);
4487 dump_stack(); 4597 dump_stack();
4488 } 4598 }
4489 spin_unlock(&BTRFS_I(inode)->root->list_lock); 4599 spin_unlock(&root->list_lock);
4490 4600
4491 while (1) { 4601 while (1) {
4492 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 4602 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -4611,8 +4721,36 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4611 if (ret) 4721 if (ret)
4612 goto out_unlock; 4722 goto out_unlock;
4613 4723
4724 /*
4725 * we're using rename to replace one file with another.
4726 * and the replacement file is large. Start IO on it now so
4727 * we don't add too much work to the end of the transaction
4728 */
4729 if (new_inode && old_inode && S_ISREG(old_inode->i_mode) &&
4730 new_inode->i_size &&
4731 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
4732 filemap_flush(old_inode->i_mapping);
4733
4614 trans = btrfs_start_transaction(root, 1); 4734 trans = btrfs_start_transaction(root, 1);
4615 4735
4736 /*
4737 * make sure the inode gets flushed if it is replacing
4738 * something.
4739 */
4740 if (new_inode && new_inode->i_size &&
4741 old_inode && S_ISREG(old_inode->i_mode)) {
4742 btrfs_add_ordered_operation(trans, root, old_inode);
4743 }
4744
4745 /*
4746 * this is an ugly little race, but the rename is required to make
4747 * sure that if we crash, the inode is either at the old name
4748 * or the new one. pinning the log transaction lets us make sure
4749 * we don't allow a log commit to come in after we unlink the
4750 * name but before we add the new name back in.
4751 */
4752 btrfs_pin_log_trans(root);
4753
4616 btrfs_set_trans_block_group(trans, new_dir); 4754 btrfs_set_trans_block_group(trans, new_dir);
4617 4755
4618 btrfs_inc_nlink(old_dentry->d_inode); 4756 btrfs_inc_nlink(old_dentry->d_inode);
@@ -4620,6 +4758,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4620 new_dir->i_ctime = new_dir->i_mtime = ctime; 4758 new_dir->i_ctime = new_dir->i_mtime = ctime;
4621 old_inode->i_ctime = ctime; 4759 old_inode->i_ctime = ctime;
4622 4760
4761 if (old_dentry->d_parent != new_dentry->d_parent)
4762 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
4763
4623 ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode, 4764 ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
4624 old_dentry->d_name.name, 4765 old_dentry->d_name.name,
4625 old_dentry->d_name.len); 4766 old_dentry->d_name.len);
@@ -4651,7 +4792,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4651 if (ret) 4792 if (ret)
4652 goto out_fail; 4793 goto out_fail;
4653 4794
4795 btrfs_log_new_name(trans, old_inode, old_dir,
4796 new_dentry->d_parent);
4654out_fail: 4797out_fail:
4798
4799 /* this btrfs_end_log_trans just allows the current
4800 * log-sub transaction to complete
4801 */
4802 btrfs_end_log_trans(root);
4655 btrfs_end_transaction_throttle(trans, root); 4803 btrfs_end_transaction_throttle(trans, root);
4656out_unlock: 4804out_unlock:
4657 return ret; 4805 return ret;
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 47b0a88c12a2..a5310c0f41e2 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -71,12 +71,13 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb)
71static int btrfs_spin_on_block(struct extent_buffer *eb) 71static int btrfs_spin_on_block(struct extent_buffer *eb)
72{ 72{
73 int i; 73 int i;
74
74 for (i = 0; i < 512; i++) { 75 for (i = 0; i < 512; i++) {
75 cpu_relax();
76 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 76 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
77 return 1; 77 return 1;
78 if (need_resched()) 78 if (need_resched())
79 break; 79 break;
80 cpu_relax();
80 } 81 }
81 return 0; 82 return 0;
82} 83}
@@ -95,13 +96,15 @@ int btrfs_try_spin_lock(struct extent_buffer *eb)
95{ 96{
96 int i; 97 int i;
97 98
98 spin_nested(eb); 99 if (btrfs_spin_on_block(eb)) {
99 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 100 spin_nested(eb);
100 return 1; 101 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
101 spin_unlock(&eb->lock); 102 return 1;
102 103 spin_unlock(&eb->lock);
104 }
103 /* spin for a bit on the BLOCKING flag */ 105 /* spin for a bit on the BLOCKING flag */
104 for (i = 0; i < 2; i++) { 106 for (i = 0; i < 2; i++) {
107 cpu_relax();
105 if (!btrfs_spin_on_block(eb)) 108 if (!btrfs_spin_on_block(eb))
106 break; 109 break;
107 110
@@ -148,6 +151,9 @@ int btrfs_tree_lock(struct extent_buffer *eb)
148 DEFINE_WAIT(wait); 151 DEFINE_WAIT(wait);
149 wait.func = btrfs_wake_function; 152 wait.func = btrfs_wake_function;
150 153
154 if (!btrfs_spin_on_block(eb))
155 goto sleep;
156
151 while(1) { 157 while(1) {
152 spin_nested(eb); 158 spin_nested(eb);
153 159
@@ -165,9 +171,10 @@ int btrfs_tree_lock(struct extent_buffer *eb)
165 * spin for a bit, and if the blocking flag goes away, 171 * spin for a bit, and if the blocking flag goes away,
166 * loop around 172 * loop around
167 */ 173 */
174 cpu_relax();
168 if (btrfs_spin_on_block(eb)) 175 if (btrfs_spin_on_block(eb))
169 continue; 176 continue;
170 177sleep:
171 prepare_to_wait_exclusive(&eb->lock_wq, &wait, 178 prepare_to_wait_exclusive(&eb->lock_wq, &wait,
172 TASK_UNINTERRUPTIBLE); 179 TASK_UNINTERRUPTIBLE);
173 180
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 77c2411a5f0f..53c87b197d70 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -310,6 +310,16 @@ int btrfs_remove_ordered_extent(struct inode *inode,
310 310
311 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 311 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
312 list_del_init(&entry->root_extent_list); 312 list_del_init(&entry->root_extent_list);
313
314 /*
315 * we have no more ordered extents for this inode and
316 * no dirty pages. We can safely remove it from the
317 * list of ordered extents
318 */
319 if (RB_EMPTY_ROOT(&tree->tree) &&
320 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
321 list_del_init(&BTRFS_I(inode)->ordered_operations);
322 }
313 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 323 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
314 324
315 mutex_unlock(&tree->mutex); 325 mutex_unlock(&tree->mutex);
@@ -370,6 +380,68 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
370} 380}
371 381
372/* 382/*
383 * this is used during transaction commit to write all the inodes
384 * added to the ordered operation list. These files must be fully on
385 * disk before the transaction commits.
386 *
387 * we have two modes here, one is to just start the IO via filemap_flush
388 * and the other is to wait for all the io. When we wait, we have an
389 * extra check to make sure the ordered operation list really is empty
390 * before we return
391 */
392int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
393{
394 struct btrfs_inode *btrfs_inode;
395 struct inode *inode;
396 struct list_head splice;
397
398 INIT_LIST_HEAD(&splice);
399
400 mutex_lock(&root->fs_info->ordered_operations_mutex);
401 spin_lock(&root->fs_info->ordered_extent_lock);
402again:
403 list_splice_init(&root->fs_info->ordered_operations, &splice);
404
405 while (!list_empty(&splice)) {
406 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
407 ordered_operations);
408
409 inode = &btrfs_inode->vfs_inode;
410
411 list_del_init(&btrfs_inode->ordered_operations);
412
413 /*
414 * the inode may be getting freed (in sys_unlink path).
415 */
416 inode = igrab(inode);
417
418 if (!wait && inode) {
419 list_add_tail(&BTRFS_I(inode)->ordered_operations,
420 &root->fs_info->ordered_operations);
421 }
422 spin_unlock(&root->fs_info->ordered_extent_lock);
423
424 if (inode) {
425 if (wait)
426 btrfs_wait_ordered_range(inode, 0, (u64)-1);
427 else
428 filemap_flush(inode->i_mapping);
429 iput(inode);
430 }
431
432 cond_resched();
433 spin_lock(&root->fs_info->ordered_extent_lock);
434 }
435 if (wait && !list_empty(&root->fs_info->ordered_operations))
436 goto again;
437
438 spin_unlock(&root->fs_info->ordered_extent_lock);
439 mutex_unlock(&root->fs_info->ordered_operations_mutex);
440
441 return 0;
442}
443
444/*
373 * Used to start IO or wait for a given ordered extent to finish. 445 * Used to start IO or wait for a given ordered extent to finish.
374 * 446 *
375 * If wait is one, this effectively waits on page writeback for all the pages 447 * If wait is one, this effectively waits on page writeback for all the pages
@@ -726,3 +798,49 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
726 798
727 return ret; 799 return ret;
728} 800}
801
802/*
803 * add a given inode to the list of inodes that must be fully on
804 * disk before a transaction commit finishes.
805 *
806 * This basically gives us the ext3 style data=ordered mode, and it is mostly
807 * used to make sure renamed files are fully on disk.
808 *
809 * It is a noop if the inode is already fully on disk.
810 *
811 * If trans is not null, we'll do a friendly check for a transaction that
812 * is already flushing things and force the IO down ourselves.
813 */
814int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
815 struct btrfs_root *root,
816 struct inode *inode)
817{
818 u64 last_mod;
819
820 last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
821
822 /*
823 * if this file hasn't been changed since the last transaction
824 * commit, we can safely return without doing anything
825 */
826 if (last_mod < root->fs_info->last_trans_committed)
827 return 0;
828
829 /*
830 * the transaction is already committing. Just start the IO and
831 * don't bother with all of this list nonsense
832 */
833 if (trans && root->fs_info->running_transaction->blocked) {
834 btrfs_wait_ordered_range(inode, 0, (u64)-1);
835 return 0;
836 }
837
838 spin_lock(&root->fs_info->ordered_extent_lock);
839 if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
840 list_add_tail(&BTRFS_I(inode)->ordered_operations,
841 &root->fs_info->ordered_operations);
842 }
843 spin_unlock(&root->fs_info->ordered_extent_lock);
844
845 return 0;
846}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index ab66d5e8d6d6..3d31c8827b01 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -155,4 +155,8 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
155int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, 155int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
156 loff_t end, int sync_mode); 156 loff_t end, int sync_mode);
157int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); 157int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
158int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
159int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
160 struct btrfs_root *root,
161 struct inode *inode);
158#endif 162#endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4112d53d4f4d..664782c6a2df 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -65,6 +65,15 @@ static noinline int join_transaction(struct btrfs_root *root)
65 cur_trans->use_count = 1; 65 cur_trans->use_count = 1;
66 cur_trans->commit_done = 0; 66 cur_trans->commit_done = 0;
67 cur_trans->start_time = get_seconds(); 67 cur_trans->start_time = get_seconds();
68
69 cur_trans->delayed_refs.root.rb_node = NULL;
70 cur_trans->delayed_refs.num_entries = 0;
71 cur_trans->delayed_refs.num_heads_ready = 0;
72 cur_trans->delayed_refs.num_heads = 0;
73 cur_trans->delayed_refs.flushing = 0;
74 cur_trans->delayed_refs.run_delayed_start = 0;
75 spin_lock_init(&cur_trans->delayed_refs.lock);
76
68 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 77 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
69 list_add_tail(&cur_trans->list, &root->fs_info->trans_list); 78 list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
70 extent_io_tree_init(&cur_trans->dirty_pages, 79 extent_io_tree_init(&cur_trans->dirty_pages,
@@ -182,6 +191,8 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
182 h->block_group = 0; 191 h->block_group = 0;
183 h->alloc_exclude_nr = 0; 192 h->alloc_exclude_nr = 0;
184 h->alloc_exclude_start = 0; 193 h->alloc_exclude_start = 0;
194 h->delayed_ref_updates = 0;
195
185 root->fs_info->running_transaction->use_count++; 196 root->fs_info->running_transaction->use_count++;
186 mutex_unlock(&root->fs_info->trans_mutex); 197 mutex_unlock(&root->fs_info->trans_mutex);
187 return h; 198 return h;
@@ -271,7 +282,6 @@ void btrfs_throttle(struct btrfs_root *root)
271 if (!root->fs_info->open_ioctl_trans) 282 if (!root->fs_info->open_ioctl_trans)
272 wait_current_trans(root); 283 wait_current_trans(root);
273 mutex_unlock(&root->fs_info->trans_mutex); 284 mutex_unlock(&root->fs_info->trans_mutex);
274
275 throttle_on_drops(root); 285 throttle_on_drops(root);
276} 286}
277 287
@@ -280,6 +290,27 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
280{ 290{
281 struct btrfs_transaction *cur_trans; 291 struct btrfs_transaction *cur_trans;
282 struct btrfs_fs_info *info = root->fs_info; 292 struct btrfs_fs_info *info = root->fs_info;
293 int count = 0;
294
295 while (count < 4) {
296 unsigned long cur = trans->delayed_ref_updates;
297 trans->delayed_ref_updates = 0;
298 if (cur &&
299 trans->transaction->delayed_refs.num_heads_ready > 64) {
300 trans->delayed_ref_updates = 0;
301
302 /*
303 * do a full flush if the transaction is trying
304 * to close
305 */
306 if (trans->transaction->delayed_refs.flushing)
307 cur = 0;
308 btrfs_run_delayed_refs(trans, root, cur);
309 } else {
310 break;
311 }
312 count++;
313 }
283 314
284 mutex_lock(&info->trans_mutex); 315 mutex_lock(&info->trans_mutex);
285 cur_trans = info->running_transaction; 316 cur_trans = info->running_transaction;
@@ -424,9 +455,10 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
424 u64 old_root_bytenr; 455 u64 old_root_bytenr;
425 struct btrfs_root *tree_root = root->fs_info->tree_root; 456 struct btrfs_root *tree_root = root->fs_info->tree_root;
426 457
427 btrfs_extent_post_op(trans, root);
428 btrfs_write_dirty_block_groups(trans, root); 458 btrfs_write_dirty_block_groups(trans, root);
429 btrfs_extent_post_op(trans, root); 459
460 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
461 BUG_ON(ret);
430 462
431 while (1) { 463 while (1) {
432 old_root_bytenr = btrfs_root_bytenr(&root->root_item); 464 old_root_bytenr = btrfs_root_bytenr(&root->root_item);
@@ -438,14 +470,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
438 btrfs_header_level(root->node)); 470 btrfs_header_level(root->node));
439 btrfs_set_root_generation(&root->root_item, trans->transid); 471 btrfs_set_root_generation(&root->root_item, trans->transid);
440 472
441 btrfs_extent_post_op(trans, root);
442
443 ret = btrfs_update_root(trans, tree_root, 473 ret = btrfs_update_root(trans, tree_root,
444 &root->root_key, 474 &root->root_key,
445 &root->root_item); 475 &root->root_item);
446 BUG_ON(ret); 476 BUG_ON(ret);
447 btrfs_write_dirty_block_groups(trans, root); 477 btrfs_write_dirty_block_groups(trans, root);
448 btrfs_extent_post_op(trans, root); 478
479 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
480 BUG_ON(ret);
449 } 481 }
450 return 0; 482 return 0;
451} 483}
@@ -459,15 +491,18 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
459 struct btrfs_fs_info *fs_info = root->fs_info; 491 struct btrfs_fs_info *fs_info = root->fs_info;
460 struct list_head *next; 492 struct list_head *next;
461 struct extent_buffer *eb; 493 struct extent_buffer *eb;
494 int ret;
462 495
463 btrfs_extent_post_op(trans, fs_info->tree_root); 496 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
497 BUG_ON(ret);
464 498
465 eb = btrfs_lock_root_node(fs_info->tree_root); 499 eb = btrfs_lock_root_node(fs_info->tree_root);
466 btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0); 500 btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
467 btrfs_tree_unlock(eb); 501 btrfs_tree_unlock(eb);
468 free_extent_buffer(eb); 502 free_extent_buffer(eb);
469 503
470 btrfs_extent_post_op(trans, fs_info->tree_root); 504 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
505 BUG_ON(ret);
471 506
472 while (!list_empty(&fs_info->dirty_cowonly_roots)) { 507 while (!list_empty(&fs_info->dirty_cowonly_roots)) {
473 next = fs_info->dirty_cowonly_roots.next; 508 next = fs_info->dirty_cowonly_roots.next;
@@ -475,6 +510,9 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
475 root = list_entry(next, struct btrfs_root, dirty_list); 510 root = list_entry(next, struct btrfs_root, dirty_list);
476 511
477 update_cowonly_root(trans, root); 512 update_cowonly_root(trans, root);
513
514 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
515 BUG_ON(ret);
478 } 516 }
479 return 0; 517 return 0;
480} 518}
@@ -635,6 +673,31 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
635} 673}
636 674
637/* 675/*
676 * when dropping snapshots, we generate a ton of delayed refs, and it makes
677 * sense not to join the transaction while it is trying to flush the current
678 * queue of delayed refs out.
679 *
680 * This is used by the drop snapshot code only
681 */
682static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
683{
684 DEFINE_WAIT(wait);
685
686 mutex_lock(&info->trans_mutex);
687 while (info->running_transaction &&
688 info->running_transaction->delayed_refs.flushing) {
689 prepare_to_wait(&info->transaction_wait, &wait,
690 TASK_UNINTERRUPTIBLE);
691 mutex_unlock(&info->trans_mutex);
692 schedule();
693 mutex_lock(&info->trans_mutex);
694 finish_wait(&info->transaction_wait, &wait);
695 }
696 mutex_unlock(&info->trans_mutex);
697 return 0;
698}
699
700/*
638 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on 701 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
639 * all of them 702 * all of them
640 */ 703 */
@@ -661,7 +724,22 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
661 atomic_inc(&root->fs_info->throttles); 724 atomic_inc(&root->fs_info->throttles);
662 725
663 while (1) { 726 while (1) {
727 /*
728 * we don't want to jump in and create a bunch of
729 * delayed refs if the transaction is starting to close
730 */
731 wait_transaction_pre_flush(tree_root->fs_info);
664 trans = btrfs_start_transaction(tree_root, 1); 732 trans = btrfs_start_transaction(tree_root, 1);
733
734 /*
735 * we've joined a transaction, make sure it isn't
736 * closing right now
737 */
738 if (trans->transaction->delayed_refs.flushing) {
739 btrfs_end_transaction(trans, tree_root);
740 continue;
741 }
742
665 mutex_lock(&root->fs_info->drop_mutex); 743 mutex_lock(&root->fs_info->drop_mutex);
666 ret = btrfs_drop_snapshot(trans, dirty->root); 744 ret = btrfs_drop_snapshot(trans, dirty->root);
667 if (ret != -EAGAIN) 745 if (ret != -EAGAIN)
@@ -766,7 +844,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
766 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 844 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
767 845
768 old = btrfs_lock_root_node(root); 846 old = btrfs_lock_root_node(root);
769 btrfs_cow_block(trans, root, old, NULL, 0, &old, 0); 847 btrfs_cow_block(trans, root, old, NULL, 0, &old);
770 848
771 btrfs_copy_root(trans, root, old, &tmp, objectid); 849 btrfs_copy_root(trans, root, old, &tmp, objectid);
772 btrfs_tree_unlock(old); 850 btrfs_tree_unlock(old);
@@ -894,12 +972,31 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
894 struct extent_io_tree *pinned_copy; 972 struct extent_io_tree *pinned_copy;
895 DEFINE_WAIT(wait); 973 DEFINE_WAIT(wait);
896 int ret; 974 int ret;
975 int should_grow = 0;
976 unsigned long now = get_seconds();
977
978 btrfs_run_ordered_operations(root, 0);
979
980 /* make a pass through all the delayed refs we have so far
981 * any runnings procs may add more while we are here
982 */
983 ret = btrfs_run_delayed_refs(trans, root, 0);
984 BUG_ON(ret);
985
986 cur_trans = trans->transaction;
987 /*
988 * set the flushing flag so procs in this transaction have to
989 * start sending their work down.
990 */
991 cur_trans->delayed_refs.flushing = 1;
992
993 ret = btrfs_run_delayed_refs(trans, root, 0);
994 BUG_ON(ret);
897 995
898 INIT_LIST_HEAD(&dirty_fs_roots);
899 mutex_lock(&root->fs_info->trans_mutex); 996 mutex_lock(&root->fs_info->trans_mutex);
900 if (trans->transaction->in_commit) { 997 INIT_LIST_HEAD(&dirty_fs_roots);
901 cur_trans = trans->transaction; 998 if (cur_trans->in_commit) {
902 trans->transaction->use_count++; 999 cur_trans->use_count++;
903 mutex_unlock(&root->fs_info->trans_mutex); 1000 mutex_unlock(&root->fs_info->trans_mutex);
904 btrfs_end_transaction(trans, root); 1001 btrfs_end_transaction(trans, root);
905 1002
@@ -922,7 +1019,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
922 1019
923 trans->transaction->in_commit = 1; 1020 trans->transaction->in_commit = 1;
924 trans->transaction->blocked = 1; 1021 trans->transaction->blocked = 1;
925 cur_trans = trans->transaction;
926 if (cur_trans->list.prev != &root->fs_info->trans_list) { 1022 if (cur_trans->list.prev != &root->fs_info->trans_list) {
927 prev_trans = list_entry(cur_trans->list.prev, 1023 prev_trans = list_entry(cur_trans->list.prev,
928 struct btrfs_transaction, list); 1024 struct btrfs_transaction, list);
@@ -937,6 +1033,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
937 } 1033 }
938 } 1034 }
939 1035
1036 if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
1037 should_grow = 1;
1038
940 do { 1039 do {
941 int snap_pending = 0; 1040 int snap_pending = 0;
942 joined = cur_trans->num_joined; 1041 joined = cur_trans->num_joined;
@@ -949,7 +1048,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
949 1048
950 if (cur_trans->num_writers > 1) 1049 if (cur_trans->num_writers > 1)
951 timeout = MAX_SCHEDULE_TIMEOUT; 1050 timeout = MAX_SCHEDULE_TIMEOUT;
952 else 1051 else if (should_grow)
953 timeout = 1; 1052 timeout = 1;
954 1053
955 mutex_unlock(&root->fs_info->trans_mutex); 1054 mutex_unlock(&root->fs_info->trans_mutex);
@@ -959,16 +1058,30 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
959 BUG_ON(ret); 1058 BUG_ON(ret);
960 } 1059 }
961 1060
962 schedule_timeout(timeout); 1061 /*
1062 * rename don't use btrfs_join_transaction, so, once we
1063 * set the transaction to blocked above, we aren't going
1064 * to get any new ordered operations. We can safely run
1065 * it here and no for sure that nothing new will be added
1066 * to the list
1067 */
1068 btrfs_run_ordered_operations(root, 1);
1069
1070 smp_mb();
1071 if (cur_trans->num_writers > 1 || should_grow)
1072 schedule_timeout(timeout);
963 1073
964 mutex_lock(&root->fs_info->trans_mutex); 1074 mutex_lock(&root->fs_info->trans_mutex);
965 finish_wait(&cur_trans->writer_wait, &wait); 1075 finish_wait(&cur_trans->writer_wait, &wait);
966 } while (cur_trans->num_writers > 1 || 1076 } while (cur_trans->num_writers > 1 ||
967 (cur_trans->num_joined != joined)); 1077 (should_grow && cur_trans->num_joined != joined));
968 1078
969 ret = create_pending_snapshots(trans, root->fs_info); 1079 ret = create_pending_snapshots(trans, root->fs_info);
970 BUG_ON(ret); 1080 BUG_ON(ret);
971 1081
1082 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1083 BUG_ON(ret);
1084
972 WARN_ON(cur_trans != trans->transaction); 1085 WARN_ON(cur_trans != trans->transaction);
973 1086
974 /* btrfs_commit_tree_roots is responsible for getting the 1087 /* btrfs_commit_tree_roots is responsible for getting the
@@ -1032,6 +1145,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1032 btrfs_copy_pinned(root, pinned_copy); 1145 btrfs_copy_pinned(root, pinned_copy);
1033 1146
1034 trans->transaction->blocked = 0; 1147 trans->transaction->blocked = 0;
1148
1035 wake_up(&root->fs_info->transaction_throttle); 1149 wake_up(&root->fs_info->transaction_throttle);
1036 wake_up(&root->fs_info->transaction_wait); 1150 wake_up(&root->fs_info->transaction_wait);
1037 1151
@@ -1058,6 +1172,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1058 mutex_lock(&root->fs_info->trans_mutex); 1172 mutex_lock(&root->fs_info->trans_mutex);
1059 1173
1060 cur_trans->commit_done = 1; 1174 cur_trans->commit_done = 1;
1175
1061 root->fs_info->last_trans_committed = cur_trans->transid; 1176 root->fs_info->last_trans_committed = cur_trans->transid;
1062 wake_up(&cur_trans->commit_wait); 1177 wake_up(&cur_trans->commit_wait);
1063 1178
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index ea292117f882..94f5bde2b58d 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -19,10 +19,16 @@
19#ifndef __BTRFS_TRANSACTION__ 19#ifndef __BTRFS_TRANSACTION__
20#define __BTRFS_TRANSACTION__ 20#define __BTRFS_TRANSACTION__
21#include "btrfs_inode.h" 21#include "btrfs_inode.h"
22#include "delayed-ref.h"
22 23
23struct btrfs_transaction { 24struct btrfs_transaction {
24 u64 transid; 25 u64 transid;
26 /*
27 * total writers in this transaction, it must be zero before the
28 * transaction can end
29 */
25 unsigned long num_writers; 30 unsigned long num_writers;
31
26 unsigned long num_joined; 32 unsigned long num_joined;
27 int in_commit; 33 int in_commit;
28 int use_count; 34 int use_count;
@@ -34,6 +40,7 @@ struct btrfs_transaction {
34 wait_queue_head_t writer_wait; 40 wait_queue_head_t writer_wait;
35 wait_queue_head_t commit_wait; 41 wait_queue_head_t commit_wait;
36 struct list_head pending_snapshots; 42 struct list_head pending_snapshots;
43 struct btrfs_delayed_ref_root delayed_refs;
37}; 44};
38 45
39struct btrfs_trans_handle { 46struct btrfs_trans_handle {
@@ -44,6 +51,7 @@ struct btrfs_trans_handle {
44 u64 block_group; 51 u64 block_group;
45 u64 alloc_exclude_start; 52 u64 alloc_exclude_start;
46 u64 alloc_exclude_nr; 53 u64 alloc_exclude_nr;
54 unsigned long delayed_ref_updates;
47}; 55};
48 56
49struct btrfs_pending_snapshot { 57struct btrfs_pending_snapshot {
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 98d25fa4570e..b10eacdb1620 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -124,8 +124,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
124 } 124 }
125 125
126 btrfs_release_path(root, path); 126 btrfs_release_path(root, path);
127 if (is_extent)
128 btrfs_extent_post_op(trans, root);
129out: 127out:
130 if (path) 128 if (path)
131 btrfs_free_path(path); 129 btrfs_free_path(path);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9c462fbd60fa..fc9b87a7975b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -35,6 +35,49 @@
35#define LOG_INODE_EXISTS 1 35#define LOG_INODE_EXISTS 1
36 36
37/* 37/*
38 * directory trouble cases
39 *
40 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
41 * log, we must force a full commit before doing an fsync of the directory
42 * where the unlink was done.
43 * ---> record transid of last unlink/rename per directory
44 *
45 * mkdir foo/some_dir
46 * normal commit
47 * rename foo/some_dir foo2/some_dir
48 * mkdir foo/some_dir
49 * fsync foo/some_dir/some_file
50 *
51 * The fsync above will unlink the original some_dir without recording
52 * it in its new location (foo2). After a crash, some_dir will be gone
53 * unless the fsync of some_file forces a full commit
54 *
55 * 2) we must log any new names for any file or dir that is in the fsync
56 * log. ---> check inode while renaming/linking.
57 *
58 * 2a) we must log any new names for any file or dir during rename
59 * when the directory they are being removed from was logged.
60 * ---> check inode and old parent dir during rename
61 *
62 * 2a is actually the more important variant. With the extra logging
63 * a crash might unlink the old name without recreating the new one
64 *
65 * 3) after a crash, we must go through any directories with a link count
66 * of zero and redo the rm -rf
67 *
68 * mkdir f1/foo
69 * normal commit
70 * rm -rf f1/foo
71 * fsync(f1)
72 *
73 * The directory f1 was fully removed from the FS, but fsync was never
74 * called on f1, only its parent dir. After a crash the rm -rf must
75 * be replayed. This must be able to recurse down the entire
76 * directory tree. The inode link count fixup code takes care of the
77 * ugly details.
78 */
79
80/*
38 * stages for the tree walking. The first 81 * stages for the tree walking. The first
39 * stage (0) is to only pin down the blocks we find 82 * stage (0) is to only pin down the blocks we find
40 * the second stage (1) is to make sure that all the inodes 83 * the second stage (1) is to make sure that all the inodes
@@ -47,12 +90,17 @@
47#define LOG_WALK_REPLAY_INODES 1 90#define LOG_WALK_REPLAY_INODES 1
48#define LOG_WALK_REPLAY_ALL 2 91#define LOG_WALK_REPLAY_ALL 2
49 92
50static int __btrfs_log_inode(struct btrfs_trans_handle *trans, 93static int btrfs_log_inode(struct btrfs_trans_handle *trans,
51 struct btrfs_root *root, struct inode *inode, 94 struct btrfs_root *root, struct inode *inode,
52 int inode_only); 95 int inode_only);
53static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 96static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root, 97 struct btrfs_root *root,
55 struct btrfs_path *path, u64 objectid); 98 struct btrfs_path *path, u64 objectid);
99static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
100 struct btrfs_root *root,
101 struct btrfs_root *log,
102 struct btrfs_path *path,
103 u64 dirid, int del_all);
56 104
57/* 105/*
58 * tree logging is a special write ahead log used to make sure that 106 * tree logging is a special write ahead log used to make sure that
@@ -133,10 +181,25 @@ static int join_running_log_trans(struct btrfs_root *root)
133} 181}
134 182
135/* 183/*
184 * This either makes the current running log transaction wait
185 * until you call btrfs_end_log_trans() or it makes any future
186 * log transactions wait until you call btrfs_end_log_trans()
187 */
188int btrfs_pin_log_trans(struct btrfs_root *root)
189{
190 int ret = -ENOENT;
191
192 mutex_lock(&root->log_mutex);
193 atomic_inc(&root->log_writers);
194 mutex_unlock(&root->log_mutex);
195 return ret;
196}
197
198/*
136 * indicate we're done making changes to the log tree 199 * indicate we're done making changes to the log tree
137 * and wake up anyone waiting to do a sync 200 * and wake up anyone waiting to do a sync
138 */ 201 */
139static int end_log_trans(struct btrfs_root *root) 202int btrfs_end_log_trans(struct btrfs_root *root)
140{ 203{
141 if (atomic_dec_and_test(&root->log_writers)) { 204 if (atomic_dec_and_test(&root->log_writers)) {
142 smp_mb(); 205 smp_mb();
@@ -203,7 +266,6 @@ static int process_one_buffer(struct btrfs_root *log,
203 mutex_lock(&log->fs_info->pinned_mutex); 266 mutex_lock(&log->fs_info->pinned_mutex);
204 btrfs_update_pinned_extents(log->fs_info->extent_root, 267 btrfs_update_pinned_extents(log->fs_info->extent_root,
205 eb->start, eb->len, 1); 268 eb->start, eb->len, 1);
206 mutex_unlock(&log->fs_info->pinned_mutex);
207 } 269 }
208 270
209 if (btrfs_buffer_uptodate(eb, gen)) { 271 if (btrfs_buffer_uptodate(eb, gen)) {
@@ -603,6 +665,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
603 665
604 ret = link_to_fixup_dir(trans, root, path, location.objectid); 666 ret = link_to_fixup_dir(trans, root, path, location.objectid);
605 BUG_ON(ret); 667 BUG_ON(ret);
668
606 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 669 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
607 BUG_ON(ret); 670 BUG_ON(ret);
608 kfree(name); 671 kfree(name);
@@ -804,6 +867,7 @@ conflict_again:
804 victim_name_len)) { 867 victim_name_len)) {
805 btrfs_inc_nlink(inode); 868 btrfs_inc_nlink(inode);
806 btrfs_release_path(root, path); 869 btrfs_release_path(root, path);
870
807 ret = btrfs_unlink_inode(trans, root, dir, 871 ret = btrfs_unlink_inode(trans, root, dir,
808 inode, victim_name, 872 inode, victim_name,
809 victim_name_len); 873 victim_name_len);
@@ -922,13 +986,20 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
922 key.offset--; 986 key.offset--;
923 btrfs_release_path(root, path); 987 btrfs_release_path(root, path);
924 } 988 }
925 btrfs_free_path(path); 989 btrfs_release_path(root, path);
926 if (nlink != inode->i_nlink) { 990 if (nlink != inode->i_nlink) {
927 inode->i_nlink = nlink; 991 inode->i_nlink = nlink;
928 btrfs_update_inode(trans, root, inode); 992 btrfs_update_inode(trans, root, inode);
929 } 993 }
930 BTRFS_I(inode)->index_cnt = (u64)-1; 994 BTRFS_I(inode)->index_cnt = (u64)-1;
931 995
996 if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) {
997 ret = replay_dir_deletes(trans, root, NULL, path,
998 inode->i_ino, 1);
999 BUG_ON(ret);
1000 }
1001 btrfs_free_path(path);
1002
932 return 0; 1003 return 0;
933} 1004}
934 1005
@@ -971,9 +1042,12 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
971 1042
972 iput(inode); 1043 iput(inode);
973 1044
974 if (key.offset == 0) 1045 /*
975 break; 1046 * fixup on a directory may create new entries,
976 key.offset--; 1047 * make sure we always look for the highset possible
1048 * offset
1049 */
1050 key.offset = (u64)-1;
977 } 1051 }
978 btrfs_release_path(root, path); 1052 btrfs_release_path(root, path);
979 return 0; 1053 return 0;
@@ -1313,11 +1387,11 @@ again:
1313 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1387 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1314 name_len); 1388 name_len);
1315 log_di = NULL; 1389 log_di = NULL;
1316 if (dir_key->type == BTRFS_DIR_ITEM_KEY) { 1390 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
1317 log_di = btrfs_lookup_dir_item(trans, log, log_path, 1391 log_di = btrfs_lookup_dir_item(trans, log, log_path,
1318 dir_key->objectid, 1392 dir_key->objectid,
1319 name, name_len, 0); 1393 name, name_len, 0);
1320 } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) { 1394 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
1321 log_di = btrfs_lookup_dir_index_item(trans, log, 1395 log_di = btrfs_lookup_dir_index_item(trans, log,
1322 log_path, 1396 log_path,
1323 dir_key->objectid, 1397 dir_key->objectid,
@@ -1378,7 +1452,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1378 struct btrfs_root *root, 1452 struct btrfs_root *root,
1379 struct btrfs_root *log, 1453 struct btrfs_root *log,
1380 struct btrfs_path *path, 1454 struct btrfs_path *path,
1381 u64 dirid) 1455 u64 dirid, int del_all)
1382{ 1456{
1383 u64 range_start; 1457 u64 range_start;
1384 u64 range_end; 1458 u64 range_end;
@@ -1408,10 +1482,14 @@ again:
1408 range_start = 0; 1482 range_start = 0;
1409 range_end = 0; 1483 range_end = 0;
1410 while (1) { 1484 while (1) {
1411 ret = find_dir_range(log, path, dirid, key_type, 1485 if (del_all)
1412 &range_start, &range_end); 1486 range_end = (u64)-1;
1413 if (ret != 0) 1487 else {
1414 break; 1488 ret = find_dir_range(log, path, dirid, key_type,
1489 &range_start, &range_end);
1490 if (ret != 0)
1491 break;
1492 }
1415 1493
1416 dir_key.offset = range_start; 1494 dir_key.offset = range_start;
1417 while (1) { 1495 while (1) {
@@ -1437,7 +1515,8 @@ again:
1437 break; 1515 break;
1438 1516
1439 ret = check_item_in_log(trans, root, log, path, 1517 ret = check_item_in_log(trans, root, log, path,
1440 log_path, dir, &found_key); 1518 log_path, dir,
1519 &found_key);
1441 BUG_ON(ret); 1520 BUG_ON(ret);
1442 if (found_key.offset == (u64)-1) 1521 if (found_key.offset == (u64)-1)
1443 break; 1522 break;
@@ -1514,7 +1593,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1514 mode = btrfs_inode_mode(eb, inode_item); 1593 mode = btrfs_inode_mode(eb, inode_item);
1515 if (S_ISDIR(mode)) { 1594 if (S_ISDIR(mode)) {
1516 ret = replay_dir_deletes(wc->trans, 1595 ret = replay_dir_deletes(wc->trans,
1517 root, log, path, key.objectid); 1596 root, log, path, key.objectid, 0);
1518 BUG_ON(ret); 1597 BUG_ON(ret);
1519 } 1598 }
1520 ret = overwrite_item(wc->trans, root, path, 1599 ret = overwrite_item(wc->trans, root, path,
@@ -1533,6 +1612,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1533 root, inode, inode->i_size, 1612 root, inode, inode->i_size,
1534 BTRFS_EXTENT_DATA_KEY); 1613 BTRFS_EXTENT_DATA_KEY);
1535 BUG_ON(ret); 1614 BUG_ON(ret);
1615
1616 /* if the nlink count is zero here, the iput
1617 * will free the inode. We bump it to make
1618 * sure it doesn't get freed until the link
1619 * count fixup is done
1620 */
1621 if (inode->i_nlink == 0) {
1622 btrfs_inc_nlink(inode);
1623 btrfs_update_inode(wc->trans,
1624 root, inode);
1625 }
1536 iput(inode); 1626 iput(inode);
1537 } 1627 }
1538 ret = link_to_fixup_dir(wc->trans, root, 1628 ret = link_to_fixup_dir(wc->trans, root,
@@ -1840,7 +1930,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
1840 return ret; 1930 return ret;
1841} 1931}
1842 1932
1843static int wait_log_commit(struct btrfs_root *root, unsigned long transid) 1933static int wait_log_commit(struct btrfs_trans_handle *trans,
1934 struct btrfs_root *root, unsigned long transid)
1844{ 1935{
1845 DEFINE_WAIT(wait); 1936 DEFINE_WAIT(wait);
1846 int index = transid % 2; 1937 int index = transid % 2;
@@ -1854,9 +1945,12 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
1854 prepare_to_wait(&root->log_commit_wait[index], 1945 prepare_to_wait(&root->log_commit_wait[index],
1855 &wait, TASK_UNINTERRUPTIBLE); 1946 &wait, TASK_UNINTERRUPTIBLE);
1856 mutex_unlock(&root->log_mutex); 1947 mutex_unlock(&root->log_mutex);
1857 if (root->log_transid < transid + 2 && 1948
1949 if (root->fs_info->last_trans_log_full_commit !=
1950 trans->transid && root->log_transid < transid + 2 &&
1858 atomic_read(&root->log_commit[index])) 1951 atomic_read(&root->log_commit[index]))
1859 schedule(); 1952 schedule();
1953
1860 finish_wait(&root->log_commit_wait[index], &wait); 1954 finish_wait(&root->log_commit_wait[index], &wait);
1861 mutex_lock(&root->log_mutex); 1955 mutex_lock(&root->log_mutex);
1862 } while (root->log_transid < transid + 2 && 1956 } while (root->log_transid < transid + 2 &&
@@ -1864,14 +1958,16 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
1864 return 0; 1958 return 0;
1865} 1959}
1866 1960
1867static int wait_for_writer(struct btrfs_root *root) 1961static int wait_for_writer(struct btrfs_trans_handle *trans,
1962 struct btrfs_root *root)
1868{ 1963{
1869 DEFINE_WAIT(wait); 1964 DEFINE_WAIT(wait);
1870 while (atomic_read(&root->log_writers)) { 1965 while (atomic_read(&root->log_writers)) {
1871 prepare_to_wait(&root->log_writer_wait, 1966 prepare_to_wait(&root->log_writer_wait,
1872 &wait, TASK_UNINTERRUPTIBLE); 1967 &wait, TASK_UNINTERRUPTIBLE);
1873 mutex_unlock(&root->log_mutex); 1968 mutex_unlock(&root->log_mutex);
1874 if (atomic_read(&root->log_writers)) 1969 if (root->fs_info->last_trans_log_full_commit !=
1970 trans->transid && atomic_read(&root->log_writers))
1875 schedule(); 1971 schedule();
1876 mutex_lock(&root->log_mutex); 1972 mutex_lock(&root->log_mutex);
1877 finish_wait(&root->log_writer_wait, &wait); 1973 finish_wait(&root->log_writer_wait, &wait);
@@ -1882,7 +1978,14 @@ static int wait_for_writer(struct btrfs_root *root)
1882/* 1978/*
1883 * btrfs_sync_log does sends a given tree log down to the disk and 1979 * btrfs_sync_log does sends a given tree log down to the disk and
1884 * updates the super blocks to record it. When this call is done, 1980 * updates the super blocks to record it. When this call is done,
1885 * you know that any inodes previously logged are safely on disk 1981 * you know that any inodes previously logged are safely on disk only
1982 * if it returns 0.
1983 *
1984 * Any other return value means you need to call btrfs_commit_transaction.
1985 * Some of the edge cases for fsyncing directories that have had unlinks
1986 * or renames done in the past mean that sometimes the only safe
1987 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
1988 * that has happened.
1886 */ 1989 */
1887int btrfs_sync_log(struct btrfs_trans_handle *trans, 1990int btrfs_sync_log(struct btrfs_trans_handle *trans,
1888 struct btrfs_root *root) 1991 struct btrfs_root *root)
@@ -1896,7 +1999,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1896 mutex_lock(&root->log_mutex); 1999 mutex_lock(&root->log_mutex);
1897 index1 = root->log_transid % 2; 2000 index1 = root->log_transid % 2;
1898 if (atomic_read(&root->log_commit[index1])) { 2001 if (atomic_read(&root->log_commit[index1])) {
1899 wait_log_commit(root, root->log_transid); 2002 wait_log_commit(trans, root, root->log_transid);
1900 mutex_unlock(&root->log_mutex); 2003 mutex_unlock(&root->log_mutex);
1901 return 0; 2004 return 0;
1902 } 2005 }
@@ -1904,18 +2007,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1904 2007
1905 /* wait for previous tree log sync to complete */ 2008 /* wait for previous tree log sync to complete */
1906 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2009 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
1907 wait_log_commit(root, root->log_transid - 1); 2010 wait_log_commit(trans, root, root->log_transid - 1);
1908 2011
1909 while (1) { 2012 while (1) {
1910 unsigned long batch = root->log_batch; 2013 unsigned long batch = root->log_batch;
1911 mutex_unlock(&root->log_mutex); 2014 mutex_unlock(&root->log_mutex);
1912 schedule_timeout_uninterruptible(1); 2015 schedule_timeout_uninterruptible(1);
1913 mutex_lock(&root->log_mutex); 2016 mutex_lock(&root->log_mutex);
1914 wait_for_writer(root); 2017
2018 wait_for_writer(trans, root);
1915 if (batch == root->log_batch) 2019 if (batch == root->log_batch)
1916 break; 2020 break;
1917 } 2021 }
1918 2022
2023 /* bail out if we need to do a full commit */
2024 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2025 ret = -EAGAIN;
2026 mutex_unlock(&root->log_mutex);
2027 goto out;
2028 }
2029
1919 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); 2030 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
1920 BUG_ON(ret); 2031 BUG_ON(ret);
1921 2032
@@ -1951,16 +2062,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1951 2062
1952 index2 = log_root_tree->log_transid % 2; 2063 index2 = log_root_tree->log_transid % 2;
1953 if (atomic_read(&log_root_tree->log_commit[index2])) { 2064 if (atomic_read(&log_root_tree->log_commit[index2])) {
1954 wait_log_commit(log_root_tree, log_root_tree->log_transid); 2065 wait_log_commit(trans, log_root_tree,
2066 log_root_tree->log_transid);
1955 mutex_unlock(&log_root_tree->log_mutex); 2067 mutex_unlock(&log_root_tree->log_mutex);
1956 goto out; 2068 goto out;
1957 } 2069 }
1958 atomic_set(&log_root_tree->log_commit[index2], 1); 2070 atomic_set(&log_root_tree->log_commit[index2], 1);
1959 2071
1960 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) 2072 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
1961 wait_log_commit(log_root_tree, log_root_tree->log_transid - 1); 2073 wait_log_commit(trans, log_root_tree,
2074 log_root_tree->log_transid - 1);
2075 }
2076
2077 wait_for_writer(trans, log_root_tree);
1962 2078
1963 wait_for_writer(log_root_tree); 2079 /*
2080 * now that we've moved on to the tree of log tree roots,
2081 * check the full commit flag again
2082 */
2083 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2084 mutex_unlock(&log_root_tree->log_mutex);
2085 ret = -EAGAIN;
2086 goto out_wake_log_root;
2087 }
1964 2088
1965 ret = btrfs_write_and_wait_marked_extents(log_root_tree, 2089 ret = btrfs_write_and_wait_marked_extents(log_root_tree,
1966 &log_root_tree->dirty_log_pages); 2090 &log_root_tree->dirty_log_pages);
@@ -1985,7 +2109,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1985 * in and cause problems either. 2109 * in and cause problems either.
1986 */ 2110 */
1987 write_ctree_super(trans, root->fs_info->tree_root, 2); 2111 write_ctree_super(trans, root->fs_info->tree_root, 2);
2112 ret = 0;
1988 2113
2114out_wake_log_root:
1989 atomic_set(&log_root_tree->log_commit[index2], 0); 2115 atomic_set(&log_root_tree->log_commit[index2], 0);
1990 smp_mb(); 2116 smp_mb();
1991 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2117 if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
@@ -1998,7 +2124,8 @@ out:
1998 return 0; 2124 return 0;
1999} 2125}
2000 2126
2001/* * free all the extents used by the tree log. This should be called 2127/*
2128 * free all the extents used by the tree log. This should be called
2002 * at commit time of the full transaction 2129 * at commit time of the full transaction
2003 */ 2130 */
2004int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 2131int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
@@ -2132,7 +2259,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2132 2259
2133 btrfs_free_path(path); 2260 btrfs_free_path(path);
2134 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2261 mutex_unlock(&BTRFS_I(dir)->log_mutex);
2135 end_log_trans(root); 2262 btrfs_end_log_trans(root);
2136 2263
2137 return 0; 2264 return 0;
2138} 2265}
@@ -2159,7 +2286,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2159 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, 2286 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2160 dirid, &index); 2287 dirid, &index);
2161 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2288 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2162 end_log_trans(root); 2289 btrfs_end_log_trans(root);
2163 2290
2164 return ret; 2291 return ret;
2165} 2292}
@@ -2559,7 +2686,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2559 * 2686 *
2560 * This handles both files and directories. 2687 * This handles both files and directories.
2561 */ 2688 */
2562static int __btrfs_log_inode(struct btrfs_trans_handle *trans, 2689static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2563 struct btrfs_root *root, struct inode *inode, 2690 struct btrfs_root *root, struct inode *inode,
2564 int inode_only) 2691 int inode_only)
2565{ 2692{
@@ -2585,28 +2712,17 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
2585 min_key.offset = 0; 2712 min_key.offset = 0;
2586 2713
2587 max_key.objectid = inode->i_ino; 2714 max_key.objectid = inode->i_ino;
2715
2716 /* today the code can only do partial logging of directories */
2717 if (!S_ISDIR(inode->i_mode))
2718 inode_only = LOG_INODE_ALL;
2719
2588 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 2720 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
2589 max_key.type = BTRFS_XATTR_ITEM_KEY; 2721 max_key.type = BTRFS_XATTR_ITEM_KEY;
2590 else 2722 else
2591 max_key.type = (u8)-1; 2723 max_key.type = (u8)-1;
2592 max_key.offset = (u64)-1; 2724 max_key.offset = (u64)-1;
2593 2725
2594 /*
2595 * if this inode has already been logged and we're in inode_only
2596 * mode, we don't want to delete the things that have already
2597 * been written to the log.
2598 *
2599 * But, if the inode has been through an inode_only log,
2600 * the logged_trans field is not set. This allows us to catch
2601 * any new names for this inode in the backrefs by logging it
2602 * again
2603 */
2604 if (inode_only == LOG_INODE_EXISTS &&
2605 BTRFS_I(inode)->logged_trans == trans->transid) {
2606 btrfs_free_path(path);
2607 btrfs_free_path(dst_path);
2608 goto out;
2609 }
2610 mutex_lock(&BTRFS_I(inode)->log_mutex); 2726 mutex_lock(&BTRFS_I(inode)->log_mutex);
2611 2727
2612 /* 2728 /*
@@ -2693,7 +2809,6 @@ next_slot:
2693 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 2809 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
2694 btrfs_release_path(root, path); 2810 btrfs_release_path(root, path);
2695 btrfs_release_path(log, dst_path); 2811 btrfs_release_path(log, dst_path);
2696 BTRFS_I(inode)->log_dirty_trans = 0;
2697 ret = log_directory_changes(trans, root, inode, path, dst_path); 2812 ret = log_directory_changes(trans, root, inode, path, dst_path);
2698 BUG_ON(ret); 2813 BUG_ON(ret);
2699 } 2814 }
@@ -2702,19 +2817,69 @@ next_slot:
2702 2817
2703 btrfs_free_path(path); 2818 btrfs_free_path(path);
2704 btrfs_free_path(dst_path); 2819 btrfs_free_path(dst_path);
2705out:
2706 return 0; 2820 return 0;
2707} 2821}
2708 2822
2709int btrfs_log_inode(struct btrfs_trans_handle *trans, 2823/*
2710 struct btrfs_root *root, struct inode *inode, 2824 * follow the dentry parent pointers up the chain and see if any
2711 int inode_only) 2825 * of the directories in it require a full commit before they can
2826 * be logged. Returns zero if nothing special needs to be done or 1 if
2827 * a full commit is required.
2828 */
2829static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
2830 struct inode *inode,
2831 struct dentry *parent,
2832 struct super_block *sb,
2833 u64 last_committed)
2712{ 2834{
2713 int ret; 2835 int ret = 0;
2836 struct btrfs_root *root;
2714 2837
2715 start_log_trans(trans, root); 2838 /*
2716 ret = __btrfs_log_inode(trans, root, inode, inode_only); 2839 * for regular files, if its inode is already on disk, we don't
2717 end_log_trans(root); 2840 * have to worry about the parents at all. This is because
2841 * we can use the last_unlink_trans field to record renames
2842 * and other fun in this file.
2843 */
2844 if (S_ISREG(inode->i_mode) &&
2845 BTRFS_I(inode)->generation <= last_committed &&
2846 BTRFS_I(inode)->last_unlink_trans <= last_committed)
2847 goto out;
2848
2849 if (!S_ISDIR(inode->i_mode)) {
2850 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2851 goto out;
2852 inode = parent->d_inode;
2853 }
2854
2855 while (1) {
2856 BTRFS_I(inode)->logged_trans = trans->transid;
2857 smp_mb();
2858
2859 if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
2860 root = BTRFS_I(inode)->root;
2861
2862 /*
2863 * make sure any commits to the log are forced
2864 * to be full commits
2865 */
2866 root->fs_info->last_trans_log_full_commit =
2867 trans->transid;
2868 ret = 1;
2869 break;
2870 }
2871
2872 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2873 break;
2874
2875 if (parent == sb->s_root)
2876 break;
2877
2878 parent = parent->d_parent;
2879 inode = parent->d_inode;
2880
2881 }
2882out:
2718 return ret; 2883 return ret;
2719} 2884}
2720 2885
@@ -2724,31 +2889,65 @@ int btrfs_log_inode(struct btrfs_trans_handle *trans,
2724 * only logging is done of any parent directories that are older than 2889 * only logging is done of any parent directories that are older than
2725 * the last committed transaction 2890 * the last committed transaction
2726 */ 2891 */
2727int btrfs_log_dentry(struct btrfs_trans_handle *trans, 2892int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2728 struct btrfs_root *root, struct dentry *dentry) 2893 struct btrfs_root *root, struct inode *inode,
2894 struct dentry *parent, int exists_only)
2729{ 2895{
2730 int inode_only = LOG_INODE_ALL; 2896 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
2731 struct super_block *sb; 2897 struct super_block *sb;
2732 int ret; 2898 int ret = 0;
2899 u64 last_committed = root->fs_info->last_trans_committed;
2900
2901 sb = inode->i_sb;
2902
2903 if (root->fs_info->last_trans_log_full_commit >
2904 root->fs_info->last_trans_committed) {
2905 ret = 1;
2906 goto end_no_trans;
2907 }
2908
2909 ret = check_parent_dirs_for_sync(trans, inode, parent,
2910 sb, last_committed);
2911 if (ret)
2912 goto end_no_trans;
2733 2913
2734 start_log_trans(trans, root); 2914 start_log_trans(trans, root);
2735 sb = dentry->d_inode->i_sb;
2736 while (1) {
2737 ret = __btrfs_log_inode(trans, root, dentry->d_inode,
2738 inode_only);
2739 BUG_ON(ret);
2740 inode_only = LOG_INODE_EXISTS;
2741 2915
2742 dentry = dentry->d_parent; 2916 ret = btrfs_log_inode(trans, root, inode, inode_only);
2743 if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb) 2917 BUG_ON(ret);
2918
2919 /*
2920 * for regular files, if its inode is already on disk, we don't
2921 * have to worry about the parents at all. This is because
2922 * we can use the last_unlink_trans field to record renames
2923 * and other fun in this file.
2924 */
2925 if (S_ISREG(inode->i_mode) &&
2926 BTRFS_I(inode)->generation <= last_committed &&
2927 BTRFS_I(inode)->last_unlink_trans <= last_committed)
2928 goto no_parent;
2929
2930 inode_only = LOG_INODE_EXISTS;
2931 while (1) {
2932 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2744 break; 2933 break;
2745 2934
2746 if (BTRFS_I(dentry->d_inode)->generation <= 2935 inode = parent->d_inode;
2747 root->fs_info->last_trans_committed) 2936 if (BTRFS_I(inode)->generation >
2937 root->fs_info->last_trans_committed) {
2938 ret = btrfs_log_inode(trans, root, inode, inode_only);
2939 BUG_ON(ret);
2940 }
2941 if (parent == sb->s_root)
2748 break; 2942 break;
2943
2944 parent = parent->d_parent;
2749 } 2945 }
2750 end_log_trans(root); 2946no_parent:
2751 return 0; 2947 ret = 0;
2948 btrfs_end_log_trans(root);
2949end_no_trans:
2950 return ret;
2752} 2951}
2753 2952
2754/* 2953/*
@@ -2760,12 +2959,8 @@ int btrfs_log_dentry(struct btrfs_trans_handle *trans,
2760int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 2959int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
2761 struct btrfs_root *root, struct dentry *dentry) 2960 struct btrfs_root *root, struct dentry *dentry)
2762{ 2961{
2763 u64 gen; 2962 return btrfs_log_inode_parent(trans, root, dentry->d_inode,
2764 gen = root->fs_info->last_trans_new_blockgroup; 2963 dentry->d_parent, 0);
2765 if (gen > root->fs_info->last_trans_committed)
2766 return 1;
2767 else
2768 return btrfs_log_dentry(trans, root, dentry);
2769} 2964}
2770 2965
2771/* 2966/*
@@ -2884,3 +3079,94 @@ again:
2884 kfree(log_root_tree); 3079 kfree(log_root_tree);
2885 return 0; 3080 return 0;
2886} 3081}
3082
3083/*
3084 * there are some corner cases where we want to force a full
3085 * commit instead of allowing a directory to be logged.
3086 *
3087 * They revolve around files there were unlinked from the directory, and
3088 * this function updates the parent directory so that a full commit is
3089 * properly done if it is fsync'd later after the unlinks are done.
3090 */
3091void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
3092 struct inode *dir, struct inode *inode,
3093 int for_rename)
3094{
3095 /*
3096 * when we're logging a file, if it hasn't been renamed
3097 * or unlinked, and its inode is fully committed on disk,
3098 * we don't have to worry about walking up the directory chain
3099 * to log its parents.
3100 *
3101 * So, we use the last_unlink_trans field to put this transid
3102 * into the file. When the file is logged we check it and
3103 * don't log the parents if the file is fully on disk.
3104 */
3105 if (S_ISREG(inode->i_mode))
3106 BTRFS_I(inode)->last_unlink_trans = trans->transid;
3107
3108 /*
3109 * if this directory was already logged any new
3110 * names for this file/dir will get recorded
3111 */
3112 smp_mb();
3113 if (BTRFS_I(dir)->logged_trans == trans->transid)
3114 return;
3115
3116 /*
3117 * if the inode we're about to unlink was logged,
3118 * the log will be properly updated for any new names
3119 */
3120 if (BTRFS_I(inode)->logged_trans == trans->transid)
3121 return;
3122
3123 /*
3124 * when renaming files across directories, if the directory
3125 * there we're unlinking from gets fsync'd later on, there's
3126 * no way to find the destination directory later and fsync it
3127 * properly. So, we have to be conservative and force commits
3128 * so the new name gets discovered.
3129 */
3130 if (for_rename)
3131 goto record;
3132
3133 /* we can safely do the unlink without any special recording */
3134 return;
3135
3136record:
3137 BTRFS_I(dir)->last_unlink_trans = trans->transid;
3138}
3139
3140/*
3141 * Call this after adding a new name for a file and it will properly
3142 * update the log to reflect the new name.
3143 *
3144 * It will return zero if all goes well, and it will return 1 if a
3145 * full transaction commit is required.
3146 */
3147int btrfs_log_new_name(struct btrfs_trans_handle *trans,
3148 struct inode *inode, struct inode *old_dir,
3149 struct dentry *parent)
3150{
3151 struct btrfs_root * root = BTRFS_I(inode)->root;
3152
3153 /*
3154 * this will force the logging code to walk the dentry chain
3155 * up for the file
3156 */
3157 if (S_ISREG(inode->i_mode))
3158 BTRFS_I(inode)->last_unlink_trans = trans->transid;
3159
3160 /*
3161 * if this inode hasn't been logged and directory we're renaming it
3162 * from hasn't been logged, we don't need to log it
3163 */
3164 if (BTRFS_I(inode)->logged_trans <=
3165 root->fs_info->last_trans_committed &&
3166 (!old_dir || BTRFS_I(old_dir)->logged_trans <=
3167 root->fs_info->last_trans_committed))
3168 return 0;
3169
3170 return btrfs_log_inode_parent(trans, root, inode, parent, 1);
3171}
3172
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index b9409b32ed02..d09c7609e16b 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -22,14 +22,9 @@
22int btrfs_sync_log(struct btrfs_trans_handle *trans, 22int btrfs_sync_log(struct btrfs_trans_handle *trans,
23 struct btrfs_root *root); 23 struct btrfs_root *root);
24int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); 24int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
25int btrfs_log_dentry(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root, struct dentry *dentry);
27int btrfs_recover_log_trees(struct btrfs_root *tree_root); 25int btrfs_recover_log_trees(struct btrfs_root *tree_root);
28int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 26int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
29 struct btrfs_root *root, struct dentry *dentry); 27 struct btrfs_root *root, struct dentry *dentry);
30int btrfs_log_inode(struct btrfs_trans_handle *trans,
31 struct btrfs_root *root, struct inode *inode,
32 int inode_only);
33int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 28int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
34 struct btrfs_root *root, 29 struct btrfs_root *root,
35 const char *name, int name_len, 30 const char *name, int name_len,
@@ -38,4 +33,16 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
38 struct btrfs_root *root, 33 struct btrfs_root *root,
39 const char *name, int name_len, 34 const char *name, int name_len,
40 struct inode *inode, u64 dirid); 35 struct inode *inode, u64 dirid);
36int btrfs_join_running_log_trans(struct btrfs_root *root);
37int btrfs_end_log_trans(struct btrfs_root *root);
38int btrfs_pin_log_trans(struct btrfs_root *root);
39int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
40 struct btrfs_root *root, struct inode *inode,
41 struct dentry *parent, int exists_only);
42void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
43 struct inode *dir, struct inode *inode,
44 int for_rename);
45int btrfs_log_new_name(struct btrfs_trans_handle *trans,
46 struct inode *inode, struct inode *old_dir,
47 struct dentry *parent);
41#endif 48#endif
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 38f40d55899c..53c72ad85877 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -55,7 +55,8 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
55} 55}
56 56
57static int ext4_group_used_meta_blocks(struct super_block *sb, 57static int ext4_group_used_meta_blocks(struct super_block *sb,
58 ext4_group_t block_group) 58 ext4_group_t block_group,
59 struct ext4_group_desc *gdp)
59{ 60{
60 ext4_fsblk_t tmp; 61 ext4_fsblk_t tmp;
61 struct ext4_sb_info *sbi = EXT4_SB(sb); 62 struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -63,10 +64,6 @@ static int ext4_group_used_meta_blocks(struct super_block *sb,
63 int used_blocks = sbi->s_itb_per_group + 2; 64 int used_blocks = sbi->s_itb_per_group + 2;
64 65
65 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { 66 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
66 struct ext4_group_desc *gdp;
67 struct buffer_head *bh;
68
69 gdp = ext4_get_group_desc(sb, block_group, &bh);
70 if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), 67 if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp),
71 block_group)) 68 block_group))
72 used_blocks--; 69 used_blocks--;
@@ -177,7 +174,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
177 */ 174 */
178 mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data); 175 mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
179 } 176 }
180 return free_blocks - ext4_group_used_meta_blocks(sb, block_group); 177 return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
181} 178}
182 179
183 180
@@ -473,9 +470,8 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
473 470
474 if (sbi->s_log_groups_per_flex) { 471 if (sbi->s_log_groups_per_flex) {
475 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 472 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
476 spin_lock(sb_bgl_lock(sbi, flex_group)); 473 atomic_add(blocks_freed,
477 sbi->s_flex_groups[flex_group].free_blocks += blocks_freed; 474 &sbi->s_flex_groups[flex_group].free_blocks);
478 spin_unlock(sb_bgl_lock(sbi, flex_group));
479 } 475 }
480 /* 476 /*
481 * request to reload the buddy with the 477 * request to reload the buddy with the
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2df2e40b01af..b64789929a65 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -67,7 +67,8 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
67 unsigned int offset) 67 unsigned int offset)
68{ 68{
69 const char *error_msg = NULL; 69 const char *error_msg = NULL;
70 const int rlen = ext4_rec_len_from_disk(de->rec_len); 70 const int rlen = ext4_rec_len_from_disk(de->rec_len,
71 dir->i_sb->s_blocksize);
71 72
72 if (rlen < EXT4_DIR_REC_LEN(1)) 73 if (rlen < EXT4_DIR_REC_LEN(1))
73 error_msg = "rec_len is smaller than minimal"; 74 error_msg = "rec_len is smaller than minimal";
@@ -178,10 +179,11 @@ revalidate:
178 * least that it is non-zero. A 179 * least that it is non-zero. A
179 * failure will be detected in the 180 * failure will be detected in the
180 * dirent test below. */ 181 * dirent test below. */
181 if (ext4_rec_len_from_disk(de->rec_len) 182 if (ext4_rec_len_from_disk(de->rec_len,
182 < EXT4_DIR_REC_LEN(1)) 183 sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
183 break; 184 break;
184 i += ext4_rec_len_from_disk(de->rec_len); 185 i += ext4_rec_len_from_disk(de->rec_len,
186 sb->s_blocksize);
185 } 187 }
186 offset = i; 188 offset = i;
187 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) 189 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
@@ -203,7 +205,8 @@ revalidate:
203 ret = stored; 205 ret = stored;
204 goto out; 206 goto out;
205 } 207 }
206 offset += ext4_rec_len_from_disk(de->rec_len); 208 offset += ext4_rec_len_from_disk(de->rec_len,
209 sb->s_blocksize);
207 if (le32_to_cpu(de->inode)) { 210 if (le32_to_cpu(de->inode)) {
208 /* We might block in the next section 211 /* We might block in the next section
209 * if the data destination is 212 * if the data destination is
@@ -225,7 +228,8 @@ revalidate:
225 goto revalidate; 228 goto revalidate;
226 stored++; 229 stored++;
227 } 230 }
228 filp->f_pos += ext4_rec_len_from_disk(de->rec_len); 231 filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
232 sb->s_blocksize);
229 } 233 }
230 offset = 0; 234 offset = 0;
231 brelse(bh); 235 brelse(bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 990c94000924..d0f15ef56de1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -33,14 +33,6 @@
33#undef EXT4FS_DEBUG 33#undef EXT4FS_DEBUG
34 34
35/* 35/*
36 * Define EXT4_RESERVATION to reserve data blocks for expanding files
37 */
38#define EXT4_DEFAULT_RESERVE_BLOCKS 8
39/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */
40#define EXT4_MAX_RESERVE_BLOCKS 1027
41#define EXT4_RESERVE_WINDOW_NOT_ALLOCATED 0
42
43/*
44 * Debug code 36 * Debug code
45 */ 37 */
46#ifdef EXT4FS_DEBUG 38#ifdef EXT4FS_DEBUG
@@ -54,8 +46,6 @@
54#define ext4_debug(f, a...) do {} while (0) 46#define ext4_debug(f, a...) do {} while (0)
55#endif 47#endif
56 48
57#define EXT4_MULTIBLOCK_ALLOCATOR 1
58
59/* prefer goal again. length */ 49/* prefer goal again. length */
60#define EXT4_MB_HINT_MERGE 1 50#define EXT4_MB_HINT_MERGE 1
61/* blocks already reserved */ 51/* blocks already reserved */
@@ -180,8 +170,9 @@ struct ext4_group_desc
180 */ 170 */
181 171
182struct flex_groups { 172struct flex_groups {
183 __u32 free_inodes; 173 atomic_t free_inodes;
184 __u32 free_blocks; 174 atomic_t free_blocks;
175 atomic_t used_dirs;
185}; 176};
186 177
187#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ 178#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */
@@ -249,6 +240,30 @@ struct flex_groups {
249#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ 240#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
250#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */ 241#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */
251 242
243/* Flags that should be inherited by new inodes from their parent. */
244#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
245 EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\
246 EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
247 EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
248 EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
249
250/* Flags that are appropriate for regular files (all but dir-specific ones). */
251#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
252
253/* Flags that are appropriate for non-directories/regular files. */
254#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
255
256/* Mask out flags that are inappropriate for the given type of inode. */
257static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
258{
259 if (S_ISDIR(mode))
260 return flags;
261 else if (S_ISREG(mode))
262 return flags & EXT4_REG_FLMASK;
263 else
264 return flags & EXT4_OTHER_FLMASK;
265}
266
252/* 267/*
253 * Inode dynamic state flags 268 * Inode dynamic state flags
254 */ 269 */
@@ -256,6 +271,7 @@ struct flex_groups {
256#define EXT4_STATE_NEW 0x00000002 /* inode is newly created */ 271#define EXT4_STATE_NEW 0x00000002 /* inode is newly created */
257#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */ 272#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
258#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ 273#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
274#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
259 275
260/* Used to pass group descriptor data when online resize is done */ 276/* Used to pass group descriptor data when online resize is done */
261struct ext4_new_group_input { 277struct ext4_new_group_input {
@@ -303,7 +319,9 @@ struct ext4_new_group_data {
303#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) 319#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
304#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input) 320#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input)
305#define EXT4_IOC_MIGRATE _IO('f', 9) 321#define EXT4_IOC_MIGRATE _IO('f', 9)
322 /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
306 /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ 323 /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
324#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
307 325
308/* 326/*
309 * ioctl commands in 32 bit emulation 327 * ioctl commands in 32 bit emulation
@@ -531,7 +549,7 @@ do { \
531#define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ 549#define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */
532#define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ 550#define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */
533#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ 551#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */
534#define EXT4_MOUNT_RESERVATION 0x10000 /* Preallocation */ 552#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */
535#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ 553#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */
536#define EXT4_MOUNT_NOBH 0x40000 /* No bufferheads */ 554#define EXT4_MOUNT_NOBH 0x40000 /* No bufferheads */
537#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ 555#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
@@ -666,7 +684,8 @@ struct ext4_super_block {
666 __u8 s_log_groups_per_flex; /* FLEX_BG group size */ 684 __u8 s_log_groups_per_flex; /* FLEX_BG group size */
667 __u8 s_reserved_char_pad2; 685 __u8 s_reserved_char_pad2;
668 __le16 s_reserved_pad; 686 __le16 s_reserved_pad;
669 __u32 s_reserved[162]; /* Padding to the end of the block */ 687 __le64 s_kbytes_written; /* nr of lifetime kilobytes written */
688 __u32 s_reserved[160]; /* Padding to the end of the block */
670}; 689};
671 690
672#ifdef __KERNEL__ 691#ifdef __KERNEL__
@@ -814,6 +833,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
814#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ 833#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
815 834
816/* 835/*
836 * Minimum number of groups in a flexgroup before we separate out
837 * directories into the first block group of a flexgroup
838 */
839#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4
840
841/*
817 * Structure of a directory entry 842 * Structure of a directory entry
818 */ 843 */
819#define EXT4_NAME_LEN 255 844#define EXT4_NAME_LEN 255
@@ -865,24 +890,6 @@ struct ext4_dir_entry_2 {
865 ~EXT4_DIR_ROUND) 890 ~EXT4_DIR_ROUND)
866#define EXT4_MAX_REC_LEN ((1<<16)-1) 891#define EXT4_MAX_REC_LEN ((1<<16)-1)
867 892
868static inline unsigned ext4_rec_len_from_disk(__le16 dlen)
869{
870 unsigned len = le16_to_cpu(dlen);
871
872 if (len == EXT4_MAX_REC_LEN || len == 0)
873 return 1 << 16;
874 return len;
875}
876
877static inline __le16 ext4_rec_len_to_disk(unsigned len)
878{
879 if (len == (1 << 16))
880 return cpu_to_le16(EXT4_MAX_REC_LEN);
881 else if (len > (1 << 16))
882 BUG();
883 return cpu_to_le16(len);
884}
885
886/* 893/*
887 * Hash Tree Directory indexing 894 * Hash Tree Directory indexing
888 * (c) Daniel Phillips, 2001 895 * (c) Daniel Phillips, 2001
@@ -970,22 +977,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
970 977
971extern struct proc_dir_entry *ext4_proc_root; 978extern struct proc_dir_entry *ext4_proc_root;
972 979
973#ifdef CONFIG_PROC_FS
974extern const struct file_operations ext4_ui_proc_fops;
975
976#define EXT4_PROC_HANDLER(name, var) \
977do { \
978 proc = proc_create_data(name, mode, sbi->s_proc, \
979 &ext4_ui_proc_fops, &sbi->s_##var); \
980 if (proc == NULL) { \
981 printk(KERN_ERR "EXT4-fs: can't create %s\n", name); \
982 goto err_out; \
983 } \
984} while (0)
985#else
986#define EXT4_PROC_HANDLER(name, var)
987#endif
988
989/* 980/*
990 * Function prototypes 981 * Function prototypes
991 */ 982 */
@@ -1092,6 +1083,7 @@ extern int ext4_can_truncate(struct inode *inode);
1092extern void ext4_truncate(struct inode *); 1083extern void ext4_truncate(struct inode *);
1093extern void ext4_set_inode_flags(struct inode *); 1084extern void ext4_set_inode_flags(struct inode *);
1094extern void ext4_get_inode_flags(struct ext4_inode_info *); 1085extern void ext4_get_inode_flags(struct ext4_inode_info *);
1086extern int ext4_alloc_da_blocks(struct inode *inode);
1095extern void ext4_set_aops(struct inode *inode); 1087extern void ext4_set_aops(struct inode *inode);
1096extern int ext4_writepage_trans_blocks(struct inode *); 1088extern int ext4_writepage_trans_blocks(struct inode *);
1097extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks); 1089extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
@@ -1107,7 +1099,10 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
1107 1099
1108/* migrate.c */ 1100/* migrate.c */
1109extern int ext4_ext_migrate(struct inode *); 1101extern int ext4_ext_migrate(struct inode *);
1102
1110/* namei.c */ 1103/* namei.c */
1104extern unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize);
1105extern __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize);
1111extern int ext4_orphan_add(handle_t *, struct inode *); 1106extern int ext4_orphan_add(handle_t *, struct inode *);
1112extern int ext4_orphan_del(handle_t *, struct inode *); 1107extern int ext4_orphan_del(handle_t *, struct inode *);
1113extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, 1108extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 18cb67b2cbbc..f0c3ec85bd48 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -241,5 +241,6 @@ extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
241extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *, 241extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
242 ext4_lblk_t *, ext4_fsblk_t *); 242 ext4_lblk_t *, ext4_fsblk_t *);
243extern void ext4_ext_drop_refs(struct ext4_ext_path *); 243extern void ext4_ext_drop_refs(struct ext4_ext_path *);
244extern int ext4_ext_check_inode(struct inode *inode);
244#endif /* _EXT4_EXTENTS */ 245#endif /* _EXT4_EXTENTS */
245 246
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index e69acc16f5c4..4ce2187123aa 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -33,9 +33,6 @@ typedef __u32 ext4_lblk_t;
33/* data type for block group number */ 33/* data type for block group number */
34typedef unsigned int ext4_group_t; 34typedef unsigned int ext4_group_t;
35 35
36#define rsv_start rsv_window._rsv_start
37#define rsv_end rsv_window._rsv_end
38
39/* 36/*
40 * storage for cached extent 37 * storage for cached extent
41 */ 38 */
@@ -125,6 +122,9 @@ struct ext4_inode_info {
125 struct list_head i_prealloc_list; 122 struct list_head i_prealloc_list;
126 spinlock_t i_prealloc_lock; 123 spinlock_t i_prealloc_lock;
127 124
125 /* ialloc */
126 ext4_group_t i_last_alloc_group;
127
128 /* allocation reservation info for delalloc */ 128 /* allocation reservation info for delalloc */
129 unsigned int i_reserved_data_blocks; 129 unsigned int i_reserved_data_blocks;
130 unsigned int i_reserved_meta_blocks; 130 unsigned int i_reserved_meta_blocks;
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 039b6ea1a042..57b71fefbccf 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -62,12 +62,10 @@ struct ext4_sb_info {
62 struct percpu_counter s_freeinodes_counter; 62 struct percpu_counter s_freeinodes_counter;
63 struct percpu_counter s_dirs_counter; 63 struct percpu_counter s_dirs_counter;
64 struct percpu_counter s_dirtyblocks_counter; 64 struct percpu_counter s_dirtyblocks_counter;
65 struct blockgroup_lock s_blockgroup_lock; 65 struct blockgroup_lock *s_blockgroup_lock;
66 struct proc_dir_entry *s_proc; 66 struct proc_dir_entry *s_proc;
67 67 struct kobject s_kobj;
68 /* root of the per fs reservation window tree */ 68 struct completion s_kobj_unregister;
69 spinlock_t s_rsv_window_lock;
70 struct rb_root s_rsv_window_root;
71 69
72 /* Journaling */ 70 /* Journaling */
73 struct inode *s_journal_inode; 71 struct inode *s_journal_inode;
@@ -146,6 +144,10 @@ struct ext4_sb_info {
146 /* locality groups */ 144 /* locality groups */
147 struct ext4_locality_group *s_locality_groups; 145 struct ext4_locality_group *s_locality_groups;
148 146
147 /* for write statistics */
148 unsigned long s_sectors_written_start;
149 u64 s_kbytes_written;
150
149 unsigned int s_log_groups_per_flex; 151 unsigned int s_log_groups_per_flex;
150 struct flex_groups *s_flex_groups; 152 struct flex_groups *s_flex_groups;
151}; 153};
@@ -153,7 +155,7 @@ struct ext4_sb_info {
153static inline spinlock_t * 155static inline spinlock_t *
154sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group) 156sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
155{ 157{
156 return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group); 158 return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
157} 159}
158 160
159#endif /* _EXT4_SB */ 161#endif /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e0aa4fe4f596..ac77d8b8251d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -152,6 +152,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
152 ext4_fsblk_t bg_start; 152 ext4_fsblk_t bg_start;
153 ext4_fsblk_t last_block; 153 ext4_fsblk_t last_block;
154 ext4_grpblk_t colour; 154 ext4_grpblk_t colour;
155 ext4_group_t block_group;
156 int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
155 int depth; 157 int depth;
156 158
157 if (path) { 159 if (path) {
@@ -170,10 +172,31 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
170 } 172 }
171 173
172 /* OK. use inode's group */ 174 /* OK. use inode's group */
173 bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + 175 block_group = ei->i_block_group;
176 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
177 /*
178 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
179 * block groups per flexgroup, reserve the first block
180 * group for directories and special files. Regular
181 * files will start at the second block group. This
182 * tends to speed up directory access and improves
183 * fsck times.
184 */
185 block_group &= ~(flex_size-1);
186 if (S_ISREG(inode->i_mode))
187 block_group++;
188 }
189 bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
174 le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block); 190 le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
175 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; 191 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
176 192
193 /*
194 * If we are doing delayed allocation, we don't need take
195 * colour into account.
196 */
197 if (test_opt(inode->i_sb, DELALLOC))
198 return bg_start;
199
177 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) 200 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
178 colour = (current->pid % 16) * 201 colour = (current->pid % 16) *
179 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); 202 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -301,7 +324,64 @@ ext4_ext_max_entries(struct inode *inode, int depth)
301 return max; 324 return max;
302} 325}
303 326
304static int __ext4_ext_check_header(const char *function, struct inode *inode, 327static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
328{
329 ext4_fsblk_t block = ext_pblock(ext);
330 int len = ext4_ext_get_actual_len(ext);
331 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
332 if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
333 ((block + len) > ext4_blocks_count(es))))
334 return 0;
335 else
336 return 1;
337}
338
339static int ext4_valid_extent_idx(struct inode *inode,
340 struct ext4_extent_idx *ext_idx)
341{
342 ext4_fsblk_t block = idx_pblock(ext_idx);
343 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
344 if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
345 (block > ext4_blocks_count(es))))
346 return 0;
347 else
348 return 1;
349}
350
351static int ext4_valid_extent_entries(struct inode *inode,
352 struct ext4_extent_header *eh,
353 int depth)
354{
355 struct ext4_extent *ext;
356 struct ext4_extent_idx *ext_idx;
357 unsigned short entries;
358 if (eh->eh_entries == 0)
359 return 1;
360
361 entries = le16_to_cpu(eh->eh_entries);
362
363 if (depth == 0) {
364 /* leaf entries */
365 ext = EXT_FIRST_EXTENT(eh);
366 while (entries) {
367 if (!ext4_valid_extent(inode, ext))
368 return 0;
369 ext++;
370 entries--;
371 }
372 } else {
373 ext_idx = EXT_FIRST_INDEX(eh);
374 while (entries) {
375 if (!ext4_valid_extent_idx(inode, ext_idx))
376 return 0;
377 ext_idx++;
378 entries--;
379 }
380 }
381 return 1;
382}
383
384static int __ext4_ext_check(const char *function, struct inode *inode,
305 struct ext4_extent_header *eh, 385 struct ext4_extent_header *eh,
306 int depth) 386 int depth)
307{ 387{
@@ -329,11 +409,15 @@ static int __ext4_ext_check_header(const char *function, struct inode *inode,
329 error_msg = "invalid eh_entries"; 409 error_msg = "invalid eh_entries";
330 goto corrupted; 410 goto corrupted;
331 } 411 }
412 if (!ext4_valid_extent_entries(inode, eh, depth)) {
413 error_msg = "invalid extent entries";
414 goto corrupted;
415 }
332 return 0; 416 return 0;
333 417
334corrupted: 418corrupted:
335 ext4_error(inode->i_sb, function, 419 ext4_error(inode->i_sb, function,
336 "bad header in inode #%lu: %s - magic %x, " 420 "bad header/extent in inode #%lu: %s - magic %x, "
337 "entries %u, max %u(%u), depth %u(%u)", 421 "entries %u, max %u(%u), depth %u(%u)",
338 inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), 422 inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
339 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), 423 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
@@ -342,8 +426,13 @@ corrupted:
342 return -EIO; 426 return -EIO;
343} 427}
344 428
345#define ext4_ext_check_header(inode, eh, depth) \ 429#define ext4_ext_check(inode, eh, depth) \
346 __ext4_ext_check_header(__func__, inode, eh, depth) 430 __ext4_ext_check(__func__, inode, eh, depth)
431
432int ext4_ext_check_inode(struct inode *inode)
433{
434 return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
435}
347 436
348#ifdef EXT_DEBUG 437#ifdef EXT_DEBUG
349static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) 438static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
@@ -547,9 +636,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
547 636
548 eh = ext_inode_hdr(inode); 637 eh = ext_inode_hdr(inode);
549 depth = ext_depth(inode); 638 depth = ext_depth(inode);
550 if (ext4_ext_check_header(inode, eh, depth))
551 return ERR_PTR(-EIO);
552
553 639
554 /* account possible depth increase */ 640 /* account possible depth increase */
555 if (!path) { 641 if (!path) {
@@ -565,6 +651,8 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
565 i = depth; 651 i = depth;
566 /* walk through the tree */ 652 /* walk through the tree */
567 while (i) { 653 while (i) {
654 int need_to_validate = 0;
655
568 ext_debug("depth %d: num %d, max %d\n", 656 ext_debug("depth %d: num %d, max %d\n",
569 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 657 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
570 658
@@ -573,10 +661,17 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
573 path[ppos].p_depth = i; 661 path[ppos].p_depth = i;
574 path[ppos].p_ext = NULL; 662 path[ppos].p_ext = NULL;
575 663
576 bh = sb_bread(inode->i_sb, path[ppos].p_block); 664 bh = sb_getblk(inode->i_sb, path[ppos].p_block);
577 if (!bh) 665 if (unlikely(!bh))
578 goto err; 666 goto err;
579 667 if (!bh_uptodate_or_lock(bh)) {
668 if (bh_submit_read(bh) < 0) {
669 put_bh(bh);
670 goto err;
671 }
672 /* validate the extent entries */
673 need_to_validate = 1;
674 }
580 eh = ext_block_hdr(bh); 675 eh = ext_block_hdr(bh);
581 ppos++; 676 ppos++;
582 BUG_ON(ppos > depth); 677 BUG_ON(ppos > depth);
@@ -584,7 +679,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
584 path[ppos].p_hdr = eh; 679 path[ppos].p_hdr = eh;
585 i--; 680 i--;
586 681
587 if (ext4_ext_check_header(inode, eh, i)) 682 if (need_to_validate && ext4_ext_check(inode, eh, i))
588 goto err; 683 goto err;
589 } 684 }
590 685
@@ -1181,7 +1276,7 @@ got_index:
1181 return -EIO; 1276 return -EIO;
1182 eh = ext_block_hdr(bh); 1277 eh = ext_block_hdr(bh);
1183 /* subtract from p_depth to get proper eh_depth */ 1278 /* subtract from p_depth to get proper eh_depth */
1184 if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) { 1279 if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1185 put_bh(bh); 1280 put_bh(bh);
1186 return -EIO; 1281 return -EIO;
1187 } 1282 }
@@ -1194,7 +1289,7 @@ got_index:
1194 if (bh == NULL) 1289 if (bh == NULL)
1195 return -EIO; 1290 return -EIO;
1196 eh = ext_block_hdr(bh); 1291 eh = ext_block_hdr(bh);
1197 if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) { 1292 if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1198 put_bh(bh); 1293 put_bh(bh);
1199 return -EIO; 1294 return -EIO;
1200 } 1295 }
@@ -2137,7 +2232,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2137 return -ENOMEM; 2232 return -ENOMEM;
2138 } 2233 }
2139 path[0].p_hdr = ext_inode_hdr(inode); 2234 path[0].p_hdr = ext_inode_hdr(inode);
2140 if (ext4_ext_check_header(inode, path[0].p_hdr, depth)) { 2235 if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
2141 err = -EIO; 2236 err = -EIO;
2142 goto out; 2237 goto out;
2143 } 2238 }
@@ -2191,7 +2286,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2191 err = -EIO; 2286 err = -EIO;
2192 break; 2287 break;
2193 } 2288 }
2194 if (ext4_ext_check_header(inode, ext_block_hdr(bh), 2289 if (ext4_ext_check(inode, ext_block_hdr(bh),
2195 depth - i - 1)) { 2290 depth - i - 1)) {
2196 err = -EIO; 2291 err = -EIO;
2197 break; 2292 break;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index f731cb545a03..588af8c77246 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -33,9 +33,14 @@
33 */ 33 */
34static int ext4_release_file(struct inode *inode, struct file *filp) 34static int ext4_release_file(struct inode *inode, struct file *filp)
35{ 35{
36 if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) {
37 ext4_alloc_da_blocks(inode);
38 EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE;
39 }
36 /* if we are the last writer on the inode, drop the block reservation */ 40 /* if we are the last writer on the inode, drop the block reservation */
37 if ((filp->f_mode & FMODE_WRITE) && 41 if ((filp->f_mode & FMODE_WRITE) &&
38 (atomic_read(&inode->i_writecount) == 1)) 42 (atomic_read(&inode->i_writecount) == 1) &&
43 !EXT4_I(inode)->i_reserved_data_blocks)
39 { 44 {
40 down_write(&EXT4_I(inode)->i_data_sem); 45 down_write(&EXT4_I(inode)->i_data_sem);
41 ext4_discard_preallocations(inode); 46 ext4_discard_preallocations(inode);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index fb51b40e3e8f..47b84e8df568 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -189,7 +189,6 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
189 struct ext4_super_block *es; 189 struct ext4_super_block *es;
190 struct ext4_sb_info *sbi; 190 struct ext4_sb_info *sbi;
191 int fatal = 0, err, count, cleared; 191 int fatal = 0, err, count, cleared;
192 ext4_group_t flex_group;
193 192
194 if (atomic_read(&inode->i_count) > 1) { 193 if (atomic_read(&inode->i_count) > 1) {
195 printk(KERN_ERR "ext4_free_inode: inode has count=%d\n", 194 printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
@@ -268,6 +267,13 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
268 if (is_directory) { 267 if (is_directory) {
269 count = ext4_used_dirs_count(sb, gdp) - 1; 268 count = ext4_used_dirs_count(sb, gdp) - 1;
270 ext4_used_dirs_set(sb, gdp, count); 269 ext4_used_dirs_set(sb, gdp, count);
270 if (sbi->s_log_groups_per_flex) {
271 ext4_group_t f;
272
273 f = ext4_flex_group(sbi, block_group);
274 atomic_dec(&sbi->s_flex_groups[f].free_inodes);
275 }
276
271 } 277 }
272 gdp->bg_checksum = ext4_group_desc_csum(sbi, 278 gdp->bg_checksum = ext4_group_desc_csum(sbi,
273 block_group, gdp); 279 block_group, gdp);
@@ -277,10 +283,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
277 percpu_counter_dec(&sbi->s_dirs_counter); 283 percpu_counter_dec(&sbi->s_dirs_counter);
278 284
279 if (sbi->s_log_groups_per_flex) { 285 if (sbi->s_log_groups_per_flex) {
280 flex_group = ext4_flex_group(sbi, block_group); 286 ext4_group_t f;
281 spin_lock(sb_bgl_lock(sbi, flex_group)); 287
282 sbi->s_flex_groups[flex_group].free_inodes++; 288 f = ext4_flex_group(sbi, block_group);
283 spin_unlock(sb_bgl_lock(sbi, flex_group)); 289 atomic_inc(&sbi->s_flex_groups[f].free_inodes);
284 } 290 }
285 } 291 }
286 BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); 292 BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
@@ -360,9 +366,9 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
360 sbi->s_log_groups_per_flex; 366 sbi->s_log_groups_per_flex;
361 367
362find_close_to_parent: 368find_close_to_parent:
363 flexbg_free_blocks = flex_group[best_flex].free_blocks; 369 flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks);
364 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; 370 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
365 if (flex_group[best_flex].free_inodes && 371 if (atomic_read(&flex_group[best_flex].free_inodes) &&
366 flex_freeb_ratio > free_block_ratio) 372 flex_freeb_ratio > free_block_ratio)
367 goto found_flexbg; 373 goto found_flexbg;
368 374
@@ -375,24 +381,24 @@ find_close_to_parent:
375 if (i == parent_fbg_group || i == parent_fbg_group - 1) 381 if (i == parent_fbg_group || i == parent_fbg_group - 1)
376 continue; 382 continue;
377 383
378 flexbg_free_blocks = flex_group[i].free_blocks; 384 flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks);
379 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; 385 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
380 386
381 if (flex_freeb_ratio > free_block_ratio && 387 if (flex_freeb_ratio > free_block_ratio &&
382 flex_group[i].free_inodes) { 388 (atomic_read(&flex_group[i].free_inodes))) {
383 best_flex = i; 389 best_flex = i;
384 goto found_flexbg; 390 goto found_flexbg;
385 } 391 }
386 392
387 if (flex_group[best_flex].free_inodes == 0 || 393 if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) ||
388 (flex_group[i].free_blocks > 394 ((atomic_read(&flex_group[i].free_blocks) >
389 flex_group[best_flex].free_blocks && 395 atomic_read(&flex_group[best_flex].free_blocks)) &&
390 flex_group[i].free_inodes)) 396 atomic_read(&flex_group[i].free_inodes)))
391 best_flex = i; 397 best_flex = i;
392 } 398 }
393 399
394 if (!flex_group[best_flex].free_inodes || 400 if (!atomic_read(&flex_group[best_flex].free_inodes) ||
395 !flex_group[best_flex].free_blocks) 401 !atomic_read(&flex_group[best_flex].free_blocks))
396 return -1; 402 return -1;
397 403
398found_flexbg: 404found_flexbg:
@@ -410,6 +416,42 @@ out:
410 return 0; 416 return 0;
411} 417}
412 418
419struct orlov_stats {
420 __u32 free_inodes;
421 __u32 free_blocks;
422 __u32 used_dirs;
423};
424
425/*
426 * Helper function for Orlov's allocator; returns critical information
427 * for a particular block group or flex_bg. If flex_size is 1, then g
428 * is a block group number; otherwise it is flex_bg number.
429 */
430void get_orlov_stats(struct super_block *sb, ext4_group_t g,
431 int flex_size, struct orlov_stats *stats)
432{
433 struct ext4_group_desc *desc;
434 struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
435
436 if (flex_size > 1) {
437 stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
438 stats->free_blocks = atomic_read(&flex_group[g].free_blocks);
439 stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
440 return;
441 }
442
443 desc = ext4_get_group_desc(sb, g, NULL);
444 if (desc) {
445 stats->free_inodes = ext4_free_inodes_count(sb, desc);
446 stats->free_blocks = ext4_free_blks_count(sb, desc);
447 stats->used_dirs = ext4_used_dirs_count(sb, desc);
448 } else {
449 stats->free_inodes = 0;
450 stats->free_blocks = 0;
451 stats->used_dirs = 0;
452 }
453}
454
413/* 455/*
414 * Orlov's allocator for directories. 456 * Orlov's allocator for directories.
415 * 457 *
@@ -425,35 +467,34 @@ out:
425 * it has too many directories already (max_dirs) or 467 * it has too many directories already (max_dirs) or
426 * it has too few free inodes left (min_inodes) or 468 * it has too few free inodes left (min_inodes) or
427 * it has too few free blocks left (min_blocks) or 469 * it has too few free blocks left (min_blocks) or
428 * it's already running too large debt (max_debt).
429 * Parent's group is preferred, if it doesn't satisfy these 470 * Parent's group is preferred, if it doesn't satisfy these
430 * conditions we search cyclically through the rest. If none 471 * conditions we search cyclically through the rest. If none
431 * of the groups look good we just look for a group with more 472 * of the groups look good we just look for a group with more
432 * free inodes than average (starting at parent's group). 473 * free inodes than average (starting at parent's group).
433 *
434 * Debt is incremented each time we allocate a directory and decremented
435 * when we allocate an inode, within 0--255.
436 */ 474 */
437 475
438#define INODE_COST 64
439#define BLOCK_COST 256
440
441static int find_group_orlov(struct super_block *sb, struct inode *parent, 476static int find_group_orlov(struct super_block *sb, struct inode *parent,
442 ext4_group_t *group) 477 ext4_group_t *group, int mode)
443{ 478{
444 ext4_group_t parent_group = EXT4_I(parent)->i_block_group; 479 ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
445 struct ext4_sb_info *sbi = EXT4_SB(sb); 480 struct ext4_sb_info *sbi = EXT4_SB(sb);
446 struct ext4_super_block *es = sbi->s_es;
447 ext4_group_t ngroups = sbi->s_groups_count; 481 ext4_group_t ngroups = sbi->s_groups_count;
448 int inodes_per_group = EXT4_INODES_PER_GROUP(sb); 482 int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
449 unsigned int freei, avefreei; 483 unsigned int freei, avefreei;
450 ext4_fsblk_t freeb, avefreeb; 484 ext4_fsblk_t freeb, avefreeb;
451 ext4_fsblk_t blocks_per_dir;
452 unsigned int ndirs; 485 unsigned int ndirs;
453 int max_debt, max_dirs, min_inodes; 486 int max_dirs, min_inodes;
454 ext4_grpblk_t min_blocks; 487 ext4_grpblk_t min_blocks;
455 ext4_group_t i; 488 ext4_group_t i, grp, g;
456 struct ext4_group_desc *desc; 489 struct ext4_group_desc *desc;
490 struct orlov_stats stats;
491 int flex_size = ext4_flex_bg_size(sbi);
492
493 if (flex_size > 1) {
494 ngroups = (ngroups + flex_size - 1) >>
495 sbi->s_log_groups_per_flex;
496 parent_group >>= sbi->s_log_groups_per_flex;
497 }
457 498
458 freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); 499 freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
459 avefreei = freei / ngroups; 500 avefreei = freei / ngroups;
@@ -462,71 +503,97 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
462 do_div(avefreeb, ngroups); 503 do_div(avefreeb, ngroups);
463 ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); 504 ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
464 505
465 if ((parent == sb->s_root->d_inode) || 506 if (S_ISDIR(mode) &&
466 (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) { 507 ((parent == sb->s_root->d_inode) ||
508 (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
467 int best_ndir = inodes_per_group; 509 int best_ndir = inodes_per_group;
468 ext4_group_t grp;
469 int ret = -1; 510 int ret = -1;
470 511
471 get_random_bytes(&grp, sizeof(grp)); 512 get_random_bytes(&grp, sizeof(grp));
472 parent_group = (unsigned)grp % ngroups; 513 parent_group = (unsigned)grp % ngroups;
473 for (i = 0; i < ngroups; i++) { 514 for (i = 0; i < ngroups; i++) {
474 grp = (parent_group + i) % ngroups; 515 g = (parent_group + i) % ngroups;
475 desc = ext4_get_group_desc(sb, grp, NULL); 516 get_orlov_stats(sb, g, flex_size, &stats);
476 if (!desc || !ext4_free_inodes_count(sb, desc)) 517 if (!stats.free_inodes)
477 continue; 518 continue;
478 if (ext4_used_dirs_count(sb, desc) >= best_ndir) 519 if (stats.used_dirs >= best_ndir)
479 continue; 520 continue;
480 if (ext4_free_inodes_count(sb, desc) < avefreei) 521 if (stats.free_inodes < avefreei)
481 continue; 522 continue;
482 if (ext4_free_blks_count(sb, desc) < avefreeb) 523 if (stats.free_blocks < avefreeb)
483 continue; 524 continue;
484 *group = grp; 525 grp = g;
485 ret = 0; 526 ret = 0;
486 best_ndir = ext4_used_dirs_count(sb, desc); 527 best_ndir = stats.used_dirs;
528 }
529 if (ret)
530 goto fallback;
531 found_flex_bg:
532 if (flex_size == 1) {
533 *group = grp;
534 return 0;
535 }
536
537 /*
538 * We pack inodes at the beginning of the flexgroup's
539 * inode tables. Block allocation decisions will do
540 * something similar, although regular files will
541 * start at 2nd block group of the flexgroup. See
542 * ext4_ext_find_goal() and ext4_find_near().
543 */
544 grp *= flex_size;
545 for (i = 0; i < flex_size; i++) {
546 if (grp+i >= sbi->s_groups_count)
547 break;
548 desc = ext4_get_group_desc(sb, grp+i, NULL);
549 if (desc && ext4_free_inodes_count(sb, desc)) {
550 *group = grp+i;
551 return 0;
552 }
487 } 553 }
488 if (ret == 0)
489 return ret;
490 goto fallback; 554 goto fallback;
491 } 555 }
492 556
493 blocks_per_dir = ext4_blocks_count(es) - freeb;
494 do_div(blocks_per_dir, ndirs);
495
496 max_dirs = ndirs / ngroups + inodes_per_group / 16; 557 max_dirs = ndirs / ngroups + inodes_per_group / 16;
497 min_inodes = avefreei - inodes_per_group / 4; 558 min_inodes = avefreei - inodes_per_group*flex_size / 4;
498 min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4; 559 if (min_inodes < 1)
499 560 min_inodes = 1;
500 max_debt = EXT4_BLOCKS_PER_GROUP(sb); 561 min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;
501 max_debt /= max_t(int, blocks_per_dir, BLOCK_COST); 562
502 if (max_debt * INODE_COST > inodes_per_group) 563 /*
503 max_debt = inodes_per_group / INODE_COST; 564 * Start looking in the flex group where we last allocated an
504 if (max_debt > 255) 565 * inode for this parent directory
505 max_debt = 255; 566 */
506 if (max_debt == 0) 567 if (EXT4_I(parent)->i_last_alloc_group != ~0) {
507 max_debt = 1; 568 parent_group = EXT4_I(parent)->i_last_alloc_group;
569 if (flex_size > 1)
570 parent_group >>= sbi->s_log_groups_per_flex;
571 }
508 572
509 for (i = 0; i < ngroups; i++) { 573 for (i = 0; i < ngroups; i++) {
510 *group = (parent_group + i) % ngroups; 574 grp = (parent_group + i) % ngroups;
511 desc = ext4_get_group_desc(sb, *group, NULL); 575 get_orlov_stats(sb, grp, flex_size, &stats);
512 if (!desc || !ext4_free_inodes_count(sb, desc)) 576 if (stats.used_dirs >= max_dirs)
513 continue;
514 if (ext4_used_dirs_count(sb, desc) >= max_dirs)
515 continue; 577 continue;
516 if (ext4_free_inodes_count(sb, desc) < min_inodes) 578 if (stats.free_inodes < min_inodes)
517 continue; 579 continue;
518 if (ext4_free_blks_count(sb, desc) < min_blocks) 580 if (stats.free_blocks < min_blocks)
519 continue; 581 continue;
520 return 0; 582 goto found_flex_bg;
521 } 583 }
522 584
523fallback: 585fallback:
586 ngroups = sbi->s_groups_count;
587 avefreei = freei / ngroups;
588 parent_group = EXT4_I(parent)->i_block_group;
524 for (i = 0; i < ngroups; i++) { 589 for (i = 0; i < ngroups; i++) {
525 *group = (parent_group + i) % ngroups; 590 grp = (parent_group + i) % ngroups;
526 desc = ext4_get_group_desc(sb, *group, NULL); 591 desc = ext4_get_group_desc(sb, grp, NULL);
527 if (desc && ext4_free_inodes_count(sb, desc) && 592 if (desc && ext4_free_inodes_count(sb, desc) &&
528 ext4_free_inodes_count(sb, desc) >= avefreei) 593 ext4_free_inodes_count(sb, desc) >= avefreei) {
594 *group = grp;
529 return 0; 595 return 0;
596 }
530 } 597 }
531 598
532 if (avefreei) { 599 if (avefreei) {
@@ -542,12 +609,51 @@ fallback:
542} 609}
543 610
544static int find_group_other(struct super_block *sb, struct inode *parent, 611static int find_group_other(struct super_block *sb, struct inode *parent,
545 ext4_group_t *group) 612 ext4_group_t *group, int mode)
546{ 613{
547 ext4_group_t parent_group = EXT4_I(parent)->i_block_group; 614 ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
548 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; 615 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
549 struct ext4_group_desc *desc; 616 struct ext4_group_desc *desc;
550 ext4_group_t i; 617 ext4_group_t i, last;
618 int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
619
620 /*
621 * Try to place the inode is the same flex group as its
622 * parent. If we can't find space, use the Orlov algorithm to
623 * find another flex group, and store that information in the
624 * parent directory's inode information so that use that flex
625 * group for future allocations.
626 */
627 if (flex_size > 1) {
628 int retry = 0;
629
630 try_again:
631 parent_group &= ~(flex_size-1);
632 last = parent_group + flex_size;
633 if (last > ngroups)
634 last = ngroups;
635 for (i = parent_group; i < last; i++) {
636 desc = ext4_get_group_desc(sb, i, NULL);
637 if (desc && ext4_free_inodes_count(sb, desc)) {
638 *group = i;
639 return 0;
640 }
641 }
642 if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
643 retry = 1;
644 parent_group = EXT4_I(parent)->i_last_alloc_group;
645 goto try_again;
646 }
647 /*
648 * If this didn't work, use the Orlov search algorithm
649 * to find a new flex group; we pass in the mode to
650 * avoid the topdir algorithms.
651 */
652 *group = parent_group + flex_size;
653 if (*group > ngroups)
654 *group = 0;
655 return find_group_orlov(sb, parent, group, mode);
656 }
551 657
552 /* 658 /*
553 * Try to place the inode in its parent directory 659 * Try to place the inode in its parent directory
@@ -665,6 +771,11 @@ static int ext4_claim_inode(struct super_block *sb,
665 if (S_ISDIR(mode)) { 771 if (S_ISDIR(mode)) {
666 count = ext4_used_dirs_count(sb, gdp) + 1; 772 count = ext4_used_dirs_count(sb, gdp) + 1;
667 ext4_used_dirs_set(sb, gdp, count); 773 ext4_used_dirs_set(sb, gdp, count);
774 if (sbi->s_log_groups_per_flex) {
775 ext4_group_t f = ext4_flex_group(sbi, group);
776
777 atomic_inc(&sbi->s_flex_groups[f].free_inodes);
778 }
668 } 779 }
669 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); 780 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
670err_ret: 781err_ret:
@@ -716,10 +827,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
716 sbi = EXT4_SB(sb); 827 sbi = EXT4_SB(sb);
717 es = sbi->s_es; 828 es = sbi->s_es;
718 829
719 if (sbi->s_log_groups_per_flex) { 830 if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
720 ret2 = find_group_flex(sb, dir, &group); 831 ret2 = find_group_flex(sb, dir, &group);
721 if (ret2 == -1) { 832 if (ret2 == -1) {
722 ret2 = find_group_other(sb, dir, &group); 833 ret2 = find_group_other(sb, dir, &group, mode);
723 if (ret2 == 0 && once) 834 if (ret2 == 0 && once)
724 once = 0; 835 once = 0;
725 printk(KERN_NOTICE "ext4: find_group_flex " 836 printk(KERN_NOTICE "ext4: find_group_flex "
@@ -733,11 +844,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
733 if (test_opt(sb, OLDALLOC)) 844 if (test_opt(sb, OLDALLOC))
734 ret2 = find_group_dir(sb, dir, &group); 845 ret2 = find_group_dir(sb, dir, &group);
735 else 846 else
736 ret2 = find_group_orlov(sb, dir, &group); 847 ret2 = find_group_orlov(sb, dir, &group, mode);
737 } else 848 } else
738 ret2 = find_group_other(sb, dir, &group); 849 ret2 = find_group_other(sb, dir, &group, mode);
739 850
740got_group: 851got_group:
852 EXT4_I(dir)->i_last_alloc_group = group;
741 err = -ENOSPC; 853 err = -ENOSPC;
742 if (ret2 == -1) 854 if (ret2 == -1)
743 goto out; 855 goto out;
@@ -858,9 +970,7 @@ got:
858 970
859 if (sbi->s_log_groups_per_flex) { 971 if (sbi->s_log_groups_per_flex) {
860 flex_group = ext4_flex_group(sbi, group); 972 flex_group = ext4_flex_group(sbi, group);
861 spin_lock(sb_bgl_lock(sbi, flex_group)); 973 atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
862 sbi->s_flex_groups[flex_group].free_inodes--;
863 spin_unlock(sb_bgl_lock(sbi, flex_group));
864 } 974 }
865 975
866 inode->i_uid = current_fsuid(); 976 inode->i_uid = current_fsuid();
@@ -885,19 +995,16 @@ got:
885 ei->i_disksize = 0; 995 ei->i_disksize = 0;
886 996
887 /* 997 /*
888 * Don't inherit extent flag from directory. We set extent flag on 998 * Don't inherit extent flag from directory, amongst others. We set
889 * newly created directory and file only if -o extent mount option is 999 * extent flag on newly created directory and file only if -o extent
890 * specified 1000 * mount option is specified
891 */ 1001 */
892 ei->i_flags = EXT4_I(dir)->i_flags & ~(EXT4_INDEX_FL|EXT4_EXTENTS_FL); 1002 ei->i_flags =
893 if (S_ISLNK(mode)) 1003 ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
894 ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL);
895 /* dirsync only applies to directories */
896 if (!S_ISDIR(mode))
897 ei->i_flags &= ~EXT4_DIRSYNC_FL;
898 ei->i_file_acl = 0; 1004 ei->i_file_acl = 0;
899 ei->i_dtime = 0; 1005 ei->i_dtime = 0;
900 ei->i_block_group = group; 1006 ei->i_block_group = group;
1007 ei->i_last_alloc_group = ~0;
901 1008
902 ext4_set_inode_flags(inode); 1009 ext4_set_inode_flags(inode);
903 if (IS_DIRSYNC(inode)) 1010 if (IS_DIRSYNC(inode))
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index dd82ff390067..a2e7952bc5f9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -371,6 +371,34 @@ static int ext4_block_to_path(struct inode *inode,
371 return n; 371 return n;
372} 372}
373 373
374static int __ext4_check_blockref(const char *function, struct inode *inode,
375 unsigned int *p, unsigned int max) {
376
377 unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es);
378 unsigned int *bref = p;
379 while (bref < p+max) {
380 if (unlikely(*bref >= maxblocks)) {
381 ext4_error(inode->i_sb, function,
382 "block reference %u >= max (%u) "
383 "in inode #%lu, offset=%d",
384 *bref, maxblocks,
385 inode->i_ino, (int)(bref-p));
386 return -EIO;
387 }
388 bref++;
389 }
390 return 0;
391}
392
393
394#define ext4_check_indirect_blockref(inode, bh) \
395 __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data, \
396 EXT4_ADDR_PER_BLOCK((inode)->i_sb))
397
398#define ext4_check_inode_blockref(inode) \
399 __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data, \
400 EXT4_NDIR_BLOCKS)
401
374/** 402/**
375 * ext4_get_branch - read the chain of indirect blocks leading to data 403 * ext4_get_branch - read the chain of indirect blocks leading to data
376 * @inode: inode in question 404 * @inode: inode in question
@@ -415,9 +443,22 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
415 if (!p->key) 443 if (!p->key)
416 goto no_block; 444 goto no_block;
417 while (--depth) { 445 while (--depth) {
418 bh = sb_bread(sb, le32_to_cpu(p->key)); 446 bh = sb_getblk(sb, le32_to_cpu(p->key));
419 if (!bh) 447 if (unlikely(!bh))
420 goto failure; 448 goto failure;
449
450 if (!bh_uptodate_or_lock(bh)) {
451 if (bh_submit_read(bh) < 0) {
452 put_bh(bh);
453 goto failure;
454 }
455 /* validate block references */
456 if (ext4_check_indirect_blockref(inode, bh)) {
457 put_bh(bh);
458 goto failure;
459 }
460 }
461
421 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); 462 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
422 /* Reader: end */ 463 /* Reader: end */
423 if (!p->key) 464 if (!p->key)
@@ -459,6 +500,8 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
459 ext4_fsblk_t bg_start; 500 ext4_fsblk_t bg_start;
460 ext4_fsblk_t last_block; 501 ext4_fsblk_t last_block;
461 ext4_grpblk_t colour; 502 ext4_grpblk_t colour;
503 ext4_group_t block_group;
504 int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
462 505
463 /* Try to find previous block */ 506 /* Try to find previous block */
464 for (p = ind->p - 1; p >= start; p--) { 507 for (p = ind->p - 1; p >= start; p--) {
@@ -474,9 +517,22 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
474 * It is going to be referred to from the inode itself? OK, just put it 517 * It is going to be referred to from the inode itself? OK, just put it
475 * into the same cylinder group then. 518 * into the same cylinder group then.
476 */ 519 */
477 bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group); 520 block_group = ei->i_block_group;
521 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
522 block_group &= ~(flex_size-1);
523 if (S_ISREG(inode->i_mode))
524 block_group++;
525 }
526 bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
478 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; 527 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
479 528
529 /*
530 * If we are doing delayed allocation, we don't need take
531 * colour into account.
532 */
533 if (test_opt(inode->i_sb, DELALLOC))
534 return bg_start;
535
480 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) 536 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
481 colour = (current->pid % 16) * 537 colour = (current->pid % 16) *
482 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); 538 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -1052,9 +1108,16 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
1052 /* 1108 /*
1053 * free those over-booking quota for metadata blocks 1109 * free those over-booking quota for metadata blocks
1054 */ 1110 */
1055
1056 if (mdb_free) 1111 if (mdb_free)
1057 vfs_dq_release_reservation_block(inode, mdb_free); 1112 vfs_dq_release_reservation_block(inode, mdb_free);
1113
1114 /*
1115 * If we have done all the pending block allocations and if
1116 * there aren't any writers on the inode, we can discard the
1117 * inode's preallocations.
1118 */
1119 if (!total && (atomic_read(&inode->i_writecount) == 0))
1120 ext4_discard_preallocations(inode);
1058} 1121}
1059 1122
1060/* 1123/*
@@ -1688,9 +1751,10 @@ static void ext4_da_page_release_reservation(struct page *page,
1688 1751
1689struct mpage_da_data { 1752struct mpage_da_data {
1690 struct inode *inode; 1753 struct inode *inode;
1691 struct buffer_head lbh; /* extent of blocks */ 1754 sector_t b_blocknr; /* start block number of extent */
1755 size_t b_size; /* size of extent */
1756 unsigned long b_state; /* state of the extent */
1692 unsigned long first_page, next_page; /* extent of pages */ 1757 unsigned long first_page, next_page; /* extent of pages */
1693 get_block_t *get_block;
1694 struct writeback_control *wbc; 1758 struct writeback_control *wbc;
1695 int io_done; 1759 int io_done;
1696 int pages_written; 1760 int pages_written;
@@ -1704,7 +1768,6 @@ struct mpage_da_data {
1704 * @mpd->inode: inode 1768 * @mpd->inode: inode
1705 * @mpd->first_page: first page of the extent 1769 * @mpd->first_page: first page of the extent
1706 * @mpd->next_page: page after the last page of the extent 1770 * @mpd->next_page: page after the last page of the extent
1707 * @mpd->get_block: the filesystem's block mapper function
1708 * 1771 *
1709 * By the time mpage_da_submit_io() is called we expect all blocks 1772 * By the time mpage_da_submit_io() is called we expect all blocks
1710 * to be allocated. this may be wrong if allocation failed. 1773 * to be allocated. this may be wrong if allocation failed.
@@ -1724,7 +1787,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
1724 /* 1787 /*
1725 * We need to start from the first_page to the next_page - 1 1788 * We need to start from the first_page to the next_page - 1
1726 * to make sure we also write the mapped dirty buffer_heads. 1789 * to make sure we also write the mapped dirty buffer_heads.
1727 * If we look at mpd->lbh.b_blocknr we would only be looking 1790 * If we look at mpd->b_blocknr we would only be looking
1728 * at the currently mapped buffer_heads. 1791 * at the currently mapped buffer_heads.
1729 */ 1792 */
1730 index = mpd->first_page; 1793 index = mpd->first_page;
@@ -1914,68 +1977,111 @@ static void ext4_print_free_blocks(struct inode *inode)
1914 return; 1977 return;
1915} 1978}
1916 1979
1980#define EXT4_DELALLOC_RSVED 1
1981static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
1982 struct buffer_head *bh_result, int create)
1983{
1984 int ret;
1985 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
1986 loff_t disksize = EXT4_I(inode)->i_disksize;
1987 handle_t *handle = NULL;
1988
1989 handle = ext4_journal_current_handle();
1990 BUG_ON(!handle);
1991 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
1992 bh_result, create, 0, EXT4_DELALLOC_RSVED);
1993 if (ret <= 0)
1994 return ret;
1995
1996 bh_result->b_size = (ret << inode->i_blkbits);
1997
1998 if (ext4_should_order_data(inode)) {
1999 int retval;
2000 retval = ext4_jbd2_file_inode(handle, inode);
2001 if (retval)
2002 /*
2003 * Failed to add inode for ordered mode. Don't
2004 * update file size
2005 */
2006 return retval;
2007 }
2008
2009 /*
2010 * Update on-disk size along with block allocation we don't
2011 * use 'extend_disksize' as size may change within already
2012 * allocated block -bzzz
2013 */
2014 disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
2015 if (disksize > i_size_read(inode))
2016 disksize = i_size_read(inode);
2017 if (disksize > EXT4_I(inode)->i_disksize) {
2018 ext4_update_i_disksize(inode, disksize);
2019 ret = ext4_mark_inode_dirty(handle, inode);
2020 return ret;
2021 }
2022 return 0;
2023}
2024
1917/* 2025/*
1918 * mpage_da_map_blocks - go through given space 2026 * mpage_da_map_blocks - go through given space
1919 * 2027 *
1920 * @mpd->lbh - bh describing space 2028 * @mpd - bh describing space
1921 * @mpd->get_block - the filesystem's block mapper function
1922 * 2029 *
1923 * The function skips space we know is already mapped to disk blocks. 2030 * The function skips space we know is already mapped to disk blocks.
1924 * 2031 *
1925 */ 2032 */
1926static int mpage_da_map_blocks(struct mpage_da_data *mpd) 2033static int mpage_da_map_blocks(struct mpage_da_data *mpd)
1927{ 2034{
1928 int err = 0; 2035 int err = 0;
1929 struct buffer_head new; 2036 struct buffer_head new;
1930 struct buffer_head *lbh = &mpd->lbh;
1931 sector_t next; 2037 sector_t next;
1932 2038
1933 /* 2039 /*
1934 * We consider only non-mapped and non-allocated blocks 2040 * We consider only non-mapped and non-allocated blocks
1935 */ 2041 */
1936 if (buffer_mapped(lbh) && !buffer_delay(lbh)) 2042 if ((mpd->b_state & (1 << BH_Mapped)) &&
2043 !(mpd->b_state & (1 << BH_Delay)))
1937 return 0; 2044 return 0;
1938 new.b_state = lbh->b_state; 2045 new.b_state = mpd->b_state;
1939 new.b_blocknr = 0; 2046 new.b_blocknr = 0;
1940 new.b_size = lbh->b_size; 2047 new.b_size = mpd->b_size;
1941 next = lbh->b_blocknr; 2048 next = mpd->b_blocknr;
1942 /* 2049 /*
1943 * If we didn't accumulate anything 2050 * If we didn't accumulate anything
1944 * to write simply return 2051 * to write simply return
1945 */ 2052 */
1946 if (!new.b_size) 2053 if (!new.b_size)
1947 return 0; 2054 return 0;
1948 err = mpd->get_block(mpd->inode, next, &new, 1);
1949 if (err) {
1950 2055
1951 /* If get block returns with error 2056 err = ext4_da_get_block_write(mpd->inode, next, &new, 1);
1952 * we simply return. Later writepage 2057 if (err) {
1953 * will redirty the page and writepages 2058 /*
1954 * will find the dirty page again 2059 * If get block returns with error we simply
2060 * return. Later writepage will redirty the page and
2061 * writepages will find the dirty page again
1955 */ 2062 */
1956 if (err == -EAGAIN) 2063 if (err == -EAGAIN)
1957 return 0; 2064 return 0;
1958 2065
1959 if (err == -ENOSPC && 2066 if (err == -ENOSPC &&
1960 ext4_count_free_blocks(mpd->inode->i_sb)) { 2067 ext4_count_free_blocks(mpd->inode->i_sb)) {
1961 mpd->retval = err; 2068 mpd->retval = err;
1962 return 0; 2069 return 0;
1963 } 2070 }
1964 2071
1965 /* 2072 /*
1966 * get block failure will cause us 2073 * get block failure will cause us to loop in
1967 * to loop in writepages. Because 2074 * writepages, because a_ops->writepage won't be able
1968 * a_ops->writepage won't be able to 2075 * to make progress. The page will be redirtied by
1969 * make progress. The page will be redirtied 2076 * writepage and writepages will again try to write
1970 * by writepage and writepages will again 2077 * the same.
1971 * try to write the same.
1972 */ 2078 */
1973 printk(KERN_EMERG "%s block allocation failed for inode %lu " 2079 printk(KERN_EMERG "%s block allocation failed for inode %lu "
1974 "at logical offset %llu with max blocks " 2080 "at logical offset %llu with max blocks "
1975 "%zd with error %d\n", 2081 "%zd with error %d\n",
1976 __func__, mpd->inode->i_ino, 2082 __func__, mpd->inode->i_ino,
1977 (unsigned long long)next, 2083 (unsigned long long)next,
1978 lbh->b_size >> mpd->inode->i_blkbits, err); 2084 mpd->b_size >> mpd->inode->i_blkbits, err);
1979 printk(KERN_EMERG "This should not happen.!! " 2085 printk(KERN_EMERG "This should not happen.!! "
1980 "Data will be lost\n"); 2086 "Data will be lost\n");
1981 if (err == -ENOSPC) { 2087 if (err == -ENOSPC) {
@@ -1983,7 +2089,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
1983 } 2089 }
1984 /* invlaidate all the pages */ 2090 /* invlaidate all the pages */
1985 ext4_da_block_invalidatepages(mpd, next, 2091 ext4_da_block_invalidatepages(mpd, next,
1986 lbh->b_size >> mpd->inode->i_blkbits); 2092 mpd->b_size >> mpd->inode->i_blkbits);
1987 return err; 2093 return err;
1988 } 2094 }
1989 BUG_ON(new.b_size == 0); 2095 BUG_ON(new.b_size == 0);
@@ -1995,7 +2101,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
1995 * If blocks are delayed marked, we need to 2101 * If blocks are delayed marked, we need to
1996 * put actual blocknr and drop delayed bit 2102 * put actual blocknr and drop delayed bit
1997 */ 2103 */
1998 if (buffer_delay(lbh) || buffer_unwritten(lbh)) 2104 if ((mpd->b_state & (1 << BH_Delay)) ||
2105 (mpd->b_state & (1 << BH_Unwritten)))
1999 mpage_put_bnr_to_bhs(mpd, next, &new); 2106 mpage_put_bnr_to_bhs(mpd, next, &new);
2000 2107
2001 return 0; 2108 return 0;
@@ -2014,12 +2121,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2014 * the function is used to collect contig. blocks in same state 2121 * the function is used to collect contig. blocks in same state
2015 */ 2122 */
2016static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, 2123static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
2017 sector_t logical, struct buffer_head *bh) 2124 sector_t logical, size_t b_size,
2125 unsigned long b_state)
2018{ 2126{
2019 sector_t next; 2127 sector_t next;
2020 size_t b_size = bh->b_size; 2128 int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
2021 struct buffer_head *lbh = &mpd->lbh;
2022 int nrblocks = lbh->b_size >> mpd->inode->i_blkbits;
2023 2129
2024 /* check if thereserved journal credits might overflow */ 2130 /* check if thereserved journal credits might overflow */
2025 if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { 2131 if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
@@ -2046,19 +2152,19 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
2046 /* 2152 /*
2047 * First block in the extent 2153 * First block in the extent
2048 */ 2154 */
2049 if (lbh->b_size == 0) { 2155 if (mpd->b_size == 0) {
2050 lbh->b_blocknr = logical; 2156 mpd->b_blocknr = logical;
2051 lbh->b_size = b_size; 2157 mpd->b_size = b_size;
2052 lbh->b_state = bh->b_state & BH_FLAGS; 2158 mpd->b_state = b_state & BH_FLAGS;
2053 return; 2159 return;
2054 } 2160 }
2055 2161
2056 next = lbh->b_blocknr + nrblocks; 2162 next = mpd->b_blocknr + nrblocks;
2057 /* 2163 /*
2058 * Can we merge the block to our big extent? 2164 * Can we merge the block to our big extent?
2059 */ 2165 */
2060 if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { 2166 if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
2061 lbh->b_size += b_size; 2167 mpd->b_size += b_size;
2062 return; 2168 return;
2063 } 2169 }
2064 2170
@@ -2087,7 +2193,7 @@ static int __mpage_da_writepage(struct page *page,
2087{ 2193{
2088 struct mpage_da_data *mpd = data; 2194 struct mpage_da_data *mpd = data;
2089 struct inode *inode = mpd->inode; 2195 struct inode *inode = mpd->inode;
2090 struct buffer_head *bh, *head, fake; 2196 struct buffer_head *bh, *head;
2091 sector_t logical; 2197 sector_t logical;
2092 2198
2093 if (mpd->io_done) { 2199 if (mpd->io_done) {
@@ -2129,9 +2235,9 @@ static int __mpage_da_writepage(struct page *page,
2129 /* 2235 /*
2130 * ... and blocks 2236 * ... and blocks
2131 */ 2237 */
2132 mpd->lbh.b_size = 0; 2238 mpd->b_size = 0;
2133 mpd->lbh.b_state = 0; 2239 mpd->b_state = 0;
2134 mpd->lbh.b_blocknr = 0; 2240 mpd->b_blocknr = 0;
2135 } 2241 }
2136 2242
2137 mpd->next_page = page->index + 1; 2243 mpd->next_page = page->index + 1;
@@ -2139,16 +2245,8 @@ static int __mpage_da_writepage(struct page *page,
2139 (PAGE_CACHE_SHIFT - inode->i_blkbits); 2245 (PAGE_CACHE_SHIFT - inode->i_blkbits);
2140 2246
2141 if (!page_has_buffers(page)) { 2247 if (!page_has_buffers(page)) {
2142 /* 2248 mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE,
2143 * There is no attached buffer heads yet (mmap?) 2249 (1 << BH_Dirty) | (1 << BH_Uptodate));
2144 * we treat the page asfull of dirty blocks
2145 */
2146 bh = &fake;
2147 bh->b_size = PAGE_CACHE_SIZE;
2148 bh->b_state = 0;
2149 set_buffer_dirty(bh);
2150 set_buffer_uptodate(bh);
2151 mpage_add_bh_to_extent(mpd, logical, bh);
2152 if (mpd->io_done) 2250 if (mpd->io_done)
2153 return MPAGE_DA_EXTENT_TAIL; 2251 return MPAGE_DA_EXTENT_TAIL;
2154 } else { 2252 } else {
@@ -2166,8 +2264,10 @@ static int __mpage_da_writepage(struct page *page,
2166 * with the page in ext4_da_writepage 2264 * with the page in ext4_da_writepage
2167 */ 2265 */
2168 if (buffer_dirty(bh) && 2266 if (buffer_dirty(bh) &&
2169 (!buffer_mapped(bh) || buffer_delay(bh))) { 2267 (!buffer_mapped(bh) || buffer_delay(bh))) {
2170 mpage_add_bh_to_extent(mpd, logical, bh); 2268 mpage_add_bh_to_extent(mpd, logical,
2269 bh->b_size,
2270 bh->b_state);
2171 if (mpd->io_done) 2271 if (mpd->io_done)
2172 return MPAGE_DA_EXTENT_TAIL; 2272 return MPAGE_DA_EXTENT_TAIL;
2173 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { 2273 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
@@ -2179,9 +2279,8 @@ static int __mpage_da_writepage(struct page *page,
2179 * unmapped buffer_head later we need to 2279 * unmapped buffer_head later we need to
2180 * use the b_state flag of that buffer_head. 2280 * use the b_state flag of that buffer_head.
2181 */ 2281 */
2182 if (mpd->lbh.b_size == 0) 2282 if (mpd->b_size == 0)
2183 mpd->lbh.b_state = 2283 mpd->b_state = bh->b_state & BH_FLAGS;
2184 bh->b_state & BH_FLAGS;
2185 } 2284 }
2186 logical++; 2285 logical++;
2187 } while ((bh = bh->b_this_page) != head); 2286 } while ((bh = bh->b_this_page) != head);
@@ -2191,51 +2290,6 @@ static int __mpage_da_writepage(struct page *page,
2191} 2290}
2192 2291
2193/* 2292/*
2194 * mpage_da_writepages - walk the list of dirty pages of the given
2195 * address space, allocates non-allocated blocks, maps newly-allocated
2196 * blocks to existing bhs and issue IO them
2197 *
2198 * @mapping: address space structure to write
2199 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2200 * @get_block: the filesystem's block mapper function.
2201 *
2202 * This is a library function, which implements the writepages()
2203 * address_space_operation.
2204 */
2205static int mpage_da_writepages(struct address_space *mapping,
2206 struct writeback_control *wbc,
2207 struct mpage_da_data *mpd)
2208{
2209 int ret;
2210
2211 if (!mpd->get_block)
2212 return generic_writepages(mapping, wbc);
2213
2214 mpd->lbh.b_size = 0;
2215 mpd->lbh.b_state = 0;
2216 mpd->lbh.b_blocknr = 0;
2217 mpd->first_page = 0;
2218 mpd->next_page = 0;
2219 mpd->io_done = 0;
2220 mpd->pages_written = 0;
2221 mpd->retval = 0;
2222
2223 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
2224 /*
2225 * Handle last extent of pages
2226 */
2227 if (!mpd->io_done && mpd->next_page != mpd->first_page) {
2228 if (mpage_da_map_blocks(mpd) == 0)
2229 mpage_da_submit_io(mpd);
2230
2231 mpd->io_done = 1;
2232 ret = MPAGE_DA_EXTENT_TAIL;
2233 }
2234 wbc->nr_to_write -= mpd->pages_written;
2235 return ret;
2236}
2237
2238/*
2239 * this is a special callback for ->write_begin() only 2293 * this is a special callback for ->write_begin() only
2240 * it's intention is to return mapped block or reserve space 2294 * it's intention is to return mapped block or reserve space
2241 */ 2295 */
@@ -2274,51 +2328,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2274 2328
2275 return ret; 2329 return ret;
2276} 2330}
2277#define EXT4_DELALLOC_RSVED 1
2278static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
2279 struct buffer_head *bh_result, int create)
2280{
2281 int ret;
2282 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
2283 loff_t disksize = EXT4_I(inode)->i_disksize;
2284 handle_t *handle = NULL;
2285
2286 handle = ext4_journal_current_handle();
2287 BUG_ON(!handle);
2288 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
2289 bh_result, create, 0, EXT4_DELALLOC_RSVED);
2290 if (ret > 0) {
2291
2292 bh_result->b_size = (ret << inode->i_blkbits);
2293
2294 if (ext4_should_order_data(inode)) {
2295 int retval;
2296 retval = ext4_jbd2_file_inode(handle, inode);
2297 if (retval)
2298 /*
2299 * Failed to add inode for ordered
2300 * mode. Don't update file size
2301 */
2302 return retval;
2303 }
2304
2305 /*
2306 * Update on-disk size along with block allocation
2307 * we don't use 'extend_disksize' as size may change
2308 * within already allocated block -bzzz
2309 */
2310 disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
2311 if (disksize > i_size_read(inode))
2312 disksize = i_size_read(inode);
2313 if (disksize > EXT4_I(inode)->i_disksize) {
2314 ext4_update_i_disksize(inode, disksize);
2315 ret = ext4_mark_inode_dirty(handle, inode);
2316 return ret;
2317 }
2318 ret = 0;
2319 }
2320 return ret;
2321}
2322 2331
2323static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) 2332static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
2324{ 2333{
@@ -2569,8 +2578,38 @@ retry:
2569 dump_stack(); 2578 dump_stack();
2570 goto out_writepages; 2579 goto out_writepages;
2571 } 2580 }
2572 mpd.get_block = ext4_da_get_block_write; 2581
2573 ret = mpage_da_writepages(mapping, wbc, &mpd); 2582 /*
2583 * Now call __mpage_da_writepage to find the next
2584 * contiguous region of logical blocks that need
2585 * blocks to be allocated by ext4. We don't actually
2586 * submit the blocks for I/O here, even though
2587 * write_cache_pages thinks it will, and will set the
2588 * pages as clean for write before calling
2589 * __mpage_da_writepage().
2590 */
2591 mpd.b_size = 0;
2592 mpd.b_state = 0;
2593 mpd.b_blocknr = 0;
2594 mpd.first_page = 0;
2595 mpd.next_page = 0;
2596 mpd.io_done = 0;
2597 mpd.pages_written = 0;
2598 mpd.retval = 0;
2599 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
2600 &mpd);
2601 /*
2602 * If we have a contigous extent of pages and we
2603 * haven't done the I/O yet, map the blocks and submit
2604 * them for I/O.
2605 */
2606 if (!mpd.io_done && mpd.next_page != mpd.first_page) {
2607 if (mpage_da_map_blocks(&mpd) == 0)
2608 mpage_da_submit_io(&mpd);
2609 mpd.io_done = 1;
2610 ret = MPAGE_DA_EXTENT_TAIL;
2611 }
2612 wbc->nr_to_write -= mpd.pages_written;
2574 2613
2575 ext4_journal_stop(handle); 2614 ext4_journal_stop(handle);
2576 2615
@@ -2846,6 +2885,48 @@ out:
2846 return; 2885 return;
2847} 2886}
2848 2887
2888/*
2889 * Force all delayed allocation blocks to be allocated for a given inode.
2890 */
2891int ext4_alloc_da_blocks(struct inode *inode)
2892{
2893 if (!EXT4_I(inode)->i_reserved_data_blocks &&
2894 !EXT4_I(inode)->i_reserved_meta_blocks)
2895 return 0;
2896
2897 /*
2898 * We do something simple for now. The filemap_flush() will
2899 * also start triggering a write of the data blocks, which is
2900 * not strictly speaking necessary (and for users of
2901 * laptop_mode, not even desirable). However, to do otherwise
2902 * would require replicating code paths in:
2903 *
2904 * ext4_da_writepages() ->
2905 * write_cache_pages() ---> (via passed in callback function)
2906 * __mpage_da_writepage() -->
2907 * mpage_add_bh_to_extent()
2908 * mpage_da_map_blocks()
2909 *
2910 * The problem is that write_cache_pages(), located in
2911 * mm/page-writeback.c, marks pages clean in preparation for
2912 * doing I/O, which is not desirable if we're not planning on
2913 * doing I/O at all.
2914 *
2915 * We could call write_cache_pages(), and then redirty all of
2916 * the pages by calling redirty_page_for_writeback() but that
2917 * would be ugly in the extreme. So instead we would need to
2918 * replicate parts of the code in the above functions,
2919 * simplifying them becuase we wouldn't actually intend to
2920 * write out the pages, but rather only collect contiguous
2921 * logical block extents, call the multi-block allocator, and
2922 * then update the buffer heads with the block allocations.
2923 *
2924 * For now, though, we'll cheat by calling filemap_flush(),
2925 * which will map the blocks, and start the I/O, but not
2926 * actually wait for the I/O to complete.
2927 */
2928 return filemap_flush(inode->i_mapping);
2929}
2849 2930
2850/* 2931/*
2851 * bmap() is special. It gets used by applications such as lilo and by 2932 * bmap() is special. It gets used by applications such as lilo and by
@@ -3868,6 +3949,9 @@ void ext4_truncate(struct inode *inode)
3868 if (!ext4_can_truncate(inode)) 3949 if (!ext4_can_truncate(inode))
3869 return; 3950 return;
3870 3951
3952 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
3953 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
3954
3871 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 3955 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
3872 ext4_ext_truncate(inode); 3956 ext4_ext_truncate(inode);
3873 return; 3957 return;
@@ -4110,12 +4194,7 @@ make_io:
4110 unsigned num; 4194 unsigned num;
4111 4195
4112 table = ext4_inode_table(sb, gdp); 4196 table = ext4_inode_table(sb, gdp);
4113 /* Make sure s_inode_readahead_blks is a power of 2 */ 4197 /* s_inode_readahead_blks is always a power of 2 */
4114 while (EXT4_SB(sb)->s_inode_readahead_blks &
4115 (EXT4_SB(sb)->s_inode_readahead_blks-1))
4116 EXT4_SB(sb)->s_inode_readahead_blks =
4117 (EXT4_SB(sb)->s_inode_readahead_blks &
4118 (EXT4_SB(sb)->s_inode_readahead_blks-1));
4119 b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); 4198 b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
4120 if (table > b) 4199 if (table > b)
4121 b = table; 4200 b = table;
@@ -4287,6 +4366,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4287 ei->i_disksize = inode->i_size; 4366 ei->i_disksize = inode->i_size;
4288 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 4367 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
4289 ei->i_block_group = iloc.block_group; 4368 ei->i_block_group = iloc.block_group;
4369 ei->i_last_alloc_group = ~0;
4290 /* 4370 /*
4291 * NOTE! The in-memory inode i_data array is in little-endian order 4371 * NOTE! The in-memory inode i_data array is in little-endian order
4292 * even on big-endian machines: we do NOT byteswap the block numbers! 4372 * even on big-endian machines: we do NOT byteswap the block numbers!
@@ -4329,6 +4409,20 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4329 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; 4409 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
4330 } 4410 }
4331 4411
4412 if (ei->i_flags & EXT4_EXTENTS_FL) {
4413 /* Validate extent which is part of inode */
4414 ret = ext4_ext_check_inode(inode);
4415 } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
4416 (S_ISLNK(inode->i_mode) &&
4417 !ext4_inode_is_fast_symlink(inode))) {
4418 /* Validate block references which are part of inode */
4419 ret = ext4_check_inode_blockref(inode);
4420 }
4421 if (ret) {
4422 brelse(bh);
4423 goto bad_inode;
4424 }
4425
4332 if (S_ISREG(inode->i_mode)) { 4426 if (S_ISREG(inode->i_mode)) {
4333 inode->i_op = &ext4_file_inode_operations; 4427 inode->i_op = &ext4_file_inode_operations;
4334 inode->i_fop = &ext4_file_operations; 4428 inode->i_fop = &ext4_file_operations;
@@ -4345,7 +4439,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4345 inode->i_op = &ext4_symlink_inode_operations; 4439 inode->i_op = &ext4_symlink_inode_operations;
4346 ext4_set_aops(inode); 4440 ext4_set_aops(inode);
4347 } 4441 }
4348 } else { 4442 } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
4443 S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
4349 inode->i_op = &ext4_special_inode_operations; 4444 inode->i_op = &ext4_special_inode_operations;
4350 if (raw_inode->i_block[0]) 4445 if (raw_inode->i_block[0])
4351 init_special_inode(inode, inode->i_mode, 4446 init_special_inode(inode, inode->i_mode,
@@ -4353,6 +4448,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4353 else 4448 else
4354 init_special_inode(inode, inode->i_mode, 4449 init_special_inode(inode, inode->i_mode,
4355 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 4450 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
4451 } else {
4452 brelse(bh);
4453 ret = -EIO;
4454 ext4_error(inode->i_sb, __func__,
4455 "bogus i_mode (%o) for inode=%lu",
4456 inode->i_mode, inode->i_ino);
4457 goto bad_inode;
4356 } 4458 }
4357 brelse(iloc.bh); 4459 brelse(iloc.bh);
4358 ext4_set_inode_flags(inode); 4460 ext4_set_inode_flags(inode);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 42dc83fb247a..91e75f7a9e73 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -48,8 +48,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
48 if (err) 48 if (err)
49 return err; 49 return err;
50 50
51 if (!S_ISDIR(inode->i_mode)) 51 flags = ext4_mask_flags(inode->i_mode, flags);
52 flags &= ~EXT4_DIRSYNC_FL;
53 52
54 err = -EPERM; 53 err = -EPERM;
55 mutex_lock(&inode->i_mutex); 54 mutex_lock(&inode->i_mutex);
@@ -263,6 +262,20 @@ setversion_out:
263 return err; 262 return err;
264 } 263 }
265 264
265 case EXT4_IOC_ALLOC_DA_BLKS:
266 {
267 int err;
268 if (!is_owner_or_cap(inode))
269 return -EACCES;
270
271 err = mnt_want_write(filp->f_path.mnt);
272 if (err)
273 return err;
274 err = ext4_alloc_da_blocks(inode);
275 mnt_drop_write(filp->f_path.mnt);
276 return err;
277 }
278
266 default: 279 default:
267 return -ENOTTY; 280 return -ENOTTY;
268 } 281 }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b038188bd039..f871677a7984 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -46,22 +46,23 @@
46 * The allocation request involve request for multiple number of blocks 46 * The allocation request involve request for multiple number of blocks
47 * near to the goal(block) value specified. 47 * near to the goal(block) value specified.
48 * 48 *
49 * During initialization phase of the allocator we decide to use the group 49 * During initialization phase of the allocator we decide to use the
50 * preallocation or inode preallocation depending on the size file. The 50 * group preallocation or inode preallocation depending on the size of
51 * size of the file could be the resulting file size we would have after 51 * the file. The size of the file could be the resulting file size we
52 * allocation or the current file size which ever is larger. If the size is 52 * would have after allocation, or the current file size, which ever
53 * less that sbi->s_mb_stream_request we select the group 53 * is larger. If the size is less than sbi->s_mb_stream_request we
54 * preallocation. The default value of s_mb_stream_request is 16 54 * select to use the group preallocation. The default value of
55 * blocks. This can also be tuned via 55 * s_mb_stream_request is 16 blocks. This can also be tuned via
56 * /proc/fs/ext4/<partition>/stream_req. The value is represented in terms 56 * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
57 * of number of blocks. 57 * terms of number of blocks.
58 * 58 *
59 * The main motivation for having small file use group preallocation is to 59 * The main motivation for having small file use group preallocation is to
60 * ensure that we have small file closer in the disk. 60 * ensure that we have small files closer together on the disk.
61 * 61 *
62 * First stage the allocator looks at the inode prealloc list 62 * First stage the allocator looks at the inode prealloc list,
63 * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for 63 * ext4_inode_info->i_prealloc_list, which contains list of prealloc
64 * this particular inode. The inode prealloc space is represented as: 64 * spaces for this particular inode. The inode prealloc space is
65 * represented as:
65 * 66 *
66 * pa_lstart -> the logical start block for this prealloc space 67 * pa_lstart -> the logical start block for this prealloc space
67 * pa_pstart -> the physical start block for this prealloc space 68 * pa_pstart -> the physical start block for this prealloc space
@@ -121,29 +122,29 @@
121 * list. In case of inode preallocation we follow a list of heuristics 122 * list. In case of inode preallocation we follow a list of heuristics
122 * based on file size. This can be found in ext4_mb_normalize_request. If 123 * based on file size. This can be found in ext4_mb_normalize_request. If
123 * we are doing a group prealloc we try to normalize the request to 124 * we are doing a group prealloc we try to normalize the request to
124 * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to 125 * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is
125 * 512 blocks. This can be tuned via 126 * 512 blocks. This can be tuned via
126 * /proc/fs/ext4/<partition/group_prealloc. The value is represented in 127 * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in
127 * terms of number of blocks. If we have mounted the file system with -O 128 * terms of number of blocks. If we have mounted the file system with -O
128 * stripe=<value> option the group prealloc request is normalized to the 129 * stripe=<value> option the group prealloc request is normalized to the
129 * stripe value (sbi->s_stripe) 130 * stripe value (sbi->s_stripe)
130 * 131 *
131 * The regular allocator(using the buddy cache) support few tunables. 132 * The regular allocator(using the buddy cache) supports few tunables.
132 * 133 *
133 * /proc/fs/ext4/<partition>/min_to_scan 134 * /sys/fs/ext4/<partition>/mb_min_to_scan
134 * /proc/fs/ext4/<partition>/max_to_scan 135 * /sys/fs/ext4/<partition>/mb_max_to_scan
135 * /proc/fs/ext4/<partition>/order2_req 136 * /sys/fs/ext4/<partition>/mb_order2_req
136 * 137 *
137 * The regular allocator use buddy scan only if the request len is power of 138 * The regular allocator uses buddy scan only if the request len is power of
138 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The 139 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
139 * value of s_mb_order2_reqs can be tuned via 140 * value of s_mb_order2_reqs can be tuned via
140 * /proc/fs/ext4/<partition>/order2_req. If the request len is equal to 141 * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to
141 * stripe size (sbi->s_stripe), we try to search for contigous block in 142 * stripe size (sbi->s_stripe), we try to search for contigous block in
142 * stripe size. This should result in better allocation on RAID setup. If 143 * stripe size. This should result in better allocation on RAID setups. If
143 * not we search in the specific group using bitmap for best extents. The 144 * not, we search in the specific group using bitmap for best extents. The
144 * tunable min_to_scan and max_to_scan controll the behaviour here. 145 * tunable min_to_scan and max_to_scan control the behaviour here.
145 * min_to_scan indicate how long the mballoc __must__ look for a best 146 * min_to_scan indicate how long the mballoc __must__ look for a best
146 * extent and max_to_scanindicate how long the mballoc __can__ look for a 147 * extent and max_to_scan indicates how long the mballoc __can__ look for a
147 * best extent in the found extents. Searching for the blocks starts with 148 * best extent in the found extents. Searching for the blocks starts with
148 * the group specified as the goal value in allocation context via 149 * the group specified as the goal value in allocation context via
149 * ac_g_ex. Each group is first checked based on the criteria whether it 150 * ac_g_ex. Each group is first checked based on the criteria whether it
@@ -337,8 +338,6 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
337 ext4_group_t group); 338 ext4_group_t group);
338static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 339static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
339 ext4_group_t group); 340 ext4_group_t group);
340static int ext4_mb_init_per_dev_proc(struct super_block *sb);
341static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
342static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); 341static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
343 342
344 343
@@ -1726,6 +1725,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1726{ 1725{
1727 unsigned free, fragments; 1726 unsigned free, fragments;
1728 unsigned i, bits; 1727 unsigned i, bits;
1728 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
1729 struct ext4_group_desc *desc; 1729 struct ext4_group_desc *desc;
1730 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 1730 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1731 1731
@@ -1747,6 +1747,12 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1747 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) 1747 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
1748 return 0; 1748 return 0;
1749 1749
1750 /* Avoid using the first bg of a flexgroup for data files */
1751 if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
1752 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
1753 ((group % flex_size) == 0))
1754 return 0;
1755
1750 bits = ac->ac_sb->s_blocksize_bits + 1; 1756 bits = ac->ac_sb->s_blocksize_bits + 1;
1751 for (i = ac->ac_2order; i <= bits; i++) 1757 for (i = ac->ac_2order; i <= bits; i++)
1752 if (grp->bb_counters[i] > 0) 1758 if (grp->bb_counters[i] > 0)
@@ -1971,7 +1977,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1971 /* 1977 /*
1972 * We search using buddy data only if the order of the request 1978 * We search using buddy data only if the order of the request
1973 * is greater than equal to the sbi_s_mb_order2_reqs 1979 * is greater than equal to the sbi_s_mb_order2_reqs
1974 * You can tune it via /proc/fs/ext4/<partition>/order2_req 1980 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
1975 */ 1981 */
1976 if (i >= sbi->s_mb_order2_reqs) { 1982 if (i >= sbi->s_mb_order2_reqs) {
1977 /* 1983 /*
@@ -2693,7 +2699,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2693 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int); 2699 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
2694 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 2700 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2695 if (sbi->s_mb_maxs == NULL) { 2701 if (sbi->s_mb_maxs == NULL) {
2696 kfree(sbi->s_mb_maxs); 2702 kfree(sbi->s_mb_offsets);
2697 return -ENOMEM; 2703 return -ENOMEM;
2698 } 2704 }
2699 2705
@@ -2746,7 +2752,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2746 spin_lock_init(&lg->lg_prealloc_lock); 2752 spin_lock_init(&lg->lg_prealloc_lock);
2747 } 2753 }
2748 2754
2749 ext4_mb_init_per_dev_proc(sb);
2750 ext4_mb_history_init(sb); 2755 ext4_mb_history_init(sb);
2751 2756
2752 if (sbi->s_journal) 2757 if (sbi->s_journal)
@@ -2829,7 +2834,6 @@ int ext4_mb_release(struct super_block *sb)
2829 2834
2830 free_percpu(sbi->s_locality_groups); 2835 free_percpu(sbi->s_locality_groups);
2831 ext4_mb_history_release(sb); 2836 ext4_mb_history_release(sb);
2832 ext4_mb_destroy_per_dev_proc(sb);
2833 2837
2834 return 0; 2838 return 0;
2835} 2839}
@@ -2890,62 +2894,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2890 mb_debug("freed %u blocks in %u structures\n", count, count2); 2894 mb_debug("freed %u blocks in %u structures\n", count, count2);
2891} 2895}
2892 2896
2893#define EXT4_MB_STATS_NAME "stats"
2894#define EXT4_MB_MAX_TO_SCAN_NAME "max_to_scan"
2895#define EXT4_MB_MIN_TO_SCAN_NAME "min_to_scan"
2896#define EXT4_MB_ORDER2_REQ "order2_req"
2897#define EXT4_MB_STREAM_REQ "stream_req"
2898#define EXT4_MB_GROUP_PREALLOC "group_prealloc"
2899
2900static int ext4_mb_init_per_dev_proc(struct super_block *sb)
2901{
2902#ifdef CONFIG_PROC_FS
2903 mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
2904 struct ext4_sb_info *sbi = EXT4_SB(sb);
2905 struct proc_dir_entry *proc;
2906
2907 if (sbi->s_proc == NULL)
2908 return -EINVAL;
2909
2910 EXT4_PROC_HANDLER(EXT4_MB_STATS_NAME, mb_stats);
2911 EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan);
2912 EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan);
2913 EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs);
2914 EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request);
2915 EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc);
2916 return 0;
2917
2918err_out:
2919 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
2920 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
2921 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
2922 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
2923 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
2924 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
2925 return -ENOMEM;
2926#else
2927 return 0;
2928#endif
2929}
2930
2931static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
2932{
2933#ifdef CONFIG_PROC_FS
2934 struct ext4_sb_info *sbi = EXT4_SB(sb);
2935
2936 if (sbi->s_proc == NULL)
2937 return -EINVAL;
2938
2939 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
2940 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
2941 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
2942 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
2943 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
2944 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
2945#endif
2946 return 0;
2947}
2948
2949int __init init_ext4_mballoc(void) 2897int __init init_ext4_mballoc(void)
2950{ 2898{
2951 ext4_pspace_cachep = 2899 ext4_pspace_cachep =
@@ -3096,9 +3044,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
3096 if (sbi->s_log_groups_per_flex) { 3044 if (sbi->s_log_groups_per_flex) {
3097 ext4_group_t flex_group = ext4_flex_group(sbi, 3045 ext4_group_t flex_group = ext4_flex_group(sbi,
3098 ac->ac_b_ex.fe_group); 3046 ac->ac_b_ex.fe_group);
3099 spin_lock(sb_bgl_lock(sbi, flex_group)); 3047 atomic_sub(ac->ac_b_ex.fe_len,
3100 sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len; 3048 &sbi->s_flex_groups[flex_group].free_blocks);
3101 spin_unlock(sb_bgl_lock(sbi, flex_group));
3102 } 3049 }
3103 3050
3104 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 3051 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -3116,7 +3063,7 @@ out_err:
3116 * here we normalize request for locality group 3063 * here we normalize request for locality group
3117 * Group request are normalized to s_strip size if we set the same via mount 3064 * Group request are normalized to s_strip size if we set the same via mount
3118 * option. If not we set it to s_mb_group_prealloc which can be configured via 3065 * option. If not we set it to s_mb_group_prealloc which can be configured via
3119 * /proc/fs/ext4/<partition>/group_prealloc 3066 * /sys/fs/ext4/<partition>/mb_group_prealloc
3120 * 3067 *
3121 * XXX: should we try to preallocate more than the group has now? 3068 * XXX: should we try to preallocate more than the group has now?
3122 */ 3069 */
@@ -3608,8 +3555,11 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3608 spin_unlock(&pa->pa_lock); 3555 spin_unlock(&pa->pa_lock);
3609 3556
3610 grp_blk = pa->pa_pstart; 3557 grp_blk = pa->pa_pstart;
3611 /* If linear, pa_pstart may be in the next group when pa is used up */ 3558 /*
3612 if (pa->pa_linear) 3559 * If doing group-based preallocation, pa_pstart may be in the
3560 * next group when pa is used up
3561 */
3562 if (pa->pa_type == MB_GROUP_PA)
3613 grp_blk--; 3563 grp_blk--;
3614 3564
3615 ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL); 3565 ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
@@ -3704,7 +3654,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3704 INIT_LIST_HEAD(&pa->pa_inode_list); 3654 INIT_LIST_HEAD(&pa->pa_inode_list);
3705 INIT_LIST_HEAD(&pa->pa_group_list); 3655 INIT_LIST_HEAD(&pa->pa_group_list);
3706 pa->pa_deleted = 0; 3656 pa->pa_deleted = 0;
3707 pa->pa_linear = 0; 3657 pa->pa_type = MB_INODE_PA;
3708 3658
3709 mb_debug("new inode pa %p: %llu/%u for %u\n", pa, 3659 mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
3710 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3660 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
@@ -3767,7 +3717,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3767 INIT_LIST_HEAD(&pa->pa_inode_list); 3717 INIT_LIST_HEAD(&pa->pa_inode_list);
3768 INIT_LIST_HEAD(&pa->pa_group_list); 3718 INIT_LIST_HEAD(&pa->pa_group_list);
3769 pa->pa_deleted = 0; 3719 pa->pa_deleted = 0;
3770 pa->pa_linear = 1; 3720 pa->pa_type = MB_GROUP_PA;
3771 3721
3772 mb_debug("new group pa %p: %llu/%u for %u\n", pa, 3722 mb_debug("new group pa %p: %llu/%u for %u\n", pa,
3773 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3723 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
@@ -4021,7 +3971,7 @@ repeat:
4021 list_del_rcu(&pa->pa_inode_list); 3971 list_del_rcu(&pa->pa_inode_list);
4022 spin_unlock(pa->pa_obj_lock); 3972 spin_unlock(pa->pa_obj_lock);
4023 3973
4024 if (pa->pa_linear) 3974 if (pa->pa_type == MB_GROUP_PA)
4025 ext4_mb_release_group_pa(&e4b, pa, ac); 3975 ext4_mb_release_group_pa(&e4b, pa, ac);
4026 else 3976 else
4027 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); 3977 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
@@ -4121,7 +4071,7 @@ repeat:
4121 spin_unlock(&ei->i_prealloc_lock); 4071 spin_unlock(&ei->i_prealloc_lock);
4122 4072
4123 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { 4073 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
4124 BUG_ON(pa->pa_linear != 0); 4074 BUG_ON(pa->pa_type != MB_INODE_PA);
4125 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); 4075 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
4126 4076
4127 err = ext4_mb_load_buddy(sb, group, &e4b); 4077 err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -4232,7 +4182,7 @@ static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4232 * file is determined by the current size or the resulting size after 4182 * file is determined by the current size or the resulting size after
4233 * allocation which ever is larger 4183 * allocation which ever is larger
4234 * 4184 *
4235 * One can tune this size via /proc/fs/ext4/<partition>/stream_req 4185 * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
4236 */ 4186 */
4237static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) 4187static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4238{ 4188{
@@ -4373,7 +4323,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4373 continue; 4323 continue;
4374 } 4324 }
4375 /* only lg prealloc space */ 4325 /* only lg prealloc space */
4376 BUG_ON(!pa->pa_linear); 4326 BUG_ON(pa->pa_type != MB_GROUP_PA);
4377 4327
4378 /* seems this one can be freed ... */ 4328 /* seems this one can be freed ... */
4379 pa->pa_deleted = 1; 4329 pa->pa_deleted = 1;
@@ -4442,7 +4392,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
4442 pa_inode_list) { 4392 pa_inode_list) {
4443 spin_lock(&tmp_pa->pa_lock); 4393 spin_lock(&tmp_pa->pa_lock);
4444 if (tmp_pa->pa_deleted) { 4394 if (tmp_pa->pa_deleted) {
4445 spin_unlock(&pa->pa_lock); 4395 spin_unlock(&tmp_pa->pa_lock);
4446 continue; 4396 continue;
4447 } 4397 }
4448 if (!added && pa->pa_free < tmp_pa->pa_free) { 4398 if (!added && pa->pa_free < tmp_pa->pa_free) {
@@ -4479,7 +4429,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4479{ 4429{
4480 struct ext4_prealloc_space *pa = ac->ac_pa; 4430 struct ext4_prealloc_space *pa = ac->ac_pa;
4481 if (pa) { 4431 if (pa) {
4482 if (pa->pa_linear) { 4432 if (pa->pa_type == MB_GROUP_PA) {
4483 /* see comment in ext4_mb_use_group_pa() */ 4433 /* see comment in ext4_mb_use_group_pa() */
4484 spin_lock(&pa->pa_lock); 4434 spin_lock(&pa->pa_lock);
4485 pa->pa_pstart += ac->ac_b_ex.fe_len; 4435 pa->pa_pstart += ac->ac_b_ex.fe_len;
@@ -4499,7 +4449,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4499 * doesn't grow big. We need to release 4449 * doesn't grow big. We need to release
4500 * alloc_semp before calling ext4_mb_add_n_trim() 4450 * alloc_semp before calling ext4_mb_add_n_trim()
4501 */ 4451 */
4502 if (pa->pa_linear && likely(pa->pa_free)) { 4452 if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
4503 spin_lock(pa->pa_obj_lock); 4453 spin_lock(pa->pa_obj_lock);
4504 list_del_rcu(&pa->pa_inode_list); 4454 list_del_rcu(&pa->pa_inode_list);
4505 spin_unlock(pa->pa_obj_lock); 4455 spin_unlock(pa->pa_obj_lock);
@@ -4936,9 +4886,7 @@ do_more:
4936 4886
4937 if (sbi->s_log_groups_per_flex) { 4887 if (sbi->s_log_groups_per_flex) {
4938 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 4888 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4939 spin_lock(sb_bgl_lock(sbi, flex_group)); 4889 atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
4940 sbi->s_flex_groups[flex_group].free_blocks += count;
4941 spin_unlock(sb_bgl_lock(sbi, flex_group));
4942 } 4890 }
4943 4891
4944 ext4_mb_release_desc(&e4b); 4892 ext4_mb_release_desc(&e4b);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 10a2921baf14..dd9e6cd5f6cf 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -132,12 +132,15 @@ struct ext4_prealloc_space {
132 ext4_lblk_t pa_lstart; /* log. block */ 132 ext4_lblk_t pa_lstart; /* log. block */
133 unsigned short pa_len; /* len of preallocated chunk */ 133 unsigned short pa_len; /* len of preallocated chunk */
134 unsigned short pa_free; /* how many blocks are free */ 134 unsigned short pa_free; /* how many blocks are free */
135 unsigned short pa_linear; /* consumed in one direction 135 unsigned short pa_type; /* pa type. inode or group */
136 * strictly, for grp prealloc */
137 spinlock_t *pa_obj_lock; 136 spinlock_t *pa_obj_lock;
138 struct inode *pa_inode; /* hack, for history only */ 137 struct inode *pa_inode; /* hack, for history only */
139}; 138};
140 139
140enum {
141 MB_INODE_PA = 0,
142 MB_GROUP_PA = 1
143};
141 144
142struct ext4_free_extent { 145struct ext4_free_extent {
143 ext4_lblk_t fe_logical; 146 ext4_lblk_t fe_logical;
@@ -247,7 +250,6 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
247 250
248#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) 251#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
249 252
250struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
251static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, 253static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
252 struct ext4_free_extent *fex) 254 struct ext4_free_extent *fex)
253{ 255{
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 83410244d3ee..22098e1cd085 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -161,12 +161,12 @@ static struct dx_frame *dx_probe(const struct qstr *d_name,
161 struct dx_frame *frame, 161 struct dx_frame *frame,
162 int *err); 162 int *err);
163static void dx_release(struct dx_frame *frames); 163static void dx_release(struct dx_frame *frames);
164static int dx_make_map(struct ext4_dir_entry_2 *de, int size, 164static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
165 struct dx_hash_info *hinfo, struct dx_map_entry map[]); 165 struct dx_hash_info *hinfo, struct dx_map_entry map[]);
166static void dx_sort_map(struct dx_map_entry *map, unsigned count); 166static void dx_sort_map(struct dx_map_entry *map, unsigned count);
167static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to, 167static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
168 struct dx_map_entry *offsets, int count); 168 struct dx_map_entry *offsets, int count, unsigned blocksize);
169static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size); 169static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize);
170static void dx_insert_block(struct dx_frame *frame, 170static void dx_insert_block(struct dx_frame *frame,
171 u32 hash, ext4_lblk_t block); 171 u32 hash, ext4_lblk_t block);
172static int ext4_htree_next_block(struct inode *dir, __u32 hash, 172static int ext4_htree_next_block(struct inode *dir, __u32 hash,
@@ -180,14 +180,38 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
180static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, 180static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
181 struct inode *inode); 181 struct inode *inode);
182 182
183unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
184{
185 unsigned len = le16_to_cpu(dlen);
186
187 if (len == EXT4_MAX_REC_LEN || len == 0)
188 return blocksize;
189 return (len & 65532) | ((len & 3) << 16);
190}
191
192__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
193{
194 if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
195 BUG();
196 if (len < 65536)
197 return cpu_to_le16(len);
198 if (len == blocksize) {
199 if (blocksize == 65536)
200 return cpu_to_le16(EXT4_MAX_REC_LEN);
201 else
202 return cpu_to_le16(0);
203 }
204 return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
205}
206
183/* 207/*
184 * p is at least 6 bytes before the end of page 208 * p is at least 6 bytes before the end of page
185 */ 209 */
186static inline struct ext4_dir_entry_2 * 210static inline struct ext4_dir_entry_2 *
187ext4_next_entry(struct ext4_dir_entry_2 *p) 211ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
188{ 212{
189 return (struct ext4_dir_entry_2 *)((char *)p + 213 return (struct ext4_dir_entry_2 *)((char *)p +
190 ext4_rec_len_from_disk(p->rec_len)); 214 ext4_rec_len_from_disk(p->rec_len, blocksize));
191} 215}
192 216
193/* 217/*
@@ -294,7 +318,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent
294 space += EXT4_DIR_REC_LEN(de->name_len); 318 space += EXT4_DIR_REC_LEN(de->name_len);
295 names++; 319 names++;
296 } 320 }
297 de = ext4_next_entry(de); 321 de = ext4_next_entry(de, size);
298 } 322 }
299 printk("(%i)\n", names); 323 printk("(%i)\n", names);
300 return (struct stats) { names, space, 1 }; 324 return (struct stats) { names, space, 1 };
@@ -585,7 +609,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
585 top = (struct ext4_dir_entry_2 *) ((char *) de + 609 top = (struct ext4_dir_entry_2 *) ((char *) de +
586 dir->i_sb->s_blocksize - 610 dir->i_sb->s_blocksize -
587 EXT4_DIR_REC_LEN(0)); 611 EXT4_DIR_REC_LEN(0));
588 for (; de < top; de = ext4_next_entry(de)) { 612 for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
589 if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, 613 if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
590 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) 614 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
591 +((char *)de - bh->b_data))) { 615 +((char *)de - bh->b_data))) {
@@ -663,7 +687,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
663 } 687 }
664 if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) { 688 if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
665 de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; 689 de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
666 de = ext4_next_entry(de); 690 de = ext4_next_entry(de, dir->i_sb->s_blocksize);
667 if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0) 691 if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0)
668 goto errout; 692 goto errout;
669 count++; 693 count++;
@@ -713,15 +737,15 @@ errout:
713 * Create map of hash values, offsets, and sizes, stored at end of block. 737 * Create map of hash values, offsets, and sizes, stored at end of block.
714 * Returns number of entries mapped. 738 * Returns number of entries mapped.
715 */ 739 */
716static int dx_make_map (struct ext4_dir_entry_2 *de, int size, 740static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
717 struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) 741 struct dx_hash_info *hinfo,
742 struct dx_map_entry *map_tail)
718{ 743{
719 int count = 0; 744 int count = 0;
720 char *base = (char *) de; 745 char *base = (char *) de;
721 struct dx_hash_info h = *hinfo; 746 struct dx_hash_info h = *hinfo;
722 747
723 while ((char *) de < base + size) 748 while ((char *) de < base + blocksize) {
724 {
725 if (de->name_len && de->inode) { 749 if (de->name_len && de->inode) {
726 ext4fs_dirhash(de->name, de->name_len, &h); 750 ext4fs_dirhash(de->name, de->name_len, &h);
727 map_tail--; 751 map_tail--;
@@ -732,7 +756,7 @@ static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
732 cond_resched(); 756 cond_resched();
733 } 757 }
734 /* XXX: do we need to check rec_len == 0 case? -Chris */ 758 /* XXX: do we need to check rec_len == 0 case? -Chris */
735 de = ext4_next_entry(de); 759 de = ext4_next_entry(de, blocksize);
736 } 760 }
737 return count; 761 return count;
738} 762}
@@ -832,7 +856,8 @@ static inline int search_dirblock(struct buffer_head *bh,
832 return 1; 856 return 1;
833 } 857 }
834 /* prevent looping on a bad block */ 858 /* prevent looping on a bad block */
835 de_len = ext4_rec_len_from_disk(de->rec_len); 859 de_len = ext4_rec_len_from_disk(de->rec_len,
860 dir->i_sb->s_blocksize);
836 if (de_len <= 0) 861 if (de_len <= 0)
837 return -1; 862 return -1;
838 offset += de_len; 863 offset += de_len;
@@ -996,7 +1021,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
996 de = (struct ext4_dir_entry_2 *) bh->b_data; 1021 de = (struct ext4_dir_entry_2 *) bh->b_data;
997 top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize - 1022 top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
998 EXT4_DIR_REC_LEN(0)); 1023 EXT4_DIR_REC_LEN(0));
999 for (; de < top; de = ext4_next_entry(de)) { 1024 for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) {
1000 int off = (block << EXT4_BLOCK_SIZE_BITS(sb)) 1025 int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
1001 + ((char *) de - bh->b_data); 1026 + ((char *) de - bh->b_data);
1002 1027
@@ -1052,8 +1077,16 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1052 return ERR_PTR(-EIO); 1077 return ERR_PTR(-EIO);
1053 } 1078 }
1054 inode = ext4_iget(dir->i_sb, ino); 1079 inode = ext4_iget(dir->i_sb, ino);
1055 if (IS_ERR(inode)) 1080 if (unlikely(IS_ERR(inode))) {
1056 return ERR_CAST(inode); 1081 if (PTR_ERR(inode) == -ESTALE) {
1082 ext4_error(dir->i_sb, __func__,
1083 "deleted inode referenced: %u",
1084 ino);
1085 return ERR_PTR(-EIO);
1086 } else {
1087 return ERR_CAST(inode);
1088 }
1089 }
1057 } 1090 }
1058 return d_splice_alias(inode, dentry); 1091 return d_splice_alias(inode, dentry);
1059} 1092}
@@ -1109,7 +1142,8 @@ static inline void ext4_set_de_type(struct super_block *sb,
1109 * Returns pointer to last entry moved. 1142 * Returns pointer to last entry moved.
1110 */ 1143 */
1111static struct ext4_dir_entry_2 * 1144static struct ext4_dir_entry_2 *
1112dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) 1145dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
1146 unsigned blocksize)
1113{ 1147{
1114 unsigned rec_len = 0; 1148 unsigned rec_len = 0;
1115 1149
@@ -1118,7 +1152,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
1118 rec_len = EXT4_DIR_REC_LEN(de->name_len); 1152 rec_len = EXT4_DIR_REC_LEN(de->name_len);
1119 memcpy (to, de, rec_len); 1153 memcpy (to, de, rec_len);
1120 ((struct ext4_dir_entry_2 *) to)->rec_len = 1154 ((struct ext4_dir_entry_2 *) to)->rec_len =
1121 ext4_rec_len_to_disk(rec_len); 1155 ext4_rec_len_to_disk(rec_len, blocksize);
1122 de->inode = 0; 1156 de->inode = 0;
1123 map++; 1157 map++;
1124 to += rec_len; 1158 to += rec_len;
@@ -1130,19 +1164,19 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
1130 * Compact each dir entry in the range to the minimal rec_len. 1164 * Compact each dir entry in the range to the minimal rec_len.
1131 * Returns pointer to last entry in range. 1165 * Returns pointer to last entry in range.
1132 */ 1166 */
1133static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size) 1167static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
1134{ 1168{
1135 struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base; 1169 struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
1136 unsigned rec_len = 0; 1170 unsigned rec_len = 0;
1137 1171
1138 prev = to = de; 1172 prev = to = de;
1139 while ((char*)de < base + size) { 1173 while ((char*)de < base + blocksize) {
1140 next = ext4_next_entry(de); 1174 next = ext4_next_entry(de, blocksize);
1141 if (de->inode && de->name_len) { 1175 if (de->inode && de->name_len) {
1142 rec_len = EXT4_DIR_REC_LEN(de->name_len); 1176 rec_len = EXT4_DIR_REC_LEN(de->name_len);
1143 if (de > to) 1177 if (de > to)
1144 memmove(to, de, rec_len); 1178 memmove(to, de, rec_len);
1145 to->rec_len = ext4_rec_len_to_disk(rec_len); 1179 to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
1146 prev = to; 1180 prev = to;
1147 to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len); 1181 to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
1148 } 1182 }
@@ -1215,10 +1249,12 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1215 hash2, split, count-split)); 1249 hash2, split, count-split));
1216 1250
1217 /* Fancy dance to stay within two buffers */ 1251 /* Fancy dance to stay within two buffers */
1218 de2 = dx_move_dirents(data1, data2, map + split, count - split); 1252 de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
1219 de = dx_pack_dirents(data1, blocksize); 1253 de = dx_pack_dirents(data1, blocksize);
1220 de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de); 1254 de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
1221 de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2); 1255 blocksize);
1256 de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2,
1257 blocksize);
1222 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); 1258 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
1223 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); 1259 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
1224 1260
@@ -1268,6 +1304,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1268 const char *name = dentry->d_name.name; 1304 const char *name = dentry->d_name.name;
1269 int namelen = dentry->d_name.len; 1305 int namelen = dentry->d_name.len;
1270 unsigned int offset = 0; 1306 unsigned int offset = 0;
1307 unsigned int blocksize = dir->i_sb->s_blocksize;
1271 unsigned short reclen; 1308 unsigned short reclen;
1272 int nlen, rlen, err; 1309 int nlen, rlen, err;
1273 char *top; 1310 char *top;
@@ -1275,7 +1312,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1275 reclen = EXT4_DIR_REC_LEN(namelen); 1312 reclen = EXT4_DIR_REC_LEN(namelen);
1276 if (!de) { 1313 if (!de) {
1277 de = (struct ext4_dir_entry_2 *)bh->b_data; 1314 de = (struct ext4_dir_entry_2 *)bh->b_data;
1278 top = bh->b_data + dir->i_sb->s_blocksize - reclen; 1315 top = bh->b_data + blocksize - reclen;
1279 while ((char *) de <= top) { 1316 while ((char *) de <= top) {
1280 if (!ext4_check_dir_entry("ext4_add_entry", dir, de, 1317 if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
1281 bh, offset)) { 1318 bh, offset)) {
@@ -1287,7 +1324,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1287 return -EEXIST; 1324 return -EEXIST;
1288 } 1325 }
1289 nlen = EXT4_DIR_REC_LEN(de->name_len); 1326 nlen = EXT4_DIR_REC_LEN(de->name_len);
1290 rlen = ext4_rec_len_from_disk(de->rec_len); 1327 rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
1291 if ((de->inode? rlen - nlen: rlen) >= reclen) 1328 if ((de->inode? rlen - nlen: rlen) >= reclen)
1292 break; 1329 break;
1293 de = (struct ext4_dir_entry_2 *)((char *)de + rlen); 1330 de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
@@ -1306,11 +1343,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1306 1343
1307 /* By now the buffer is marked for journaling */ 1344 /* By now the buffer is marked for journaling */
1308 nlen = EXT4_DIR_REC_LEN(de->name_len); 1345 nlen = EXT4_DIR_REC_LEN(de->name_len);
1309 rlen = ext4_rec_len_from_disk(de->rec_len); 1346 rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
1310 if (de->inode) { 1347 if (de->inode) {
1311 struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen); 1348 struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
1312 de1->rec_len = ext4_rec_len_to_disk(rlen - nlen); 1349 de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize);
1313 de->rec_len = ext4_rec_len_to_disk(nlen); 1350 de->rec_len = ext4_rec_len_to_disk(nlen, blocksize);
1314 de = de1; 1351 de = de1;
1315 } 1352 }
1316 de->file_type = EXT4_FT_UNKNOWN; 1353 de->file_type = EXT4_FT_UNKNOWN;
@@ -1380,7 +1417,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1380 /* The 0th block becomes the root, move the dirents out */ 1417 /* The 0th block becomes the root, move the dirents out */
1381 fde = &root->dotdot; 1418 fde = &root->dotdot;
1382 de = (struct ext4_dir_entry_2 *)((char *)fde + 1419 de = (struct ext4_dir_entry_2 *)((char *)fde +
1383 ext4_rec_len_from_disk(fde->rec_len)); 1420 ext4_rec_len_from_disk(fde->rec_len, blocksize));
1384 if ((char *) de >= (((char *) root) + blocksize)) { 1421 if ((char *) de >= (((char *) root) + blocksize)) {
1385 ext4_error(dir->i_sb, __func__, 1422 ext4_error(dir->i_sb, __func__,
1386 "invalid rec_len for '..' in inode %lu", 1423 "invalid rec_len for '..' in inode %lu",
@@ -1402,12 +1439,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1402 memcpy (data1, de, len); 1439 memcpy (data1, de, len);
1403 de = (struct ext4_dir_entry_2 *) data1; 1440 de = (struct ext4_dir_entry_2 *) data1;
1404 top = data1 + len; 1441 top = data1 + len;
1405 while ((char *)(de2 = ext4_next_entry(de)) < top) 1442 while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
1406 de = de2; 1443 de = de2;
1407 de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de); 1444 de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
1445 blocksize);
1408 /* Initialize the root; the dot dirents already exist */ 1446 /* Initialize the root; the dot dirents already exist */
1409 de = (struct ext4_dir_entry_2 *) (&root->dotdot); 1447 de = (struct ext4_dir_entry_2 *) (&root->dotdot);
1410 de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2)); 1448 de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
1449 blocksize);
1411 memset (&root->info, 0, sizeof(root->info)); 1450 memset (&root->info, 0, sizeof(root->info));
1412 root->info.info_length = sizeof(root->info); 1451 root->info.info_length = sizeof(root->info);
1413 root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; 1452 root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
@@ -1488,7 +1527,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1488 return retval; 1527 return retval;
1489 de = (struct ext4_dir_entry_2 *) bh->b_data; 1528 de = (struct ext4_dir_entry_2 *) bh->b_data;
1490 de->inode = 0; 1529 de->inode = 0;
1491 de->rec_len = ext4_rec_len_to_disk(blocksize); 1530 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
1492 return add_dirent_to_buf(handle, dentry, inode, de, bh); 1531 return add_dirent_to_buf(handle, dentry, inode, de, bh);
1493} 1532}
1494 1533
@@ -1551,7 +1590,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1551 goto cleanup; 1590 goto cleanup;
1552 node2 = (struct dx_node *)(bh2->b_data); 1591 node2 = (struct dx_node *)(bh2->b_data);
1553 entries2 = node2->entries; 1592 entries2 = node2->entries;
1554 node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize); 1593 node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
1594 sb->s_blocksize);
1555 node2->fake.inode = 0; 1595 node2->fake.inode = 0;
1556 BUFFER_TRACE(frame->bh, "get_write_access"); 1596 BUFFER_TRACE(frame->bh, "get_write_access");
1557 err = ext4_journal_get_write_access(handle, frame->bh); 1597 err = ext4_journal_get_write_access(handle, frame->bh);
@@ -1639,6 +1679,7 @@ static int ext4_delete_entry(handle_t *handle,
1639 struct buffer_head *bh) 1679 struct buffer_head *bh)
1640{ 1680{
1641 struct ext4_dir_entry_2 *de, *pde; 1681 struct ext4_dir_entry_2 *de, *pde;
1682 unsigned int blocksize = dir->i_sb->s_blocksize;
1642 int i; 1683 int i;
1643 1684
1644 i = 0; 1685 i = 0;
@@ -1652,8 +1693,11 @@ static int ext4_delete_entry(handle_t *handle,
1652 ext4_journal_get_write_access(handle, bh); 1693 ext4_journal_get_write_access(handle, bh);
1653 if (pde) 1694 if (pde)
1654 pde->rec_len = ext4_rec_len_to_disk( 1695 pde->rec_len = ext4_rec_len_to_disk(
1655 ext4_rec_len_from_disk(pde->rec_len) + 1696 ext4_rec_len_from_disk(pde->rec_len,
1656 ext4_rec_len_from_disk(de->rec_len)); 1697 blocksize) +
1698 ext4_rec_len_from_disk(de->rec_len,
1699 blocksize),
1700 blocksize);
1657 else 1701 else
1658 de->inode = 0; 1702 de->inode = 0;
1659 dir->i_version++; 1703 dir->i_version++;
@@ -1661,9 +1705,9 @@ static int ext4_delete_entry(handle_t *handle,
1661 ext4_handle_dirty_metadata(handle, dir, bh); 1705 ext4_handle_dirty_metadata(handle, dir, bh);
1662 return 0; 1706 return 0;
1663 } 1707 }
1664 i += ext4_rec_len_from_disk(de->rec_len); 1708 i += ext4_rec_len_from_disk(de->rec_len, blocksize);
1665 pde = de; 1709 pde = de;
1666 de = ext4_next_entry(de); 1710 de = ext4_next_entry(de, blocksize);
1667 } 1711 }
1668 return -ENOENT; 1712 return -ENOENT;
1669} 1713}
@@ -1793,6 +1837,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1793 struct inode *inode; 1837 struct inode *inode;
1794 struct buffer_head *dir_block; 1838 struct buffer_head *dir_block;
1795 struct ext4_dir_entry_2 *de; 1839 struct ext4_dir_entry_2 *de;
1840 unsigned int blocksize = dir->i_sb->s_blocksize;
1796 int err, retries = 0; 1841 int err, retries = 0;
1797 1842
1798 if (EXT4_DIR_LINK_MAX(dir)) 1843 if (EXT4_DIR_LINK_MAX(dir))
@@ -1824,13 +1869,14 @@ retry:
1824 de = (struct ext4_dir_entry_2 *) dir_block->b_data; 1869 de = (struct ext4_dir_entry_2 *) dir_block->b_data;
1825 de->inode = cpu_to_le32(inode->i_ino); 1870 de->inode = cpu_to_le32(inode->i_ino);
1826 de->name_len = 1; 1871 de->name_len = 1;
1827 de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len)); 1872 de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
1873 blocksize);
1828 strcpy(de->name, "."); 1874 strcpy(de->name, ".");
1829 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 1875 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1830 de = ext4_next_entry(de); 1876 de = ext4_next_entry(de, blocksize);
1831 de->inode = cpu_to_le32(dir->i_ino); 1877 de->inode = cpu_to_le32(dir->i_ino);
1832 de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize - 1878 de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1),
1833 EXT4_DIR_REC_LEN(1)); 1879 blocksize);
1834 de->name_len = 2; 1880 de->name_len = 2;
1835 strcpy(de->name, ".."); 1881 strcpy(de->name, "..");
1836 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 1882 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
@@ -1885,7 +1931,7 @@ static int empty_dir(struct inode *inode)
1885 return 1; 1931 return 1;
1886 } 1932 }
1887 de = (struct ext4_dir_entry_2 *) bh->b_data; 1933 de = (struct ext4_dir_entry_2 *) bh->b_data;
1888 de1 = ext4_next_entry(de); 1934 de1 = ext4_next_entry(de, sb->s_blocksize);
1889 if (le32_to_cpu(de->inode) != inode->i_ino || 1935 if (le32_to_cpu(de->inode) != inode->i_ino ||
1890 !le32_to_cpu(de1->inode) || 1936 !le32_to_cpu(de1->inode) ||
1891 strcmp(".", de->name) || 1937 strcmp(".", de->name) ||
@@ -1896,9 +1942,9 @@ static int empty_dir(struct inode *inode)
1896 brelse(bh); 1942 brelse(bh);
1897 return 1; 1943 return 1;
1898 } 1944 }
1899 offset = ext4_rec_len_from_disk(de->rec_len) + 1945 offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) +
1900 ext4_rec_len_from_disk(de1->rec_len); 1946 ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize);
1901 de = ext4_next_entry(de1); 1947 de = ext4_next_entry(de1, sb->s_blocksize);
1902 while (offset < inode->i_size) { 1948 while (offset < inode->i_size) {
1903 if (!bh || 1949 if (!bh ||
1904 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { 1950 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
@@ -1927,8 +1973,8 @@ static int empty_dir(struct inode *inode)
1927 brelse(bh); 1973 brelse(bh);
1928 return 0; 1974 return 0;
1929 } 1975 }
1930 offset += ext4_rec_len_from_disk(de->rec_len); 1976 offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
1931 de = ext4_next_entry(de); 1977 de = ext4_next_entry(de, sb->s_blocksize);
1932 } 1978 }
1933 brelse(bh); 1979 brelse(bh);
1934 return 1; 1980 return 1;
@@ -2297,8 +2343,8 @@ retry:
2297 return err; 2343 return err;
2298} 2344}
2299 2345
2300#define PARENT_INO(buffer) \ 2346#define PARENT_INO(buffer, size) \
2301 (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer))->inode) 2347 (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode)
2302 2348
2303/* 2349/*
2304 * Anybody can rename anything with this: the permission checks are left to the 2350 * Anybody can rename anything with this: the permission checks are left to the
@@ -2311,7 +2357,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2311 struct inode *old_inode, *new_inode; 2357 struct inode *old_inode, *new_inode;
2312 struct buffer_head *old_bh, *new_bh, *dir_bh; 2358 struct buffer_head *old_bh, *new_bh, *dir_bh;
2313 struct ext4_dir_entry_2 *old_de, *new_de; 2359 struct ext4_dir_entry_2 *old_de, *new_de;
2314 int retval; 2360 int retval, force_da_alloc = 0;
2315 2361
2316 old_bh = new_bh = dir_bh = NULL; 2362 old_bh = new_bh = dir_bh = NULL;
2317 2363
@@ -2358,7 +2404,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2358 dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval); 2404 dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
2359 if (!dir_bh) 2405 if (!dir_bh)
2360 goto end_rename; 2406 goto end_rename;
2361 if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) 2407 if (le32_to_cpu(PARENT_INO(dir_bh->b_data,
2408 old_dir->i_sb->s_blocksize)) != old_dir->i_ino)
2362 goto end_rename; 2409 goto end_rename;
2363 retval = -EMLINK; 2410 retval = -EMLINK;
2364 if (!new_inode && new_dir != old_dir && 2411 if (!new_inode && new_dir != old_dir &&
@@ -2430,7 +2477,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2430 if (dir_bh) { 2477 if (dir_bh) {
2431 BUFFER_TRACE(dir_bh, "get_write_access"); 2478 BUFFER_TRACE(dir_bh, "get_write_access");
2432 ext4_journal_get_write_access(handle, dir_bh); 2479 ext4_journal_get_write_access(handle, dir_bh);
2433 PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); 2480 PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
2481 cpu_to_le32(new_dir->i_ino);
2434 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); 2482 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
2435 ext4_handle_dirty_metadata(handle, old_dir, dir_bh); 2483 ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
2436 ext4_dec_count(handle, old_dir); 2484 ext4_dec_count(handle, old_dir);
@@ -2449,6 +2497,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2449 ext4_mark_inode_dirty(handle, new_inode); 2497 ext4_mark_inode_dirty(handle, new_inode);
2450 if (!new_inode->i_nlink) 2498 if (!new_inode->i_nlink)
2451 ext4_orphan_add(handle, new_inode); 2499 ext4_orphan_add(handle, new_inode);
2500 if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
2501 force_da_alloc = 1;
2452 } 2502 }
2453 retval = 0; 2503 retval = 0;
2454 2504
@@ -2457,6 +2507,8 @@ end_rename:
2457 brelse(old_bh); 2507 brelse(old_bh);
2458 brelse(new_bh); 2508 brelse(new_bh);
2459 ext4_journal_stop(handle); 2509 ext4_journal_stop(handle);
2510 if (retval == 0 && force_da_alloc)
2511 ext4_alloc_da_blocks(old_inode);
2460 return retval; 2512 return retval;
2461} 2513}
2462 2514
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index c06886abd658..546c7dd869e1 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -938,10 +938,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
938 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { 938 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
939 ext4_group_t flex_group; 939 ext4_group_t flex_group;
940 flex_group = ext4_flex_group(sbi, input->group); 940 flex_group = ext4_flex_group(sbi, input->group);
941 sbi->s_flex_groups[flex_group].free_blocks += 941 atomic_add(input->free_blocks_count,
942 input->free_blocks_count; 942 &sbi->s_flex_groups[flex_group].free_blocks);
943 sbi->s_flex_groups[flex_group].free_inodes += 943 atomic_add(EXT4_INODES_PER_GROUP(sb),
944 EXT4_INODES_PER_GROUP(sb); 944 &sbi->s_flex_groups[flex_group].free_inodes);
945 } 945 }
946 946
947 ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); 947 ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f7371a6a923d..9987bba99db3 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -35,6 +35,7 @@
35#include <linux/quotaops.h> 35#include <linux/quotaops.h>
36#include <linux/seq_file.h> 36#include <linux/seq_file.h>
37#include <linux/proc_fs.h> 37#include <linux/proc_fs.h>
38#include <linux/ctype.h>
38#include <linux/marker.h> 39#include <linux/marker.h>
39#include <linux/log2.h> 40#include <linux/log2.h>
40#include <linux/crc16.h> 41#include <linux/crc16.h>
@@ -48,6 +49,7 @@
48#include "group.h" 49#include "group.h"
49 50
50struct proc_dir_entry *ext4_proc_root; 51struct proc_dir_entry *ext4_proc_root;
52static struct kset *ext4_kset;
51 53
52static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 54static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
53 unsigned long journal_devnum); 55 unsigned long journal_devnum);
@@ -577,9 +579,9 @@ static void ext4_put_super(struct super_block *sb)
577 ext4_commit_super(sb, es, 1); 579 ext4_commit_super(sb, es, 1);
578 } 580 }
579 if (sbi->s_proc) { 581 if (sbi->s_proc) {
580 remove_proc_entry("inode_readahead_blks", sbi->s_proc);
581 remove_proc_entry(sb->s_id, ext4_proc_root); 582 remove_proc_entry(sb->s_id, ext4_proc_root);
582 } 583 }
584 kobject_del(&sbi->s_kobj);
583 585
584 for (i = 0; i < sbi->s_gdb_count; i++) 586 for (i = 0; i < sbi->s_gdb_count; i++)
585 brelse(sbi->s_group_desc[i]); 587 brelse(sbi->s_group_desc[i]);
@@ -615,6 +617,17 @@ static void ext4_put_super(struct super_block *sb)
615 ext4_blkdev_remove(sbi); 617 ext4_blkdev_remove(sbi);
616 } 618 }
617 sb->s_fs_info = NULL; 619 sb->s_fs_info = NULL;
620 /*
621 * Now that we are completely done shutting down the
622 * superblock, we need to actually destroy the kobject.
623 */
624 unlock_kernel();
625 unlock_super(sb);
626 kobject_put(&sbi->s_kobj);
627 wait_for_completion(&sbi->s_kobj_unregister);
628 lock_super(sb);
629 lock_kernel();
630 kfree(sbi->s_blockgroup_lock);
618 kfree(sbi); 631 kfree(sbi);
619 return; 632 return;
620} 633}
@@ -803,8 +816,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
803 if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL)) 816 if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
804 seq_puts(seq, ",noacl"); 817 seq_puts(seq, ",noacl");
805#endif 818#endif
806 if (!test_opt(sb, RESERVATION))
807 seq_puts(seq, ",noreservation");
808 if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { 819 if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
809 seq_printf(seq, ",commit=%u", 820 seq_printf(seq, ",commit=%u",
810 (unsigned) (sbi->s_commit_interval / HZ)); 821 (unsigned) (sbi->s_commit_interval / HZ));
@@ -855,6 +866,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
855 if (test_opt(sb, DATA_ERR_ABORT)) 866 if (test_opt(sb, DATA_ERR_ABORT))
856 seq_puts(seq, ",data_err=abort"); 867 seq_puts(seq, ",data_err=abort");
857 868
869 if (test_opt(sb, NO_AUTO_DA_ALLOC))
870 seq_puts(seq, ",noauto_da_alloc");
871
858 ext4_show_quota_options(seq, sb); 872 ext4_show_quota_options(seq, sb);
859 return 0; 873 return 0;
860} 874}
@@ -1004,7 +1018,7 @@ enum {
1004 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, 1018 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
1005 Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, 1019 Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
1006 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, 1020 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
1007 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, 1021 Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh,
1008 Opt_commit, Opt_min_batch_time, Opt_max_batch_time, 1022 Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
1009 Opt_journal_update, Opt_journal_dev, 1023 Opt_journal_update, Opt_journal_dev,
1010 Opt_journal_checksum, Opt_journal_async_commit, 1024 Opt_journal_checksum, Opt_journal_async_commit,
@@ -1012,8 +1026,8 @@ enum {
1012 Opt_data_err_abort, Opt_data_err_ignore, 1026 Opt_data_err_abort, Opt_data_err_ignore,
1013 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 1027 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
1014 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 1028 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
1015 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, 1029 Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
1016 Opt_grpquota, Opt_i_version, 1030 Opt_usrquota, Opt_grpquota, Opt_i_version,
1017 Opt_stripe, Opt_delalloc, Opt_nodelalloc, 1031 Opt_stripe, Opt_delalloc, Opt_nodelalloc,
1018 Opt_inode_readahead_blks, Opt_journal_ioprio 1032 Opt_inode_readahead_blks, Opt_journal_ioprio
1019}; 1033};
@@ -1039,8 +1053,6 @@ static const match_table_t tokens = {
1039 {Opt_nouser_xattr, "nouser_xattr"}, 1053 {Opt_nouser_xattr, "nouser_xattr"},
1040 {Opt_acl, "acl"}, 1054 {Opt_acl, "acl"},
1041 {Opt_noacl, "noacl"}, 1055 {Opt_noacl, "noacl"},
1042 {Opt_reservation, "reservation"},
1043 {Opt_noreservation, "noreservation"},
1044 {Opt_noload, "noload"}, 1056 {Opt_noload, "noload"},
1045 {Opt_nobh, "nobh"}, 1057 {Opt_nobh, "nobh"},
1046 {Opt_bh, "bh"}, 1058 {Opt_bh, "bh"},
@@ -1068,6 +1080,8 @@ static const match_table_t tokens = {
1068 {Opt_quota, "quota"}, 1080 {Opt_quota, "quota"},
1069 {Opt_usrquota, "usrquota"}, 1081 {Opt_usrquota, "usrquota"},
1070 {Opt_barrier, "barrier=%u"}, 1082 {Opt_barrier, "barrier=%u"},
1083 {Opt_barrier, "barrier"},
1084 {Opt_nobarrier, "nobarrier"},
1071 {Opt_i_version, "i_version"}, 1085 {Opt_i_version, "i_version"},
1072 {Opt_stripe, "stripe=%u"}, 1086 {Opt_stripe, "stripe=%u"},
1073 {Opt_resize, "resize"}, 1087 {Opt_resize, "resize"},
@@ -1075,6 +1089,9 @@ static const match_table_t tokens = {
1075 {Opt_nodelalloc, "nodelalloc"}, 1089 {Opt_nodelalloc, "nodelalloc"},
1076 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, 1090 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
1077 {Opt_journal_ioprio, "journal_ioprio=%u"}, 1091 {Opt_journal_ioprio, "journal_ioprio=%u"},
1092 {Opt_auto_da_alloc, "auto_da_alloc=%u"},
1093 {Opt_auto_da_alloc, "auto_da_alloc"},
1094 {Opt_noauto_da_alloc, "noauto_da_alloc"},
1078 {Opt_err, NULL}, 1095 {Opt_err, NULL},
1079}; 1096};
1080 1097
@@ -1207,12 +1224,6 @@ static int parse_options(char *options, struct super_block *sb,
1207 "not supported\n"); 1224 "not supported\n");
1208 break; 1225 break;
1209#endif 1226#endif
1210 case Opt_reservation:
1211 set_opt(sbi->s_mount_opt, RESERVATION);
1212 break;
1213 case Opt_noreservation:
1214 clear_opt(sbi->s_mount_opt, RESERVATION);
1215 break;
1216 case Opt_journal_update: 1227 case Opt_journal_update:
1217 /* @@@ FIXME */ 1228 /* @@@ FIXME */
1218 /* Eventually we will want to be able to create 1229 /* Eventually we will want to be able to create
@@ -1415,9 +1426,14 @@ set_qf_format:
1415 case Opt_abort: 1426 case Opt_abort:
1416 set_opt(sbi->s_mount_opt, ABORT); 1427 set_opt(sbi->s_mount_opt, ABORT);
1417 break; 1428 break;
1429 case Opt_nobarrier:
1430 clear_opt(sbi->s_mount_opt, BARRIER);
1431 break;
1418 case Opt_barrier: 1432 case Opt_barrier:
1419 if (match_int(&args[0], &option)) 1433 if (match_int(&args[0], &option)) {
1420 return 0; 1434 set_opt(sbi->s_mount_opt, BARRIER);
1435 break;
1436 }
1421 if (option) 1437 if (option)
1422 set_opt(sbi->s_mount_opt, BARRIER); 1438 set_opt(sbi->s_mount_opt, BARRIER);
1423 else 1439 else
@@ -1463,6 +1479,11 @@ set_qf_format:
1463 return 0; 1479 return 0;
1464 if (option < 0 || option > (1 << 30)) 1480 if (option < 0 || option > (1 << 30))
1465 return 0; 1481 return 0;
1482 if (option & (option - 1)) {
1483 printk(KERN_ERR "EXT4-fs: inode_readahead_blks"
1484 " must be a power of 2\n");
1485 return 0;
1486 }
1466 sbi->s_inode_readahead_blks = option; 1487 sbi->s_inode_readahead_blks = option;
1467 break; 1488 break;
1468 case Opt_journal_ioprio: 1489 case Opt_journal_ioprio:
@@ -1473,6 +1494,19 @@ set_qf_format:
1473 *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 1494 *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
1474 option); 1495 option);
1475 break; 1496 break;
1497 case Opt_noauto_da_alloc:
1498 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
1499 break;
1500 case Opt_auto_da_alloc:
1501 if (match_int(&args[0], &option)) {
1502 clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
1503 break;
1504 }
1505 if (option)
1506 clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
1507 else
1508 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
1509 break;
1476 default: 1510 default:
1477 printk(KERN_ERR 1511 printk(KERN_ERR
1478 "EXT4-fs: Unrecognized mount option \"%s\" " 1512 "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1612,10 +1646,12 @@ static int ext4_fill_flex_info(struct super_block *sb)
1612 gdp = ext4_get_group_desc(sb, i, &bh); 1646 gdp = ext4_get_group_desc(sb, i, &bh);
1613 1647
1614 flex_group = ext4_flex_group(sbi, i); 1648 flex_group = ext4_flex_group(sbi, i);
1615 sbi->s_flex_groups[flex_group].free_inodes += 1649 atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
1616 ext4_free_inodes_count(sb, gdp); 1650 ext4_free_inodes_count(sb, gdp));
1617 sbi->s_flex_groups[flex_group].free_blocks += 1651 atomic_set(&sbi->s_flex_groups[flex_group].free_blocks,
1618 ext4_free_blks_count(sb, gdp); 1652 ext4_free_blks_count(sb, gdp));
1653 atomic_set(&sbi->s_flex_groups[flex_group].used_dirs,
1654 ext4_used_dirs_count(sb, gdp));
1619 } 1655 }
1620 1656
1621 return 1; 1657 return 1;
@@ -1991,6 +2027,181 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
1991 return 0; 2027 return 0;
1992} 2028}
1993 2029
2030/* sysfs supprt */
2031
2032struct ext4_attr {
2033 struct attribute attr;
2034 ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
2035 ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
2036 const char *, size_t);
2037 int offset;
2038};
2039
2040static int parse_strtoul(const char *buf,
2041 unsigned long max, unsigned long *value)
2042{
2043 char *endp;
2044
2045 while (*buf && isspace(*buf))
2046 buf++;
2047 *value = simple_strtoul(buf, &endp, 0);
2048 while (*endp && isspace(*endp))
2049 endp++;
2050 if (*endp || *value > max)
2051 return -EINVAL;
2052
2053 return 0;
2054}
2055
2056static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
2057 struct ext4_sb_info *sbi,
2058 char *buf)
2059{
2060 return snprintf(buf, PAGE_SIZE, "%llu\n",
2061 (s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
2062}
2063
2064static ssize_t session_write_kbytes_show(struct ext4_attr *a,
2065 struct ext4_sb_info *sbi, char *buf)
2066{
2067 struct super_block *sb = sbi->s_buddy_cache->i_sb;
2068
2069 return snprintf(buf, PAGE_SIZE, "%lu\n",
2070 (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
2071 sbi->s_sectors_written_start) >> 1);
2072}
2073
2074static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
2075 struct ext4_sb_info *sbi, char *buf)
2076{
2077 struct super_block *sb = sbi->s_buddy_cache->i_sb;
2078
2079 return snprintf(buf, PAGE_SIZE, "%llu\n",
2080 sbi->s_kbytes_written +
2081 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
2082 EXT4_SB(sb)->s_sectors_written_start) >> 1));
2083}
2084
2085static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
2086 struct ext4_sb_info *sbi,
2087 const char *buf, size_t count)
2088{
2089 unsigned long t;
2090
2091 if (parse_strtoul(buf, 0x40000000, &t))
2092 return -EINVAL;
2093
2094 /* inode_readahead_blks must be a power of 2 */
2095 if (t & (t-1))
2096 return -EINVAL;
2097
2098 sbi->s_inode_readahead_blks = t;
2099 return count;
2100}
2101
2102static ssize_t sbi_ui_show(struct ext4_attr *a,
2103 struct ext4_sb_info *sbi, char *buf)
2104{
2105 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
2106
2107 return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
2108}
2109
2110static ssize_t sbi_ui_store(struct ext4_attr *a,
2111 struct ext4_sb_info *sbi,
2112 const char *buf, size_t count)
2113{
2114 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
2115 unsigned long t;
2116
2117 if (parse_strtoul(buf, 0xffffffff, &t))
2118 return -EINVAL;
2119 *ui = t;
2120 return count;
2121}
2122
2123#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
2124static struct ext4_attr ext4_attr_##_name = { \
2125 .attr = {.name = __stringify(_name), .mode = _mode }, \
2126 .show = _show, \
2127 .store = _store, \
2128 .offset = offsetof(struct ext4_sb_info, _elname), \
2129}
2130#define EXT4_ATTR(name, mode, show, store) \
2131static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
2132
2133#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
2134#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
2135#define EXT4_RW_ATTR_SBI_UI(name, elname) \
2136 EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
2137#define ATTR_LIST(name) &ext4_attr_##name.attr
2138
2139EXT4_RO_ATTR(delayed_allocation_blocks);
2140EXT4_RO_ATTR(session_write_kbytes);
2141EXT4_RO_ATTR(lifetime_write_kbytes);
2142EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
2143 inode_readahead_blks_store, s_inode_readahead_blks);
2144EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
2145EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
2146EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
2147EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
2148EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
2149EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
2150
2151static struct attribute *ext4_attrs[] = {
2152 ATTR_LIST(delayed_allocation_blocks),
2153 ATTR_LIST(session_write_kbytes),
2154 ATTR_LIST(lifetime_write_kbytes),
2155 ATTR_LIST(inode_readahead_blks),
2156 ATTR_LIST(mb_stats),
2157 ATTR_LIST(mb_max_to_scan),
2158 ATTR_LIST(mb_min_to_scan),
2159 ATTR_LIST(mb_order2_req),
2160 ATTR_LIST(mb_stream_req),
2161 ATTR_LIST(mb_group_prealloc),
2162 NULL,
2163};
2164
2165static ssize_t ext4_attr_show(struct kobject *kobj,
2166 struct attribute *attr, char *buf)
2167{
2168 struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
2169 s_kobj);
2170 struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
2171
2172 return a->show ? a->show(a, sbi, buf) : 0;
2173}
2174
2175static ssize_t ext4_attr_store(struct kobject *kobj,
2176 struct attribute *attr,
2177 const char *buf, size_t len)
2178{
2179 struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
2180 s_kobj);
2181 struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
2182
2183 return a->store ? a->store(a, sbi, buf, len) : 0;
2184}
2185
2186static void ext4_sb_release(struct kobject *kobj)
2187{
2188 struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
2189 s_kobj);
2190 complete(&sbi->s_kobj_unregister);
2191}
2192
2193
2194static struct sysfs_ops ext4_attr_ops = {
2195 .show = ext4_attr_show,
2196 .store = ext4_attr_store,
2197};
2198
2199static struct kobj_type ext4_ktype = {
2200 .default_attrs = ext4_attrs,
2201 .sysfs_ops = &ext4_attr_ops,
2202 .release = ext4_sb_release,
2203};
2204
1994static int ext4_fill_super(struct super_block *sb, void *data, int silent) 2205static int ext4_fill_super(struct super_block *sb, void *data, int silent)
1995 __releases(kernel_lock) 2206 __releases(kernel_lock)
1996 __acquires(kernel_lock) 2207 __acquires(kernel_lock)
@@ -2021,12 +2232,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2021 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 2232 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
2022 if (!sbi) 2233 if (!sbi)
2023 return -ENOMEM; 2234 return -ENOMEM;
2235
2236 sbi->s_blockgroup_lock =
2237 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
2238 if (!sbi->s_blockgroup_lock) {
2239 kfree(sbi);
2240 return -ENOMEM;
2241 }
2024 sb->s_fs_info = sbi; 2242 sb->s_fs_info = sbi;
2025 sbi->s_mount_opt = 0; 2243 sbi->s_mount_opt = 0;
2026 sbi->s_resuid = EXT4_DEF_RESUID; 2244 sbi->s_resuid = EXT4_DEF_RESUID;
2027 sbi->s_resgid = EXT4_DEF_RESGID; 2245 sbi->s_resgid = EXT4_DEF_RESGID;
2028 sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; 2246 sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
2029 sbi->s_sb_block = sb_block; 2247 sbi->s_sb_block = sb_block;
2248 sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part,
2249 sectors[1]);
2030 2250
2031 unlock_kernel(); 2251 unlock_kernel();
2032 2252
@@ -2064,6 +2284,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2064 sb->s_magic = le16_to_cpu(es->s_magic); 2284 sb->s_magic = le16_to_cpu(es->s_magic);
2065 if (sb->s_magic != EXT4_SUPER_MAGIC) 2285 if (sb->s_magic != EXT4_SUPER_MAGIC)
2066 goto cantfind_ext4; 2286 goto cantfind_ext4;
2287 sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
2067 2288
2068 /* Set defaults before we parse the mount options */ 2289 /* Set defaults before we parse the mount options */
2069 def_mount_opts = le32_to_cpu(es->s_default_mount_opts); 2290 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -2101,7 +2322,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2101 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; 2322 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
2102 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; 2323 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
2103 2324
2104 set_opt(sbi->s_mount_opt, RESERVATION);
2105 set_opt(sbi->s_mount_opt, BARRIER); 2325 set_opt(sbi->s_mount_opt, BARRIER);
2106 2326
2107 /* 2327 /*
@@ -2325,14 +2545,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2325#ifdef CONFIG_PROC_FS 2545#ifdef CONFIG_PROC_FS
2326 if (ext4_proc_root) 2546 if (ext4_proc_root)
2327 sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); 2547 sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
2328
2329 if (sbi->s_proc)
2330 proc_create_data("inode_readahead_blks", 0644, sbi->s_proc,
2331 &ext4_ui_proc_fops,
2332 &sbi->s_inode_readahead_blks);
2333#endif 2548#endif
2334 2549
2335 bgl_lock_init(&sbi->s_blockgroup_lock); 2550 bgl_lock_init(sbi->s_blockgroup_lock);
2336 2551
2337 for (i = 0; i < db_count; i++) { 2552 for (i = 0; i < db_count; i++) {
2338 block = descriptor_loc(sb, logical_sb_block, i); 2553 block = descriptor_loc(sb, logical_sb_block, i);
@@ -2564,6 +2779,16 @@ no_journal:
2564 goto failed_mount4; 2779 goto failed_mount4;
2565 } 2780 }
2566 2781
2782 sbi->s_kobj.kset = ext4_kset;
2783 init_completion(&sbi->s_kobj_unregister);
2784 err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
2785 "%s", sb->s_id);
2786 if (err) {
2787 ext4_mb_release(sb);
2788 ext4_ext_release(sb);
2789 goto failed_mount4;
2790 };
2791
2567 /* 2792 /*
2568 * akpm: core read_super() calls in here with the superblock locked. 2793 * akpm: core read_super() calls in here with the superblock locked.
2569 * That deadlocks, because orphan cleanup needs to lock the superblock 2794 * That deadlocks, because orphan cleanup needs to lock the superblock
@@ -2618,7 +2843,6 @@ failed_mount2:
2618 kfree(sbi->s_group_desc); 2843 kfree(sbi->s_group_desc);
2619failed_mount: 2844failed_mount:
2620 if (sbi->s_proc) { 2845 if (sbi->s_proc) {
2621 remove_proc_entry("inode_readahead_blks", sbi->s_proc);
2622 remove_proc_entry(sb->s_id, ext4_proc_root); 2846 remove_proc_entry(sb->s_id, ext4_proc_root);
2623 } 2847 }
2624#ifdef CONFIG_QUOTA 2848#ifdef CONFIG_QUOTA
@@ -2913,6 +3137,10 @@ static int ext4_commit_super(struct super_block *sb,
2913 set_buffer_uptodate(sbh); 3137 set_buffer_uptodate(sbh);
2914 } 3138 }
2915 es->s_wtime = cpu_to_le32(get_seconds()); 3139 es->s_wtime = cpu_to_le32(get_seconds());
3140 es->s_kbytes_written =
3141 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
3142 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
3143 EXT4_SB(sb)->s_sectors_written_start) >> 1));
2916 ext4_free_blocks_count_set(es, percpu_counter_sum_positive( 3144 ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
2917 &EXT4_SB(sb)->s_freeblocks_counter)); 3145 &EXT4_SB(sb)->s_freeblocks_counter));
2918 es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( 3146 es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
@@ -3647,45 +3875,6 @@ static int ext4_get_sb(struct file_system_type *fs_type,
3647 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt); 3875 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
3648} 3876}
3649 3877
3650#ifdef CONFIG_PROC_FS
3651static int ext4_ui_proc_show(struct seq_file *m, void *v)
3652{
3653 unsigned int *p = m->private;
3654
3655 seq_printf(m, "%u\n", *p);
3656 return 0;
3657}
3658
3659static int ext4_ui_proc_open(struct inode *inode, struct file *file)
3660{
3661 return single_open(file, ext4_ui_proc_show, PDE(inode)->data);
3662}
3663
3664static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
3665 size_t cnt, loff_t *ppos)
3666{
3667 unsigned long *p = PDE(file->f_path.dentry->d_inode)->data;
3668 char str[32];
3669
3670 if (cnt >= sizeof(str))
3671 return -EINVAL;
3672 if (copy_from_user(str, buf, cnt))
3673 return -EFAULT;
3674
3675 *p = simple_strtoul(str, NULL, 0);
3676 return cnt;
3677}
3678
3679const struct file_operations ext4_ui_proc_fops = {
3680 .owner = THIS_MODULE,
3681 .open = ext4_ui_proc_open,
3682 .read = seq_read,
3683 .llseek = seq_lseek,
3684 .release = single_release,
3685 .write = ext4_ui_proc_write,
3686};
3687#endif
3688
3689static struct file_system_type ext4_fs_type = { 3878static struct file_system_type ext4_fs_type = {
3690 .owner = THIS_MODULE, 3879 .owner = THIS_MODULE,
3691 .name = "ext4", 3880 .name = "ext4",
@@ -3719,6 +3908,9 @@ static int __init init_ext4_fs(void)
3719{ 3908{
3720 int err; 3909 int err;
3721 3910
3911 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
3912 if (!ext4_kset)
3913 return -ENOMEM;
3722 ext4_proc_root = proc_mkdir("fs/ext4", NULL); 3914 ext4_proc_root = proc_mkdir("fs/ext4", NULL);
3723 err = init_ext4_mballoc(); 3915 err = init_ext4_mballoc();
3724 if (err) 3916 if (err)
@@ -3760,6 +3952,7 @@ static void __exit exit_ext4_fs(void)
3760 exit_ext4_xattr(); 3952 exit_ext4_xattr();
3761 exit_ext4_mballoc(); 3953 exit_ext4_mballoc();
3762 remove_proc_entry("fs/ext4", NULL); 3954 remove_proc_entry("fs/ext4", NULL);
3955 kset_unregister(ext4_kset);
3763} 3956}
3764 3957
3765MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); 3958MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 62804e57a44c..4ea72377c7a2 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -367,6 +367,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
367 int tag_bytes = journal_tag_bytes(journal); 367 int tag_bytes = journal_tag_bytes(journal);
368 struct buffer_head *cbh = NULL; /* For transactional checksums */ 368 struct buffer_head *cbh = NULL; /* For transactional checksums */
369 __u32 crc32_sum = ~0; 369 __u32 crc32_sum = ~0;
370 int write_op = WRITE;
370 371
371 /* 372 /*
372 * First job: lock down the current transaction and wait for 373 * First job: lock down the current transaction and wait for
@@ -401,6 +402,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
401 spin_lock(&journal->j_state_lock); 402 spin_lock(&journal->j_state_lock);
402 commit_transaction->t_state = T_LOCKED; 403 commit_transaction->t_state = T_LOCKED;
403 404
405 if (commit_transaction->t_synchronous_commit)
406 write_op = WRITE_SYNC;
404 stats.u.run.rs_wait = commit_transaction->t_max_wait; 407 stats.u.run.rs_wait = commit_transaction->t_max_wait;
405 stats.u.run.rs_locked = jiffies; 408 stats.u.run.rs_locked = jiffies;
406 stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start, 409 stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
@@ -680,7 +683,7 @@ start_journal_io:
680 clear_buffer_dirty(bh); 683 clear_buffer_dirty(bh);
681 set_buffer_uptodate(bh); 684 set_buffer_uptodate(bh);
682 bh->b_end_io = journal_end_buffer_io_sync; 685 bh->b_end_io = journal_end_buffer_io_sync;
683 submit_bh(WRITE, bh); 686 submit_bh(write_op, bh);
684 } 687 }
685 cond_resched(); 688 cond_resched();
686 stats.u.run.rs_blocks_logged += bufs; 689 stats.u.run.rs_blocks_logged += bufs;
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 257ff2625765..bbe6d592d8b3 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -55,6 +55,25 @@
55 * need do nothing. 55 * need do nothing.
56 * RevokeValid set, Revoked set: 56 * RevokeValid set, Revoked set:
57 * buffer has been revoked. 57 * buffer has been revoked.
58 *
59 * Locking rules:
60 * We keep two hash tables of revoke records. One hashtable belongs to the
61 * running transaction (is pointed to by journal->j_revoke), the other one
62 * belongs to the committing transaction. Accesses to the second hash table
63 * happen only from the kjournald and no other thread touches this table. Also
64 * journal_switch_revoke_table() which switches which hashtable belongs to the
65 * running and which to the committing transaction is called only from
66 * kjournald. Therefore we need no locks when accessing the hashtable belonging
67 * to the committing transaction.
68 *
69 * All users operating on the hash table belonging to the running transaction
70 * have a handle to the transaction. Therefore they are safe from kjournald
71 * switching hash tables under them. For operations on the lists of entries in
72 * the hash table j_revoke_lock is used.
73 *
74 * Finally, also replay code uses the hash tables but at this moment noone else
75 * can touch them (filesystem isn't mounted yet) and hence no locking is
76 * needed.
58 */ 77 */
59 78
60#ifndef __KERNEL__ 79#ifndef __KERNEL__
@@ -401,8 +420,6 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
401 * the second time we would still have a pending revoke to cancel. So, 420 * the second time we would still have a pending revoke to cancel. So,
402 * do not trust the Revoked bit on buffers unless RevokeValid is also 421 * do not trust the Revoked bit on buffers unless RevokeValid is also
403 * set. 422 * set.
404 *
405 * The caller must have the journal locked.
406 */ 423 */
407int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) 424int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
408{ 425{
@@ -480,10 +497,7 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
480/* 497/*
481 * Write revoke records to the journal for all entries in the current 498 * Write revoke records to the journal for all entries in the current
482 * revoke hash, deleting the entries as we go. 499 * revoke hash, deleting the entries as we go.
483 *
484 * Called with the journal lock held.
485 */ 500 */
486
487void jbd2_journal_write_revoke_records(journal_t *journal, 501void jbd2_journal_write_revoke_records(journal_t *journal,
488 transaction_t *transaction) 502 transaction_t *transaction)
489{ 503{
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 28ce21d8598e..996ffda06bf3 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1315,6 +1315,8 @@ int jbd2_journal_stop(handle_t *handle)
1315 } 1315 }
1316 } 1316 }
1317 1317
1318 if (handle->h_sync)
1319 transaction->t_synchronous_commit = 1;
1318 current->journal_info = NULL; 1320 current->journal_info = NULL;
1319 spin_lock(&journal->j_state_lock); 1321 spin_lock(&journal->j_state_lock);
1320 spin_lock(&transaction->t_handle_lock); 1322 spin_lock(&transaction->t_handle_lock);
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index aedc47a264c1..1f3b0fc0d351 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -139,55 +139,6 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
139 return 0; 139 return 0;
140} 140}
141 141
142#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
143static const struct in6_addr *nlmclnt_map_v4addr(const struct sockaddr *sap,
144 struct in6_addr *addr_mapped)
145{
146 const struct sockaddr_in *sin = (const struct sockaddr_in *)sap;
147
148 switch (sap->sa_family) {
149 case AF_INET6:
150 return &((const struct sockaddr_in6 *)sap)->sin6_addr;
151 case AF_INET:
152 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, addr_mapped);
153 return addr_mapped;
154 }
155
156 return NULL;
157}
158
159/*
160 * If lockd is using a PF_INET6 listener, all incoming requests appear
161 * to come from AF_INET6 remotes. The address of AF_INET remotes are
162 * mapped to AF_INET6 automatically by the network layer. In case the
163 * user passed an AF_INET server address at mount time, ensure both
164 * addresses are AF_INET6 before comparing them.
165 */
166static int nlmclnt_cmp_addr(const struct nlm_host *host,
167 const struct sockaddr *sap)
168{
169 const struct in6_addr *addr1;
170 const struct in6_addr *addr2;
171 struct in6_addr addr1_mapped;
172 struct in6_addr addr2_mapped;
173
174 addr1 = nlmclnt_map_v4addr(nlm_addr(host), &addr1_mapped);
175 if (likely(addr1 != NULL)) {
176 addr2 = nlmclnt_map_v4addr(sap, &addr2_mapped);
177 if (likely(addr2 != NULL))
178 return ipv6_addr_equal(addr1, addr2);
179 }
180
181 return 0;
182}
183#else /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
184static int nlmclnt_cmp_addr(const struct nlm_host *host,
185 const struct sockaddr *sap)
186{
187 return nlm_cmp_addr(nlm_addr(host), sap);
188}
189#endif /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
190
191/* 142/*
192 * The server lockd has called us back to tell us the lock was granted 143 * The server lockd has called us back to tell us the lock was granted
193 */ 144 */
@@ -215,7 +166,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
215 */ 166 */
216 if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) 167 if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
217 continue; 168 continue;
218 if (!nlmclnt_cmp_addr(block->b_host, addr)) 169 if (!nlm_cmp_addr(nlm_addr(block->b_host), addr))
219 continue; 170 continue;
220 if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) 171 if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
221 continue; 172 continue;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 5e2c4d5ac827..6d5d4a4169e5 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -16,6 +16,8 @@
16#include <linux/sunrpc/svc.h> 16#include <linux/sunrpc/svc.h>
17#include <linux/lockd/lockd.h> 17#include <linux/lockd/lockd.h>
18 18
19#include <asm/unaligned.h>
20
19#define NLMDBG_FACILITY NLMDBG_MONITOR 21#define NLMDBG_FACILITY NLMDBG_MONITOR
20#define NSM_PROGRAM 100024 22#define NSM_PROGRAM 100024
21#define NSM_VERSION 1 23#define NSM_VERSION 1
@@ -274,10 +276,12 @@ static void nsm_init_private(struct nsm_handle *nsm)
274{ 276{
275 u64 *p = (u64 *)&nsm->sm_priv.data; 277 u64 *p = (u64 *)&nsm->sm_priv.data;
276 struct timespec ts; 278 struct timespec ts;
279 s64 ns;
277 280
278 ktime_get_ts(&ts); 281 ktime_get_ts(&ts);
279 *p++ = timespec_to_ns(&ts); 282 ns = timespec_to_ns(&ts);
280 *p = (unsigned long)nsm; 283 put_unaligned(ns, p);
284 put_unaligned((unsigned long)nsm, p + 1);
281} 285}
282 286
283static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap, 287static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 64f1c31b5853..abf83881f68a 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -53,17 +53,6 @@ static struct svc_rqst *nlmsvc_rqst;
53unsigned long nlmsvc_timeout; 53unsigned long nlmsvc_timeout;
54 54
55/* 55/*
56 * If the kernel has IPv6 support available, always listen for
57 * both AF_INET and AF_INET6 requests.
58 */
59#if (defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) && \
60 defined(CONFIG_SUNRPC_REGISTER_V4)
61static const sa_family_t nlmsvc_family = AF_INET6;
62#else /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
63static const sa_family_t nlmsvc_family = AF_INET;
64#endif /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
65
66/*
67 * These can be set at insmod time (useful for NFS as root filesystem), 56 * These can be set at insmod time (useful for NFS as root filesystem),
68 * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003 57 * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003
69 */ 58 */
@@ -204,19 +193,30 @@ lockd(void *vrqstp)
204 return 0; 193 return 0;
205} 194}
206 195
207static int create_lockd_listener(struct svc_serv *serv, char *name, 196static int create_lockd_listener(struct svc_serv *serv, const char *name,
208 unsigned short port) 197 const int family, const unsigned short port)
209{ 198{
210 struct svc_xprt *xprt; 199 struct svc_xprt *xprt;
211 200
212 xprt = svc_find_xprt(serv, name, 0, 0); 201 xprt = svc_find_xprt(serv, name, family, 0);
213 if (xprt == NULL) 202 if (xprt == NULL)
214 return svc_create_xprt(serv, name, port, SVC_SOCK_DEFAULTS); 203 return svc_create_xprt(serv, name, family, port,
215 204 SVC_SOCK_DEFAULTS);
216 svc_xprt_put(xprt); 205 svc_xprt_put(xprt);
217 return 0; 206 return 0;
218} 207}
219 208
209static int create_lockd_family(struct svc_serv *serv, const int family)
210{
211 int err;
212
213 err = create_lockd_listener(serv, "udp", family, nlm_udpport);
214 if (err < 0)
215 return err;
216
217 return create_lockd_listener(serv, "tcp", family, nlm_tcpport);
218}
219
220/* 220/*
221 * Ensure there are active UDP and TCP listeners for lockd. 221 * Ensure there are active UDP and TCP listeners for lockd.
222 * 222 *
@@ -232,13 +232,15 @@ static int make_socks(struct svc_serv *serv)
232 static int warned; 232 static int warned;
233 int err; 233 int err;
234 234
235 err = create_lockd_listener(serv, "udp", nlm_udpport); 235 err = create_lockd_family(serv, PF_INET);
236 if (err < 0) 236 if (err < 0)
237 goto out_err; 237 goto out_err;
238 238
239 err = create_lockd_listener(serv, "tcp", nlm_tcpport); 239#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
240 if (err < 0) 240 err = create_lockd_family(serv, PF_INET6);
241 if (err < 0 && err != -EAFNOSUPPORT)
241 goto out_err; 242 goto out_err;
243#endif /* CONFIG_IPV6 || CONFIG_IPV6_MODULE */
242 244
243 warned = 0; 245 warned = 0;
244 return 0; 246 return 0;
@@ -274,7 +276,7 @@ int lockd_up(void)
274 "lockd_up: no pid, %d users??\n", nlmsvc_users); 276 "lockd_up: no pid, %d users??\n", nlmsvc_users);
275 277
276 error = -ENOMEM; 278 error = -ENOMEM;
277 serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, nlmsvc_family, NULL); 279 serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL);
278 if (!serv) { 280 if (!serv) {
279 printk(KERN_WARNING "lockd_up: create service failed\n"); 281 printk(KERN_WARNING "lockd_up: create service failed\n");
280 goto out; 282 goto out;
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 3e634f2a1083..a886e692ddd0 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -38,19 +38,10 @@ static struct svc_program nfs4_callback_program;
38 38
39unsigned int nfs_callback_set_tcpport; 39unsigned int nfs_callback_set_tcpport;
40unsigned short nfs_callback_tcpport; 40unsigned short nfs_callback_tcpport;
41unsigned short nfs_callback_tcpport6;
41static const int nfs_set_port_min = 0; 42static const int nfs_set_port_min = 0;
42static const int nfs_set_port_max = 65535; 43static const int nfs_set_port_max = 65535;
43 44
44/*
45 * If the kernel has IPv6 support available, always listen for
46 * both AF_INET and AF_INET6 requests.
47 */
48#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
49static const sa_family_t nfs_callback_family = AF_INET6;
50#else
51static const sa_family_t nfs_callback_family = AF_INET;
52#endif
53
54static int param_set_port(const char *val, struct kernel_param *kp) 45static int param_set_port(const char *val, struct kernel_param *kp)
55{ 46{
56 char *endp; 47 char *endp;
@@ -116,19 +107,29 @@ int nfs_callback_up(void)
116 mutex_lock(&nfs_callback_mutex); 107 mutex_lock(&nfs_callback_mutex);
117 if (nfs_callback_info.users++ || nfs_callback_info.task != NULL) 108 if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
118 goto out; 109 goto out;
119 serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, 110 serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
120 nfs_callback_family, NULL);
121 ret = -ENOMEM; 111 ret = -ENOMEM;
122 if (!serv) 112 if (!serv)
123 goto out_err; 113 goto out_err;
124 114
125 ret = svc_create_xprt(serv, "tcp", nfs_callback_set_tcpport, 115 ret = svc_create_xprt(serv, "tcp", PF_INET,
126 SVC_SOCK_ANONYMOUS); 116 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
127 if (ret <= 0) 117 if (ret <= 0)
128 goto out_err; 118 goto out_err;
129 nfs_callback_tcpport = ret; 119 nfs_callback_tcpport = ret;
130 dprintk("NFS: Callback listener port = %u (af %u)\n", 120 dprintk("NFS: Callback listener port = %u (af %u)\n",
131 nfs_callback_tcpport, nfs_callback_family); 121 nfs_callback_tcpport, PF_INET);
122
123#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
124 ret = svc_create_xprt(serv, "tcp", PF_INET6,
125 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
126 if (ret > 0) {
127 nfs_callback_tcpport6 = ret;
128 dprintk("NFS: Callback listener port = %u (af %u)\n",
129 nfs_callback_tcpport6, PF_INET6);
130 } else if (ret != -EAFNOSUPPORT)
131 goto out_err;
132#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
132 133
133 nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]); 134 nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
134 if (IS_ERR(nfs_callback_info.rqst)) { 135 if (IS_ERR(nfs_callback_info.rqst)) {
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index bb25d2135ff1..e110e286a262 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -72,5 +72,6 @@ extern void nfs_callback_down(void);
72 72
73extern unsigned int nfs_callback_set_tcpport; 73extern unsigned int nfs_callback_set_tcpport;
74extern unsigned short nfs_callback_tcpport; 74extern unsigned short nfs_callback_tcpport;
75extern unsigned short nfs_callback_tcpport6;
75 76
76#endif /* __LINUX_FS_NFS_CALLBACK_H */ 77#endif /* __LINUX_FS_NFS_CALLBACK_H */
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 2277421656e7..aba38017bdef 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -224,38 +224,6 @@ void nfs_put_client(struct nfs_client *clp)
224} 224}
225 225
226#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 226#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
227static const struct in6_addr *nfs_map_ipv4_addr(const struct sockaddr *sa, struct in6_addr *addr_mapped)
228{
229 switch (sa->sa_family) {
230 default:
231 return NULL;
232 case AF_INET6:
233 return &((const struct sockaddr_in6 *)sa)->sin6_addr;
234 break;
235 case AF_INET:
236 ipv6_addr_set_v4mapped(((const struct sockaddr_in *)sa)->sin_addr.s_addr,
237 addr_mapped);
238 return addr_mapped;
239 }
240}
241
242static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
243 const struct sockaddr *sa2)
244{
245 const struct in6_addr *addr1;
246 const struct in6_addr *addr2;
247 struct in6_addr addr1_mapped;
248 struct in6_addr addr2_mapped;
249
250 addr1 = nfs_map_ipv4_addr(sa1, &addr1_mapped);
251 if (likely(addr1 != NULL)) {
252 addr2 = nfs_map_ipv4_addr(sa2, &addr2_mapped);
253 if (likely(addr2 != NULL))
254 return ipv6_addr_equal(addr1, addr2);
255 }
256 return 0;
257}
258
259/* 227/*
260 * Test if two ip6 socket addresses refer to the same socket by 228 * Test if two ip6 socket addresses refer to the same socket by
261 * comparing relevant fields. The padding bytes specifically, are not 229 * comparing relevant fields. The padding bytes specifically, are not
@@ -267,38 +235,21 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
267 * 235 *
268 * The caller should ensure both socket addresses are AF_INET6. 236 * The caller should ensure both socket addresses are AF_INET6.
269 */ 237 */
270static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1, 238static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
271 const struct sockaddr *sa2) 239 const struct sockaddr *sa2)
272{ 240{
273 const struct sockaddr_in6 *saddr1 = (const struct sockaddr_in6 *)sa1; 241 const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
274 const struct sockaddr_in6 *saddr2 = (const struct sockaddr_in6 *)sa2; 242 const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
275 243
276 if (!ipv6_addr_equal(&saddr1->sin6_addr, 244 if (ipv6_addr_scope(&sin1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL &&
277 &saddr1->sin6_addr)) 245 sin1->sin6_scope_id != sin2->sin6_scope_id)
278 return 0; 246 return 0;
279 if (ipv6_addr_scope(&saddr1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL &&
280 saddr1->sin6_scope_id != saddr2->sin6_scope_id)
281 return 0;
282 return saddr1->sin6_port == saddr2->sin6_port;
283}
284#else
285static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1,
286 const struct sockaddr_in *sa2)
287{
288 return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr;
289}
290 247
291static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, 248 return ipv6_addr_equal(&sin1->sin6_addr, &sin1->sin6_addr);
292 const struct sockaddr *sa2)
293{
294 if (unlikely(sa1->sa_family != AF_INET || sa2->sa_family != AF_INET))
295 return 0;
296 return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1,
297 (const struct sockaddr_in *)sa2);
298} 249}
299 250#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
300static int nfs_sockaddr_cmp_ip6(const struct sockaddr * sa1, 251static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
301 const struct sockaddr * sa2) 252 const struct sockaddr *sa2)
302{ 253{
303 return 0; 254 return 0;
304} 255}
@@ -311,20 +262,57 @@ static int nfs_sockaddr_cmp_ip6(const struct sockaddr * sa1,
311 * 262 *
312 * The caller should ensure both socket addresses are AF_INET. 263 * The caller should ensure both socket addresses are AF_INET.
313 */ 264 */
265static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1,
266 const struct sockaddr *sa2)
267{
268 const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
269 const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
270
271 return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
272}
273
274static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
275 const struct sockaddr *sa2)
276{
277 const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
278 const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
279
280 return nfs_sockaddr_match_ipaddr6(sa1, sa2) &&
281 (sin1->sin6_port == sin2->sin6_port);
282}
283
314static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1, 284static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,
315 const struct sockaddr *sa2) 285 const struct sockaddr *sa2)
316{ 286{
317 const struct sockaddr_in *saddr1 = (const struct sockaddr_in *)sa1; 287 const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
318 const struct sockaddr_in *saddr2 = (const struct sockaddr_in *)sa2; 288 const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
319 289
320 if (saddr1->sin_addr.s_addr != saddr2->sin_addr.s_addr) 290 return nfs_sockaddr_match_ipaddr4(sa1, sa2) &&
291 (sin1->sin_port == sin2->sin_port);
292}
293
294/*
295 * Test if two socket addresses represent the same actual socket,
296 * by comparing (only) relevant fields, excluding the port number.
297 */
298static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
299 const struct sockaddr *sa2)
300{
301 if (sa1->sa_family != sa2->sa_family)
321 return 0; 302 return 0;
322 return saddr1->sin_port == saddr2->sin_port; 303
304 switch (sa1->sa_family) {
305 case AF_INET:
306 return nfs_sockaddr_match_ipaddr4(sa1, sa2);
307 case AF_INET6:
308 return nfs_sockaddr_match_ipaddr6(sa1, sa2);
309 }
310 return 0;
323} 311}
324 312
325/* 313/*
326 * Test if two socket addresses represent the same actual socket, 314 * Test if two socket addresses represent the same actual socket,
327 * by comparing (only) relevant fields. 315 * by comparing (only) relevant fields, including the port number.
328 */ 316 */
329static int nfs_sockaddr_cmp(const struct sockaddr *sa1, 317static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
330 const struct sockaddr *sa2) 318 const struct sockaddr *sa2)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 78bf72fc1db3..370b190a09d1 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1624,8 +1624,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1624 } else if (atomic_read(&new_dentry->d_count) > 1) 1624 } else if (atomic_read(&new_dentry->d_count) > 1)
1625 /* dentry still busy? */ 1625 /* dentry still busy? */
1626 goto out; 1626 goto out;
1627 } else 1627 }
1628 nfs_drop_nlink(new_inode);
1629 1628
1630go_ahead: 1629go_ahead:
1631 /* 1630 /*
@@ -1638,10 +1637,8 @@ go_ahead:
1638 } 1637 }
1639 nfs_inode_return_delegation(old_inode); 1638 nfs_inode_return_delegation(old_inode);
1640 1639
1641 if (new_inode != NULL) { 1640 if (new_inode != NULL)
1642 nfs_inode_return_delegation(new_inode); 1641 nfs_inode_return_delegation(new_inode);
1643 d_delete(new_dentry);
1644 }
1645 1642
1646 error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name, 1643 error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name,
1647 new_dir, &new_dentry->d_name); 1644 new_dir, &new_dentry->d_name);
@@ -1650,6 +1647,8 @@ out:
1650 if (rehash) 1647 if (rehash)
1651 d_rehash(rehash); 1648 d_rehash(rehash);
1652 if (!error) { 1649 if (!error) {
1650 if (new_inode != NULL)
1651 nfs_drop_nlink(new_inode);
1653 d_move(old_dentry, new_dentry); 1652 d_move(old_dentry, new_dentry);
1654 nfs_set_verifier(new_dentry, 1653 nfs_set_verifier(new_dentry,
1655 nfs_save_change_attribute(new_dir)); 1654 nfs_save_change_attribute(new_dir));
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index cec79392e4ba..0abf3f331f56 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -64,11 +64,7 @@ const struct file_operations nfs_file_operations = {
64 .write = do_sync_write, 64 .write = do_sync_write,
65 .aio_read = nfs_file_read, 65 .aio_read = nfs_file_read,
66 .aio_write = nfs_file_write, 66 .aio_write = nfs_file_write,
67#ifdef CONFIG_MMU
68 .mmap = nfs_file_mmap, 67 .mmap = nfs_file_mmap,
69#else
70 .mmap = generic_file_mmap,
71#endif
72 .open = nfs_file_open, 68 .open = nfs_file_open,
73 .flush = nfs_file_flush, 69 .flush = nfs_file_flush,
74 .release = nfs_file_release, 70 .release = nfs_file_release,
@@ -141,9 +137,6 @@ nfs_file_release(struct inode *inode, struct file *filp)
141 dentry->d_parent->d_name.name, 137 dentry->d_parent->d_name.name,
142 dentry->d_name.name); 138 dentry->d_name.name);
143 139
144 /* Ensure that dirty pages are flushed out with the right creds */
145 if (filp->f_mode & FMODE_WRITE)
146 nfs_wb_all(dentry->d_inode);
147 nfs_inc_stats(inode, NFSIOS_VFSRELEASE); 140 nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
148 return nfs_release(inode, filp); 141 return nfs_release(inode, filp);
149} 142}
@@ -235,7 +228,6 @@ nfs_file_flush(struct file *file, fl_owner_t id)
235 struct nfs_open_context *ctx = nfs_file_open_context(file); 228 struct nfs_open_context *ctx = nfs_file_open_context(file);
236 struct dentry *dentry = file->f_path.dentry; 229 struct dentry *dentry = file->f_path.dentry;
237 struct inode *inode = dentry->d_inode; 230 struct inode *inode = dentry->d_inode;
238 int status;
239 231
240 dprintk("NFS: flush(%s/%s)\n", 232 dprintk("NFS: flush(%s/%s)\n",
241 dentry->d_parent->d_name.name, 233 dentry->d_parent->d_name.name,
@@ -245,11 +237,8 @@ nfs_file_flush(struct file *file, fl_owner_t id)
245 return 0; 237 return 0;
246 nfs_inc_stats(inode, NFSIOS_VFSFLUSH); 238 nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
247 239
248 /* Ensure that data+attribute caches are up to date after close() */ 240 /* Flush writes to the server and return any errors */
249 status = nfs_do_fsync(ctx, inode); 241 return nfs_do_fsync(ctx, inode);
250 if (!status)
251 nfs_revalidate_inode(NFS_SERVER(inode), inode);
252 return status;
253} 242}
254 243
255static ssize_t 244static ssize_t
@@ -304,11 +293,13 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
304 dprintk("NFS: mmap(%s/%s)\n", 293 dprintk("NFS: mmap(%s/%s)\n",
305 dentry->d_parent->d_name.name, dentry->d_name.name); 294 dentry->d_parent->d_name.name, dentry->d_name.name);
306 295
307 status = nfs_revalidate_mapping(inode, file->f_mapping); 296 /* Note: generic_file_mmap() returns ENOSYS on nommu systems
297 * so we call that before revalidating the mapping
298 */
299 status = generic_file_mmap(file, vma);
308 if (!status) { 300 if (!status) {
309 vma->vm_ops = &nfs_file_vm_ops; 301 vma->vm_ops = &nfs_file_vm_ops;
310 vma->vm_flags |= VM_CAN_NONLINEAR; 302 status = nfs_revalidate_mapping(inode, file->f_mapping);
311 file_accessed(file);
312 } 303 }
313 return status; 304 return status;
314} 305}
@@ -354,6 +345,15 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
354 file->f_path.dentry->d_name.name, 345 file->f_path.dentry->d_name.name,
355 mapping->host->i_ino, len, (long long) pos); 346 mapping->host->i_ino, len, (long long) pos);
356 347
348 /*
349 * Prevent starvation issues if someone is doing a consistency
350 * sync-to-disk
351 */
352 ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
353 nfs_wait_bit_killable, TASK_KILLABLE);
354 if (ret)
355 return ret;
356
357 page = grab_cache_page_write_begin(mapping, index, flags); 357 page = grab_cache_page_write_begin(mapping, index, flags);
358 if (!page) 358 if (!page)
359 return -ENOMEM; 359 return -ENOMEM;
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b7c9b2df1f29..46177cb87064 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -156,7 +156,7 @@ int nfs4_path_walk(struct nfs_server *server,
156 return ret; 156 return ret;
157 } 157 }
158 158
159 if (fattr.type != NFDIR) { 159 if (!S_ISDIR(fattr.mode)) {
160 printk(KERN_ERR "nfs4_get_root:" 160 printk(KERN_ERR "nfs4_get_root:"
161 " getroot encountered non-directory\n"); 161 " getroot encountered non-directory\n");
162 return -ENOTDIR; 162 return -ENOTDIR;
@@ -213,7 +213,7 @@ eat_dot_dir:
213 return ret; 213 return ret;
214 } 214 }
215 215
216 if (fattr.type != NFDIR) { 216 if (!S_ISDIR(fattr.mode)) {
217 printk(KERN_ERR "nfs4_get_root:" 217 printk(KERN_ERR "nfs4_get_root:"
218 " lookupfh encountered non-directory\n"); 218 " lookupfh encountered non-directory\n");
219 return -ENOTDIR; 219 return -ENOTDIR;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0c381686171e..a834d1d850b7 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -66,6 +66,18 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
66} 66}
67 67
68/** 68/**
69 * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks
70 * @word: long word containing the bit lock
71 */
72int nfs_wait_bit_killable(void *word)
73{
74 if (fatal_signal_pending(current))
75 return -ERESTARTSYS;
76 schedule();
77 return 0;
78}
79
80/**
69 * nfs_compat_user_ino64 - returns the user-visible inode number 81 * nfs_compat_user_ino64 - returns the user-visible inode number
70 * @fileid: 64-bit fileid 82 * @fileid: 64-bit fileid
71 * 83 *
@@ -249,13 +261,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
249 struct inode *inode = ERR_PTR(-ENOENT); 261 struct inode *inode = ERR_PTR(-ENOENT);
250 unsigned long hash; 262 unsigned long hash;
251 263
252 if ((fattr->valid & NFS_ATTR_FATTR) == 0) 264 if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0)
253 goto out_no_inode; 265 goto out_no_inode;
254 266 if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0)
255 if (!fattr->nlink) {
256 printk("NFS: Buggy server - nlink == 0!\n");
257 goto out_no_inode; 267 goto out_no_inode;
258 }
259 268
260 hash = nfs_fattr_to_ino_t(fattr); 269 hash = nfs_fattr_to_ino_t(fattr);
261 270
@@ -291,7 +300,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
291 && fattr->size <= NFS_LIMIT_READDIRPLUS) 300 && fattr->size <= NFS_LIMIT_READDIRPLUS)
292 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); 301 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
293 /* Deal with crossing mountpoints */ 302 /* Deal with crossing mountpoints */
294 if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) { 303 if ((fattr->valid & NFS_ATTR_FATTR_FSID)
304 && !nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
295 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) 305 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
296 inode->i_op = &nfs_referral_inode_operations; 306 inode->i_op = &nfs_referral_inode_operations;
297 else 307 else
@@ -304,28 +314,45 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
304 else 314 else
305 init_special_inode(inode, inode->i_mode, fattr->rdev); 315 init_special_inode(inode, inode->i_mode, fattr->rdev);
306 316
317 memset(&inode->i_atime, 0, sizeof(inode->i_atime));
318 memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
319 memset(&inode->i_ctime, 0, sizeof(inode->i_ctime));
320 nfsi->change_attr = 0;
321 inode->i_size = 0;
322 inode->i_nlink = 0;
323 inode->i_uid = -2;
324 inode->i_gid = -2;
325 inode->i_blocks = 0;
326 memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
327
307 nfsi->read_cache_jiffies = fattr->time_start; 328 nfsi->read_cache_jiffies = fattr->time_start;
308 nfsi->attr_gencount = fattr->gencount; 329 nfsi->attr_gencount = fattr->gencount;
309 inode->i_atime = fattr->atime; 330 if (fattr->valid & NFS_ATTR_FATTR_ATIME)
310 inode->i_mtime = fattr->mtime; 331 inode->i_atime = fattr->atime;
311 inode->i_ctime = fattr->ctime; 332 if (fattr->valid & NFS_ATTR_FATTR_MTIME)
312 if (fattr->valid & NFS_ATTR_FATTR_V4) 333 inode->i_mtime = fattr->mtime;
334 if (fattr->valid & NFS_ATTR_FATTR_CTIME)
335 inode->i_ctime = fattr->ctime;
336 if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
313 nfsi->change_attr = fattr->change_attr; 337 nfsi->change_attr = fattr->change_attr;
314 inode->i_size = nfs_size_to_loff_t(fattr->size); 338 if (fattr->valid & NFS_ATTR_FATTR_SIZE)
315 inode->i_nlink = fattr->nlink; 339 inode->i_size = nfs_size_to_loff_t(fattr->size);
316 inode->i_uid = fattr->uid; 340 if (fattr->valid & NFS_ATTR_FATTR_NLINK)
317 inode->i_gid = fattr->gid; 341 inode->i_nlink = fattr->nlink;
318 if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) { 342 if (fattr->valid & NFS_ATTR_FATTR_OWNER)
343 inode->i_uid = fattr->uid;
344 if (fattr->valid & NFS_ATTR_FATTR_GROUP)
345 inode->i_gid = fattr->gid;
346 if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
347 inode->i_blocks = fattr->du.nfs2.blocks;
348 if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
319 /* 349 /*
320 * report the blocks in 512byte units 350 * report the blocks in 512byte units
321 */ 351 */
322 inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); 352 inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
323 } else {
324 inode->i_blocks = fattr->du.nfs2.blocks;
325 } 353 }
326 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); 354 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
327 nfsi->attrtimeo_timestamp = now; 355 nfsi->attrtimeo_timestamp = now;
328 memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
329 nfsi->access_cache = RB_ROOT; 356 nfsi->access_cache = RB_ROOT;
330 357
331 unlock_new_inode(inode); 358 unlock_new_inode(inode);
@@ -514,6 +541,32 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
514 return err; 541 return err;
515} 542}
516 543
544/**
545 * nfs_close_context - Common close_context() routine NFSv2/v3
546 * @ctx: pointer to context
547 * @is_sync: is this a synchronous close
548 *
549 * always ensure that the attributes are up to date if we're mounted
550 * with close-to-open semantics
551 */
552void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
553{
554 struct inode *inode;
555 struct nfs_server *server;
556
557 if (!(ctx->mode & FMODE_WRITE))
558 return;
559 if (!is_sync)
560 return;
561 inode = ctx->path.dentry->d_inode;
562 if (!list_empty(&NFS_I(inode)->open_files))
563 return;
564 server = NFS_SERVER(inode);
565 if (server->flags & NFS_MOUNT_NOCTO)
566 return;
567 nfs_revalidate_inode(server, inode);
568}
569
517static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, struct dentry *dentry, struct rpc_cred *cred) 570static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, struct dentry *dentry, struct rpc_cred *cred)
518{ 571{
519 struct nfs_open_context *ctx; 572 struct nfs_open_context *ctx;
@@ -540,24 +593,15 @@ struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
540 return ctx; 593 return ctx;
541} 594}
542 595
543static void __put_nfs_open_context(struct nfs_open_context *ctx, int wait) 596static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
544{ 597{
545 struct inode *inode; 598 struct inode *inode = ctx->path.dentry->d_inode;
546
547 if (ctx == NULL)
548 return;
549 599
550 inode = ctx->path.dentry->d_inode;
551 if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock)) 600 if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock))
552 return; 601 return;
553 list_del(&ctx->list); 602 list_del(&ctx->list);
554 spin_unlock(&inode->i_lock); 603 spin_unlock(&inode->i_lock);
555 if (ctx->state != NULL) { 604 NFS_PROTO(inode)->close_context(ctx, is_sync);
556 if (wait)
557 nfs4_close_sync(&ctx->path, ctx->state, ctx->mode);
558 else
559 nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
560 }
561 if (ctx->cred != NULL) 605 if (ctx->cred != NULL)
562 put_rpccred(ctx->cred); 606 put_rpccred(ctx->cred);
563 path_put(&ctx->path); 607 path_put(&ctx->path);
@@ -670,9 +714,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
670 if (NFS_STALE(inode)) 714 if (NFS_STALE(inode))
671 goto out; 715 goto out;
672 716
673 if (NFS_STALE(inode))
674 goto out;
675
676 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); 717 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
677 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr); 718 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr);
678 if (status != 0) { 719 if (status != 0) {
@@ -815,25 +856,31 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
815{ 856{
816 struct nfs_inode *nfsi = NFS_I(inode); 857 struct nfs_inode *nfsi = NFS_I(inode);
817 858
818 if ((fattr->valid & NFS_ATTR_WCC_V4) != 0 && 859 if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
819 nfsi->change_attr == fattr->pre_change_attr) { 860 && (fattr->valid & NFS_ATTR_FATTR_CHANGE)
861 && nfsi->change_attr == fattr->pre_change_attr) {
820 nfsi->change_attr = fattr->change_attr; 862 nfsi->change_attr = fattr->change_attr;
821 if (S_ISDIR(inode->i_mode)) 863 if (S_ISDIR(inode->i_mode))
822 nfsi->cache_validity |= NFS_INO_INVALID_DATA; 864 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
823 } 865 }
824 /* If we have atomic WCC data, we may update some attributes */ 866 /* If we have atomic WCC data, we may update some attributes */
825 if ((fattr->valid & NFS_ATTR_WCC) != 0) { 867 if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
826 if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) 868 && (fattr->valid & NFS_ATTR_FATTR_CTIME)
869 && timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
827 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); 870 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
828 if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { 871
872 if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
873 && (fattr->valid & NFS_ATTR_FATTR_MTIME)
874 && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
829 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); 875 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
830 if (S_ISDIR(inode->i_mode)) 876 if (S_ISDIR(inode->i_mode))
831 nfsi->cache_validity |= NFS_INO_INVALID_DATA; 877 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
832 }
833 if (i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) &&
834 nfsi->npages == 0)
835 i_size_write(inode, nfs_size_to_loff_t(fattr->size));
836 } 878 }
879 if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
880 && (fattr->valid & NFS_ATTR_FATTR_SIZE)
881 && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
882 && nfsi->npages == 0)
883 i_size_write(inode, nfs_size_to_loff_t(fattr->size));
837} 884}
838 885
839/** 886/**
@@ -853,35 +900,39 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
853 900
854 901
855 /* Has the inode gone and changed behind our back? */ 902 /* Has the inode gone and changed behind our back? */
856 if (nfsi->fileid != fattr->fileid 903 if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
857 || (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) { 904 return -EIO;
905 if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
858 return -EIO; 906 return -EIO;
859 }
860 907
861 if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && 908 if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
862 nfsi->change_attr != fattr->change_attr) 909 nfsi->change_attr != fattr->change_attr)
863 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 910 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
864 911
865 /* Verify a few of the more important attributes */ 912 /* Verify a few of the more important attributes */
866 if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) 913 if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
867 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 914 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
868 915
869 cur_size = i_size_read(inode); 916 if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
870 new_isize = nfs_size_to_loff_t(fattr->size); 917 cur_size = i_size_read(inode);
871 if (cur_size != new_isize && nfsi->npages == 0) 918 new_isize = nfs_size_to_loff_t(fattr->size);
872 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 919 if (cur_size != new_isize && nfsi->npages == 0)
920 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
921 }
873 922
874 /* Have any file permissions changed? */ 923 /* Have any file permissions changed? */
875 if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) 924 if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
876 || inode->i_uid != fattr->uid 925 invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
877 || inode->i_gid != fattr->gid) 926 if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && inode->i_uid != fattr->uid)
927 invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
928 if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && inode->i_gid != fattr->gid)
878 invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; 929 invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
879 930
880 /* Has the link count changed? */ 931 /* Has the link count changed? */
881 if (inode->i_nlink != fattr->nlink) 932 if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink)
882 invalid |= NFS_INO_INVALID_ATTR; 933 invalid |= NFS_INO_INVALID_ATTR;
883 934
884 if (!timespec_equal(&inode->i_atime, &fattr->atime)) 935 if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime))
885 invalid |= NFS_INO_INVALID_ATIME; 936 invalid |= NFS_INO_INVALID_ATIME;
886 937
887 if (invalid != 0) 938 if (invalid != 0)
@@ -893,11 +944,15 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
893 944
894static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr) 945static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
895{ 946{
947 if (!(fattr->valid & NFS_ATTR_FATTR_CTIME))
948 return 0;
896 return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0; 949 return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
897} 950}
898 951
899static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr) 952static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
900{ 953{
954 if (!(fattr->valid & NFS_ATTR_FATTR_SIZE))
955 return 0;
901 return nfs_size_to_loff_t(fattr->size) > i_size_read(inode); 956 return nfs_size_to_loff_t(fattr->size) > i_size_read(inode);
902} 957}
903 958
@@ -1033,20 +1088,31 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
1033 /* Don't do a WCC update if these attributes are already stale */ 1088 /* Don't do a WCC update if these attributes are already stale */
1034 if ((fattr->valid & NFS_ATTR_FATTR) == 0 || 1089 if ((fattr->valid & NFS_ATTR_FATTR) == 0 ||
1035 !nfs_inode_attrs_need_update(inode, fattr)) { 1090 !nfs_inode_attrs_need_update(inode, fattr)) {
1036 fattr->valid &= ~(NFS_ATTR_WCC_V4|NFS_ATTR_WCC); 1091 fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE
1092 | NFS_ATTR_FATTR_PRESIZE
1093 | NFS_ATTR_FATTR_PREMTIME
1094 | NFS_ATTR_FATTR_PRECTIME);
1037 goto out_noforce; 1095 goto out_noforce;
1038 } 1096 }
1039 if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && 1097 if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
1040 (fattr->valid & NFS_ATTR_WCC_V4) == 0) { 1098 (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) {
1041 fattr->pre_change_attr = NFS_I(inode)->change_attr; 1099 fattr->pre_change_attr = NFS_I(inode)->change_attr;
1042 fattr->valid |= NFS_ATTR_WCC_V4; 1100 fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
1043 } 1101 }
1044 if ((fattr->valid & NFS_ATTR_FATTR) != 0 && 1102 if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 &&
1045 (fattr->valid & NFS_ATTR_WCC) == 0) { 1103 (fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) {
1046 memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime)); 1104 memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime));
1105 fattr->valid |= NFS_ATTR_FATTR_PRECTIME;
1106 }
1107 if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 &&
1108 (fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) {
1047 memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime)); 1109 memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime));
1110 fattr->valid |= NFS_ATTR_FATTR_PREMTIME;
1111 }
1112 if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 &&
1113 (fattr->valid & NFS_ATTR_FATTR_PRESIZE) == 0) {
1048 fattr->pre_size = i_size_read(inode); 1114 fattr->pre_size = i_size_read(inode);
1049 fattr->valid |= NFS_ATTR_WCC; 1115 fattr->valid |= NFS_ATTR_FATTR_PRESIZE;
1050 } 1116 }
1051out_noforce: 1117out_noforce:
1052 status = nfs_post_op_update_inode_locked(inode, fattr); 1118 status = nfs_post_op_update_inode_locked(inode, fattr);
@@ -1078,18 +1144,18 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1078 __func__, inode->i_sb->s_id, inode->i_ino, 1144 __func__, inode->i_sb->s_id, inode->i_ino,
1079 atomic_read(&inode->i_count), fattr->valid); 1145 atomic_read(&inode->i_count), fattr->valid);
1080 1146
1081 if (nfsi->fileid != fattr->fileid) 1147 if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
1082 goto out_fileid; 1148 goto out_fileid;
1083 1149
1084 /* 1150 /*
1085 * Make sure the inode's type hasn't changed. 1151 * Make sure the inode's type hasn't changed.
1086 */ 1152 */
1087 if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) 1153 if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
1088 goto out_changed; 1154 goto out_changed;
1089 1155
1090 server = NFS_SERVER(inode); 1156 server = NFS_SERVER(inode);
1091 /* Update the fsid? */ 1157 /* Update the fsid? */
1092 if (S_ISDIR(inode->i_mode) && 1158 if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
1093 !nfs_fsid_equal(&server->fsid, &fattr->fsid) && 1159 !nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
1094 !test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags)) 1160 !test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags))
1095 server->fsid = fattr->fsid; 1161 server->fsid = fattr->fsid;
@@ -1099,14 +1165,27 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1099 */ 1165 */
1100 nfsi->read_cache_jiffies = fattr->time_start; 1166 nfsi->read_cache_jiffies = fattr->time_start;
1101 1167
1102 nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME 1168 if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) || (fattr->valid & (NFS_ATTR_FATTR_MTIME|NFS_ATTR_FATTR_CTIME)))
1103 | NFS_INO_REVAL_PAGECACHE); 1169 nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
1170 | NFS_INO_INVALID_ATIME
1171 | NFS_INO_REVAL_PAGECACHE);
1104 1172
1105 /* Do atomic weak cache consistency updates */ 1173 /* Do atomic weak cache consistency updates */
1106 nfs_wcc_update_inode(inode, fattr); 1174 nfs_wcc_update_inode(inode, fattr);
1107 1175
1108 /* More cache consistency checks */ 1176 /* More cache consistency checks */
1109 if (!(fattr->valid & NFS_ATTR_FATTR_V4)) { 1177 if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
1178 if (nfsi->change_attr != fattr->change_attr) {
1179 dprintk("NFS: change_attr change on server for file %s/%ld\n",
1180 inode->i_sb->s_id, inode->i_ino);
1181 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1182 if (S_ISDIR(inode->i_mode))
1183 nfs_force_lookup_revalidate(inode);
1184 nfsi->change_attr = fattr->change_attr;
1185 }
1186 }
1187
1188 if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
1110 /* NFSv2/v3: Check if the mtime agrees */ 1189 /* NFSv2/v3: Check if the mtime agrees */
1111 if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) { 1190 if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
1112 dprintk("NFS: mtime change on server for file %s/%ld\n", 1191 dprintk("NFS: mtime change on server for file %s/%ld\n",
@@ -1114,59 +1193,80 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1114 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; 1193 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
1115 if (S_ISDIR(inode->i_mode)) 1194 if (S_ISDIR(inode->i_mode))
1116 nfs_force_lookup_revalidate(inode); 1195 nfs_force_lookup_revalidate(inode);
1196 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
1117 } 1197 }
1198 }
1199 if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
1118 /* If ctime has changed we should definitely clear access+acl caches */ 1200 /* If ctime has changed we should definitely clear access+acl caches */
1119 if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) 1201 if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
1120 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 1202 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1121 } else if (nfsi->change_attr != fattr->change_attr) { 1203 /* and probably clear data for a directory too as utimes can cause
1122 dprintk("NFS: change_attr change on server for file %s/%ld\n", 1204 * havoc with our cache.
1123 inode->i_sb->s_id, inode->i_ino); 1205 */
1124 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 1206 if (S_ISDIR(inode->i_mode)) {
1125 if (S_ISDIR(inode->i_mode)) 1207 invalid |= NFS_INO_INVALID_DATA;
1126 nfs_force_lookup_revalidate(inode); 1208 nfs_force_lookup_revalidate(inode);
1209 }
1210 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
1211 }
1127 } 1212 }
1128 1213
1129 /* Check if our cached file size is stale */ 1214 /* Check if our cached file size is stale */
1130 new_isize = nfs_size_to_loff_t(fattr->size); 1215 if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
1131 cur_isize = i_size_read(inode); 1216 new_isize = nfs_size_to_loff_t(fattr->size);
1132 if (new_isize != cur_isize) { 1217 cur_isize = i_size_read(inode);
1133 /* Do we perhaps have any outstanding writes, or has 1218 if (new_isize != cur_isize) {
1134 * the file grown beyond our last write? */ 1219 /* Do we perhaps have any outstanding writes, or has
1135 if (nfsi->npages == 0 || new_isize > cur_isize) { 1220 * the file grown beyond our last write? */
1136 i_size_write(inode, new_isize); 1221 if (nfsi->npages == 0 || new_isize > cur_isize) {
1137 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; 1222 i_size_write(inode, new_isize);
1223 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
1224 }
1225 dprintk("NFS: isize change on server for file %s/%ld\n",
1226 inode->i_sb->s_id, inode->i_ino);
1138 } 1227 }
1139 dprintk("NFS: isize change on server for file %s/%ld\n",
1140 inode->i_sb->s_id, inode->i_ino);
1141 } 1228 }
1142 1229
1143 1230
1144 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); 1231 if (fattr->valid & NFS_ATTR_FATTR_ATIME)
1145 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); 1232 memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
1146 memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
1147 nfsi->change_attr = fattr->change_attr;
1148
1149 if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) ||
1150 inode->i_uid != fattr->uid ||
1151 inode->i_gid != fattr->gid)
1152 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1153 1233
1154 if (inode->i_nlink != fattr->nlink) 1234 if (fattr->valid & NFS_ATTR_FATTR_MODE) {
1155 invalid |= NFS_INO_INVALID_ATTR; 1235 if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
1236 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1237 inode->i_mode = fattr->mode;
1238 }
1239 }
1240 if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
1241 if (inode->i_uid != fattr->uid) {
1242 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1243 inode->i_uid = fattr->uid;
1244 }
1245 }
1246 if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
1247 if (inode->i_gid != fattr->gid) {
1248 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1249 inode->i_gid = fattr->gid;
1250 }
1251 }
1156 1252
1157 inode->i_mode = fattr->mode; 1253 if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
1158 inode->i_nlink = fattr->nlink; 1254 if (inode->i_nlink != fattr->nlink) {
1159 inode->i_uid = fattr->uid; 1255 invalid |= NFS_INO_INVALID_ATTR;
1160 inode->i_gid = fattr->gid; 1256 if (S_ISDIR(inode->i_mode))
1257 invalid |= NFS_INO_INVALID_DATA;
1258 inode->i_nlink = fattr->nlink;
1259 }
1260 }
1161 1261
1162 if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) { 1262 if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
1163 /* 1263 /*
1164 * report the blocks in 512byte units 1264 * report the blocks in 512byte units
1165 */ 1265 */
1166 inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); 1266 inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
1167 } else {
1168 inode->i_blocks = fattr->du.nfs2.blocks;
1169 } 1267 }
1268 if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
1269 inode->i_blocks = fattr->du.nfs2.blocks;
1170 1270
1171 /* Update attrtimeo value if we're out of the unstable period */ 1271 /* Update attrtimeo value if we're out of the unstable period */
1172 if (invalid & NFS_INO_INVALID_ATTR) { 1272 if (invalid & NFS_INO_INVALID_ATTR) {
@@ -1274,7 +1374,6 @@ static void init_once(void *foo)
1274 INIT_LIST_HEAD(&nfsi->access_cache_entry_lru); 1374 INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
1275 INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); 1375 INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
1276 INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC); 1376 INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
1277 nfsi->ncommit = 0;
1278 nfsi->npages = 0; 1377 nfsi->npages = 0;
1279 atomic_set(&nfsi->silly_count, 1); 1378 atomic_set(&nfsi->silly_count, 1);
1280 INIT_HLIST_HEAD(&nfsi->silly_list); 1379 INIT_HLIST_HEAD(&nfsi->silly_list);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 340ede8f608f..2041f68ff1cc 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -152,6 +152,9 @@ extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
152extern struct rpc_procinfo nfs4_procedures[]; 152extern struct rpc_procinfo nfs4_procedures[];
153#endif 153#endif
154 154
155/* proc.c */
156void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
157
155/* dir.c */ 158/* dir.c */
156extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask); 159extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
157 160
@@ -165,6 +168,7 @@ extern void nfs_clear_inode(struct inode *);
165extern void nfs4_clear_inode(struct inode *); 168extern void nfs4_clear_inode(struct inode *);
166#endif 169#endif
167void nfs_zap_acl_cache(struct inode *inode); 170void nfs_zap_acl_cache(struct inode *inode);
171extern int nfs_wait_bit_killable(void *word);
168 172
169/* super.c */ 173/* super.c */
170void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *); 174void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *);
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 28bab67d1519..c862c9340f9a 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -120,8 +120,8 @@ xdr_decode_time(__be32 *p, struct timespec *timep)
120static __be32 * 120static __be32 *
121xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) 121xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
122{ 122{
123 u32 rdev; 123 u32 rdev, type;
124 fattr->type = (enum nfs_ftype) ntohl(*p++); 124 type = ntohl(*p++);
125 fattr->mode = ntohl(*p++); 125 fattr->mode = ntohl(*p++);
126 fattr->nlink = ntohl(*p++); 126 fattr->nlink = ntohl(*p++);
127 fattr->uid = ntohl(*p++); 127 fattr->uid = ntohl(*p++);
@@ -136,10 +136,9 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
136 p = xdr_decode_time(p, &fattr->atime); 136 p = xdr_decode_time(p, &fattr->atime);
137 p = xdr_decode_time(p, &fattr->mtime); 137 p = xdr_decode_time(p, &fattr->mtime);
138 p = xdr_decode_time(p, &fattr->ctime); 138 p = xdr_decode_time(p, &fattr->ctime);
139 fattr->valid |= NFS_ATTR_FATTR; 139 fattr->valid |= NFS_ATTR_FATTR_V2;
140 fattr->rdev = new_decode_dev(rdev); 140 fattr->rdev = new_decode_dev(rdev);
141 if (fattr->type == NFCHR && rdev == NFS2_FIFO_DEV) { 141 if (type == NFCHR && rdev == NFS2_FIFO_DEV) {
142 fattr->type = NFFIFO;
143 fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO; 142 fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO;
144 fattr->rdev = 0; 143 fattr->rdev = 0;
145 } 144 }
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index c55be7a7679e..b82fe6847f14 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -834,4 +834,5 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
834 .commit_done = nfs3_commit_done, 834 .commit_done = nfs3_commit_done,
835 .lock = nfs3_proc_lock, 835 .lock = nfs3_proc_lock,
836 .clear_acl_cache = nfs3_forget_cached_acls, 836 .clear_acl_cache = nfs3_forget_cached_acls,
837 .close_context = nfs_close_context,
837}; 838};
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 6cdeacffde46..e6a1932c7110 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -91,19 +91,15 @@
91/* 91/*
92 * Map file type to S_IFMT bits 92 * Map file type to S_IFMT bits
93 */ 93 */
94static struct { 94static const umode_t nfs_type2fmt[] = {
95 unsigned int mode; 95 [NF3BAD] = 0,
96 unsigned int nfs2type; 96 [NF3REG] = S_IFREG,
97} nfs_type2fmt[] = { 97 [NF3DIR] = S_IFDIR,
98 { 0, NFNON }, 98 [NF3BLK] = S_IFBLK,
99 { S_IFREG, NFREG }, 99 [NF3CHR] = S_IFCHR,
100 { S_IFDIR, NFDIR }, 100 [NF3LNK] = S_IFLNK,
101 { S_IFBLK, NFBLK }, 101 [NF3SOCK] = S_IFSOCK,
102 { S_IFCHR, NFCHR }, 102 [NF3FIFO] = S_IFIFO,
103 { S_IFLNK, NFLNK },
104 { S_IFSOCK, NFSOCK },
105 { S_IFIFO, NFFIFO },
106 { 0, NFBAD }
107}; 103};
108 104
109/* 105/*
@@ -148,13 +144,12 @@ static __be32 *
148xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) 144xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
149{ 145{
150 unsigned int type, major, minor; 146 unsigned int type, major, minor;
151 int fmode; 147 umode_t fmode;
152 148
153 type = ntohl(*p++); 149 type = ntohl(*p++);
154 if (type >= NF3BAD) 150 if (type > NF3FIFO)
155 type = NF3BAD; 151 type = NF3NON;
156 fmode = nfs_type2fmt[type].mode; 152 fmode = nfs_type2fmt[type];
157 fattr->type = nfs_type2fmt[type].nfs2type;
158 fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode; 153 fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode;
159 fattr->nlink = ntohl(*p++); 154 fattr->nlink = ntohl(*p++);
160 fattr->uid = ntohl(*p++); 155 fattr->uid = ntohl(*p++);
@@ -177,7 +172,7 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
177 p = xdr_decode_time3(p, &fattr->ctime); 172 p = xdr_decode_time3(p, &fattr->ctime);
178 173
179 /* Update the mode bits */ 174 /* Update the mode bits */
180 fattr->valid |= (NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3); 175 fattr->valid |= NFS_ATTR_FATTR_V3;
181 return p; 176 return p;
182} 177}
183 178
@@ -233,7 +228,9 @@ xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr)
233 p = xdr_decode_hyper(p, &fattr->pre_size); 228 p = xdr_decode_hyper(p, &fattr->pre_size);
234 p = xdr_decode_time3(p, &fattr->pre_mtime); 229 p = xdr_decode_time3(p, &fattr->pre_mtime);
235 p = xdr_decode_time3(p, &fattr->pre_ctime); 230 p = xdr_decode_time3(p, &fattr->pre_ctime);
236 fattr->valid |= NFS_ATTR_WCC; 231 fattr->valid |= NFS_ATTR_FATTR_PRESIZE
232 | NFS_ATTR_FATTR_PREMTIME
233 | NFS_ATTR_FATTR_PRECTIME;
237 return p; 234 return p;
238} 235}
239 236
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8dde84b988d9..97bacccff579 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -193,14 +193,6 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
193 kunmap_atomic(start, KM_USER0); 193 kunmap_atomic(start, KM_USER0);
194} 194}
195 195
196static int nfs4_wait_bit_killable(void *word)
197{
198 if (fatal_signal_pending(current))
199 return -ERESTARTSYS;
200 schedule();
201 return 0;
202}
203
204static int nfs4_wait_clnt_recover(struct nfs_client *clp) 196static int nfs4_wait_clnt_recover(struct nfs_client *clp)
205{ 197{
206 int res; 198 int res;
@@ -208,7 +200,7 @@ static int nfs4_wait_clnt_recover(struct nfs_client *clp)
208 might_sleep(); 200 might_sleep();
209 201
210 res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, 202 res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
211 nfs4_wait_bit_killable, TASK_KILLABLE); 203 nfs_wait_bit_killable, TASK_KILLABLE);
212 return res; 204 return res;
213} 205}
214 206
@@ -1439,7 +1431,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
1439 if (calldata->arg.seqid == NULL) 1431 if (calldata->arg.seqid == NULL)
1440 goto out_free_calldata; 1432 goto out_free_calldata;
1441 calldata->arg.fmode = 0; 1433 calldata->arg.fmode = 0;
1442 calldata->arg.bitmask = server->attr_bitmask; 1434 calldata->arg.bitmask = server->cache_consistency_bitmask;
1443 calldata->res.fattr = &calldata->fattr; 1435 calldata->res.fattr = &calldata->fattr;
1444 calldata->res.seqid = calldata->arg.seqid; 1436 calldata->res.seqid = calldata->arg.seqid;
1445 calldata->res.server = server; 1437 calldata->res.server = server;
@@ -1580,6 +1572,15 @@ out_drop:
1580 return 0; 1572 return 0;
1581} 1573}
1582 1574
1575void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
1576{
1577 if (ctx->state == NULL)
1578 return;
1579 if (is_sync)
1580 nfs4_close_sync(&ctx->path, ctx->state, ctx->mode);
1581 else
1582 nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
1583}
1583 1584
1584static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) 1585static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
1585{ 1586{
@@ -1600,6 +1601,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
1600 server->caps |= NFS_CAP_HARDLINKS; 1601 server->caps |= NFS_CAP_HARDLINKS;
1601 if (res.has_symlinks != 0) 1602 if (res.has_symlinks != 0)
1602 server->caps |= NFS_CAP_SYMLINKS; 1603 server->caps |= NFS_CAP_SYMLINKS;
1604 memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
1605 server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
1606 server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
1603 server->acl_bitmask = res.acl_bitmask; 1607 server->acl_bitmask = res.acl_bitmask;
1604 } 1608 }
1605 return status; 1609 return status;
@@ -2079,7 +2083,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
2079 struct nfs_removeargs *args = msg->rpc_argp; 2083 struct nfs_removeargs *args = msg->rpc_argp;
2080 struct nfs_removeres *res = msg->rpc_resp; 2084 struct nfs_removeres *res = msg->rpc_resp;
2081 2085
2082 args->bitmask = server->attr_bitmask; 2086 args->bitmask = server->cache_consistency_bitmask;
2083 res->server = server; 2087 res->server = server;
2084 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; 2088 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
2085} 2089}
@@ -2323,7 +2327,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
2323 .pages = &page, 2327 .pages = &page,
2324 .pgbase = 0, 2328 .pgbase = 0,
2325 .count = count, 2329 .count = count,
2326 .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask, 2330 .bitmask = NFS_SERVER(dentry->d_inode)->cache_consistency_bitmask,
2327 }; 2331 };
2328 struct nfs4_readdir_res res; 2332 struct nfs4_readdir_res res;
2329 struct rpc_message msg = { 2333 struct rpc_message msg = {
@@ -2552,7 +2556,7 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
2552{ 2556{
2553 struct nfs_server *server = NFS_SERVER(data->inode); 2557 struct nfs_server *server = NFS_SERVER(data->inode);
2554 2558
2555 data->args.bitmask = server->attr_bitmask; 2559 data->args.bitmask = server->cache_consistency_bitmask;
2556 data->res.server = server; 2560 data->res.server = server;
2557 data->timestamp = jiffies; 2561 data->timestamp = jiffies;
2558 2562
@@ -2575,7 +2579,7 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
2575{ 2579{
2576 struct nfs_server *server = NFS_SERVER(data->inode); 2580 struct nfs_server *server = NFS_SERVER(data->inode);
2577 2581
2578 data->args.bitmask = server->attr_bitmask; 2582 data->args.bitmask = server->cache_consistency_bitmask;
2579 data->res.server = server; 2583 data->res.server = server;
2580 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; 2584 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
2581} 2585}
@@ -3678,6 +3682,19 @@ ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen)
3678 return len; 3682 return len;
3679} 3683}
3680 3684
3685static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr)
3686{
3687 if (!((fattr->valid & NFS_ATTR_FATTR_FILEID) &&
3688 (fattr->valid & NFS_ATTR_FATTR_FSID) &&
3689 (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)))
3690 return;
3691
3692 fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE |
3693 NFS_ATTR_FATTR_NLINK;
3694 fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO;
3695 fattr->nlink = 2;
3696}
3697
3681int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, 3698int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
3682 struct nfs4_fs_locations *fs_locations, struct page *page) 3699 struct nfs4_fs_locations *fs_locations, struct page *page)
3683{ 3700{
@@ -3704,6 +3721,7 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
3704 fs_locations->server = server; 3721 fs_locations->server = server;
3705 fs_locations->nlocations = 0; 3722 fs_locations->nlocations = 0;
3706 status = rpc_call_sync(server->client, &msg, 0); 3723 status = rpc_call_sync(server->client, &msg, 0);
3724 nfs_fixup_referral_attributes(&fs_locations->fattr);
3707 dprintk("%s: returned status = %d\n", __func__, status); 3725 dprintk("%s: returned status = %d\n", __func__, status);
3708 return status; 3726 return status;
3709} 3727}
@@ -3767,6 +3785,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
3767 .commit_done = nfs4_commit_done, 3785 .commit_done = nfs4_commit_done,
3768 .lock = nfs4_proc_lock, 3786 .lock = nfs4_proc_lock,
3769 .clear_acl_cache = nfs4_zap_acl_attr, 3787 .clear_acl_cache = nfs4_zap_acl_attr,
3788 .close_context = nfs4_close_context,
3770}; 3789};
3771 3790
3772/* 3791/*
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 2022fe47966f..0298e909559f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -62,8 +62,14 @@ static LIST_HEAD(nfs4_clientid_list);
62 62
63static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred) 63static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred)
64{ 64{
65 int status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, 65 unsigned short port;
66 nfs_callback_tcpport, cred); 66 int status;
67
68 port = nfs_callback_tcpport;
69 if (clp->cl_addr.ss_family == AF_INET6)
70 port = nfs_callback_tcpport6;
71
72 status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred);
67 if (status == 0) 73 if (status == 0)
68 status = nfs4_proc_setclientid_confirm(clp, cred); 74 status = nfs4_proc_setclientid_confirm(clp, cred);
69 if (status == 0) 75 if (status == 0)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index d1e4c8f8a0a9..1690f0e44b91 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -522,20 +522,17 @@ static int nfs4_stat_to_errno(int);
522 decode_lookup_maxsz + \ 522 decode_lookup_maxsz + \
523 decode_fs_locations_maxsz) 523 decode_fs_locations_maxsz)
524 524
525static struct { 525static const umode_t nfs_type2fmt[] = {
526 unsigned int mode; 526 [NF4BAD] = 0,
527 unsigned int nfs2type; 527 [NF4REG] = S_IFREG,
528} nfs_type2fmt[] = { 528 [NF4DIR] = S_IFDIR,
529 { 0, NFNON }, 529 [NF4BLK] = S_IFBLK,
530 { S_IFREG, NFREG }, 530 [NF4CHR] = S_IFCHR,
531 { S_IFDIR, NFDIR }, 531 [NF4LNK] = S_IFLNK,
532 { S_IFBLK, NFBLK }, 532 [NF4SOCK] = S_IFSOCK,
533 { S_IFCHR, NFCHR }, 533 [NF4FIFO] = S_IFIFO,
534 { S_IFLNK, NFLNK }, 534 [NF4ATTRDIR] = 0,
535 { S_IFSOCK, NFSOCK }, 535 [NF4NAMEDATTR] = 0,
536 { S_IFIFO, NFFIFO },
537 { 0, NFNON },
538 { 0, NFNON },
539}; 536};
540 537
541struct compound_hdr { 538struct compound_hdr {
@@ -2160,6 +2157,7 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3
2160static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *type) 2157static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *type)
2161{ 2158{
2162 __be32 *p; 2159 __be32 *p;
2160 int ret = 0;
2163 2161
2164 *type = 0; 2162 *type = 0;
2165 if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U))) 2163 if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U)))
@@ -2172,14 +2170,16 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
2172 return -EIO; 2170 return -EIO;
2173 } 2171 }
2174 bitmap[0] &= ~FATTR4_WORD0_TYPE; 2172 bitmap[0] &= ~FATTR4_WORD0_TYPE;
2173 ret = NFS_ATTR_FATTR_TYPE;
2175 } 2174 }
2176 dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type].nfs2type); 2175 dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
2177 return 0; 2176 return ret;
2178} 2177}
2179 2178
2180static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change) 2179static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
2181{ 2180{
2182 __be32 *p; 2181 __be32 *p;
2182 int ret = 0;
2183 2183
2184 *change = 0; 2184 *change = 0;
2185 if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U))) 2185 if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U)))
@@ -2188,15 +2188,17 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
2188 READ_BUF(8); 2188 READ_BUF(8);
2189 READ64(*change); 2189 READ64(*change);
2190 bitmap[0] &= ~FATTR4_WORD0_CHANGE; 2190 bitmap[0] &= ~FATTR4_WORD0_CHANGE;
2191 ret = NFS_ATTR_FATTR_CHANGE;
2191 } 2192 }
2192 dprintk("%s: change attribute=%Lu\n", __func__, 2193 dprintk("%s: change attribute=%Lu\n", __func__,
2193 (unsigned long long)*change); 2194 (unsigned long long)*change);
2194 return 0; 2195 return ret;
2195} 2196}
2196 2197
2197static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size) 2198static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
2198{ 2199{
2199 __be32 *p; 2200 __be32 *p;
2201 int ret = 0;
2200 2202
2201 *size = 0; 2203 *size = 0;
2202 if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U))) 2204 if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U)))
@@ -2205,9 +2207,10 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
2205 READ_BUF(8); 2207 READ_BUF(8);
2206 READ64(*size); 2208 READ64(*size);
2207 bitmap[0] &= ~FATTR4_WORD0_SIZE; 2209 bitmap[0] &= ~FATTR4_WORD0_SIZE;
2210 ret = NFS_ATTR_FATTR_SIZE;
2208 } 2211 }
2209 dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size); 2212 dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size);
2210 return 0; 2213 return ret;
2211} 2214}
2212 2215
2213static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 2216static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2245,6 +2248,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
2245static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid) 2248static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
2246{ 2249{
2247 __be32 *p; 2250 __be32 *p;
2251 int ret = 0;
2248 2252
2249 fsid->major = 0; 2253 fsid->major = 0;
2250 fsid->minor = 0; 2254 fsid->minor = 0;
@@ -2255,11 +2259,12 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
2255 READ64(fsid->major); 2259 READ64(fsid->major);
2256 READ64(fsid->minor); 2260 READ64(fsid->minor);
2257 bitmap[0] &= ~FATTR4_WORD0_FSID; 2261 bitmap[0] &= ~FATTR4_WORD0_FSID;
2262 ret = NFS_ATTR_FATTR_FSID;
2258 } 2263 }
2259 dprintk("%s: fsid=(0x%Lx/0x%Lx)\n", __func__, 2264 dprintk("%s: fsid=(0x%Lx/0x%Lx)\n", __func__,
2260 (unsigned long long)fsid->major, 2265 (unsigned long long)fsid->major,
2261 (unsigned long long)fsid->minor); 2266 (unsigned long long)fsid->minor);
2262 return 0; 2267 return ret;
2263} 2268}
2264 2269
2265static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 2270static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2297,6 +2302,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
2297static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) 2302static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
2298{ 2303{
2299 __be32 *p; 2304 __be32 *p;
2305 int ret = 0;
2300 2306
2301 *fileid = 0; 2307 *fileid = 0;
2302 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U))) 2308 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U)))
@@ -2305,14 +2311,16 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
2305 READ_BUF(8); 2311 READ_BUF(8);
2306 READ64(*fileid); 2312 READ64(*fileid);
2307 bitmap[0] &= ~FATTR4_WORD0_FILEID; 2313 bitmap[0] &= ~FATTR4_WORD0_FILEID;
2314 ret = NFS_ATTR_FATTR_FILEID;
2308 } 2315 }
2309 dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); 2316 dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
2310 return 0; 2317 return ret;
2311} 2318}
2312 2319
2313static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) 2320static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
2314{ 2321{
2315 __be32 *p; 2322 __be32 *p;
2323 int ret = 0;
2316 2324
2317 *fileid = 0; 2325 *fileid = 0;
2318 if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U))) 2326 if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
@@ -2321,9 +2329,10 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
2321 READ_BUF(8); 2329 READ_BUF(8);
2322 READ64(*fileid); 2330 READ64(*fileid);
2323 bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; 2331 bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
2332 ret = NFS_ATTR_FATTR_FILEID;
2324 } 2333 }
2325 dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); 2334 dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
2326 return 0; 2335 return ret;
2327} 2336}
2328 2337
2329static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 2338static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2479,6 +2488,8 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
2479 if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES) 2488 if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES)
2480 res->nlocations++; 2489 res->nlocations++;
2481 } 2490 }
2491 if (res->nlocations != 0)
2492 status = NFS_ATTR_FATTR_V4_REFERRAL;
2482out: 2493out:
2483 dprintk("%s: fs_locations done, error = %d\n", __func__, status); 2494 dprintk("%s: fs_locations done, error = %d\n", __func__, status);
2484 return status; 2495 return status;
@@ -2580,26 +2591,30 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
2580 return status; 2591 return status;
2581} 2592}
2582 2593
2583static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *mode) 2594static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode)
2584{ 2595{
2596 uint32_t tmp;
2585 __be32 *p; 2597 __be32 *p;
2598 int ret = 0;
2586 2599
2587 *mode = 0; 2600 *mode = 0;
2588 if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U))) 2601 if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U)))
2589 return -EIO; 2602 return -EIO;
2590 if (likely(bitmap[1] & FATTR4_WORD1_MODE)) { 2603 if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
2591 READ_BUF(4); 2604 READ_BUF(4);
2592 READ32(*mode); 2605 READ32(tmp);
2593 *mode &= ~S_IFMT; 2606 *mode = tmp & ~S_IFMT;
2594 bitmap[1] &= ~FATTR4_WORD1_MODE; 2607 bitmap[1] &= ~FATTR4_WORD1_MODE;
2608 ret = NFS_ATTR_FATTR_MODE;
2595 } 2609 }
2596 dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode); 2610 dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
2597 return 0; 2611 return ret;
2598} 2612}
2599 2613
2600static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink) 2614static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
2601{ 2615{
2602 __be32 *p; 2616 __be32 *p;
2617 int ret = 0;
2603 2618
2604 *nlink = 1; 2619 *nlink = 1;
2605 if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U))) 2620 if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U)))
@@ -2608,15 +2623,17 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
2608 READ_BUF(4); 2623 READ_BUF(4);
2609 READ32(*nlink); 2624 READ32(*nlink);
2610 bitmap[1] &= ~FATTR4_WORD1_NUMLINKS; 2625 bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
2626 ret = NFS_ATTR_FATTR_NLINK;
2611 } 2627 }
2612 dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink); 2628 dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink);
2613 return 0; 2629 return ret;
2614} 2630}
2615 2631
2616static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid) 2632static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid)
2617{ 2633{
2618 uint32_t len; 2634 uint32_t len;
2619 __be32 *p; 2635 __be32 *p;
2636 int ret = 0;
2620 2637
2621 *uid = -2; 2638 *uid = -2;
2622 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U))) 2639 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))
@@ -2626,7 +2643,9 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
2626 READ32(len); 2643 READ32(len);
2627 READ_BUF(len); 2644 READ_BUF(len);
2628 if (len < XDR_MAX_NETOBJ) { 2645 if (len < XDR_MAX_NETOBJ) {
2629 if (nfs_map_name_to_uid(clp, (char *)p, len, uid) != 0) 2646 if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
2647 ret = NFS_ATTR_FATTR_OWNER;
2648 else
2630 dprintk("%s: nfs_map_name_to_uid failed!\n", 2649 dprintk("%s: nfs_map_name_to_uid failed!\n",
2631 __func__); 2650 __func__);
2632 } else 2651 } else
@@ -2635,13 +2654,14 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
2635 bitmap[1] &= ~FATTR4_WORD1_OWNER; 2654 bitmap[1] &= ~FATTR4_WORD1_OWNER;
2636 } 2655 }
2637 dprintk("%s: uid=%d\n", __func__, (int)*uid); 2656 dprintk("%s: uid=%d\n", __func__, (int)*uid);
2638 return 0; 2657 return ret;
2639} 2658}
2640 2659
2641static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid) 2660static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid)
2642{ 2661{
2643 uint32_t len; 2662 uint32_t len;
2644 __be32 *p; 2663 __be32 *p;
2664 int ret = 0;
2645 2665
2646 *gid = -2; 2666 *gid = -2;
2647 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U))) 2667 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))
@@ -2651,7 +2671,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
2651 READ32(len); 2671 READ32(len);
2652 READ_BUF(len); 2672 READ_BUF(len);
2653 if (len < XDR_MAX_NETOBJ) { 2673 if (len < XDR_MAX_NETOBJ) {
2654 if (nfs_map_group_to_gid(clp, (char *)p, len, gid) != 0) 2674 if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
2675 ret = NFS_ATTR_FATTR_GROUP;
2676 else
2655 dprintk("%s: nfs_map_group_to_gid failed!\n", 2677 dprintk("%s: nfs_map_group_to_gid failed!\n",
2656 __func__); 2678 __func__);
2657 } else 2679 } else
@@ -2660,13 +2682,14 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
2660 bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP; 2682 bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP;
2661 } 2683 }
2662 dprintk("%s: gid=%d\n", __func__, (int)*gid); 2684 dprintk("%s: gid=%d\n", __func__, (int)*gid);
2663 return 0; 2685 return ret;
2664} 2686}
2665 2687
2666static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev) 2688static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
2667{ 2689{
2668 uint32_t major = 0, minor = 0; 2690 uint32_t major = 0, minor = 0;
2669 __be32 *p; 2691 __be32 *p;
2692 int ret = 0;
2670 2693
2671 *rdev = MKDEV(0,0); 2694 *rdev = MKDEV(0,0);
2672 if (unlikely(bitmap[1] & (FATTR4_WORD1_RAWDEV - 1U))) 2695 if (unlikely(bitmap[1] & (FATTR4_WORD1_RAWDEV - 1U)))
@@ -2681,9 +2704,10 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
2681 if (MAJOR(tmp) == major && MINOR(tmp) == minor) 2704 if (MAJOR(tmp) == major && MINOR(tmp) == minor)
2682 *rdev = tmp; 2705 *rdev = tmp;
2683 bitmap[1] &= ~ FATTR4_WORD1_RAWDEV; 2706 bitmap[1] &= ~ FATTR4_WORD1_RAWDEV;
2707 ret = NFS_ATTR_FATTR_RDEV;
2684 } 2708 }
2685 dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor); 2709 dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor);
2686 return 0; 2710 return ret;
2687} 2711}
2688 2712
2689static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 2713static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2740,6 +2764,7 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
2740static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used) 2764static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
2741{ 2765{
2742 __be32 *p; 2766 __be32 *p;
2767 int ret = 0;
2743 2768
2744 *used = 0; 2769 *used = 0;
2745 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U))) 2770 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U)))
@@ -2748,10 +2773,11 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
2748 READ_BUF(8); 2773 READ_BUF(8);
2749 READ64(*used); 2774 READ64(*used);
2750 bitmap[1] &= ~FATTR4_WORD1_SPACE_USED; 2775 bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
2776 ret = NFS_ATTR_FATTR_SPACE_USED;
2751 } 2777 }
2752 dprintk("%s: space used=%Lu\n", __func__, 2778 dprintk("%s: space used=%Lu\n", __func__,
2753 (unsigned long long)*used); 2779 (unsigned long long)*used);
2754 return 0; 2780 return ret;
2755} 2781}
2756 2782
2757static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) 2783static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
@@ -2778,6 +2804,8 @@ static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, str
2778 return -EIO; 2804 return -EIO;
2779 if (likely(bitmap[1] & FATTR4_WORD1_TIME_ACCESS)) { 2805 if (likely(bitmap[1] & FATTR4_WORD1_TIME_ACCESS)) {
2780 status = decode_attr_time(xdr, time); 2806 status = decode_attr_time(xdr, time);
2807 if (status == 0)
2808 status = NFS_ATTR_FATTR_ATIME;
2781 bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS; 2809 bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS;
2782 } 2810 }
2783 dprintk("%s: atime=%ld\n", __func__, (long)time->tv_sec); 2811 dprintk("%s: atime=%ld\n", __func__, (long)time->tv_sec);
@@ -2794,6 +2822,8 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s
2794 return -EIO; 2822 return -EIO;
2795 if (likely(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) { 2823 if (likely(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) {
2796 status = decode_attr_time(xdr, time); 2824 status = decode_attr_time(xdr, time);
2825 if (status == 0)
2826 status = NFS_ATTR_FATTR_CTIME;
2797 bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA; 2827 bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA;
2798 } 2828 }
2799 dprintk("%s: ctime=%ld\n", __func__, (long)time->tv_sec); 2829 dprintk("%s: ctime=%ld\n", __func__, (long)time->tv_sec);
@@ -2810,6 +2840,8 @@ static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, str
2810 return -EIO; 2840 return -EIO;
2811 if (likely(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) { 2841 if (likely(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) {
2812 status = decode_attr_time(xdr, time); 2842 status = decode_attr_time(xdr, time);
2843 if (status == 0)
2844 status = NFS_ATTR_FATTR_MTIME;
2813 bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY; 2845 bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY;
2814 } 2846 }
2815 dprintk("%s: mtime=%ld\n", __func__, (long)time->tv_sec); 2847 dprintk("%s: mtime=%ld\n", __func__, (long)time->tv_sec);
@@ -2994,63 +3026,116 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
2994 uint32_t attrlen, 3026 uint32_t attrlen,
2995 bitmap[2] = {0}, 3027 bitmap[2] = {0},
2996 type; 3028 type;
2997 int status, fmode = 0; 3029 int status;
3030 umode_t fmode = 0;
2998 uint64_t fileid; 3031 uint64_t fileid;
2999 3032
3000 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) 3033 status = decode_op_hdr(xdr, OP_GETATTR);
3001 goto xdr_error; 3034 if (status < 0)
3002 if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
3003 goto xdr_error; 3035 goto xdr_error;
3004 3036
3005 fattr->bitmap[0] = bitmap[0]; 3037 status = decode_attr_bitmap(xdr, bitmap);
3006 fattr->bitmap[1] = bitmap[1]; 3038 if (status < 0)
3039 goto xdr_error;
3007 3040
3008 if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) 3041 status = decode_attr_length(xdr, &attrlen, &savep);
3042 if (status < 0)
3009 goto xdr_error; 3043 goto xdr_error;
3010 3044
3011 3045
3012 if ((status = decode_attr_type(xdr, bitmap, &type)) != 0) 3046 status = decode_attr_type(xdr, bitmap, &type);
3047 if (status < 0)
3013 goto xdr_error; 3048 goto xdr_error;
3014 fattr->type = nfs_type2fmt[type].nfs2type; 3049 fattr->mode = 0;
3015 fmode = nfs_type2fmt[type].mode; 3050 if (status != 0) {
3051 fattr->mode |= nfs_type2fmt[type];
3052 fattr->valid |= status;
3053 }
3016 3054
3017 if ((status = decode_attr_change(xdr, bitmap, &fattr->change_attr)) != 0) 3055 status = decode_attr_change(xdr, bitmap, &fattr->change_attr);
3056 if (status < 0)
3018 goto xdr_error; 3057 goto xdr_error;
3019 if ((status = decode_attr_size(xdr, bitmap, &fattr->size)) != 0) 3058 fattr->valid |= status;
3059
3060 status = decode_attr_size(xdr, bitmap, &fattr->size);
3061 if (status < 0)
3020 goto xdr_error; 3062 goto xdr_error;
3021 if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid)) != 0) 3063 fattr->valid |= status;
3064
3065 status = decode_attr_fsid(xdr, bitmap, &fattr->fsid);
3066 if (status < 0)
3022 goto xdr_error; 3067 goto xdr_error;
3023 if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0) 3068 fattr->valid |= status;
3069
3070 status = decode_attr_fileid(xdr, bitmap, &fattr->fileid);
3071 if (status < 0)
3024 goto xdr_error; 3072 goto xdr_error;
3025 if ((status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr, 3073 fattr->valid |= status;
3074
3075 status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
3026 struct nfs4_fs_locations, 3076 struct nfs4_fs_locations,
3027 fattr))) != 0) 3077 fattr));
3078 if (status < 0)
3028 goto xdr_error; 3079 goto xdr_error;
3029 if ((status = decode_attr_mode(xdr, bitmap, &fattr->mode)) != 0) 3080 fattr->valid |= status;
3081
3082 status = decode_attr_mode(xdr, bitmap, &fmode);
3083 if (status < 0)
3030 goto xdr_error; 3084 goto xdr_error;
3031 fattr->mode |= fmode; 3085 if (status != 0) {
3032 if ((status = decode_attr_nlink(xdr, bitmap, &fattr->nlink)) != 0) 3086 fattr->mode |= fmode;
3087 fattr->valid |= status;
3088 }
3089
3090 status = decode_attr_nlink(xdr, bitmap, &fattr->nlink);
3091 if (status < 0)
3033 goto xdr_error; 3092 goto xdr_error;
3034 if ((status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid)) != 0) 3093 fattr->valid |= status;
3094
3095 status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid);
3096 if (status < 0)
3035 goto xdr_error; 3097 goto xdr_error;
3036 if ((status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid)) != 0) 3098 fattr->valid |= status;
3099
3100 status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid);
3101 if (status < 0)
3037 goto xdr_error; 3102 goto xdr_error;
3038 if ((status = decode_attr_rdev(xdr, bitmap, &fattr->rdev)) != 0) 3103 fattr->valid |= status;
3104
3105 status = decode_attr_rdev(xdr, bitmap, &fattr->rdev);
3106 if (status < 0)
3039 goto xdr_error; 3107 goto xdr_error;
3040 if ((status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used)) != 0) 3108 fattr->valid |= status;
3109
3110 status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used);
3111 if (status < 0)
3041 goto xdr_error; 3112 goto xdr_error;
3042 if ((status = decode_attr_time_access(xdr, bitmap, &fattr->atime)) != 0) 3113 fattr->valid |= status;
3114
3115 status = decode_attr_time_access(xdr, bitmap, &fattr->atime);
3116 if (status < 0)
3043 goto xdr_error; 3117 goto xdr_error;
3044 if ((status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime)) != 0) 3118 fattr->valid |= status;
3119
3120 status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime);
3121 if (status < 0)
3045 goto xdr_error; 3122 goto xdr_error;
3046 if ((status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime)) != 0) 3123 fattr->valid |= status;
3124
3125 status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime);
3126 if (status < 0)
3047 goto xdr_error; 3127 goto xdr_error;
3048 if ((status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid)) != 0) 3128 fattr->valid |= status;
3129
3130 status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid);
3131 if (status < 0)
3049 goto xdr_error; 3132 goto xdr_error;
3050 if (fattr->fileid == 0 && fileid != 0) 3133 if (status != 0 && !(fattr->valid & status)) {
3051 fattr->fileid = fileid; 3134 fattr->fileid = fileid;
3052 if ((status = verify_attr_len(xdr, savep, attrlen)) == 0) 3135 fattr->valid |= status;
3053 fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4; 3136 }
3137
3138 status = verify_attr_len(xdr, savep, attrlen);
3054xdr_error: 3139xdr_error:
3055 dprintk("%s: xdr returned %d\n", __func__, -status); 3140 dprintk("%s: xdr returned %d\n", __func__, -status);
3056 return status; 3141 return status;
@@ -4078,9 +4163,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_se
4078 status = decode_setattr(&xdr, res); 4163 status = decode_setattr(&xdr, res);
4079 if (status) 4164 if (status)
4080 goto out; 4165 goto out;
4081 status = decode_getfattr(&xdr, res->fattr, res->server); 4166 decode_getfattr(&xdr, res->fattr, res->server);
4082 if (status == NFS4ERR_DELAY)
4083 status = 0;
4084out: 4167out:
4085 return status; 4168 return status;
4086} 4169}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 7f079209d70a..e2975939126a 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -176,17 +176,6 @@ void nfs_release_request(struct nfs_page *req)
176 kref_put(&req->wb_kref, nfs_free_request); 176 kref_put(&req->wb_kref, nfs_free_request);
177} 177}
178 178
179static int nfs_wait_bit_killable(void *word)
180{
181 int ret = 0;
182
183 if (fatal_signal_pending(current))
184 ret = -ERESTARTSYS;
185 else
186 schedule();
187 return ret;
188}
189
190/** 179/**
191 * nfs_wait_on_request - Wait for a request to complete. 180 * nfs_wait_on_request - Wait for a request to complete.
192 * @req: request to wait upon. 181 * @req: request to wait upon.
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 193465210d7c..7be72d90d49d 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -663,4 +663,5 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
663 .commit_setup = nfs_proc_commit_setup, 663 .commit_setup = nfs_proc_commit_setup,
664 .lock = nfs_proc_lock, 664 .lock = nfs_proc_lock,
665 .lock_check_bounds = nfs_lock_check_bounds, 665 .lock_check_bounds = nfs_lock_check_bounds,
666 .close_context = nfs_close_context,
666}; 667};
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index d6686f4786dc..0942fcbbad3c 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1018,6 +1018,7 @@ static int nfs_parse_mount_options(char *raw,
1018 case Opt_rdma: 1018 case Opt_rdma:
1019 mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */ 1019 mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */
1020 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; 1020 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
1021 xprt_load_transport(p);
1021 break; 1022 break;
1022 case Opt_acl: 1023 case Opt_acl:
1023 mnt->flags &= ~NFS_MOUNT_NOACL; 1024 mnt->flags &= ~NFS_MOUNT_NOACL;
@@ -1205,12 +1206,14 @@ static int nfs_parse_mount_options(char *raw,
1205 /* vector side protocols to TCP */ 1206 /* vector side protocols to TCP */
1206 mnt->flags |= NFS_MOUNT_TCP; 1207 mnt->flags |= NFS_MOUNT_TCP;
1207 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; 1208 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
1209 xprt_load_transport(string);
1208 break; 1210 break;
1209 default: 1211 default:
1210 errors++; 1212 errors++;
1211 dfprintk(MOUNT, "NFS: unrecognized " 1213 dfprintk(MOUNT, "NFS: unrecognized "
1212 "transport protocol\n"); 1214 "transport protocol\n");
1213 } 1215 }
1216 kfree(string);
1214 break; 1217 break;
1215 case Opt_mountproto: 1218 case Opt_mountproto:
1216 string = match_strdup(args); 1219 string = match_strdup(args);
@@ -1218,7 +1221,6 @@ static int nfs_parse_mount_options(char *raw,
1218 goto out_nomem; 1221 goto out_nomem;
1219 token = match_token(string, 1222 token = match_token(string,
1220 nfs_xprt_protocol_tokens, args); 1223 nfs_xprt_protocol_tokens, args);
1221 kfree(string);
1222 1224
1223 switch (token) { 1225 switch (token) {
1224 case Opt_xprt_udp: 1226 case Opt_xprt_udp:
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 9f9845859fc1..e560a78995a3 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -313,19 +313,34 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
313int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) 313int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
314{ 314{
315 struct inode *inode = mapping->host; 315 struct inode *inode = mapping->host;
316 unsigned long *bitlock = &NFS_I(inode)->flags;
316 struct nfs_pageio_descriptor pgio; 317 struct nfs_pageio_descriptor pgio;
317 int err; 318 int err;
318 319
320 /* Stop dirtying of new pages while we sync */
321 err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING,
322 nfs_wait_bit_killable, TASK_KILLABLE);
323 if (err)
324 goto out_err;
325
319 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); 326 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
320 327
321 nfs_pageio_init_write(&pgio, inode, wb_priority(wbc)); 328 nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
322 err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); 329 err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
323 nfs_pageio_complete(&pgio); 330 nfs_pageio_complete(&pgio);
331
332 clear_bit_unlock(NFS_INO_FLUSHING, bitlock);
333 smp_mb__after_clear_bit();
334 wake_up_bit(bitlock, NFS_INO_FLUSHING);
335
324 if (err < 0) 336 if (err < 0)
325 return err; 337 goto out_err;
326 if (pgio.pg_error < 0) 338 err = pgio.pg_error;
327 return pgio.pg_error; 339 if (err < 0)
340 goto out_err;
328 return 0; 341 return 0;
342out_err:
343 return err;
329} 344}
330 345
331/* 346/*
@@ -404,7 +419,6 @@ nfs_mark_request_commit(struct nfs_page *req)
404 struct nfs_inode *nfsi = NFS_I(inode); 419 struct nfs_inode *nfsi = NFS_I(inode);
405 420
406 spin_lock(&inode->i_lock); 421 spin_lock(&inode->i_lock);
407 nfsi->ncommit++;
408 set_bit(PG_CLEAN, &(req)->wb_flags); 422 set_bit(PG_CLEAN, &(req)->wb_flags);
409 radix_tree_tag_set(&nfsi->nfs_page_tree, 423 radix_tree_tag_set(&nfsi->nfs_page_tree,
410 req->wb_index, 424 req->wb_index,
@@ -524,6 +538,12 @@ static void nfs_cancel_commit_list(struct list_head *head)
524} 538}
525 539
526#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 540#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
541static int
542nfs_need_commit(struct nfs_inode *nfsi)
543{
544 return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT);
545}
546
527/* 547/*
528 * nfs_scan_commit - Scan an inode for commit requests 548 * nfs_scan_commit - Scan an inode for commit requests
529 * @inode: NFS inode to scan 549 * @inode: NFS inode to scan
@@ -538,16 +558,18 @@ static int
538nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) 558nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
539{ 559{
540 struct nfs_inode *nfsi = NFS_I(inode); 560 struct nfs_inode *nfsi = NFS_I(inode);
541 int res = 0;
542 561
543 if (nfsi->ncommit != 0) { 562 if (!nfs_need_commit(nfsi))
544 res = nfs_scan_list(nfsi, dst, idx_start, npages, 563 return 0;
545 NFS_PAGE_TAG_COMMIT); 564
546 nfsi->ncommit -= res; 565 return nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
547 }
548 return res;
549} 566}
550#else 567#else
568static inline int nfs_need_commit(struct nfs_inode *nfsi)
569{
570 return 0;
571}
572
551static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) 573static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
552{ 574{
553 return 0; 575 return 0;
@@ -820,7 +842,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
820 data->args.stable = NFS_UNSTABLE; 842 data->args.stable = NFS_UNSTABLE;
821 if (how & FLUSH_STABLE) { 843 if (how & FLUSH_STABLE) {
822 data->args.stable = NFS_DATA_SYNC; 844 data->args.stable = NFS_DATA_SYNC;
823 if (!NFS_I(inode)->ncommit) 845 if (!nfs_need_commit(NFS_I(inode)))
824 data->args.stable = NFS_FILE_SYNC; 846 data->args.stable = NFS_FILE_SYNC;
825 } 847 }
826 848
@@ -1425,18 +1447,13 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
1425{ 1447{
1426 struct writeback_control wbc = { 1448 struct writeback_control wbc = {
1427 .bdi = mapping->backing_dev_info, 1449 .bdi = mapping->backing_dev_info,
1428 .sync_mode = WB_SYNC_NONE, 1450 .sync_mode = WB_SYNC_ALL,
1429 .nr_to_write = LONG_MAX, 1451 .nr_to_write = LONG_MAX,
1430 .range_start = 0, 1452 .range_start = 0,
1431 .range_end = LLONG_MAX, 1453 .range_end = LLONG_MAX,
1432 .for_writepages = 1, 1454 .for_writepages = 1,
1433 }; 1455 };
1434 int ret;
1435 1456
1436 ret = __nfs_write_mapping(mapping, &wbc, how);
1437 if (ret < 0)
1438 return ret;
1439 wbc.sync_mode = WB_SYNC_ALL;
1440 return __nfs_write_mapping(mapping, &wbc, how); 1457 return __nfs_write_mapping(mapping, &wbc, how);
1441} 1458}
1442 1459
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 3d93b2064ce5..a4ed8644d69c 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -938,10 +938,12 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
938 char transport[16]; 938 char transport[16];
939 int port; 939 int port;
940 if (sscanf(buf, "%15s %4d", transport, &port) == 2) { 940 if (sscanf(buf, "%15s %4d", transport, &port) == 2) {
941 if (port < 1 || port > 65535)
942 return -EINVAL;
941 err = nfsd_create_serv(); 943 err = nfsd_create_serv();
942 if (!err) { 944 if (!err) {
943 err = svc_create_xprt(nfsd_serv, 945 err = svc_create_xprt(nfsd_serv,
944 transport, port, 946 transport, PF_INET, port,
945 SVC_SOCK_ANONYMOUS); 947 SVC_SOCK_ANONYMOUS);
946 if (err == -ENOENT) 948 if (err == -ENOENT)
947 /* Give a reasonable perror msg for 949 /* Give a reasonable perror msg for
@@ -960,7 +962,7 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
960 char transport[16]; 962 char transport[16];
961 int port; 963 int port;
962 if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) { 964 if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) {
963 if (port == 0) 965 if (port < 1 || port > 65535)
964 return -EINVAL; 966 return -EINVAL;
965 if (nfsd_serv) { 967 if (nfsd_serv) {
966 xprt = svc_find_xprt(nfsd_serv, transport, 968 xprt = svc_find_xprt(nfsd_serv, transport,
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 07e4f5d7baa8..bc3567bab8c4 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -229,7 +229,6 @@ int nfsd_create_serv(void)
229 229
230 atomic_set(&nfsd_busy, 0); 230 atomic_set(&nfsd_busy, 0);
231 nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, 231 nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
232 AF_INET,
233 nfsd_last_thread, nfsd, THIS_MODULE); 232 nfsd_last_thread, nfsd, THIS_MODULE);
234 if (nfsd_serv == NULL) 233 if (nfsd_serv == NULL)
235 err = -ENOMEM; 234 err = -ENOMEM;
@@ -244,7 +243,7 @@ static int nfsd_init_socks(int port)
244 if (!list_empty(&nfsd_serv->sv_permsocks)) 243 if (!list_empty(&nfsd_serv->sv_permsocks))
245 return 0; 244 return 0;
246 245
247 error = svc_create_xprt(nfsd_serv, "udp", port, 246 error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port,
248 SVC_SOCK_DEFAULTS); 247 SVC_SOCK_DEFAULTS);
249 if (error < 0) 248 if (error < 0)
250 return error; 249 return error;
@@ -253,7 +252,7 @@ static int nfsd_init_socks(int port)
253 if (error < 0) 252 if (error < 0)
254 return error; 253 return error;
255 254
256 error = svc_create_xprt(nfsd_serv, "tcp", port, 255 error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port,
257 SVC_SOCK_DEFAULTS); 256 SVC_SOCK_DEFAULTS);
258 if (error < 0) 257 if (error < 0)
259 return error; 258 return error;
diff --git a/include/drm/drm_crtc_helper.h b/include/drm/drm_crtc_helper.h
index c7d4b2e606a5..ec073d8288d9 100644
--- a/include/drm/drm_crtc_helper.h
+++ b/include/drm/drm_crtc_helper.h
@@ -33,7 +33,6 @@
33#ifndef __DRM_CRTC_HELPER_H__ 33#ifndef __DRM_CRTC_HELPER_H__
34#define __DRM_CRTC_HELPER_H__ 34#define __DRM_CRTC_HELPER_H__
35 35
36#include <linux/i2c.h>
37#include <linux/spinlock.h> 36#include <linux/spinlock.h>
38#include <linux/types.h> 37#include <linux/types.h>
39#include <linux/idr.h> 38#include <linux/idr.h>
@@ -92,7 +91,7 @@ struct drm_connector_helper_funcs {
92extern int drm_helper_probe_single_connector_modes(struct drm_connector *connector, uint32_t maxX, uint32_t maxY); 91extern int drm_helper_probe_single_connector_modes(struct drm_connector *connector, uint32_t maxX, uint32_t maxY);
93extern void drm_helper_disable_unused_functions(struct drm_device *dev); 92extern void drm_helper_disable_unused_functions(struct drm_device *dev);
94extern int drm_helper_hotplug_stage_two(struct drm_device *dev); 93extern int drm_helper_hotplug_stage_two(struct drm_device *dev);
95extern bool drm_helper_initial_config(struct drm_device *dev, bool can_grow); 94extern bool drm_helper_initial_config(struct drm_device *dev);
96extern int drm_crtc_helper_set_config(struct drm_mode_set *set); 95extern int drm_crtc_helper_set_config(struct drm_mode_set *set);
97extern bool drm_crtc_helper_set_mode(struct drm_crtc *crtc, 96extern bool drm_crtc_helper_set_mode(struct drm_crtc *crtc,
98 struct drm_display_mode *mode, 97 struct drm_display_mode *mode,
diff --git a/include/drm/drm_os_linux.h b/include/drm/drm_os_linux.h
index 013551d03c03..26641e95e0a4 100644
--- a/include/drm/drm_os_linux.h
+++ b/include/drm/drm_os_linux.h
@@ -7,12 +7,12 @@
7#include <linux/delay.h> 7#include <linux/delay.h>
8 8
9#ifndef readq 9#ifndef readq
10static u64 readq(void __iomem *reg) 10static inline u64 readq(void __iomem *reg)
11{ 11{
12 return ((u64) readl(reg)) | (((u64) readl(reg + 4UL)) << 32); 12 return ((u64) readl(reg)) | (((u64) readl(reg + 4UL)) << 32);
13} 13}
14 14
15static void writeq(u64 val, void __iomem *reg) 15static inline void writeq(u64 val, void __iomem *reg)
16{ 16{
17 writel(val & 0xffffffff, reg); 17 writel(val & 0xffffffff, reg);
18 writel(val >> 32, reg + 0x4UL); 18 writel(val >> 32, reg + 0x4UL);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 4d248b3f1323..8815a3456b3b 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -649,6 +649,12 @@ struct transaction_s
649 int t_handle_count; 649 int t_handle_count;
650 650
651 /* 651 /*
652 * This transaction is being forced and some process is
653 * waiting for it to finish.
654 */
655 int t_synchronous_commit:1;
656
657 /*
652 * For use by the filesystem to store fs-specific data 658 * For use by the filesystem to store fs-specific data
653 * structures associated with the transaction 659 * structures associated with the transaction
654 */ 660 */
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 8cc8807f77d6..bde2557c2a9c 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -166,8 +166,7 @@ struct nfs_inode {
166 */ 166 */
167 struct radix_tree_root nfs_page_tree; 167 struct radix_tree_root nfs_page_tree;
168 168
169 unsigned long ncommit, 169 unsigned long npages;
170 npages;
171 170
172 /* Open contexts for shared mmap writes */ 171 /* Open contexts for shared mmap writes */
173 struct list_head open_files; 172 struct list_head open_files;
@@ -207,6 +206,7 @@ struct nfs_inode {
207#define NFS_INO_STALE (1) /* possible stale inode */ 206#define NFS_INO_STALE (1) /* possible stale inode */
208#define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */ 207#define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */
209#define NFS_INO_MOUNTPOINT (3) /* inode is remote mountpoint */ 208#define NFS_INO_MOUNTPOINT (3) /* inode is remote mountpoint */
209#define NFS_INO_FLUSHING (4) /* inode is flushing out data */
210 210
211static inline struct nfs_inode *NFS_I(const struct inode *inode) 211static inline struct nfs_inode *NFS_I(const struct inode *inode)
212{ 212{
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 9bb81aec91cf..29b1e40dce99 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -106,6 +106,11 @@ struct nfs_server {
106 u32 attr_bitmask[2];/* V4 bitmask representing the set 106 u32 attr_bitmask[2];/* V4 bitmask representing the set
107 of attributes supported on this 107 of attributes supported on this
108 filesystem */ 108 filesystem */
109 u32 cache_consistency_bitmask[2];
110 /* V4 bitmask representing the subset
111 of change attribute, size, ctime
112 and mtime attributes supported by
113 the server */
109 u32 acl_bitmask; /* V4 bitmask representing the ACEs 114 u32 acl_bitmask; /* V4 bitmask representing the ACEs
110 that are supported on this 115 that are supported on this
111 filesystem */ 116 filesystem */
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 43a713fce11c..b89c34e40bc2 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -27,12 +27,8 @@ static inline int nfs_fsid_equal(const struct nfs_fsid *a, const struct nfs_fsid
27} 27}
28 28
29struct nfs_fattr { 29struct nfs_fattr {
30 unsigned short valid; /* which fields are valid */ 30 unsigned int valid; /* which fields are valid */
31 __u64 pre_size; /* pre_op_attr.size */ 31 umode_t mode;
32 struct timespec pre_mtime; /* pre_op_attr.mtime */
33 struct timespec pre_ctime; /* pre_op_attr.ctime */
34 enum nfs_ftype type; /* always use NFSv2 types */
35 __u32 mode;
36 __u32 nlink; 32 __u32 nlink;
37 __u32 uid; 33 __u32 uid;
38 __u32 gid; 34 __u32 gid;
@@ -52,19 +48,55 @@ struct nfs_fattr {
52 struct timespec atime; 48 struct timespec atime;
53 struct timespec mtime; 49 struct timespec mtime;
54 struct timespec ctime; 50 struct timespec ctime;
55 __u32 bitmap[2]; /* NFSv4 returned attribute bitmap */
56 __u64 change_attr; /* NFSv4 change attribute */ 51 __u64 change_attr; /* NFSv4 change attribute */
57 __u64 pre_change_attr;/* pre-op NFSv4 change attribute */ 52 __u64 pre_change_attr;/* pre-op NFSv4 change attribute */
53 __u64 pre_size; /* pre_op_attr.size */
54 struct timespec pre_mtime; /* pre_op_attr.mtime */
55 struct timespec pre_ctime; /* pre_op_attr.ctime */
58 unsigned long time_start; 56 unsigned long time_start;
59 unsigned long gencount; 57 unsigned long gencount;
60}; 58};
61 59
62#define NFS_ATTR_WCC 0x0001 /* pre-op WCC data */ 60#define NFS_ATTR_FATTR_TYPE (1U << 0)
63#define NFS_ATTR_FATTR 0x0002 /* post-op attributes */ 61#define NFS_ATTR_FATTR_MODE (1U << 1)
64#define NFS_ATTR_FATTR_V3 0x0004 /* NFSv3 attributes */ 62#define NFS_ATTR_FATTR_NLINK (1U << 2)
65#define NFS_ATTR_FATTR_V4 0x0008 /* NFSv4 change attribute */ 63#define NFS_ATTR_FATTR_OWNER (1U << 3)
66#define NFS_ATTR_WCC_V4 0x0010 /* pre-op change attribute */ 64#define NFS_ATTR_FATTR_GROUP (1U << 4)
67#define NFS_ATTR_FATTR_V4_REFERRAL 0x0020 /* NFSv4 referral */ 65#define NFS_ATTR_FATTR_RDEV (1U << 5)
66#define NFS_ATTR_FATTR_SIZE (1U << 6)
67#define NFS_ATTR_FATTR_PRESIZE (1U << 7)
68#define NFS_ATTR_FATTR_BLOCKS_USED (1U << 8)
69#define NFS_ATTR_FATTR_SPACE_USED (1U << 9)
70#define NFS_ATTR_FATTR_FSID (1U << 10)
71#define NFS_ATTR_FATTR_FILEID (1U << 11)
72#define NFS_ATTR_FATTR_ATIME (1U << 12)
73#define NFS_ATTR_FATTR_MTIME (1U << 13)
74#define NFS_ATTR_FATTR_CTIME (1U << 14)
75#define NFS_ATTR_FATTR_PREMTIME (1U << 15)
76#define NFS_ATTR_FATTR_PRECTIME (1U << 16)
77#define NFS_ATTR_FATTR_CHANGE (1U << 17)
78#define NFS_ATTR_FATTR_PRECHANGE (1U << 18)
79#define NFS_ATTR_FATTR_V4_REFERRAL (1U << 19) /* NFSv4 referral */
80
81#define NFS_ATTR_FATTR (NFS_ATTR_FATTR_TYPE \
82 | NFS_ATTR_FATTR_MODE \
83 | NFS_ATTR_FATTR_NLINK \
84 | NFS_ATTR_FATTR_OWNER \
85 | NFS_ATTR_FATTR_GROUP \
86 | NFS_ATTR_FATTR_RDEV \
87 | NFS_ATTR_FATTR_SIZE \
88 | NFS_ATTR_FATTR_FSID \
89 | NFS_ATTR_FATTR_FILEID \
90 | NFS_ATTR_FATTR_ATIME \
91 | NFS_ATTR_FATTR_MTIME \
92 | NFS_ATTR_FATTR_CTIME)
93#define NFS_ATTR_FATTR_V2 (NFS_ATTR_FATTR \
94 | NFS_ATTR_FATTR_BLOCKS_USED)
95#define NFS_ATTR_FATTR_V3 (NFS_ATTR_FATTR \
96 | NFS_ATTR_FATTR_SPACE_USED)
97#define NFS_ATTR_FATTR_V4 (NFS_ATTR_FATTR \
98 | NFS_ATTR_FATTR_SPACE_USED \
99 | NFS_ATTR_FATTR_CHANGE)
68 100
69/* 101/*
70 * Info on the file system 102 * Info on the file system
@@ -836,6 +868,7 @@ struct nfs_rpc_ops {
836 int (*lock)(struct file *, int, struct file_lock *); 868 int (*lock)(struct file *, int, struct file_lock *);
837 int (*lock_check_bounds)(const struct file_lock *); 869 int (*lock_check_bounds)(const struct file_lock *);
838 void (*clear_acl_cache)(struct inode *); 870 void (*clear_acl_cache)(struct inode *);
871 void (*close_context)(struct nfs_open_context *ctx, int);
839}; 872};
840 873
841/* 874/*
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 3435d24bfe55..d3a4c0231933 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -69,7 +69,6 @@ struct svc_serv {
69 struct list_head sv_tempsocks; /* all temporary sockets */ 69 struct list_head sv_tempsocks; /* all temporary sockets */
70 int sv_tmpcnt; /* count of temporary sockets */ 70 int sv_tmpcnt; /* count of temporary sockets */
71 struct timer_list sv_temptimer; /* timer for aging temporary sockets */ 71 struct timer_list sv_temptimer; /* timer for aging temporary sockets */
72 sa_family_t sv_family; /* listener's address family */
73 72
74 char * sv_name; /* service name */ 73 char * sv_name; /* service name */
75 74
@@ -385,19 +384,19 @@ struct svc_procedure {
385/* 384/*
386 * Function prototypes. 385 * Function prototypes.
387 */ 386 */
388struct svc_serv *svc_create(struct svc_program *, unsigned int, sa_family_t, 387struct svc_serv *svc_create(struct svc_program *, unsigned int,
389 void (*shutdown)(struct svc_serv *)); 388 void (*shutdown)(struct svc_serv *));
390struct svc_rqst *svc_prepare_thread(struct svc_serv *serv, 389struct svc_rqst *svc_prepare_thread(struct svc_serv *serv,
391 struct svc_pool *pool); 390 struct svc_pool *pool);
392void svc_exit_thread(struct svc_rqst *); 391void svc_exit_thread(struct svc_rqst *);
393struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, 392struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int,
394 sa_family_t, void (*shutdown)(struct svc_serv *), 393 void (*shutdown)(struct svc_serv *),
395 svc_thread_fn, struct module *); 394 svc_thread_fn, struct module *);
396int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int); 395int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
397void svc_destroy(struct svc_serv *); 396void svc_destroy(struct svc_serv *);
398int svc_process(struct svc_rqst *); 397int svc_process(struct svc_rqst *);
399int svc_register(const struct svc_serv *, const unsigned short, 398int svc_register(const struct svc_serv *, const int,
400 const unsigned short); 399 const unsigned short, const unsigned short);
401 400
402void svc_wake_up(struct svc_serv *); 401void svc_wake_up(struct svc_serv *);
403void svc_reserve(struct svc_rqst *rqstp, int space); 402void svc_reserve(struct svc_rqst *rqstp, int space);
diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 0127daca4354..0d9cb6ef28b0 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -71,7 +71,8 @@ int svc_reg_xprt_class(struct svc_xprt_class *);
71void svc_unreg_xprt_class(struct svc_xprt_class *); 71void svc_unreg_xprt_class(struct svc_xprt_class *);
72void svc_xprt_init(struct svc_xprt_class *, struct svc_xprt *, 72void svc_xprt_init(struct svc_xprt_class *, struct svc_xprt *,
73 struct svc_serv *); 73 struct svc_serv *);
74int svc_create_xprt(struct svc_serv *, char *, unsigned short, int); 74int svc_create_xprt(struct svc_serv *, const char *, const int,
75 const unsigned short, int);
75void svc_xprt_enqueue(struct svc_xprt *xprt); 76void svc_xprt_enqueue(struct svc_xprt *xprt);
76void svc_xprt_received(struct svc_xprt *); 77void svc_xprt_received(struct svc_xprt *);
77void svc_xprt_put(struct svc_xprt *xprt); 78void svc_xprt_put(struct svc_xprt *xprt);
@@ -80,7 +81,8 @@ void svc_close_xprt(struct svc_xprt *xprt);
80void svc_delete_xprt(struct svc_xprt *xprt); 81void svc_delete_xprt(struct svc_xprt *xprt);
81int svc_port_is_privileged(struct sockaddr *sin); 82int svc_port_is_privileged(struct sockaddr *sin);
82int svc_print_xprts(char *buf, int maxlen); 83int svc_print_xprts(char *buf, int maxlen);
83struct svc_xprt *svc_find_xprt(struct svc_serv *, char *, int, int); 84struct svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name,
85 const sa_family_t af, const unsigned short port);
84int svc_xprt_names(struct svc_serv *serv, char *buf, int buflen); 86int svc_xprt_names(struct svc_serv *serv, char *buf, int buflen);
85 87
86static inline void svc_xprt_get(struct svc_xprt *xprt) 88static inline void svc_xprt_get(struct svc_xprt *xprt)
@@ -88,29 +90,32 @@ static inline void svc_xprt_get(struct svc_xprt *xprt)
88 kref_get(&xprt->xpt_ref); 90 kref_get(&xprt->xpt_ref);
89} 91}
90static inline void svc_xprt_set_local(struct svc_xprt *xprt, 92static inline void svc_xprt_set_local(struct svc_xprt *xprt,
91 struct sockaddr *sa, int salen) 93 const struct sockaddr *sa,
94 const size_t salen)
92{ 95{
93 memcpy(&xprt->xpt_local, sa, salen); 96 memcpy(&xprt->xpt_local, sa, salen);
94 xprt->xpt_locallen = salen; 97 xprt->xpt_locallen = salen;
95} 98}
96static inline void svc_xprt_set_remote(struct svc_xprt *xprt, 99static inline void svc_xprt_set_remote(struct svc_xprt *xprt,
97 struct sockaddr *sa, int salen) 100 const struct sockaddr *sa,
101 const size_t salen)
98{ 102{
99 memcpy(&xprt->xpt_remote, sa, salen); 103 memcpy(&xprt->xpt_remote, sa, salen);
100 xprt->xpt_remotelen = salen; 104 xprt->xpt_remotelen = salen;
101} 105}
102static inline unsigned short svc_addr_port(struct sockaddr *sa) 106static inline unsigned short svc_addr_port(const struct sockaddr *sa)
103{ 107{
104 unsigned short ret = 0; 108 const struct sockaddr_in *sin = (const struct sockaddr_in *)sa;
109 const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)sa;
110
105 switch (sa->sa_family) { 111 switch (sa->sa_family) {
106 case AF_INET: 112 case AF_INET:
107 ret = ntohs(((struct sockaddr_in *)sa)->sin_port); 113 return ntohs(sin->sin_port);
108 break;
109 case AF_INET6: 114 case AF_INET6:
110 ret = ntohs(((struct sockaddr_in6 *)sa)->sin6_port); 115 return ntohs(sin6->sin6_port);
111 break;
112 } 116 }
113 return ret; 117
118 return 0;
114} 119}
115 120
116static inline size_t svc_addr_len(struct sockaddr *sa) 121static inline size_t svc_addr_len(struct sockaddr *sa)
@@ -124,36 +129,39 @@ static inline size_t svc_addr_len(struct sockaddr *sa)
124 return -EAFNOSUPPORT; 129 return -EAFNOSUPPORT;
125} 130}
126 131
127static inline unsigned short svc_xprt_local_port(struct svc_xprt *xprt) 132static inline unsigned short svc_xprt_local_port(const struct svc_xprt *xprt)
128{ 133{
129 return svc_addr_port((struct sockaddr *)&xprt->xpt_local); 134 return svc_addr_port((const struct sockaddr *)&xprt->xpt_local);
130} 135}
131 136
132static inline unsigned short svc_xprt_remote_port(struct svc_xprt *xprt) 137static inline unsigned short svc_xprt_remote_port(const struct svc_xprt *xprt)
133{ 138{
134 return svc_addr_port((struct sockaddr *)&xprt->xpt_remote); 139 return svc_addr_port((const struct sockaddr *)&xprt->xpt_remote);
135} 140}
136 141
137static inline char *__svc_print_addr(struct sockaddr *addr, 142static inline char *__svc_print_addr(const struct sockaddr *addr,
138 char *buf, size_t len) 143 char *buf, const size_t len)
139{ 144{
145 const struct sockaddr_in *sin = (const struct sockaddr_in *)addr;
146 const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)addr;
147
140 switch (addr->sa_family) { 148 switch (addr->sa_family) {
141 case AF_INET: 149 case AF_INET:
142 snprintf(buf, len, "%pI4, port=%u", 150 snprintf(buf, len, "%pI4, port=%u", &sin->sin_addr,
143 &((struct sockaddr_in *)addr)->sin_addr, 151 ntohs(sin->sin_port));
144 ntohs(((struct sockaddr_in *) addr)->sin_port));
145 break; 152 break;
146 153
147 case AF_INET6: 154 case AF_INET6:
148 snprintf(buf, len, "%pI6, port=%u", 155 snprintf(buf, len, "%pI6, port=%u",
149 &((struct sockaddr_in6 *)addr)->sin6_addr, 156 &sin6->sin6_addr,
150 ntohs(((struct sockaddr_in6 *) addr)->sin6_port)); 157 ntohs(sin6->sin6_port));
151 break; 158 break;
152 159
153 default: 160 default:
154 snprintf(buf, len, "unknown address type: %d", addr->sa_family); 161 snprintf(buf, len, "unknown address type: %d", addr->sa_family);
155 break; 162 break;
156 } 163 }
164
157 return buf; 165 return buf;
158} 166}
159#endif /* SUNRPC_SVC_XPRT_H */ 167#endif /* SUNRPC_SVC_XPRT_H */
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 11fc71d50c1e..1758d9f5b5c3 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -235,6 +235,7 @@ static inline __be32 *xprt_skip_transport_header(struct rpc_xprt *xprt, __be32 *
235 */ 235 */
236int xprt_register_transport(struct xprt_class *type); 236int xprt_register_transport(struct xprt_class *type);
237int xprt_unregister_transport(struct xprt_class *type); 237int xprt_unregister_transport(struct xprt_class *type);
238int xprt_load_transport(const char *);
238void xprt_set_retrans_timeout_def(struct rpc_task *task); 239void xprt_set_retrans_timeout_def(struct rpc_task *task);
239void xprt_set_retrans_timeout_rtt(struct rpc_task *task); 240void xprt_set_retrans_timeout_rtt(struct rpc_task *task);
240void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status); 241void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status);
@@ -259,6 +260,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie);
259#define XPRT_BOUND (4) 260#define XPRT_BOUND (4)
260#define XPRT_BINDING (5) 261#define XPRT_BINDING (5)
261#define XPRT_CLOSING (6) 262#define XPRT_CLOSING (6)
263#define XPRT_CONNECTION_ABORT (7)
262 264
263static inline void xprt_set_connected(struct rpc_xprt *xprt) 265static inline void xprt_set_connected(struct rpc_xprt *xprt)
264{ 266{
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 5592883e1e4a..afd91c78ce8e 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -17,28 +17,6 @@ config SUNRPC_XPRT_RDMA
17 17
18 If unsure, say N. 18 If unsure, say N.
19 19
20config SUNRPC_REGISTER_V4
21 bool "Register local RPC services via rpcbind v4 (EXPERIMENTAL)"
22 depends on SUNRPC && EXPERIMENTAL
23 default n
24 help
25 Sun added support for registering RPC services at an IPv6
26 address by creating two new versions of the rpcbind protocol
27 (RFC 1833).
28
29 This option enables support in the kernel RPC server for
30 registering kernel RPC services via version 4 of the rpcbind
31 protocol. If you enable this option, you must run a portmapper
32 daemon that supports rpcbind protocol version 4.
33
34 Serving NFS over IPv6 from knfsd (the kernel's NFS server)
35 requires that you enable this option and use a portmapper that
36 supports rpcbind version 4.
37
38 If unsure, say N to get traditional behavior (register kernel
39 RPC services using only rpcbind version 2). Distributions
40 using the legacy Linux portmapper daemon must say N here.
41
42config RPCSEC_GSS_KRB5 20config RPCSEC_GSS_KRB5
43 tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)" 21 tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
44 depends on SUNRPC && EXPERIMENTAL 22 depends on SUNRPC && EXPERIMENTAL
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 836f15c0c4a3..5abab094441f 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1032,27 +1032,20 @@ call_connect_status(struct rpc_task *task)
1032 dprint_status(task); 1032 dprint_status(task);
1033 1033
1034 task->tk_status = 0; 1034 task->tk_status = 0;
1035 if (status >= 0) { 1035 if (status >= 0 || status == -EAGAIN) {
1036 clnt->cl_stats->netreconn++; 1036 clnt->cl_stats->netreconn++;
1037 task->tk_action = call_transmit; 1037 task->tk_action = call_transmit;
1038 return; 1038 return;
1039 } 1039 }
1040 1040
1041 /* Something failed: remote service port may have changed */
1042 rpc_force_rebind(clnt);
1043
1044 switch (status) { 1041 switch (status) {
1045 case -ENOTCONN:
1046 case -EAGAIN:
1047 task->tk_action = call_bind;
1048 if (!RPC_IS_SOFT(task))
1049 return;
1050 /* if soft mounted, test if we've timed out */ 1042 /* if soft mounted, test if we've timed out */
1051 case -ETIMEDOUT: 1043 case -ETIMEDOUT:
1052 task->tk_action = call_timeout; 1044 task->tk_action = call_timeout;
1053 return; 1045 break;
1046 default:
1047 rpc_exit(task, -EIO);
1054 } 1048 }
1055 rpc_exit(task, -EIO);
1056} 1049}
1057 1050
1058/* 1051/*
@@ -1105,14 +1098,26 @@ static void
1105call_transmit_status(struct rpc_task *task) 1098call_transmit_status(struct rpc_task *task)
1106{ 1099{
1107 task->tk_action = call_status; 1100 task->tk_action = call_status;
1108 /* 1101 switch (task->tk_status) {
1109 * Special case: if we've been waiting on the socket's write_space() 1102 case -EAGAIN:
1110 * callback, then don't call xprt_end_transmit(). 1103 break;
1111 */ 1104 default:
1112 if (task->tk_status == -EAGAIN) 1105 xprt_end_transmit(task);
1113 return; 1106 /*
1114 xprt_end_transmit(task); 1107 * Special cases: if we've been waiting on the
1115 rpc_task_force_reencode(task); 1108 * socket's write_space() callback, or if the
1109 * socket just returned a connection error,
1110 * then hold onto the transport lock.
1111 */
1112 case -ECONNREFUSED:
1113 case -ECONNRESET:
1114 case -ENOTCONN:
1115 case -EHOSTDOWN:
1116 case -EHOSTUNREACH:
1117 case -ENETUNREACH:
1118 case -EPIPE:
1119 rpc_task_force_reencode(task);
1120 }
1116} 1121}
1117 1122
1118/* 1123/*
@@ -1152,9 +1157,12 @@ call_status(struct rpc_task *task)
1152 xprt_conditional_disconnect(task->tk_xprt, 1157 xprt_conditional_disconnect(task->tk_xprt,
1153 req->rq_connect_cookie); 1158 req->rq_connect_cookie);
1154 break; 1159 break;
1160 case -ECONNRESET:
1155 case -ECONNREFUSED: 1161 case -ECONNREFUSED:
1156 case -ENOTCONN:
1157 rpc_force_rebind(clnt); 1162 rpc_force_rebind(clnt);
1163 rpc_delay(task, 3*HZ);
1164 case -EPIPE:
1165 case -ENOTCONN:
1158 task->tk_action = call_bind; 1166 task->tk_action = call_bind;
1159 break; 1167 break;
1160 case -EAGAIN: 1168 case -EAGAIN:
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 03ae007641e4..beee6da33035 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -63,9 +63,16 @@ enum {
63 * r_owner 63 * r_owner
64 * 64 *
65 * The "owner" is allowed to unset a service in the rpcbind database. 65 * The "owner" is allowed to unset a service in the rpcbind database.
66 * We always use the following (arbitrary) fixed string. 66 *
67 * For AF_LOCAL SET/UNSET requests, rpcbind treats this string as a
68 * UID which it maps to a local user name via a password lookup.
69 * In all other cases it is ignored.
70 *
71 * For SET/UNSET requests, user space provides a value, even for
72 * network requests, and GETADDR uses an empty string. We follow
73 * those precedents here.
67 */ 74 */
68#define RPCB_OWNER_STRING "rpcb" 75#define RPCB_OWNER_STRING "0"
69#define RPCB_MAXOWNERLEN sizeof(RPCB_OWNER_STRING) 76#define RPCB_MAXOWNERLEN sizeof(RPCB_OWNER_STRING)
70 77
71static void rpcb_getport_done(struct rpc_task *, void *); 78static void rpcb_getport_done(struct rpc_task *, void *);
@@ -124,12 +131,6 @@ static const struct sockaddr_in rpcb_inaddr_loopback = {
124 .sin_port = htons(RPCBIND_PORT), 131 .sin_port = htons(RPCBIND_PORT),
125}; 132};
126 133
127static const struct sockaddr_in6 rpcb_in6addr_loopback = {
128 .sin6_family = AF_INET6,
129 .sin6_addr = IN6ADDR_LOOPBACK_INIT,
130 .sin6_port = htons(RPCBIND_PORT),
131};
132
133static struct rpc_clnt *rpcb_create_local(struct sockaddr *addr, 134static struct rpc_clnt *rpcb_create_local(struct sockaddr *addr,
134 size_t addrlen, u32 version) 135 size_t addrlen, u32 version)
135{ 136{
@@ -176,9 +177,10 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr,
176 return rpc_create(&args); 177 return rpc_create(&args);
177} 178}
178 179
179static int rpcb_register_call(struct sockaddr *addr, size_t addrlen, 180static int rpcb_register_call(const u32 version, struct rpc_message *msg)
180 u32 version, struct rpc_message *msg)
181{ 181{
182 struct sockaddr *addr = (struct sockaddr *)&rpcb_inaddr_loopback;
183 size_t addrlen = sizeof(rpcb_inaddr_loopback);
182 struct rpc_clnt *rpcb_clnt; 184 struct rpc_clnt *rpcb_clnt;
183 int result, error = 0; 185 int result, error = 0;
184 186
@@ -192,7 +194,7 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
192 error = PTR_ERR(rpcb_clnt); 194 error = PTR_ERR(rpcb_clnt);
193 195
194 if (error < 0) { 196 if (error < 0) {
195 printk(KERN_WARNING "RPC: failed to contact local rpcbind " 197 dprintk("RPC: failed to contact local rpcbind "
196 "server (errno %d).\n", -error); 198 "server (errno %d).\n", -error);
197 return error; 199 return error;
198 } 200 }
@@ -254,25 +256,23 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port)
254 if (port) 256 if (port)
255 msg.rpc_proc = &rpcb_procedures2[RPCBPROC_SET]; 257 msg.rpc_proc = &rpcb_procedures2[RPCBPROC_SET];
256 258
257 return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback, 259 return rpcb_register_call(RPCBVERS_2, &msg);
258 sizeof(rpcb_inaddr_loopback),
259 RPCBVERS_2, &msg);
260} 260}
261 261
262/* 262/*
263 * Fill in AF_INET family-specific arguments to register 263 * Fill in AF_INET family-specific arguments to register
264 */ 264 */
265static int rpcb_register_netid4(struct sockaddr_in *address_to_register, 265static int rpcb_register_inet4(const struct sockaddr *sap,
266 struct rpc_message *msg) 266 struct rpc_message *msg)
267{ 267{
268 const struct sockaddr_in *sin = (const struct sockaddr_in *)sap;
268 struct rpcbind_args *map = msg->rpc_argp; 269 struct rpcbind_args *map = msg->rpc_argp;
269 unsigned short port = ntohs(address_to_register->sin_port); 270 unsigned short port = ntohs(sin->sin_port);
270 char buf[32]; 271 char buf[32];
271 272
272 /* Construct AF_INET universal address */ 273 /* Construct AF_INET universal address */
273 snprintf(buf, sizeof(buf), "%pI4.%u.%u", 274 snprintf(buf, sizeof(buf), "%pI4.%u.%u",
274 &address_to_register->sin_addr.s_addr, 275 &sin->sin_addr.s_addr, port >> 8, port & 0xff);
275 port >> 8, port & 0xff);
276 map->r_addr = buf; 276 map->r_addr = buf;
277 277
278 dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with " 278 dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with "
@@ -284,29 +284,27 @@ static int rpcb_register_netid4(struct sockaddr_in *address_to_register,
284 if (port) 284 if (port)
285 msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET]; 285 msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET];
286 286
287 return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback, 287 return rpcb_register_call(RPCBVERS_4, msg);
288 sizeof(rpcb_inaddr_loopback),
289 RPCBVERS_4, msg);
290} 288}
291 289
292/* 290/*
293 * Fill in AF_INET6 family-specific arguments to register 291 * Fill in AF_INET6 family-specific arguments to register
294 */ 292 */
295static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register, 293static int rpcb_register_inet6(const struct sockaddr *sap,
296 struct rpc_message *msg) 294 struct rpc_message *msg)
297{ 295{
296 const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)sap;
298 struct rpcbind_args *map = msg->rpc_argp; 297 struct rpcbind_args *map = msg->rpc_argp;
299 unsigned short port = ntohs(address_to_register->sin6_port); 298 unsigned short port = ntohs(sin6->sin6_port);
300 char buf[64]; 299 char buf[64];
301 300
302 /* Construct AF_INET6 universal address */ 301 /* Construct AF_INET6 universal address */
303 if (ipv6_addr_any(&address_to_register->sin6_addr)) 302 if (ipv6_addr_any(&sin6->sin6_addr))
304 snprintf(buf, sizeof(buf), "::.%u.%u", 303 snprintf(buf, sizeof(buf), "::.%u.%u",
305 port >> 8, port & 0xff); 304 port >> 8, port & 0xff);
306 else 305 else
307 snprintf(buf, sizeof(buf), "%pI6.%u.%u", 306 snprintf(buf, sizeof(buf), "%pI6.%u.%u",
308 &address_to_register->sin6_addr, 307 &sin6->sin6_addr, port >> 8, port & 0xff);
309 port >> 8, port & 0xff);
310 map->r_addr = buf; 308 map->r_addr = buf;
311 309
312 dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with " 310 dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with "
@@ -318,9 +316,21 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
318 if (port) 316 if (port)
319 msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET]; 317 msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET];
320 318
321 return rpcb_register_call((struct sockaddr *)&rpcb_in6addr_loopback, 319 return rpcb_register_call(RPCBVERS_4, msg);
322 sizeof(rpcb_in6addr_loopback), 320}
323 RPCBVERS_4, msg); 321
322static int rpcb_unregister_all_protofamilies(struct rpc_message *msg)
323{
324 struct rpcbind_args *map = msg->rpc_argp;
325
326 dprintk("RPC: unregistering [%u, %u, '%s'] with "
327 "local rpcbind\n",
328 map->r_prog, map->r_vers, map->r_netid);
329
330 map->r_addr = "";
331 msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET];
332
333 return rpcb_register_call(RPCBVERS_4, msg);
324} 334}
325 335
326/** 336/**
@@ -340,10 +350,11 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
340 * invoke this function once for each [program, version, address, 350 * invoke this function once for each [program, version, address,
341 * netid] tuple they wish to advertise. 351 * netid] tuple they wish to advertise.
342 * 352 *
343 * Callers may also unregister RPC services that are no longer 353 * Callers may also unregister RPC services that are registered at a
344 * available by setting the port number in the passed-in address 354 * specific address by setting the port number in @address to zero.
345 * to zero. Callers pass a netid of "" to unregister all 355 * They may unregister all registered protocol families at once for
346 * transport netids associated with [program, version, address]. 356 * a service by passing a NULL @address argument. If @netid is ""
357 * then all netids for [program, version, address] are unregistered.
347 * 358 *
348 * This function uses rpcbind protocol version 4 to contact the 359 * This function uses rpcbind protocol version 4 to contact the
349 * local rpcbind daemon. The local rpcbind daemon must support 360 * local rpcbind daemon. The local rpcbind daemon must support
@@ -378,13 +389,14 @@ int rpcb_v4_register(const u32 program, const u32 version,
378 .rpc_argp = &map, 389 .rpc_argp = &map,
379 }; 390 };
380 391
392 if (address == NULL)
393 return rpcb_unregister_all_protofamilies(&msg);
394
381 switch (address->sa_family) { 395 switch (address->sa_family) {
382 case AF_INET: 396 case AF_INET:
383 return rpcb_register_netid4((struct sockaddr_in *)address, 397 return rpcb_register_inet4(address, &msg);
384 &msg);
385 case AF_INET6: 398 case AF_INET6:
386 return rpcb_register_netid6((struct sockaddr_in6 *)address, 399 return rpcb_register_inet6(address, &msg);
387 &msg);
388 } 400 }
389 401
390 return -EAFNOSUPPORT; 402 return -EAFNOSUPPORT;
@@ -579,7 +591,7 @@ void rpcb_getport_async(struct rpc_task *task)
579 map->r_xprt = xprt_get(xprt); 591 map->r_xprt = xprt_get(xprt);
580 map->r_netid = rpc_peeraddr2str(clnt, RPC_DISPLAY_NETID); 592 map->r_netid = rpc_peeraddr2str(clnt, RPC_DISPLAY_NETID);
581 map->r_addr = rpc_peeraddr2str(rpcb_clnt, RPC_DISPLAY_UNIVERSAL_ADDR); 593 map->r_addr = rpc_peeraddr2str(rpcb_clnt, RPC_DISPLAY_UNIVERSAL_ADDR);
582 map->r_owner = RPCB_OWNER_STRING; /* ignored for GETADDR */ 594 map->r_owner = "";
583 map->r_status = -EIO; 595 map->r_status = -EIO;
584 596
585 child = rpcb_call_async(rpcb_clnt, map, proc); 597 child = rpcb_call_async(rpcb_clnt, map, proc);
@@ -703,11 +715,16 @@ static int rpcb_decode_getaddr(struct rpc_rqst *req, __be32 *p,
703 *portp = 0; 715 *portp = 0;
704 addr_len = ntohl(*p++); 716 addr_len = ntohl(*p++);
705 717
718 if (addr_len == 0) {
719 dprintk("RPC: rpcb_decode_getaddr: "
720 "service is not registered\n");
721 return 0;
722 }
723
706 /* 724 /*
707 * Simple sanity check. The smallest possible universal 725 * Simple sanity check.
708 * address is an IPv4 address string containing 11 bytes.
709 */ 726 */
710 if (addr_len < 11 || addr_len > RPCBIND_MAXUADDRLEN) 727 if (addr_len > RPCBIND_MAXUADDRLEN)
711 goto out_err; 728 goto out_err;
712 729
713 /* 730 /*
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index bb507e2bb94d..9f2f2412a2f3 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -359,7 +359,7 @@ svc_pool_for_cpu(struct svc_serv *serv, int cpu)
359 */ 359 */
360static struct svc_serv * 360static struct svc_serv *
361__svc_create(struct svc_program *prog, unsigned int bufsize, int npools, 361__svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
362 sa_family_t family, void (*shutdown)(struct svc_serv *serv)) 362 void (*shutdown)(struct svc_serv *serv))
363{ 363{
364 struct svc_serv *serv; 364 struct svc_serv *serv;
365 unsigned int vers; 365 unsigned int vers;
@@ -368,7 +368,6 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
368 368
369 if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL))) 369 if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL)))
370 return NULL; 370 return NULL;
371 serv->sv_family = family;
372 serv->sv_name = prog->pg_name; 371 serv->sv_name = prog->pg_name;
373 serv->sv_program = prog; 372 serv->sv_program = prog;
374 serv->sv_nrthreads = 1; 373 serv->sv_nrthreads = 1;
@@ -427,21 +426,21 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
427 426
428struct svc_serv * 427struct svc_serv *
429svc_create(struct svc_program *prog, unsigned int bufsize, 428svc_create(struct svc_program *prog, unsigned int bufsize,
430 sa_family_t family, void (*shutdown)(struct svc_serv *serv)) 429 void (*shutdown)(struct svc_serv *serv))
431{ 430{
432 return __svc_create(prog, bufsize, /*npools*/1, family, shutdown); 431 return __svc_create(prog, bufsize, /*npools*/1, shutdown);
433} 432}
434EXPORT_SYMBOL_GPL(svc_create); 433EXPORT_SYMBOL_GPL(svc_create);
435 434
436struct svc_serv * 435struct svc_serv *
437svc_create_pooled(struct svc_program *prog, unsigned int bufsize, 436svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
438 sa_family_t family, void (*shutdown)(struct svc_serv *serv), 437 void (*shutdown)(struct svc_serv *serv),
439 svc_thread_fn func, struct module *mod) 438 svc_thread_fn func, struct module *mod)
440{ 439{
441 struct svc_serv *serv; 440 struct svc_serv *serv;
442 unsigned int npools = svc_pool_map_get(); 441 unsigned int npools = svc_pool_map_get();
443 442
444 serv = __svc_create(prog, bufsize, npools, family, shutdown); 443 serv = __svc_create(prog, bufsize, npools, shutdown);
445 444
446 if (serv != NULL) { 445 if (serv != NULL) {
447 serv->sv_function = func; 446 serv->sv_function = func;
@@ -719,8 +718,6 @@ svc_exit_thread(struct svc_rqst *rqstp)
719} 718}
720EXPORT_SYMBOL_GPL(svc_exit_thread); 719EXPORT_SYMBOL_GPL(svc_exit_thread);
721 720
722#ifdef CONFIG_SUNRPC_REGISTER_V4
723
724/* 721/*
725 * Register an "inet" protocol family netid with the local 722 * Register an "inet" protocol family netid with the local
726 * rpcbind daemon via an rpcbind v4 SET request. 723 * rpcbind daemon via an rpcbind v4 SET request.
@@ -735,12 +732,13 @@ static int __svc_rpcb_register4(const u32 program, const u32 version,
735 const unsigned short protocol, 732 const unsigned short protocol,
736 const unsigned short port) 733 const unsigned short port)
737{ 734{
738 struct sockaddr_in sin = { 735 const struct sockaddr_in sin = {
739 .sin_family = AF_INET, 736 .sin_family = AF_INET,
740 .sin_addr.s_addr = htonl(INADDR_ANY), 737 .sin_addr.s_addr = htonl(INADDR_ANY),
741 .sin_port = htons(port), 738 .sin_port = htons(port),
742 }; 739 };
743 char *netid; 740 const char *netid;
741 int error;
744 742
745 switch (protocol) { 743 switch (protocol) {
746 case IPPROTO_UDP: 744 case IPPROTO_UDP:
@@ -750,13 +748,23 @@ static int __svc_rpcb_register4(const u32 program, const u32 version,
750 netid = RPCBIND_NETID_TCP; 748 netid = RPCBIND_NETID_TCP;
751 break; 749 break;
752 default: 750 default:
753 return -EPROTONOSUPPORT; 751 return -ENOPROTOOPT;
754 } 752 }
755 753
756 return rpcb_v4_register(program, version, 754 error = rpcb_v4_register(program, version,
757 (struct sockaddr *)&sin, netid); 755 (const struct sockaddr *)&sin, netid);
756
757 /*
758 * User space didn't support rpcbind v4, so retry this
759 * registration request with the legacy rpcbind v2 protocol.
760 */
761 if (error == -EPROTONOSUPPORT)
762 error = rpcb_register(program, version, protocol, port);
763
764 return error;
758} 765}
759 766
767#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
760/* 768/*
761 * Register an "inet6" protocol family netid with the local 769 * Register an "inet6" protocol family netid with the local
762 * rpcbind daemon via an rpcbind v4 SET request. 770 * rpcbind daemon via an rpcbind v4 SET request.
@@ -771,12 +779,13 @@ static int __svc_rpcb_register6(const u32 program, const u32 version,
771 const unsigned short protocol, 779 const unsigned short protocol,
772 const unsigned short port) 780 const unsigned short port)
773{ 781{
774 struct sockaddr_in6 sin6 = { 782 const struct sockaddr_in6 sin6 = {
775 .sin6_family = AF_INET6, 783 .sin6_family = AF_INET6,
776 .sin6_addr = IN6ADDR_ANY_INIT, 784 .sin6_addr = IN6ADDR_ANY_INIT,
777 .sin6_port = htons(port), 785 .sin6_port = htons(port),
778 }; 786 };
779 char *netid; 787 const char *netid;
788 int error;
780 789
781 switch (protocol) { 790 switch (protocol) {
782 case IPPROTO_UDP: 791 case IPPROTO_UDP:
@@ -786,12 +795,22 @@ static int __svc_rpcb_register6(const u32 program, const u32 version,
786 netid = RPCBIND_NETID_TCP6; 795 netid = RPCBIND_NETID_TCP6;
787 break; 796 break;
788 default: 797 default:
789 return -EPROTONOSUPPORT; 798 return -ENOPROTOOPT;
790 } 799 }
791 800
792 return rpcb_v4_register(program, version, 801 error = rpcb_v4_register(program, version,
793 (struct sockaddr *)&sin6, netid); 802 (const struct sockaddr *)&sin6, netid);
803
804 /*
805 * User space didn't support rpcbind version 4, so we won't
806 * use a PF_INET6 listener.
807 */
808 if (error == -EPROTONOSUPPORT)
809 error = -EAFNOSUPPORT;
810
811 return error;
794} 812}
813#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
795 814
796/* 815/*
797 * Register a kernel RPC service via rpcbind version 4. 816 * Register a kernel RPC service via rpcbind version 4.
@@ -799,69 +818,43 @@ static int __svc_rpcb_register6(const u32 program, const u32 version,
799 * Returns zero on success; a negative errno value is returned 818 * Returns zero on success; a negative errno value is returned
800 * if any error occurs. 819 * if any error occurs.
801 */ 820 */
802static int __svc_register(const u32 program, const u32 version, 821static int __svc_register(const char *progname,
803 const sa_family_t family, 822 const u32 program, const u32 version,
823 const int family,
804 const unsigned short protocol, 824 const unsigned short protocol,
805 const unsigned short port) 825 const unsigned short port)
806{ 826{
807 int error; 827 int error = -EAFNOSUPPORT;
808 828
809 switch (family) { 829 switch (family) {
810 case AF_INET: 830 case PF_INET:
811 return __svc_rpcb_register4(program, version, 831 error = __svc_rpcb_register4(program, version,
812 protocol, port); 832 protocol, port);
813 case AF_INET6: 833 break;
834#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
835 case PF_INET6:
814 error = __svc_rpcb_register6(program, version, 836 error = __svc_rpcb_register6(program, version,
815 protocol, port); 837 protocol, port);
816 if (error < 0) 838#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
817 return error;
818
819 /*
820 * Work around bug in some versions of Linux rpcbind
821 * which don't allow registration of both inet and
822 * inet6 netids.
823 *
824 * Error return ignored for now.
825 */
826 __svc_rpcb_register4(program, version,
827 protocol, port);
828 return 0;
829 } 839 }
830 840
831 return -EAFNOSUPPORT; 841 if (error < 0)
832} 842 printk(KERN_WARNING "svc: failed to register %sv%u RPC "
833 843 "service (errno %d).\n", progname, version, -error);
834#else /* CONFIG_SUNRPC_REGISTER_V4 */ 844 return error;
835
836/*
837 * Register a kernel RPC service via rpcbind version 2.
838 *
839 * Returns zero on success; a negative errno value is returned
840 * if any error occurs.
841 */
842static int __svc_register(const u32 program, const u32 version,
843 sa_family_t family,
844 const unsigned short protocol,
845 const unsigned short port)
846{
847 if (family != AF_INET)
848 return -EAFNOSUPPORT;
849
850 return rpcb_register(program, version, protocol, port);
851} 845}
852 846
853#endif /* CONFIG_SUNRPC_REGISTER_V4 */
854
855/** 847/**
856 * svc_register - register an RPC service with the local portmapper 848 * svc_register - register an RPC service with the local portmapper
857 * @serv: svc_serv struct for the service to register 849 * @serv: svc_serv struct for the service to register
850 * @family: protocol family of service's listener socket
858 * @proto: transport protocol number to advertise 851 * @proto: transport protocol number to advertise
859 * @port: port to advertise 852 * @port: port to advertise
860 * 853 *
861 * Service is registered for any address in serv's address family 854 * Service is registered for any address in the passed-in protocol family
862 */ 855 */
863int svc_register(const struct svc_serv *serv, const unsigned short proto, 856int svc_register(const struct svc_serv *serv, const int family,
864 const unsigned short port) 857 const unsigned short proto, const unsigned short port)
865{ 858{
866 struct svc_program *progp; 859 struct svc_program *progp;
867 unsigned int i; 860 unsigned int i;
@@ -879,15 +872,15 @@ int svc_register(const struct svc_serv *serv, const unsigned short proto,
879 i, 872 i,
880 proto == IPPROTO_UDP? "udp" : "tcp", 873 proto == IPPROTO_UDP? "udp" : "tcp",
881 port, 874 port,
882 serv->sv_family, 875 family,
883 progp->pg_vers[i]->vs_hidden? 876 progp->pg_vers[i]->vs_hidden?
884 " (but not telling portmap)" : ""); 877 " (but not telling portmap)" : "");
885 878
886 if (progp->pg_vers[i]->vs_hidden) 879 if (progp->pg_vers[i]->vs_hidden)
887 continue; 880 continue;
888 881
889 error = __svc_register(progp->pg_prog, i, 882 error = __svc_register(progp->pg_name, progp->pg_prog,
890 serv->sv_family, proto, port); 883 i, family, proto, port);
891 if (error < 0) 884 if (error < 0)
892 break; 885 break;
893 } 886 }
@@ -896,38 +889,31 @@ int svc_register(const struct svc_serv *serv, const unsigned short proto,
896 return error; 889 return error;
897} 890}
898 891
899#ifdef CONFIG_SUNRPC_REGISTER_V4 892/*
900 893 * If user space is running rpcbind, it should take the v4 UNSET
894 * and clear everything for this [program, version]. If user space
895 * is running portmap, it will reject the v4 UNSET, but won't have
896 * any "inet6" entries anyway. So a PMAP_UNSET should be sufficient
897 * in this case to clear all existing entries for [program, version].
898 */
901static void __svc_unregister(const u32 program, const u32 version, 899static void __svc_unregister(const u32 program, const u32 version,
902 const char *progname) 900 const char *progname)
903{ 901{
904 struct sockaddr_in6 sin6 = {
905 .sin6_family = AF_INET6,
906 .sin6_addr = IN6ADDR_ANY_INIT,
907 .sin6_port = 0,
908 };
909 int error; 902 int error;
910 903
911 error = rpcb_v4_register(program, version, 904 error = rpcb_v4_register(program, version, NULL, "");
912 (struct sockaddr *)&sin6, "");
913 dprintk("svc: %s(%sv%u), error %d\n",
914 __func__, progname, version, error);
915}
916
917#else /* CONFIG_SUNRPC_REGISTER_V4 */
918 905
919static void __svc_unregister(const u32 program, const u32 version, 906 /*
920 const char *progname) 907 * User space didn't support rpcbind v4, so retry this
921{ 908 * request with the legacy rpcbind v2 protocol.
922 int error; 909 */
910 if (error == -EPROTONOSUPPORT)
911 error = rpcb_register(program, version, 0, 0);
923 912
924 error = rpcb_register(program, version, 0, 0);
925 dprintk("svc: %s(%sv%u), error %d\n", 913 dprintk("svc: %s(%sv%u), error %d\n",
926 __func__, progname, version, error); 914 __func__, progname, version, error);
927} 915}
928 916
929#endif /* CONFIG_SUNRPC_REGISTER_V4 */
930
931/* 917/*
932 * All netids, bind addresses and ports registered for [program, version] 918 * All netids, bind addresses and ports registered for [program, version]
933 * are removed from the local rpcbind database (if the service is not 919 * are removed from the local rpcbind database (if the service is not
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index e588df5d6b34..2819ee093f36 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -161,7 +161,9 @@ EXPORT_SYMBOL_GPL(svc_xprt_init);
161 161
162static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, 162static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
163 struct svc_serv *serv, 163 struct svc_serv *serv,
164 unsigned short port, int flags) 164 const int family,
165 const unsigned short port,
166 int flags)
165{ 167{
166 struct sockaddr_in sin = { 168 struct sockaddr_in sin = {
167 .sin_family = AF_INET, 169 .sin_family = AF_INET,
@@ -176,12 +178,12 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
176 struct sockaddr *sap; 178 struct sockaddr *sap;
177 size_t len; 179 size_t len;
178 180
179 switch (serv->sv_family) { 181 switch (family) {
180 case AF_INET: 182 case PF_INET:
181 sap = (struct sockaddr *)&sin; 183 sap = (struct sockaddr *)&sin;
182 len = sizeof(sin); 184 len = sizeof(sin);
183 break; 185 break;
184 case AF_INET6: 186 case PF_INET6:
185 sap = (struct sockaddr *)&sin6; 187 sap = (struct sockaddr *)&sin6;
186 len = sizeof(sin6); 188 len = sizeof(sin6);
187 break; 189 break;
@@ -192,7 +194,8 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
192 return xcl->xcl_ops->xpo_create(serv, sap, len, flags); 194 return xcl->xcl_ops->xpo_create(serv, sap, len, flags);
193} 195}
194 196
195int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port, 197int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
198 const int family, const unsigned short port,
196 int flags) 199 int flags)
197{ 200{
198 struct svc_xprt_class *xcl; 201 struct svc_xprt_class *xcl;
@@ -209,7 +212,7 @@ int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
209 goto err; 212 goto err;
210 213
211 spin_unlock(&svc_xprt_class_lock); 214 spin_unlock(&svc_xprt_class_lock);
212 newxprt = __svc_xpo_create(xcl, serv, port, flags); 215 newxprt = __svc_xpo_create(xcl, serv, family, port, flags);
213 if (IS_ERR(newxprt)) { 216 if (IS_ERR(newxprt)) {
214 module_put(xcl->xcl_owner); 217 module_put(xcl->xcl_owner);
215 return PTR_ERR(newxprt); 218 return PTR_ERR(newxprt);
@@ -1033,7 +1036,13 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt)
1033 return dr; 1036 return dr;
1034} 1037}
1035 1038
1036/* 1039/**
1040 * svc_find_xprt - find an RPC transport instance
1041 * @serv: pointer to svc_serv to search
1042 * @xcl_name: C string containing transport's class name
1043 * @af: Address family of transport's local address
1044 * @port: transport's IP port number
1045 *
1037 * Return the transport instance pointer for the endpoint accepting 1046 * Return the transport instance pointer for the endpoint accepting
1038 * connections/peer traffic from the specified transport class, 1047 * connections/peer traffic from the specified transport class,
1039 * address family and port. 1048 * address family and port.
@@ -1042,14 +1051,14 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt)
1042 * wild-card, and will result in matching the first transport in the 1051 * wild-card, and will result in matching the first transport in the
1043 * service's list that has a matching class name. 1052 * service's list that has a matching class name.
1044 */ 1053 */
1045struct svc_xprt *svc_find_xprt(struct svc_serv *serv, char *xcl_name, 1054struct svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name,
1046 int af, int port) 1055 const sa_family_t af, const unsigned short port)
1047{ 1056{
1048 struct svc_xprt *xprt; 1057 struct svc_xprt *xprt;
1049 struct svc_xprt *found = NULL; 1058 struct svc_xprt *found = NULL;
1050 1059
1051 /* Sanity check the args */ 1060 /* Sanity check the args */
1052 if (!serv || !xcl_name) 1061 if (serv == NULL || xcl_name == NULL)
1053 return found; 1062 return found;
1054 1063
1055 spin_lock_bh(&serv->sv_lock); 1064 spin_lock_bh(&serv->sv_lock);
@@ -1058,7 +1067,7 @@ struct svc_xprt *svc_find_xprt(struct svc_serv *serv, char *xcl_name,
1058 continue; 1067 continue;
1059 if (af != AF_UNSPEC && af != xprt->xpt_local.ss_family) 1068 if (af != AF_UNSPEC && af != xprt->xpt_local.ss_family)
1060 continue; 1069 continue;
1061 if (port && port != svc_xprt_local_port(xprt)) 1070 if (port != 0 && port != svc_xprt_local_port(xprt))
1062 continue; 1071 continue;
1063 found = xprt; 1072 found = xprt;
1064 svc_xprt_get(xprt); 1073 svc_xprt_get(xprt);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 5763e6460fea..9d504234af4a 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1110,7 +1110,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
1110 struct svc_sock *svsk; 1110 struct svc_sock *svsk;
1111 struct sock *inet; 1111 struct sock *inet;
1112 int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); 1112 int pmap_register = !(flags & SVC_SOCK_ANONYMOUS);
1113 int val;
1114 1113
1115 dprintk("svc: svc_setup_socket %p\n", sock); 1114 dprintk("svc: svc_setup_socket %p\n", sock);
1116 if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) { 1115 if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) {
@@ -1122,7 +1121,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
1122 1121
1123 /* Register socket with portmapper */ 1122 /* Register socket with portmapper */
1124 if (*errp >= 0 && pmap_register) 1123 if (*errp >= 0 && pmap_register)
1125 *errp = svc_register(serv, inet->sk_protocol, 1124 *errp = svc_register(serv, inet->sk_family, inet->sk_protocol,
1126 ntohs(inet_sk(inet)->sport)); 1125 ntohs(inet_sk(inet)->sport));
1127 1126
1128 if (*errp < 0) { 1127 if (*errp < 0) {
@@ -1143,18 +1142,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
1143 else 1142 else
1144 svc_tcp_init(svsk, serv); 1143 svc_tcp_init(svsk, serv);
1145 1144
1146 /*
1147 * We start one listener per sv_serv. We want AF_INET
1148 * requests to be automatically shunted to our AF_INET6
1149 * listener using a mapped IPv4 address. Make sure
1150 * no-one starts an equivalent IPv4 listener, which
1151 * would steal our incoming connections.
1152 */
1153 val = 0;
1154 if (serv->sv_family == AF_INET6)
1155 kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY,
1156 (char *)&val, sizeof(val));
1157
1158 dprintk("svc: svc_setup_socket created %p (inet %p)\n", 1145 dprintk("svc: svc_setup_socket created %p (inet %p)\n",
1159 svsk, svsk->sk_sk); 1146 svsk, svsk->sk_sk);
1160 1147
@@ -1222,6 +1209,8 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
1222 struct sockaddr_storage addr; 1209 struct sockaddr_storage addr;
1223 struct sockaddr *newsin = (struct sockaddr *)&addr; 1210 struct sockaddr *newsin = (struct sockaddr *)&addr;
1224 int newlen; 1211 int newlen;
1212 int family;
1213 int val;
1225 RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); 1214 RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
1226 1215
1227 dprintk("svc: svc_create_socket(%s, %d, %s)\n", 1216 dprintk("svc: svc_create_socket(%s, %d, %s)\n",
@@ -1233,14 +1222,35 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
1233 "sockets supported\n"); 1222 "sockets supported\n");
1234 return ERR_PTR(-EINVAL); 1223 return ERR_PTR(-EINVAL);
1235 } 1224 }
1225
1236 type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; 1226 type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM;
1227 switch (sin->sa_family) {
1228 case AF_INET6:
1229 family = PF_INET6;
1230 break;
1231 case AF_INET:
1232 family = PF_INET;
1233 break;
1234 default:
1235 return ERR_PTR(-EINVAL);
1236 }
1237 1237
1238 error = sock_create_kern(sin->sa_family, type, protocol, &sock); 1238 error = sock_create_kern(family, type, protocol, &sock);
1239 if (error < 0) 1239 if (error < 0)
1240 return ERR_PTR(error); 1240 return ERR_PTR(error);
1241 1241
1242 svc_reclassify_socket(sock); 1242 svc_reclassify_socket(sock);
1243 1243
1244 /*
1245 * If this is an PF_INET6 listener, we want to avoid
1246 * getting requests from IPv4 remotes. Those should
1247 * be shunted to a PF_INET listener via rpcbind.
1248 */
1249 val = 1;
1250 if (family == PF_INET6)
1251 kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY,
1252 (char *)&val, sizeof(val));
1253
1244 if (type == SOCK_STREAM) 1254 if (type == SOCK_STREAM)
1245 sock->sk->sk_reuse = 1; /* allow address reuse */ 1255 sock->sk->sk_reuse = 1; /* allow address reuse */
1246 error = kernel_bind(sock, sin, len); 1256 error = kernel_bind(sock, sin, len);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 62098d101a1f..a0bfe53f1621 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -152,6 +152,37 @@ out:
152EXPORT_SYMBOL_GPL(xprt_unregister_transport); 152EXPORT_SYMBOL_GPL(xprt_unregister_transport);
153 153
154/** 154/**
155 * xprt_load_transport - load a transport implementation
156 * @transport_name: transport to load
157 *
158 * Returns:
159 * 0: transport successfully loaded
160 * -ENOENT: transport module not available
161 */
162int xprt_load_transport(const char *transport_name)
163{
164 struct xprt_class *t;
165 char module_name[sizeof t->name + 5];
166 int result;
167
168 result = 0;
169 spin_lock(&xprt_list_lock);
170 list_for_each_entry(t, &xprt_list, list) {
171 if (strcmp(t->name, transport_name) == 0) {
172 spin_unlock(&xprt_list_lock);
173 goto out;
174 }
175 }
176 spin_unlock(&xprt_list_lock);
177 strcpy(module_name, "xprt");
178 strncat(module_name, transport_name, sizeof t->name);
179 result = request_module(module_name);
180out:
181 return result;
182}
183EXPORT_SYMBOL_GPL(xprt_load_transport);
184
185/**
155 * xprt_reserve_xprt - serialize write access to transports 186 * xprt_reserve_xprt - serialize write access to transports
156 * @task: task that is requesting access to the transport 187 * @task: task that is requesting access to the transport
157 * 188 *
@@ -580,7 +611,7 @@ void xprt_disconnect_done(struct rpc_xprt *xprt)
580 dprintk("RPC: disconnected transport %p\n", xprt); 611 dprintk("RPC: disconnected transport %p\n", xprt);
581 spin_lock_bh(&xprt->transport_lock); 612 spin_lock_bh(&xprt->transport_lock);
582 xprt_clear_connected(xprt); 613 xprt_clear_connected(xprt);
583 xprt_wake_pending_tasks(xprt, -ENOTCONN); 614 xprt_wake_pending_tasks(xprt, -EAGAIN);
584 spin_unlock_bh(&xprt->transport_lock); 615 spin_unlock_bh(&xprt->transport_lock);
585} 616}
586EXPORT_SYMBOL_GPL(xprt_disconnect_done); 617EXPORT_SYMBOL_GPL(xprt_disconnect_done);
@@ -598,7 +629,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt)
598 /* Try to schedule an autoclose RPC call */ 629 /* Try to schedule an autoclose RPC call */
599 if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) 630 if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
600 queue_work(rpciod_workqueue, &xprt->task_cleanup); 631 queue_work(rpciod_workqueue, &xprt->task_cleanup);
601 xprt_wake_pending_tasks(xprt, -ENOTCONN); 632 xprt_wake_pending_tasks(xprt, -EAGAIN);
602 spin_unlock_bh(&xprt->transport_lock); 633 spin_unlock_bh(&xprt->transport_lock);
603} 634}
604 635
@@ -625,7 +656,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie)
625 /* Try to schedule an autoclose RPC call */ 656 /* Try to schedule an autoclose RPC call */
626 if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) 657 if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
627 queue_work(rpciod_workqueue, &xprt->task_cleanup); 658 queue_work(rpciod_workqueue, &xprt->task_cleanup);
628 xprt_wake_pending_tasks(xprt, -ENOTCONN); 659 xprt_wake_pending_tasks(xprt, -EAGAIN);
629out: 660out:
630 spin_unlock_bh(&xprt->transport_lock); 661 spin_unlock_bh(&xprt->transport_lock);
631} 662}
@@ -695,9 +726,8 @@ static void xprt_connect_status(struct rpc_task *task)
695 } 726 }
696 727
697 switch (task->tk_status) { 728 switch (task->tk_status) {
698 case -ENOTCONN: 729 case -EAGAIN:
699 dprintk("RPC: %5u xprt_connect_status: connection broken\n", 730 dprintk("RPC: %5u xprt_connect_status: retrying\n", task->tk_pid);
700 task->tk_pid);
701 break; 731 break;
702 case -ETIMEDOUT: 732 case -ETIMEDOUT:
703 dprintk("RPC: %5u xprt_connect_status: connect attempt timed " 733 dprintk("RPC: %5u xprt_connect_status: connect attempt timed "
@@ -818,15 +848,8 @@ int xprt_prepare_transmit(struct rpc_task *task)
818 err = req->rq_received; 848 err = req->rq_received;
819 goto out_unlock; 849 goto out_unlock;
820 } 850 }
821 if (!xprt->ops->reserve_xprt(task)) { 851 if (!xprt->ops->reserve_xprt(task))
822 err = -EAGAIN; 852 err = -EAGAIN;
823 goto out_unlock;
824 }
825
826 if (!xprt_connected(xprt)) {
827 err = -ENOTCONN;
828 goto out_unlock;
829 }
830out_unlock: 853out_unlock:
831 spin_unlock_bh(&xprt->transport_lock); 854 spin_unlock_bh(&xprt->transport_lock);
832 return err; 855 return err;
@@ -870,32 +893,26 @@ void xprt_transmit(struct rpc_task *task)
870 req->rq_connect_cookie = xprt->connect_cookie; 893 req->rq_connect_cookie = xprt->connect_cookie;
871 req->rq_xtime = jiffies; 894 req->rq_xtime = jiffies;
872 status = xprt->ops->send_request(task); 895 status = xprt->ops->send_request(task);
873 if (status == 0) { 896 if (status != 0) {
874 dprintk("RPC: %5u xmit complete\n", task->tk_pid); 897 task->tk_status = status;
875 spin_lock_bh(&xprt->transport_lock); 898 return;
899 }
876 900
877 xprt->ops->set_retrans_timeout(task); 901 dprintk("RPC: %5u xmit complete\n", task->tk_pid);
902 spin_lock_bh(&xprt->transport_lock);
878 903
879 xprt->stat.sends++; 904 xprt->ops->set_retrans_timeout(task);
880 xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs;
881 xprt->stat.bklog_u += xprt->backlog.qlen;
882 905
883 /* Don't race with disconnect */ 906 xprt->stat.sends++;
884 if (!xprt_connected(xprt)) 907 xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs;
885 task->tk_status = -ENOTCONN; 908 xprt->stat.bklog_u += xprt->backlog.qlen;
886 else if (!req->rq_received)
887 rpc_sleep_on(&xprt->pending, task, xprt_timer);
888 spin_unlock_bh(&xprt->transport_lock);
889 return;
890 }
891 909
892 /* Note: at this point, task->tk_sleeping has not yet been set, 910 /* Don't race with disconnect */
893 * hence there is no danger of the waking up task being put on 911 if (!xprt_connected(xprt))
894 * schedq, and being picked up by a parallel run of rpciod(). 912 task->tk_status = -ENOTCONN;
895 */ 913 else if (!req->rq_received)
896 task->tk_status = status; 914 rpc_sleep_on(&xprt->pending, task, xprt_timer);
897 if (status == -ECONNREFUSED) 915 spin_unlock_bh(&xprt->transport_lock);
898 rpc_sleep_on(&xprt->sending, task, NULL);
899} 916}
900 917
901static inline void do_xprt_reserve(struct rpc_task *task) 918static inline void do_xprt_reserve(struct rpc_task *task)
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 14106d26bb95..e5e28d1946a4 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -310,6 +310,19 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
310 __func__, pad, destp, rqst->rq_slen, curlen); 310 __func__, pad, destp, rqst->rq_slen, curlen);
311 311
312 copy_len = rqst->rq_snd_buf.page_len; 312 copy_len = rqst->rq_snd_buf.page_len;
313
314 if (rqst->rq_snd_buf.tail[0].iov_len) {
315 curlen = rqst->rq_snd_buf.tail[0].iov_len;
316 if (destp + copy_len != rqst->rq_snd_buf.tail[0].iov_base) {
317 memmove(destp + copy_len,
318 rqst->rq_snd_buf.tail[0].iov_base, curlen);
319 r_xprt->rx_stats.pullup_copy_count += curlen;
320 }
321 dprintk("RPC: %s: tail destp 0x%p len %d\n",
322 __func__, destp + copy_len, curlen);
323 rqst->rq_svec[0].iov_len += curlen;
324 }
325
313 r_xprt->rx_stats.pullup_copy_count += copy_len; 326 r_xprt->rx_stats.pullup_copy_count += copy_len;
314 npages = PAGE_ALIGN(rqst->rq_snd_buf.page_base+copy_len) >> PAGE_SHIFT; 327 npages = PAGE_ALIGN(rqst->rq_snd_buf.page_base+copy_len) >> PAGE_SHIFT;
315 for (i = 0; copy_len && i < npages; i++) { 328 for (i = 0; copy_len && i < npages; i++) {
@@ -332,17 +345,6 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
332 destp += curlen; 345 destp += curlen;
333 copy_len -= curlen; 346 copy_len -= curlen;
334 } 347 }
335 if (rqst->rq_snd_buf.tail[0].iov_len) {
336 curlen = rqst->rq_snd_buf.tail[0].iov_len;
337 if (destp != rqst->rq_snd_buf.tail[0].iov_base) {
338 memcpy(destp,
339 rqst->rq_snd_buf.tail[0].iov_base, curlen);
340 r_xprt->rx_stats.pullup_copy_count += curlen;
341 }
342 dprintk("RPC: %s: tail destp 0x%p len %d curlen %d\n",
343 __func__, destp, copy_len, curlen);
344 rqst->rq_svec[0].iov_len += curlen;
345 }
346 /* header now contains entire send message */ 348 /* header now contains entire send message */
347 return pad; 349 return pad;
348} 350}
@@ -656,7 +658,7 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
656 if (curlen > rqst->rq_rcv_buf.tail[0].iov_len) 658 if (curlen > rqst->rq_rcv_buf.tail[0].iov_len)
657 curlen = rqst->rq_rcv_buf.tail[0].iov_len; 659 curlen = rqst->rq_rcv_buf.tail[0].iov_len;
658 if (rqst->rq_rcv_buf.tail[0].iov_base != srcp) 660 if (rqst->rq_rcv_buf.tail[0].iov_base != srcp)
659 memcpy(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen); 661 memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen);
660 dprintk("RPC: %s: tail srcp 0x%p len %d curlen %d\n", 662 dprintk("RPC: %s: tail srcp 0x%p len %d curlen %d\n",
661 __func__, srcp, copy_len, curlen); 663 __func__, srcp, copy_len, curlen);
662 rqst->rq_rcv_buf.tail[0].iov_len = curlen; 664 rqst->rq_rcv_buf.tail[0].iov_len = curlen;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index a3334e3b73cc..6c26a675435a 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -191,7 +191,6 @@ static int map_xdr(struct svcxprt_rdma *xprt,
191 struct xdr_buf *xdr, 191 struct xdr_buf *xdr,
192 struct svc_rdma_req_map *vec) 192 struct svc_rdma_req_map *vec)
193{ 193{
194 int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3;
195 int sge_no; 194 int sge_no;
196 u32 sge_bytes; 195 u32 sge_bytes;
197 u32 page_bytes; 196 u32 page_bytes;
@@ -235,7 +234,11 @@ static int map_xdr(struct svcxprt_rdma *xprt,
235 sge_no++; 234 sge_no++;
236 } 235 }
237 236
238 BUG_ON(sge_no > sge_max); 237 dprintk("svcrdma: map_xdr: sge_no %d page_no %d "
238 "page_base %u page_len %u head_len %zu tail_len %zu\n",
239 sge_no, page_no, xdr->page_base, xdr->page_len,
240 xdr->head[0].iov_len, xdr->tail[0].iov_len);
241
239 vec->count = sge_no; 242 vec->count = sge_no;
240 return 0; 243 return 0;
241} 244}
@@ -579,7 +582,6 @@ static int send_reply(struct svcxprt_rdma *rdma,
579 ctxt->sge[page_no+1].length = 0; 582 ctxt->sge[page_no+1].length = 0;
580 } 583 }
581 BUG_ON(sge_no > rdma->sc_max_sge); 584 BUG_ON(sge_no > rdma->sc_max_sge);
582 BUG_ON(sge_no > ctxt->count);
583 memset(&send_wr, 0, sizeof send_wr); 585 memset(&send_wr, 0, sizeof send_wr);
584 ctxt->wr_op = IB_WR_SEND; 586 ctxt->wr_op = IB_WR_SEND;
585 send_wr.wr_id = (unsigned long)ctxt; 587 send_wr.wr_id = (unsigned long)ctxt;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 568330eebbfe..d40ff50887aa 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -49,6 +49,9 @@ unsigned int xprt_tcp_slot_table_entries = RPC_DEF_SLOT_TABLE;
49unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT; 49unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT;
50unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT; 50unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT;
51 51
52#define XS_TCP_LINGER_TO (15U * HZ)
53static unsigned int xs_tcp_fin_timeout __read_mostly = XS_TCP_LINGER_TO;
54
52/* 55/*
53 * We can register our own files under /proc/sys/sunrpc by 56 * We can register our own files under /proc/sys/sunrpc by
54 * calling register_sysctl_table() again. The files in that 57 * calling register_sysctl_table() again. The files in that
@@ -117,6 +120,14 @@ static ctl_table xs_tunables_table[] = {
117 .extra2 = &xprt_max_resvport_limit 120 .extra2 = &xprt_max_resvport_limit
118 }, 121 },
119 { 122 {
123 .procname = "tcp_fin_timeout",
124 .data = &xs_tcp_fin_timeout,
125 .maxlen = sizeof(xs_tcp_fin_timeout),
126 .mode = 0644,
127 .proc_handler = &proc_dointvec_jiffies,
128 .strategy = sysctl_jiffies
129 },
130 {
120 .ctl_name = 0, 131 .ctl_name = 0,
121 }, 132 },
122}; 133};
@@ -521,11 +532,12 @@ static void xs_nospace_callback(struct rpc_task *task)
521 * @task: task to put to sleep 532 * @task: task to put to sleep
522 * 533 *
523 */ 534 */
524static void xs_nospace(struct rpc_task *task) 535static int xs_nospace(struct rpc_task *task)
525{ 536{
526 struct rpc_rqst *req = task->tk_rqstp; 537 struct rpc_rqst *req = task->tk_rqstp;
527 struct rpc_xprt *xprt = req->rq_xprt; 538 struct rpc_xprt *xprt = req->rq_xprt;
528 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); 539 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
540 int ret = 0;
529 541
530 dprintk("RPC: %5u xmit incomplete (%u left of %u)\n", 542 dprintk("RPC: %5u xmit incomplete (%u left of %u)\n",
531 task->tk_pid, req->rq_slen - req->rq_bytes_sent, 543 task->tk_pid, req->rq_slen - req->rq_bytes_sent,
@@ -537,6 +549,7 @@ static void xs_nospace(struct rpc_task *task)
537 /* Don't race with disconnect */ 549 /* Don't race with disconnect */
538 if (xprt_connected(xprt)) { 550 if (xprt_connected(xprt)) {
539 if (test_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags)) { 551 if (test_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags)) {
552 ret = -EAGAIN;
540 /* 553 /*
541 * Notify TCP that we're limited by the application 554 * Notify TCP that we're limited by the application
542 * window size 555 * window size
@@ -548,10 +561,11 @@ static void xs_nospace(struct rpc_task *task)
548 } 561 }
549 } else { 562 } else {
550 clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); 563 clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
551 task->tk_status = -ENOTCONN; 564 ret = -ENOTCONN;
552 } 565 }
553 566
554 spin_unlock_bh(&xprt->transport_lock); 567 spin_unlock_bh(&xprt->transport_lock);
568 return ret;
555} 569}
556 570
557/** 571/**
@@ -594,6 +608,8 @@ static int xs_udp_send_request(struct rpc_task *task)
594 /* Still some bytes left; set up for a retry later. */ 608 /* Still some bytes left; set up for a retry later. */
595 status = -EAGAIN; 609 status = -EAGAIN;
596 } 610 }
611 if (!transport->sock)
612 goto out;
597 613
598 switch (status) { 614 switch (status) {
599 case -ENOTSOCK: 615 case -ENOTSOCK:
@@ -601,21 +617,19 @@ static int xs_udp_send_request(struct rpc_task *task)
601 /* Should we call xs_close() here? */ 617 /* Should we call xs_close() here? */
602 break; 618 break;
603 case -EAGAIN: 619 case -EAGAIN:
604 xs_nospace(task); 620 status = xs_nospace(task);
605 break; 621 break;
622 default:
623 dprintk("RPC: sendmsg returned unrecognized error %d\n",
624 -status);
606 case -ENETUNREACH: 625 case -ENETUNREACH:
607 case -EPIPE: 626 case -EPIPE:
608 case -ECONNREFUSED: 627 case -ECONNREFUSED:
609 /* When the server has died, an ICMP port unreachable message 628 /* When the server has died, an ICMP port unreachable message
610 * prompts ECONNREFUSED. */ 629 * prompts ECONNREFUSED. */
611 clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); 630 clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
612 break;
613 default:
614 clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
615 dprintk("RPC: sendmsg returned unrecognized error %d\n",
616 -status);
617 } 631 }
618 632out:
619 return status; 633 return status;
620} 634}
621 635
@@ -697,6 +711,8 @@ static int xs_tcp_send_request(struct rpc_task *task)
697 status = -EAGAIN; 711 status = -EAGAIN;
698 break; 712 break;
699 } 713 }
714 if (!transport->sock)
715 goto out;
700 716
701 switch (status) { 717 switch (status) {
702 case -ENOTSOCK: 718 case -ENOTSOCK:
@@ -704,23 +720,19 @@ static int xs_tcp_send_request(struct rpc_task *task)
704 /* Should we call xs_close() here? */ 720 /* Should we call xs_close() here? */
705 break; 721 break;
706 case -EAGAIN: 722 case -EAGAIN:
707 xs_nospace(task); 723 status = xs_nospace(task);
708 break; 724 break;
725 default:
726 dprintk("RPC: sendmsg returned unrecognized error %d\n",
727 -status);
709 case -ECONNRESET: 728 case -ECONNRESET:
729 case -EPIPE:
710 xs_tcp_shutdown(xprt); 730 xs_tcp_shutdown(xprt);
711 case -ECONNREFUSED: 731 case -ECONNREFUSED:
712 case -ENOTCONN: 732 case -ENOTCONN:
713 case -EPIPE:
714 status = -ENOTCONN;
715 clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
716 break;
717 default:
718 dprintk("RPC: sendmsg returned unrecognized error %d\n",
719 -status);
720 clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); 733 clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
721 xs_tcp_shutdown(xprt);
722 } 734 }
723 735out:
724 return status; 736 return status;
725} 737}
726 738
@@ -767,23 +779,13 @@ static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *s
767 sk->sk_error_report = transport->old_error_report; 779 sk->sk_error_report = transport->old_error_report;
768} 780}
769 781
770/** 782static void xs_reset_transport(struct sock_xprt *transport)
771 * xs_close - close a socket
772 * @xprt: transport
773 *
774 * This is used when all requests are complete; ie, no DRC state remains
775 * on the server we want to save.
776 */
777static void xs_close(struct rpc_xprt *xprt)
778{ 783{
779 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
780 struct socket *sock = transport->sock; 784 struct socket *sock = transport->sock;
781 struct sock *sk = transport->inet; 785 struct sock *sk = transport->inet;
782 786
783 if (!sk) 787 if (sk == NULL)
784 goto clear_close_wait; 788 return;
785
786 dprintk("RPC: xs_close xprt %p\n", xprt);
787 789
788 write_lock_bh(&sk->sk_callback_lock); 790 write_lock_bh(&sk->sk_callback_lock);
789 transport->inet = NULL; 791 transport->inet = NULL;
@@ -797,8 +799,25 @@ static void xs_close(struct rpc_xprt *xprt)
797 sk->sk_no_check = 0; 799 sk->sk_no_check = 0;
798 800
799 sock_release(sock); 801 sock_release(sock);
800clear_close_wait: 802}
803
804/**
805 * xs_close - close a socket
806 * @xprt: transport
807 *
808 * This is used when all requests are complete; ie, no DRC state remains
809 * on the server we want to save.
810 */
811static void xs_close(struct rpc_xprt *xprt)
812{
813 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
814
815 dprintk("RPC: xs_close xprt %p\n", xprt);
816
817 xs_reset_transport(transport);
818
801 smp_mb__before_clear_bit(); 819 smp_mb__before_clear_bit();
820 clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
802 clear_bit(XPRT_CLOSE_WAIT, &xprt->state); 821 clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
803 clear_bit(XPRT_CLOSING, &xprt->state); 822 clear_bit(XPRT_CLOSING, &xprt->state);
804 smp_mb__after_clear_bit(); 823 smp_mb__after_clear_bit();
@@ -1126,6 +1145,47 @@ out:
1126 read_unlock(&sk->sk_callback_lock); 1145 read_unlock(&sk->sk_callback_lock);
1127} 1146}
1128 1147
1148/*
1149 * Do the equivalent of linger/linger2 handling for dealing with
1150 * broken servers that don't close the socket in a timely
1151 * fashion
1152 */
1153static void xs_tcp_schedule_linger_timeout(struct rpc_xprt *xprt,
1154 unsigned long timeout)
1155{
1156 struct sock_xprt *transport;
1157
1158 if (xprt_test_and_set_connecting(xprt))
1159 return;
1160 set_bit(XPRT_CONNECTION_ABORT, &xprt->state);
1161 transport = container_of(xprt, struct sock_xprt, xprt);
1162 queue_delayed_work(rpciod_workqueue, &transport->connect_worker,
1163 timeout);
1164}
1165
1166static void xs_tcp_cancel_linger_timeout(struct rpc_xprt *xprt)
1167{
1168 struct sock_xprt *transport;
1169
1170 transport = container_of(xprt, struct sock_xprt, xprt);
1171
1172 if (!test_bit(XPRT_CONNECTION_ABORT, &xprt->state) ||
1173 !cancel_delayed_work(&transport->connect_worker))
1174 return;
1175 clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
1176 xprt_clear_connecting(xprt);
1177}
1178
1179static void xs_sock_mark_closed(struct rpc_xprt *xprt)
1180{
1181 smp_mb__before_clear_bit();
1182 clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
1183 clear_bit(XPRT_CLOSING, &xprt->state);
1184 smp_mb__after_clear_bit();
1185 /* Mark transport as closed and wake up all pending tasks */
1186 xprt_disconnect_done(xprt);
1187}
1188
1129/** 1189/**
1130 * xs_tcp_state_change - callback to handle TCP socket state changes 1190 * xs_tcp_state_change - callback to handle TCP socket state changes
1131 * @sk: socket whose state has changed 1191 * @sk: socket whose state has changed
@@ -1158,7 +1218,7 @@ static void xs_tcp_state_change(struct sock *sk)
1158 transport->tcp_flags = 1218 transport->tcp_flags =
1159 TCP_RCV_COPY_FRAGHDR | TCP_RCV_COPY_XID; 1219 TCP_RCV_COPY_FRAGHDR | TCP_RCV_COPY_XID;
1160 1220
1161 xprt_wake_pending_tasks(xprt, 0); 1221 xprt_wake_pending_tasks(xprt, -EAGAIN);
1162 } 1222 }
1163 spin_unlock_bh(&xprt->transport_lock); 1223 spin_unlock_bh(&xprt->transport_lock);
1164 break; 1224 break;
@@ -1171,10 +1231,10 @@ static void xs_tcp_state_change(struct sock *sk)
1171 clear_bit(XPRT_CONNECTED, &xprt->state); 1231 clear_bit(XPRT_CONNECTED, &xprt->state);
1172 clear_bit(XPRT_CLOSE_WAIT, &xprt->state); 1232 clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
1173 smp_mb__after_clear_bit(); 1233 smp_mb__after_clear_bit();
1234 xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout);
1174 break; 1235 break;
1175 case TCP_CLOSE_WAIT: 1236 case TCP_CLOSE_WAIT:
1176 /* The server initiated a shutdown of the socket */ 1237 /* The server initiated a shutdown of the socket */
1177 set_bit(XPRT_CLOSING, &xprt->state);
1178 xprt_force_disconnect(xprt); 1238 xprt_force_disconnect(xprt);
1179 case TCP_SYN_SENT: 1239 case TCP_SYN_SENT:
1180 xprt->connect_cookie++; 1240 xprt->connect_cookie++;
@@ -1187,40 +1247,35 @@ static void xs_tcp_state_change(struct sock *sk)
1187 xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; 1247 xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
1188 break; 1248 break;
1189 case TCP_LAST_ACK: 1249 case TCP_LAST_ACK:
1250 set_bit(XPRT_CLOSING, &xprt->state);
1251 xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout);
1190 smp_mb__before_clear_bit(); 1252 smp_mb__before_clear_bit();
1191 clear_bit(XPRT_CONNECTED, &xprt->state); 1253 clear_bit(XPRT_CONNECTED, &xprt->state);
1192 smp_mb__after_clear_bit(); 1254 smp_mb__after_clear_bit();
1193 break; 1255 break;
1194 case TCP_CLOSE: 1256 case TCP_CLOSE:
1195 smp_mb__before_clear_bit(); 1257 xs_tcp_cancel_linger_timeout(xprt);
1196 clear_bit(XPRT_CLOSE_WAIT, &xprt->state); 1258 xs_sock_mark_closed(xprt);
1197 clear_bit(XPRT_CLOSING, &xprt->state);
1198 smp_mb__after_clear_bit();
1199 /* Mark transport as closed and wake up all pending tasks */
1200 xprt_disconnect_done(xprt);
1201 } 1259 }
1202 out: 1260 out:
1203 read_unlock(&sk->sk_callback_lock); 1261 read_unlock(&sk->sk_callback_lock);
1204} 1262}
1205 1263
1206/** 1264/**
1207 * xs_tcp_error_report - callback mainly for catching RST events 1265 * xs_error_report - callback mainly for catching socket errors
1208 * @sk: socket 1266 * @sk: socket
1209 */ 1267 */
1210static void xs_tcp_error_report(struct sock *sk) 1268static void xs_error_report(struct sock *sk)
1211{ 1269{
1212 struct rpc_xprt *xprt; 1270 struct rpc_xprt *xprt;
1213 1271
1214 read_lock(&sk->sk_callback_lock); 1272 read_lock(&sk->sk_callback_lock);
1215 if (sk->sk_err != ECONNRESET || sk->sk_state != TCP_ESTABLISHED)
1216 goto out;
1217 if (!(xprt = xprt_from_sock(sk))) 1273 if (!(xprt = xprt_from_sock(sk)))
1218 goto out; 1274 goto out;
1219 dprintk("RPC: %s client %p...\n" 1275 dprintk("RPC: %s client %p...\n"
1220 "RPC: error %d\n", 1276 "RPC: error %d\n",
1221 __func__, xprt, sk->sk_err); 1277 __func__, xprt, sk->sk_err);
1222 1278 xprt_wake_pending_tasks(xprt, -EAGAIN);
1223 xprt_force_disconnect(xprt);
1224out: 1279out:
1225 read_unlock(&sk->sk_callback_lock); 1280 read_unlock(&sk->sk_callback_lock);
1226} 1281}
@@ -1494,6 +1549,7 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
1494 sk->sk_user_data = xprt; 1549 sk->sk_user_data = xprt;
1495 sk->sk_data_ready = xs_udp_data_ready; 1550 sk->sk_data_ready = xs_udp_data_ready;
1496 sk->sk_write_space = xs_udp_write_space; 1551 sk->sk_write_space = xs_udp_write_space;
1552 sk->sk_error_report = xs_error_report;
1497 sk->sk_no_check = UDP_CSUM_NORCV; 1553 sk->sk_no_check = UDP_CSUM_NORCV;
1498 sk->sk_allocation = GFP_ATOMIC; 1554 sk->sk_allocation = GFP_ATOMIC;
1499 1555
@@ -1526,9 +1582,10 @@ static void xs_udp_connect_worker4(struct work_struct *work)
1526 goto out; 1582 goto out;
1527 1583
1528 /* Start by resetting any existing state */ 1584 /* Start by resetting any existing state */
1529 xs_close(xprt); 1585 xs_reset_transport(transport);
1530 1586
1531 if ((err = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) { 1587 err = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
1588 if (err < 0) {
1532 dprintk("RPC: can't create UDP transport socket (%d).\n", -err); 1589 dprintk("RPC: can't create UDP transport socket (%d).\n", -err);
1533 goto out; 1590 goto out;
1534 } 1591 }
@@ -1545,8 +1602,8 @@ static void xs_udp_connect_worker4(struct work_struct *work)
1545 xs_udp_finish_connecting(xprt, sock); 1602 xs_udp_finish_connecting(xprt, sock);
1546 status = 0; 1603 status = 0;
1547out: 1604out:
1548 xprt_wake_pending_tasks(xprt, status);
1549 xprt_clear_connecting(xprt); 1605 xprt_clear_connecting(xprt);
1606 xprt_wake_pending_tasks(xprt, status);
1550} 1607}
1551 1608
1552/** 1609/**
@@ -1567,9 +1624,10 @@ static void xs_udp_connect_worker6(struct work_struct *work)
1567 goto out; 1624 goto out;
1568 1625
1569 /* Start by resetting any existing state */ 1626 /* Start by resetting any existing state */
1570 xs_close(xprt); 1627 xs_reset_transport(transport);
1571 1628
1572 if ((err = sock_create_kern(PF_INET6, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) { 1629 err = sock_create_kern(PF_INET6, SOCK_DGRAM, IPPROTO_UDP, &sock);
1630 if (err < 0) {
1573 dprintk("RPC: can't create UDP transport socket (%d).\n", -err); 1631 dprintk("RPC: can't create UDP transport socket (%d).\n", -err);
1574 goto out; 1632 goto out;
1575 } 1633 }
@@ -1586,18 +1644,17 @@ static void xs_udp_connect_worker6(struct work_struct *work)
1586 xs_udp_finish_connecting(xprt, sock); 1644 xs_udp_finish_connecting(xprt, sock);
1587 status = 0; 1645 status = 0;
1588out: 1646out:
1589 xprt_wake_pending_tasks(xprt, status);
1590 xprt_clear_connecting(xprt); 1647 xprt_clear_connecting(xprt);
1648 xprt_wake_pending_tasks(xprt, status);
1591} 1649}
1592 1650
1593/* 1651/*
1594 * We need to preserve the port number so the reply cache on the server can 1652 * We need to preserve the port number so the reply cache on the server can
1595 * find our cached RPC replies when we get around to reconnecting. 1653 * find our cached RPC replies when we get around to reconnecting.
1596 */ 1654 */
1597static void xs_tcp_reuse_connection(struct rpc_xprt *xprt) 1655static void xs_abort_connection(struct rpc_xprt *xprt, struct sock_xprt *transport)
1598{ 1656{
1599 int result; 1657 int result;
1600 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
1601 struct sockaddr any; 1658 struct sockaddr any;
1602 1659
1603 dprintk("RPC: disconnecting xprt %p to reuse port\n", xprt); 1660 dprintk("RPC: disconnecting xprt %p to reuse port\n", xprt);
@@ -1609,11 +1666,24 @@ static void xs_tcp_reuse_connection(struct rpc_xprt *xprt)
1609 memset(&any, 0, sizeof(any)); 1666 memset(&any, 0, sizeof(any));
1610 any.sa_family = AF_UNSPEC; 1667 any.sa_family = AF_UNSPEC;
1611 result = kernel_connect(transport->sock, &any, sizeof(any), 0); 1668 result = kernel_connect(transport->sock, &any, sizeof(any), 0);
1612 if (result) 1669 if (!result)
1670 xs_sock_mark_closed(xprt);
1671 else
1613 dprintk("RPC: AF_UNSPEC connect return code %d\n", 1672 dprintk("RPC: AF_UNSPEC connect return code %d\n",
1614 result); 1673 result);
1615} 1674}
1616 1675
1676static void xs_tcp_reuse_connection(struct rpc_xprt *xprt, struct sock_xprt *transport)
1677{
1678 unsigned int state = transport->inet->sk_state;
1679
1680 if (state == TCP_CLOSE && transport->sock->state == SS_UNCONNECTED)
1681 return;
1682 if ((1 << state) & (TCPF_ESTABLISHED|TCPF_SYN_SENT))
1683 return;
1684 xs_abort_connection(xprt, transport);
1685}
1686
1617static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) 1687static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
1618{ 1688{
1619 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); 1689 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
@@ -1629,7 +1699,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
1629 sk->sk_data_ready = xs_tcp_data_ready; 1699 sk->sk_data_ready = xs_tcp_data_ready;
1630 sk->sk_state_change = xs_tcp_state_change; 1700 sk->sk_state_change = xs_tcp_state_change;
1631 sk->sk_write_space = xs_tcp_write_space; 1701 sk->sk_write_space = xs_tcp_write_space;
1632 sk->sk_error_report = xs_tcp_error_report; 1702 sk->sk_error_report = xs_error_report;
1633 sk->sk_allocation = GFP_ATOMIC; 1703 sk->sk_allocation = GFP_ATOMIC;
1634 1704
1635 /* socket options */ 1705 /* socket options */
@@ -1657,37 +1727,42 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
1657} 1727}
1658 1728
1659/** 1729/**
1660 * xs_tcp_connect_worker4 - connect a TCP socket to a remote endpoint 1730 * xs_tcp_setup_socket - create a TCP socket and connect to a remote endpoint
1661 * @work: RPC transport to connect 1731 * @xprt: RPC transport to connect
1732 * @transport: socket transport to connect
1733 * @create_sock: function to create a socket of the correct type
1662 * 1734 *
1663 * Invoked by a work queue tasklet. 1735 * Invoked by a work queue tasklet.
1664 */ 1736 */
1665static void xs_tcp_connect_worker4(struct work_struct *work) 1737static void xs_tcp_setup_socket(struct rpc_xprt *xprt,
1738 struct sock_xprt *transport,
1739 struct socket *(*create_sock)(struct rpc_xprt *,
1740 struct sock_xprt *))
1666{ 1741{
1667 struct sock_xprt *transport =
1668 container_of(work, struct sock_xprt, connect_worker.work);
1669 struct rpc_xprt *xprt = &transport->xprt;
1670 struct socket *sock = transport->sock; 1742 struct socket *sock = transport->sock;
1671 int err, status = -EIO; 1743 int status = -EIO;
1672 1744
1673 if (xprt->shutdown) 1745 if (xprt->shutdown)
1674 goto out; 1746 goto out;
1675 1747
1676 if (!sock) { 1748 if (!sock) {
1677 /* start from scratch */ 1749 clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
1678 if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) { 1750 sock = create_sock(xprt, transport);
1679 dprintk("RPC: can't create TCP transport socket (%d).\n", -err); 1751 if (IS_ERR(sock)) {
1752 status = PTR_ERR(sock);
1680 goto out; 1753 goto out;
1681 } 1754 }
1682 xs_reclassify_socket4(sock); 1755 } else {
1756 int abort_and_exit;
1683 1757
1684 if (xs_bind4(transport, sock) < 0) { 1758 abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT,
1685 sock_release(sock); 1759 &xprt->state);
1686 goto out;
1687 }
1688 } else
1689 /* "close" the socket, preserving the local port */ 1760 /* "close" the socket, preserving the local port */
1690 xs_tcp_reuse_connection(xprt); 1761 xs_tcp_reuse_connection(xprt, transport);
1762
1763 if (abort_and_exit)
1764 goto out_eagain;
1765 }
1691 1766
1692 dprintk("RPC: worker connecting xprt %p to address: %s\n", 1767 dprintk("RPC: worker connecting xprt %p to address: %s\n",
1693 xprt, xprt->address_strings[RPC_DISPLAY_ALL]); 1768 xprt, xprt->address_strings[RPC_DISPLAY_ALL]);
@@ -1696,83 +1771,104 @@ static void xs_tcp_connect_worker4(struct work_struct *work)
1696 dprintk("RPC: %p connect status %d connected %d sock state %d\n", 1771 dprintk("RPC: %p connect status %d connected %d sock state %d\n",
1697 xprt, -status, xprt_connected(xprt), 1772 xprt, -status, xprt_connected(xprt),
1698 sock->sk->sk_state); 1773 sock->sk->sk_state);
1699 if (status < 0) { 1774 switch (status) {
1700 switch (status) { 1775 case -ECONNREFUSED:
1701 case -EINPROGRESS: 1776 case -ECONNRESET:
1702 case -EALREADY: 1777 case -ENETUNREACH:
1703 goto out_clear; 1778 /* retry with existing socket, after a delay */
1704 case -ECONNREFUSED: 1779 case 0:
1705 case -ECONNRESET: 1780 case -EINPROGRESS:
1706 /* retry with existing socket, after a delay */ 1781 case -EALREADY:
1707 break; 1782 xprt_clear_connecting(xprt);
1708 default: 1783 return;
1709 /* get rid of existing socket, and retry */
1710 xs_tcp_shutdown(xprt);
1711 }
1712 } 1784 }
1785 /* get rid of existing socket, and retry */
1786 xs_tcp_shutdown(xprt);
1787 printk("%s: connect returned unhandled error %d\n",
1788 __func__, status);
1789out_eagain:
1790 status = -EAGAIN;
1713out: 1791out:
1714 xprt_wake_pending_tasks(xprt, status);
1715out_clear:
1716 xprt_clear_connecting(xprt); 1792 xprt_clear_connecting(xprt);
1793 xprt_wake_pending_tasks(xprt, status);
1794}
1795
1796static struct socket *xs_create_tcp_sock4(struct rpc_xprt *xprt,
1797 struct sock_xprt *transport)
1798{
1799 struct socket *sock;
1800 int err;
1801
1802 /* start from scratch */
1803 err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
1804 if (err < 0) {
1805 dprintk("RPC: can't create TCP transport socket (%d).\n",
1806 -err);
1807 goto out_err;
1808 }
1809 xs_reclassify_socket4(sock);
1810
1811 if (xs_bind4(transport, sock) < 0) {
1812 sock_release(sock);
1813 goto out_err;
1814 }
1815 return sock;
1816out_err:
1817 return ERR_PTR(-EIO);
1717} 1818}
1718 1819
1719/** 1820/**
1720 * xs_tcp_connect_worker6 - connect a TCP socket to a remote endpoint 1821 * xs_tcp_connect_worker4 - connect a TCP socket to a remote endpoint
1721 * @work: RPC transport to connect 1822 * @work: RPC transport to connect
1722 * 1823 *
1723 * Invoked by a work queue tasklet. 1824 * Invoked by a work queue tasklet.
1724 */ 1825 */
1725static void xs_tcp_connect_worker6(struct work_struct *work) 1826static void xs_tcp_connect_worker4(struct work_struct *work)
1726{ 1827{
1727 struct sock_xprt *transport = 1828 struct sock_xprt *transport =
1728 container_of(work, struct sock_xprt, connect_worker.work); 1829 container_of(work, struct sock_xprt, connect_worker.work);
1729 struct rpc_xprt *xprt = &transport->xprt; 1830 struct rpc_xprt *xprt = &transport->xprt;
1730 struct socket *sock = transport->sock;
1731 int err, status = -EIO;
1732 1831
1733 if (xprt->shutdown) 1832 xs_tcp_setup_socket(xprt, transport, xs_create_tcp_sock4);
1734 goto out; 1833}
1735 1834
1736 if (!sock) { 1835static struct socket *xs_create_tcp_sock6(struct rpc_xprt *xprt,
1737 /* start from scratch */ 1836 struct sock_xprt *transport)
1738 if ((err = sock_create_kern(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) { 1837{
1739 dprintk("RPC: can't create TCP transport socket (%d).\n", -err); 1838 struct socket *sock;
1740 goto out; 1839 int err;
1741 } 1840
1742 xs_reclassify_socket6(sock); 1841 /* start from scratch */
1842 err = sock_create_kern(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &sock);
1843 if (err < 0) {
1844 dprintk("RPC: can't create TCP transport socket (%d).\n",
1845 -err);
1846 goto out_err;
1847 }
1848 xs_reclassify_socket6(sock);
1743 1849
1744 if (xs_bind6(transport, sock) < 0) { 1850 if (xs_bind6(transport, sock) < 0) {
1745 sock_release(sock); 1851 sock_release(sock);
1746 goto out; 1852 goto out_err;
1747 } 1853 }
1748 } else 1854 return sock;
1749 /* "close" the socket, preserving the local port */ 1855out_err:
1750 xs_tcp_reuse_connection(xprt); 1856 return ERR_PTR(-EIO);
1857}
1751 1858
1752 dprintk("RPC: worker connecting xprt %p to address: %s\n", 1859/**
1753 xprt, xprt->address_strings[RPC_DISPLAY_ALL]); 1860 * xs_tcp_connect_worker6 - connect a TCP socket to a remote endpoint
1861 * @work: RPC transport to connect
1862 *
1863 * Invoked by a work queue tasklet.
1864 */
1865static void xs_tcp_connect_worker6(struct work_struct *work)
1866{
1867 struct sock_xprt *transport =
1868 container_of(work, struct sock_xprt, connect_worker.work);
1869 struct rpc_xprt *xprt = &transport->xprt;
1754 1870
1755 status = xs_tcp_finish_connecting(xprt, sock); 1871 xs_tcp_setup_socket(xprt, transport, xs_create_tcp_sock6);
1756 dprintk("RPC: %p connect status %d connected %d sock state %d\n",
1757 xprt, -status, xprt_connected(xprt), sock->sk->sk_state);
1758 if (status < 0) {
1759 switch (status) {
1760 case -EINPROGRESS:
1761 case -EALREADY:
1762 goto out_clear;
1763 case -ECONNREFUSED:
1764 case -ECONNRESET:
1765 /* retry with existing socket, after a delay */
1766 break;
1767 default:
1768 /* get rid of existing socket, and retry */
1769 xs_tcp_shutdown(xprt);
1770 }
1771 }
1772out:
1773 xprt_wake_pending_tasks(xprt, status);
1774out_clear:
1775 xprt_clear_connecting(xprt);
1776} 1872}
1777 1873
1778/** 1874/**
@@ -1817,9 +1913,6 @@ static void xs_tcp_connect(struct rpc_task *task)
1817{ 1913{
1818 struct rpc_xprt *xprt = task->tk_xprt; 1914 struct rpc_xprt *xprt = task->tk_xprt;
1819 1915
1820 /* Initiate graceful shutdown of the socket if not already done */
1821 if (test_bit(XPRT_CONNECTED, &xprt->state))
1822 xs_tcp_shutdown(xprt);
1823 /* Exit if we need to wait for socket shutdown to complete */ 1916 /* Exit if we need to wait for socket shutdown to complete */
1824 if (test_bit(XPRT_CLOSING, &xprt->state)) 1917 if (test_bit(XPRT_CLOSING, &xprt->state))
1825 return; 1918 return;