aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/linux-2.6
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/linux-2.6')
-rw-r--r--fs/xfs/linux-2.6/kmem.c56
-rw-r--r--fs/xfs/linux-2.6/kmem.h21
-rw-r--r--fs/xfs/linux-2.6/xfs_acl.c11
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c221
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c320
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h52
-rw-r--r--fs/xfs/linux-2.6/xfs_export.c20
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c854
-rw-r--r--fs/xfs/linux-2.6/xfs_fs_subr.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c21
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.h12
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c14
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c852
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.h32
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c19
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c175
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c196
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.c16
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h103
-rw-r--r--fs/xfs/linux-2.6/xfs_xattr.c27
23 files changed, 1748 insertions, 1282 deletions
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index 2d3f90afe5f1..bc7405585def 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -16,7 +16,6 @@
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include <linux/mm.h> 18#include <linux/mm.h>
19#include <linux/vmalloc.h>
20#include <linux/highmem.h> 19#include <linux/highmem.h>
21#include <linux/swap.h> 20#include <linux/swap.h>
22#include <linux/blkdev.h> 21#include <linux/blkdev.h>
@@ -24,8 +23,25 @@
24#include "time.h" 23#include "time.h"
25#include "kmem.h" 24#include "kmem.h"
26 25
27#define MAX_VMALLOCS 6 26/*
28#define MAX_SLAB_SIZE 0x20000 27 * Greedy allocation. May fail and may return vmalloced memory.
28 *
29 * Must be freed using kmem_free_large.
30 */
31void *
32kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
33{
34 void *ptr;
35 size_t kmsize = maxsize;
36
37 while (!(ptr = kmem_zalloc_large(kmsize))) {
38 if ((kmsize >>= 1) <= minsize)
39 kmsize = minsize;
40 }
41 if (ptr)
42 *size = kmsize;
43 return ptr;
44}
29 45
30void * 46void *
31kmem_alloc(size_t size, unsigned int __nocast flags) 47kmem_alloc(size_t size, unsigned int __nocast flags)
@@ -34,19 +50,8 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
34 gfp_t lflags = kmem_flags_convert(flags); 50 gfp_t lflags = kmem_flags_convert(flags);
35 void *ptr; 51 void *ptr;
36 52
37#ifdef DEBUG
38 if (unlikely(!(flags & KM_LARGE) && (size > PAGE_SIZE))) {
39 printk(KERN_WARNING "Large %s attempt, size=%ld\n",
40 __func__, (long)size);
41 dump_stack();
42 }
43#endif
44
45 do { 53 do {
46 if (size < MAX_SLAB_SIZE || retries > MAX_VMALLOCS) 54 ptr = kmalloc(size, lflags);
47 ptr = kmalloc(size, lflags);
48 else
49 ptr = __vmalloc(size, lflags, PAGE_KERNEL);
50 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) 55 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
51 return ptr; 56 return ptr;
52 if (!(++retries % 100)) 57 if (!(++retries % 100))
@@ -68,27 +73,6 @@ kmem_zalloc(size_t size, unsigned int __nocast flags)
68 return ptr; 73 return ptr;
69} 74}
70 75
71void *
72kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize,
73 unsigned int __nocast flags)
74{
75 void *ptr;
76 size_t kmsize = maxsize;
77 unsigned int kmflags = (flags & ~KM_SLEEP) | KM_NOSLEEP;
78
79 while (!(ptr = kmem_zalloc(kmsize, kmflags))) {
80 if ((kmsize <= minsize) && (flags & KM_NOSLEEP))
81 break;
82 if ((kmsize >>= 1) <= minsize) {
83 kmsize = minsize;
84 kmflags = flags;
85 }
86 }
87 if (ptr)
88 *size = kmsize;
89 return ptr;
90}
91
92void 76void
93kmem_free(const void *ptr) 77kmem_free(const void *ptr)
94{ 78{
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index 179cbd630f69..f7c8f7a9ea6d 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -21,6 +21,7 @@
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/mm.h> 23#include <linux/mm.h>
24#include <linux/vmalloc.h>
24 25
25/* 26/*
26 * General memory allocation interfaces 27 * General memory allocation interfaces
@@ -30,7 +31,6 @@
30#define KM_NOSLEEP 0x0002u 31#define KM_NOSLEEP 0x0002u
31#define KM_NOFS 0x0004u 32#define KM_NOFS 0x0004u
32#define KM_MAYFAIL 0x0008u 33#define KM_MAYFAIL 0x0008u
33#define KM_LARGE 0x0010u
34 34
35/* 35/*
36 * We use a special process flag to avoid recursive callbacks into 36 * We use a special process flag to avoid recursive callbacks into
@@ -42,7 +42,7 @@ kmem_flags_convert(unsigned int __nocast flags)
42{ 42{
43 gfp_t lflags; 43 gfp_t lflags;
44 44
45 BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_LARGE)); 45 BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL));
46 46
47 if (flags & KM_NOSLEEP) { 47 if (flags & KM_NOSLEEP) {
48 lflags = GFP_ATOMIC | __GFP_NOWARN; 48 lflags = GFP_ATOMIC | __GFP_NOWARN;
@@ -56,10 +56,25 @@ kmem_flags_convert(unsigned int __nocast flags)
56 56
57extern void *kmem_alloc(size_t, unsigned int __nocast); 57extern void *kmem_alloc(size_t, unsigned int __nocast);
58extern void *kmem_zalloc(size_t, unsigned int __nocast); 58extern void *kmem_zalloc(size_t, unsigned int __nocast);
59extern void *kmem_zalloc_greedy(size_t *, size_t, size_t, unsigned int __nocast);
60extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast); 59extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast);
61extern void kmem_free(const void *); 60extern void kmem_free(const void *);
62 61
62static inline void *kmem_zalloc_large(size_t size)
63{
64 void *ptr;
65
66 ptr = vmalloc(size);
67 if (ptr)
68 memset(ptr, 0, size);
69 return ptr;
70}
71static inline void kmem_free_large(void *ptr)
72{
73 vfree(ptr);
74}
75
76extern void *kmem_zalloc_greedy(size_t *, size_t, size_t);
77
63/* 78/*
64 * Zone interfaces 79 * Zone interfaces
65 */ 80 */
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index 883ca5ab8af5..bf85bbe4a9ae 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -106,7 +106,7 @@ xfs_get_acl(struct inode *inode, int type)
106 struct posix_acl *acl; 106 struct posix_acl *acl;
107 struct xfs_acl *xfs_acl; 107 struct xfs_acl *xfs_acl;
108 int len = sizeof(struct xfs_acl); 108 int len = sizeof(struct xfs_acl);
109 char *ea_name; 109 unsigned char *ea_name;
110 int error; 110 int error;
111 111
112 acl = get_cached_acl(inode, type); 112 acl = get_cached_acl(inode, type);
@@ -133,7 +133,8 @@ xfs_get_acl(struct inode *inode, int type)
133 if (!xfs_acl) 133 if (!xfs_acl)
134 return ERR_PTR(-ENOMEM); 134 return ERR_PTR(-ENOMEM);
135 135
136 error = -xfs_attr_get(ip, ea_name, (char *)xfs_acl, &len, ATTR_ROOT); 136 error = -xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
137 &len, ATTR_ROOT);
137 if (error) { 138 if (error) {
138 /* 139 /*
139 * If the attribute doesn't exist make sure we have a negative 140 * If the attribute doesn't exist make sure we have a negative
@@ -162,7 +163,7 @@ STATIC int
162xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) 163xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
163{ 164{
164 struct xfs_inode *ip = XFS_I(inode); 165 struct xfs_inode *ip = XFS_I(inode);
165 char *ea_name; 166 unsigned char *ea_name;
166 int error; 167 int error;
167 168
168 if (S_ISLNK(inode->i_mode)) 169 if (S_ISLNK(inode->i_mode))
@@ -194,7 +195,7 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
194 (sizeof(struct xfs_acl_entry) * 195 (sizeof(struct xfs_acl_entry) *
195 (XFS_ACL_MAX_ENTRIES - acl->a_count)); 196 (XFS_ACL_MAX_ENTRIES - acl->a_count));
196 197
197 error = -xfs_attr_set(ip, ea_name, (char *)xfs_acl, 198 error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
198 len, ATTR_ROOT); 199 len, ATTR_ROOT);
199 200
200 kfree(xfs_acl); 201 kfree(xfs_acl);
@@ -262,7 +263,7 @@ xfs_set_mode(struct inode *inode, mode_t mode)
262} 263}
263 264
264static int 265static int
265xfs_acl_exists(struct inode *inode, char *name) 266xfs_acl_exists(struct inode *inode, unsigned char *name)
266{ 267{
267 int len = sizeof(struct xfs_acl); 268 int len = sizeof(struct xfs_acl);
268 269
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 66abe36c1213..9083357f9e44 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -39,6 +39,7 @@
39#include "xfs_iomap.h" 39#include "xfs_iomap.h"
40#include "xfs_vnodeops.h" 40#include "xfs_vnodeops.h"
41#include "xfs_trace.h" 41#include "xfs_trace.h"
42#include "xfs_bmap.h"
42#include <linux/mpage.h> 43#include <linux/mpage.h>
43#include <linux/pagevec.h> 44#include <linux/pagevec.h>
44#include <linux/writeback.h> 45#include <linux/writeback.h>
@@ -163,14 +164,17 @@ xfs_ioend_new_eof(
163} 164}
164 165
165/* 166/*
166 * Update on-disk file size now that data has been written to disk. 167 * Update on-disk file size now that data has been written to disk. The
167 * The current in-memory file size is i_size. If a write is beyond 168 * current in-memory file size is i_size. If a write is beyond eof i_new_size
168 * eof i_new_size will be the intended file size until i_size is 169 * will be the intended file size until i_size is updated. If this write does
169 * updated. If this write does not extend all the way to the valid 170 * not extend all the way to the valid file size then restrict this update to
170 * file size then restrict this update to the end of the write. 171 * the end of the write.
172 *
173 * This function does not block as blocking on the inode lock in IO completion
174 * can lead to IO completion order dependency deadlocks.. If it can't get the
175 * inode ilock it will return EAGAIN. Callers must handle this.
171 */ 176 */
172 177STATIC int
173STATIC void
174xfs_setfilesize( 178xfs_setfilesize(
175 xfs_ioend_t *ioend) 179 xfs_ioend_t *ioend)
176{ 180{
@@ -181,16 +185,40 @@ xfs_setfilesize(
181 ASSERT(ioend->io_type != IOMAP_READ); 185 ASSERT(ioend->io_type != IOMAP_READ);
182 186
183 if (unlikely(ioend->io_error)) 187 if (unlikely(ioend->io_error))
184 return; 188 return 0;
189
190 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
191 return EAGAIN;
185 192
186 xfs_ilock(ip, XFS_ILOCK_EXCL);
187 isize = xfs_ioend_new_eof(ioend); 193 isize = xfs_ioend_new_eof(ioend);
188 if (isize) { 194 if (isize) {
189 ip->i_d.di_size = isize; 195 ip->i_d.di_size = isize;
190 xfs_mark_inode_dirty_sync(ip); 196 xfs_mark_inode_dirty(ip);
191 } 197 }
192 198
193 xfs_iunlock(ip, XFS_ILOCK_EXCL); 199 xfs_iunlock(ip, XFS_ILOCK_EXCL);
200 return 0;
201}
202
203/*
204 * Schedule IO completion handling on a xfsdatad if this was
205 * the final hold on this ioend. If we are asked to wait,
206 * flush the workqueue.
207 */
208STATIC void
209xfs_finish_ioend(
210 xfs_ioend_t *ioend,
211 int wait)
212{
213 if (atomic_dec_and_test(&ioend->io_remaining)) {
214 struct workqueue_struct *wq;
215
216 wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
217 xfsconvertd_workqueue : xfsdatad_workqueue;
218 queue_work(wq, &ioend->io_work);
219 if (wait)
220 flush_workqueue(wq);
221 }
194} 222}
195 223
196/* 224/*
@@ -198,11 +226,11 @@ xfs_setfilesize(
198 */ 226 */
199STATIC void 227STATIC void
200xfs_end_io( 228xfs_end_io(
201 struct work_struct *work) 229 struct work_struct *work)
202{ 230{
203 xfs_ioend_t *ioend = 231 xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work);
204 container_of(work, xfs_ioend_t, io_work); 232 struct xfs_inode *ip = XFS_I(ioend->io_inode);
205 struct xfs_inode *ip = XFS_I(ioend->io_inode); 233 int error = 0;
206 234
207 /* 235 /*
208 * For unwritten extents we need to issue transactions to convert a 236 * For unwritten extents we need to issue transactions to convert a
@@ -210,7 +238,6 @@ xfs_end_io(
210 */ 238 */
211 if (ioend->io_type == IOMAP_UNWRITTEN && 239 if (ioend->io_type == IOMAP_UNWRITTEN &&
212 likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) { 240 likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
213 int error;
214 241
215 error = xfs_iomap_write_unwritten(ip, ioend->io_offset, 242 error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
216 ioend->io_size); 243 ioend->io_size);
@@ -222,30 +249,23 @@ xfs_end_io(
222 * We might have to update the on-disk file size after extending 249 * We might have to update the on-disk file size after extending
223 * writes. 250 * writes.
224 */ 251 */
225 if (ioend->io_type != IOMAP_READ) 252 if (ioend->io_type != IOMAP_READ) {
226 xfs_setfilesize(ioend); 253 error = xfs_setfilesize(ioend);
227 xfs_destroy_ioend(ioend); 254 ASSERT(!error || error == EAGAIN);
228}
229
230/*
231 * Schedule IO completion handling on a xfsdatad if this was
232 * the final hold on this ioend. If we are asked to wait,
233 * flush the workqueue.
234 */
235STATIC void
236xfs_finish_ioend(
237 xfs_ioend_t *ioend,
238 int wait)
239{
240 if (atomic_dec_and_test(&ioend->io_remaining)) {
241 struct workqueue_struct *wq;
242
243 wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
244 xfsconvertd_workqueue : xfsdatad_workqueue;
245 queue_work(wq, &ioend->io_work);
246 if (wait)
247 flush_workqueue(wq);
248 } 255 }
256
257 /*
258 * If we didn't complete processing of the ioend, requeue it to the
259 * tail of the workqueue for another attempt later. Otherwise destroy
260 * it.
261 */
262 if (error == EAGAIN) {
263 atomic_inc(&ioend->io_remaining);
264 xfs_finish_ioend(ioend, 0);
265 /* ensure we don't spin on blocked ioends */
266 delay(1);
267 } else
268 xfs_destroy_ioend(ioend);
249} 269}
250 270
251/* 271/*
@@ -341,7 +361,7 @@ xfs_submit_ioend_bio(
341 * but don't update the inode size until I/O completion. 361 * but don't update the inode size until I/O completion.
342 */ 362 */
343 if (xfs_ioend_new_eof(ioend)) 363 if (xfs_ioend_new_eof(ioend))
344 xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode)); 364 xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
345 365
346 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? 366 submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
347 WRITE_SYNC_PLUG : WRITE, bio); 367 WRITE_SYNC_PLUG : WRITE, bio);
@@ -874,6 +894,118 @@ xfs_cluster_write(
874 } 894 }
875} 895}
876 896
897STATIC void
898xfs_vm_invalidatepage(
899 struct page *page,
900 unsigned long offset)
901{
902 trace_xfs_invalidatepage(page->mapping->host, page, offset);
903 block_invalidatepage(page, offset);
904}
905
906/*
907 * If the page has delalloc buffers on it, we need to punch them out before we
908 * invalidate the page. If we don't, we leave a stale delalloc mapping on the
909 * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
910 * is done on that same region - the delalloc extent is returned when none is
911 * supposed to be there.
912 *
913 * We prevent this by truncating away the delalloc regions on the page before
914 * invalidating it. Because they are delalloc, we can do this without needing a
915 * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
916 * truncation without a transaction as there is no space left for block
917 * reservation (typically why we see a ENOSPC in writeback).
918 *
919 * This is not a performance critical path, so for now just do the punching a
920 * buffer head at a time.
921 */
922STATIC void
923xfs_aops_discard_page(
924 struct page *page)
925{
926 struct inode *inode = page->mapping->host;
927 struct xfs_inode *ip = XFS_I(inode);
928 struct buffer_head *bh, *head;
929 loff_t offset = page_offset(page);
930 ssize_t len = 1 << inode->i_blkbits;
931
932 if (!xfs_is_delayed_page(page, IOMAP_DELAY))
933 goto out_invalidate;
934
935 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
936 "page discard on page %p, inode 0x%llx, offset %llu.",
937 page, ip->i_ino, offset);
938
939 xfs_ilock(ip, XFS_ILOCK_EXCL);
940 bh = head = page_buffers(page);
941 do {
942 int done;
943 xfs_fileoff_t offset_fsb;
944 xfs_bmbt_irec_t imap;
945 int nimaps = 1;
946 int error;
947 xfs_fsblock_t firstblock;
948 xfs_bmap_free_t flist;
949
950 if (!buffer_delay(bh))
951 goto next_buffer;
952
953 offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
954
955 /*
956 * Map the range first and check that it is a delalloc extent
957 * before trying to unmap the range. Otherwise we will be
958 * trying to remove a real extent (which requires a
959 * transaction) or a hole, which is probably a bad idea...
960 */
961 error = xfs_bmapi(NULL, ip, offset_fsb, 1,
962 XFS_BMAPI_ENTIRE, NULL, 0, &imap,
963 &nimaps, NULL, NULL);
964
965 if (error) {
966 /* something screwed, just bail */
967 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
968 "page discard failed delalloc mapping lookup.");
969 break;
970 }
971 if (!nimaps) {
972 /* nothing there */
973 goto next_buffer;
974 }
975 if (imap.br_startblock != DELAYSTARTBLOCK) {
976 /* been converted, ignore */
977 goto next_buffer;
978 }
979 WARN_ON(imap.br_blockcount == 0);
980
981 /*
982 * Note: while we initialise the firstblock/flist pair, they
983 * should never be used because blocks should never be
984 * allocated or freed for a delalloc extent and hence we need
985 * don't cancel or finish them after the xfs_bunmapi() call.
986 */
987 xfs_bmap_init(&flist, &firstblock);
988 error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
989 &flist, NULL, &done);
990
991 ASSERT(!flist.xbf_count && !flist.xbf_first);
992 if (error) {
993 /* something screwed, just bail */
994 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
995 "page discard unable to remove delalloc mapping.");
996 break;
997 }
998next_buffer:
999 offset += len;
1000
1001 } while ((bh = bh->b_this_page) != head);
1002
1003 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1004out_invalidate:
1005 xfs_vm_invalidatepage(page, 0);
1006 return;
1007}
1008
877/* 1009/*
878 * Calling this without startio set means we are being asked to make a dirty 1010 * Calling this without startio set means we are being asked to make a dirty
879 * page ready for freeing it's buffers. When called with startio set then 1011 * page ready for freeing it's buffers. When called with startio set then
@@ -1125,7 +1257,7 @@ error:
1125 */ 1257 */
1126 if (err != -EAGAIN) { 1258 if (err != -EAGAIN) {
1127 if (!unmapped) 1259 if (!unmapped)
1128 block_invalidatepage(page, 0); 1260 xfs_aops_discard_page(page);
1129 ClearPageUptodate(page); 1261 ClearPageUptodate(page);
1130 } 1262 }
1131 return err; 1263 return err;
@@ -1535,15 +1667,6 @@ xfs_vm_readpages(
1535 return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); 1667 return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
1536} 1668}
1537 1669
1538STATIC void
1539xfs_vm_invalidatepage(
1540 struct page *page,
1541 unsigned long offset)
1542{
1543 trace_xfs_invalidatepage(page->mapping->host, page, offset);
1544 block_invalidatepage(page, offset);
1545}
1546
1547const struct address_space_operations xfs_address_space_operations = { 1670const struct address_space_operations xfs_address_space_operations = {
1548 .readpage = xfs_vm_readpage, 1671 .readpage = xfs_vm_readpage,
1549 .readpages = xfs_vm_readpages, 1672 .readpages = xfs_vm_readpages,
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 77b8be81c769..6f76ba85f193 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -33,6 +33,7 @@
33#include <linux/migrate.h> 33#include <linux/migrate.h>
34#include <linux/backing-dev.h> 34#include <linux/backing-dev.h>
35#include <linux/freezer.h> 35#include <linux/freezer.h>
36#include <linux/list_sort.h>
36 37
37#include "xfs_sb.h" 38#include "xfs_sb.h"
38#include "xfs_inum.h" 39#include "xfs_inum.h"
@@ -76,6 +77,27 @@ struct workqueue_struct *xfsconvertd_workqueue;
76#define xfs_buf_deallocate(bp) \ 77#define xfs_buf_deallocate(bp) \
77 kmem_zone_free(xfs_buf_zone, (bp)); 78 kmem_zone_free(xfs_buf_zone, (bp));
78 79
80static inline int
81xfs_buf_is_vmapped(
82 struct xfs_buf *bp)
83{
84 /*
85 * Return true if the buffer is vmapped.
86 *
87 * The XBF_MAPPED flag is set if the buffer should be mapped, but the
88 * code is clever enough to know it doesn't have to map a single page,
89 * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1.
90 */
91 return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1;
92}
93
94static inline int
95xfs_buf_vmap_len(
96 struct xfs_buf *bp)
97{
98 return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
99}
100
79/* 101/*
80 * Page Region interfaces. 102 * Page Region interfaces.
81 * 103 *
@@ -314,7 +336,7 @@ xfs_buf_free(
314 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { 336 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
315 uint i; 337 uint i;
316 338
317 if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) 339 if (xfs_buf_is_vmapped(bp))
318 free_address(bp->b_addr - bp->b_offset); 340 free_address(bp->b_addr - bp->b_offset);
319 341
320 for (i = 0; i < bp->b_page_count; i++) { 342 for (i = 0; i < bp->b_page_count; i++) {
@@ -1051,22 +1073,30 @@ xfs_buf_ioerror(
1051} 1073}
1052 1074
1053int 1075int
1054xfs_bawrite( 1076xfs_bwrite(
1055 void *mp, 1077 struct xfs_mount *mp,
1056 struct xfs_buf *bp) 1078 struct xfs_buf *bp)
1057{ 1079{
1058 trace_xfs_buf_bawrite(bp, _RET_IP_); 1080 int iowait = (bp->b_flags & XBF_ASYNC) == 0;
1081 int error = 0;
1059 1082
1060 ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); 1083 bp->b_strat = xfs_bdstrat_cb;
1084 bp->b_mount = mp;
1085 bp->b_flags |= XBF_WRITE;
1086 if (!iowait)
1087 bp->b_flags |= _XBF_RUN_QUEUES;
1061 1088
1062 xfs_buf_delwri_dequeue(bp); 1089 xfs_buf_delwri_dequeue(bp);
1090 xfs_buf_iostrategy(bp);
1063 1091
1064 bp->b_flags &= ~(XBF_READ | XBF_DELWRI | XBF_READ_AHEAD); 1092 if (iowait) {
1065 bp->b_flags |= (XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES); 1093 error = xfs_buf_iowait(bp);
1094 if (error)
1095 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1096 xfs_buf_relse(bp);
1097 }
1066 1098
1067 bp->b_mount = mp; 1099 return error;
1068 bp->b_strat = xfs_bdstrat_cb;
1069 return xfs_bdstrat_cb(bp);
1070} 1100}
1071 1101
1072void 1102void
@@ -1085,6 +1115,126 @@ xfs_bdwrite(
1085 xfs_buf_delwri_queue(bp, 1); 1115 xfs_buf_delwri_queue(bp, 1);
1086} 1116}
1087 1117
1118/*
1119 * Called when we want to stop a buffer from getting written or read.
1120 * We attach the EIO error, muck with its flags, and call biodone
1121 * so that the proper iodone callbacks get called.
1122 */
1123STATIC int
1124xfs_bioerror(
1125 xfs_buf_t *bp)
1126{
1127#ifdef XFSERRORDEBUG
1128 ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
1129#endif
1130
1131 /*
1132 * No need to wait until the buffer is unpinned, we aren't flushing it.
1133 */
1134 XFS_BUF_ERROR(bp, EIO);
1135
1136 /*
1137 * We're calling biodone, so delete XBF_DONE flag.
1138 */
1139 XFS_BUF_UNREAD(bp);
1140 XFS_BUF_UNDELAYWRITE(bp);
1141 XFS_BUF_UNDONE(bp);
1142 XFS_BUF_STALE(bp);
1143
1144 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
1145 xfs_biodone(bp);
1146
1147 return EIO;
1148}
1149
1150/*
1151 * Same as xfs_bioerror, except that we are releasing the buffer
1152 * here ourselves, and avoiding the biodone call.
1153 * This is meant for userdata errors; metadata bufs come with
1154 * iodone functions attached, so that we can track down errors.
1155 */
1156STATIC int
1157xfs_bioerror_relse(
1158 struct xfs_buf *bp)
1159{
1160 int64_t fl = XFS_BUF_BFLAGS(bp);
1161 /*
1162 * No need to wait until the buffer is unpinned.
1163 * We aren't flushing it.
1164 *
1165 * chunkhold expects B_DONE to be set, whether
1166 * we actually finish the I/O or not. We don't want to
1167 * change that interface.
1168 */
1169 XFS_BUF_UNREAD(bp);
1170 XFS_BUF_UNDELAYWRITE(bp);
1171 XFS_BUF_DONE(bp);
1172 XFS_BUF_STALE(bp);
1173 XFS_BUF_CLR_IODONE_FUNC(bp);
1174 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
1175 if (!(fl & XBF_ASYNC)) {
1176 /*
1177 * Mark b_error and B_ERROR _both_.
1178 * Lot's of chunkcache code assumes that.
1179 * There's no reason to mark error for
1180 * ASYNC buffers.
1181 */
1182 XFS_BUF_ERROR(bp, EIO);
1183 XFS_BUF_FINISH_IOWAIT(bp);
1184 } else {
1185 xfs_buf_relse(bp);
1186 }
1187
1188 return EIO;
1189}
1190
1191
1192/*
1193 * All xfs metadata buffers except log state machine buffers
1194 * get this attached as their b_bdstrat callback function.
1195 * This is so that we can catch a buffer
1196 * after prematurely unpinning it to forcibly shutdown the filesystem.
1197 */
1198int
1199xfs_bdstrat_cb(
1200 struct xfs_buf *bp)
1201{
1202 if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
1203 trace_xfs_bdstrat_shut(bp, _RET_IP_);
1204 /*
1205 * Metadata write that didn't get logged but
1206 * written delayed anyway. These aren't associated
1207 * with a transaction, and can be ignored.
1208 */
1209 if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
1210 return xfs_bioerror_relse(bp);
1211 else
1212 return xfs_bioerror(bp);
1213 }
1214
1215 xfs_buf_iorequest(bp);
1216 return 0;
1217}
1218
1219/*
1220 * Wrapper around bdstrat so that we can stop data from going to disk in case
1221 * we are shutting down the filesystem. Typically user data goes thru this
1222 * path; one of the exceptions is the superblock.
1223 */
1224void
1225xfsbdstrat(
1226 struct xfs_mount *mp,
1227 struct xfs_buf *bp)
1228{
1229 if (XFS_FORCED_SHUTDOWN(mp)) {
1230 trace_xfs_bdstrat_shut(bp, _RET_IP_);
1231 xfs_bioerror_relse(bp);
1232 return;
1233 }
1234
1235 xfs_buf_iorequest(bp);
1236}
1237
1088STATIC void 1238STATIC void
1089_xfs_buf_ioend( 1239_xfs_buf_ioend(
1090 xfs_buf_t *bp, 1240 xfs_buf_t *bp,
@@ -1107,6 +1257,9 @@ xfs_buf_bio_end_io(
1107 1257
1108 xfs_buf_ioerror(bp, -error); 1258 xfs_buf_ioerror(bp, -error);
1109 1259
1260 if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
1261 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
1262
1110 do { 1263 do {
1111 struct page *page = bvec->bv_page; 1264 struct page *page = bvec->bv_page;
1112 1265
@@ -1216,6 +1369,10 @@ next_chunk:
1216 1369
1217submit_io: 1370submit_io:
1218 if (likely(bio->bi_size)) { 1371 if (likely(bio->bi_size)) {
1372 if (xfs_buf_is_vmapped(bp)) {
1373 flush_kernel_vmap_range(bp->b_addr,
1374 xfs_buf_vmap_len(bp));
1375 }
1219 submit_bio(rw, bio); 1376 submit_bio(rw, bio);
1220 if (size) 1377 if (size)
1221 goto next_chunk; 1378 goto next_chunk;
@@ -1296,7 +1453,7 @@ xfs_buf_iomove(
1296 xfs_buf_t *bp, /* buffer to process */ 1453 xfs_buf_t *bp, /* buffer to process */
1297 size_t boff, /* starting buffer offset */ 1454 size_t boff, /* starting buffer offset */
1298 size_t bsize, /* length to copy */ 1455 size_t bsize, /* length to copy */
1299 caddr_t data, /* data address */ 1456 void *data, /* data address */
1300 xfs_buf_rw_t mode) /* read/write/zero flag */ 1457 xfs_buf_rw_t mode) /* read/write/zero flag */
1301{ 1458{
1302 size_t bend, cpoff, csize; 1459 size_t bend, cpoff, csize;
@@ -1378,8 +1535,8 @@ xfs_alloc_bufhash(
1378 1535
1379 btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ 1536 btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */
1380 btp->bt_hashmask = (1 << btp->bt_hashshift) - 1; 1537 btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
1381 btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) * 1538 btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
1382 sizeof(xfs_bufhash_t), KM_SLEEP | KM_LARGE); 1539 sizeof(xfs_bufhash_t));
1383 for (i = 0; i < (1 << btp->bt_hashshift); i++) { 1540 for (i = 0; i < (1 << btp->bt_hashshift); i++) {
1384 spin_lock_init(&btp->bt_hash[i].bh_lock); 1541 spin_lock_init(&btp->bt_hash[i].bh_lock);
1385 INIT_LIST_HEAD(&btp->bt_hash[i].bh_list); 1542 INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
@@ -1390,7 +1547,7 @@ STATIC void
1390xfs_free_bufhash( 1547xfs_free_bufhash(
1391 xfs_buftarg_t *btp) 1548 xfs_buftarg_t *btp)
1392{ 1549{
1393 kmem_free(btp->bt_hash); 1550 kmem_free_large(btp->bt_hash);
1394 btp->bt_hash = NULL; 1551 btp->bt_hash = NULL;
1395} 1552}
1396 1553
@@ -1595,6 +1752,11 @@ xfs_buf_delwri_queue(
1595 list_del(&bp->b_list); 1752 list_del(&bp->b_list);
1596 } 1753 }
1597 1754
1755 if (list_empty(dwq)) {
1756 /* start xfsbufd as it is about to have something to do */
1757 wake_up_process(bp->b_target->bt_task);
1758 }
1759
1598 bp->b_flags |= _XBF_DELWRI_Q; 1760 bp->b_flags |= _XBF_DELWRI_Q;
1599 list_add_tail(&bp->b_list, dwq); 1761 list_add_tail(&bp->b_list, dwq);
1600 bp->b_queuetime = jiffies; 1762 bp->b_queuetime = jiffies;
@@ -1626,6 +1788,35 @@ xfs_buf_delwri_dequeue(
1626 trace_xfs_buf_delwri_dequeue(bp, _RET_IP_); 1788 trace_xfs_buf_delwri_dequeue(bp, _RET_IP_);
1627} 1789}
1628 1790
1791/*
1792 * If a delwri buffer needs to be pushed before it has aged out, then promote
1793 * it to the head of the delwri queue so that it will be flushed on the next
1794 * xfsbufd run. We do this by resetting the queuetime of the buffer to be older
1795 * than the age currently needed to flush the buffer. Hence the next time the
1796 * xfsbufd sees it is guaranteed to be considered old enough to flush.
1797 */
1798void
1799xfs_buf_delwri_promote(
1800 struct xfs_buf *bp)
1801{
1802 struct xfs_buftarg *btp = bp->b_target;
1803 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1;
1804
1805 ASSERT(bp->b_flags & XBF_DELWRI);
1806 ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1807
1808 /*
1809 * Check the buffer age before locking the delayed write queue as we
1810 * don't need to promote buffers that are already past the flush age.
1811 */
1812 if (bp->b_queuetime < jiffies - age)
1813 return;
1814 bp->b_queuetime = jiffies - age;
1815 spin_lock(&btp->bt_delwrite_lock);
1816 list_move(&bp->b_list, &btp->bt_delwrite_queue);
1817 spin_unlock(&btp->bt_delwrite_lock);
1818}
1819
1629STATIC void 1820STATIC void
1630xfs_buf_runall_queues( 1821xfs_buf_runall_queues(
1631 struct workqueue_struct *queue) 1822 struct workqueue_struct *queue)
@@ -1644,6 +1835,8 @@ xfsbufd_wakeup(
1644 list_for_each_entry(btp, &xfs_buftarg_list, bt_list) { 1835 list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
1645 if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags)) 1836 if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
1646 continue; 1837 continue;
1838 if (list_empty(&btp->bt_delwrite_queue))
1839 continue;
1647 set_bit(XBT_FORCE_FLUSH, &btp->bt_flags); 1840 set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
1648 wake_up_process(btp->bt_task); 1841 wake_up_process(btp->bt_task);
1649 } 1842 }
@@ -1694,20 +1887,53 @@ xfs_buf_delwri_split(
1694 1887
1695} 1888}
1696 1889
1890/*
1891 * Compare function is more complex than it needs to be because
1892 * the return value is only 32 bits and we are doing comparisons
1893 * on 64 bit values
1894 */
1895static int
1896xfs_buf_cmp(
1897 void *priv,
1898 struct list_head *a,
1899 struct list_head *b)
1900{
1901 struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list);
1902 struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list);
1903 xfs_daddr_t diff;
1904
1905 diff = ap->b_bn - bp->b_bn;
1906 if (diff < 0)
1907 return -1;
1908 if (diff > 0)
1909 return 1;
1910 return 0;
1911}
1912
1913void
1914xfs_buf_delwri_sort(
1915 xfs_buftarg_t *target,
1916 struct list_head *list)
1917{
1918 list_sort(NULL, list, xfs_buf_cmp);
1919}
1920
1697STATIC int 1921STATIC int
1698xfsbufd( 1922xfsbufd(
1699 void *data) 1923 void *data)
1700{ 1924{
1701 struct list_head tmp; 1925 xfs_buftarg_t *target = (xfs_buftarg_t *)data;
1702 xfs_buftarg_t *target = (xfs_buftarg_t *)data;
1703 int count;
1704 xfs_buf_t *bp;
1705 1926
1706 current->flags |= PF_MEMALLOC; 1927 current->flags |= PF_MEMALLOC;
1707 1928
1708 set_freezable(); 1929 set_freezable();
1709 1930
1710 do { 1931 do {
1932 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
1933 long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
1934 int count = 0;
1935 struct list_head tmp;
1936
1711 if (unlikely(freezing(current))) { 1937 if (unlikely(freezing(current))) {
1712 set_bit(XBT_FORCE_SLEEP, &target->bt_flags); 1938 set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
1713 refrigerator(); 1939 refrigerator();
@@ -1715,17 +1941,16 @@ xfsbufd(
1715 clear_bit(XBT_FORCE_SLEEP, &target->bt_flags); 1941 clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);
1716 } 1942 }
1717 1943
1718 schedule_timeout_interruptible( 1944 /* sleep for a long time if there is nothing to do. */
1719 xfs_buf_timer_centisecs * msecs_to_jiffies(10)); 1945 if (list_empty(&target->bt_delwrite_queue))
1946 tout = MAX_SCHEDULE_TIMEOUT;
1947 schedule_timeout_interruptible(tout);
1720 1948
1721 xfs_buf_delwri_split(target, &tmp, 1949 xfs_buf_delwri_split(target, &tmp, age);
1722 xfs_buf_age_centisecs * msecs_to_jiffies(10)); 1950 list_sort(NULL, &tmp, xfs_buf_cmp);
1723
1724 count = 0;
1725 while (!list_empty(&tmp)) { 1951 while (!list_empty(&tmp)) {
1726 bp = list_entry(tmp.next, xfs_buf_t, b_list); 1952 struct xfs_buf *bp;
1727 ASSERT(target == bp->b_target); 1953 bp = list_first_entry(&tmp, struct xfs_buf, b_list);
1728
1729 list_del_init(&bp->b_list); 1954 list_del_init(&bp->b_list);
1730 xfs_buf_iostrategy(bp); 1955 xfs_buf_iostrategy(bp);
1731 count++; 1956 count++;
@@ -1751,42 +1976,45 @@ xfs_flush_buftarg(
1751 xfs_buftarg_t *target, 1976 xfs_buftarg_t *target,
1752 int wait) 1977 int wait)
1753{ 1978{
1754 struct list_head tmp; 1979 xfs_buf_t *bp;
1755 xfs_buf_t *bp, *n;
1756 int pincount = 0; 1980 int pincount = 0;
1981 LIST_HEAD(tmp_list);
1982 LIST_HEAD(wait_list);
1757 1983
1758 xfs_buf_runall_queues(xfsconvertd_workqueue); 1984 xfs_buf_runall_queues(xfsconvertd_workqueue);
1759 xfs_buf_runall_queues(xfsdatad_workqueue); 1985 xfs_buf_runall_queues(xfsdatad_workqueue);
1760 xfs_buf_runall_queues(xfslogd_workqueue); 1986 xfs_buf_runall_queues(xfslogd_workqueue);
1761 1987
1762 set_bit(XBT_FORCE_FLUSH, &target->bt_flags); 1988 set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
1763 pincount = xfs_buf_delwri_split(target, &tmp, 0); 1989 pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
1764 1990
1765 /* 1991 /*
1766 * Dropped the delayed write list lock, now walk the temporary list 1992 * Dropped the delayed write list lock, now walk the temporary list.
1993 * All I/O is issued async and then if we need to wait for completion
1994 * we do that after issuing all the IO.
1767 */ 1995 */
1768 list_for_each_entry_safe(bp, n, &tmp, b_list) { 1996 list_sort(NULL, &tmp_list, xfs_buf_cmp);
1997 while (!list_empty(&tmp_list)) {
1998 bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
1769 ASSERT(target == bp->b_target); 1999 ASSERT(target == bp->b_target);
1770 if (wait) 2000 list_del_init(&bp->b_list);
2001 if (wait) {
1771 bp->b_flags &= ~XBF_ASYNC; 2002 bp->b_flags &= ~XBF_ASYNC;
1772 else 2003 list_add(&bp->b_list, &wait_list);
1773 list_del_init(&bp->b_list); 2004 }
1774
1775 xfs_buf_iostrategy(bp); 2005 xfs_buf_iostrategy(bp);
1776 } 2006 }
1777 2007
1778 if (wait) 2008 if (wait) {
2009 /* Expedite and wait for IO to complete. */
1779 blk_run_address_space(target->bt_mapping); 2010 blk_run_address_space(target->bt_mapping);
2011 while (!list_empty(&wait_list)) {
2012 bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
1780 2013
1781 /* 2014 list_del_init(&bp->b_list);
1782 * Remaining list items must be flushed before returning 2015 xfs_iowait(bp);
1783 */ 2016 xfs_buf_relse(bp);
1784 while (!list_empty(&tmp)) { 2017 }
1785 bp = list_entry(tmp.next, xfs_buf_t, b_list);
1786
1787 list_del_init(&bp->b_list);
1788 xfs_iowait(bp);
1789 xfs_buf_relse(bp);
1790 } 2018 }
1791 2019
1792 return pincount; 2020 return pincount;
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index a34c7b54822d..386e7361e50e 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -232,13 +232,17 @@ extern void xfs_buf_lock(xfs_buf_t *);
232extern void xfs_buf_unlock(xfs_buf_t *); 232extern void xfs_buf_unlock(xfs_buf_t *);
233 233
234/* Buffer Read and Write Routines */ 234/* Buffer Read and Write Routines */
235extern int xfs_bawrite(void *mp, xfs_buf_t *bp); 235extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
236extern void xfs_bdwrite(void *mp, xfs_buf_t *bp); 236extern void xfs_bdwrite(void *mp, xfs_buf_t *bp);
237
238extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
239extern int xfs_bdstrat_cb(struct xfs_buf *);
240
237extern void xfs_buf_ioend(xfs_buf_t *, int); 241extern void xfs_buf_ioend(xfs_buf_t *, int);
238extern void xfs_buf_ioerror(xfs_buf_t *, int); 242extern void xfs_buf_ioerror(xfs_buf_t *, int);
239extern int xfs_buf_iorequest(xfs_buf_t *); 243extern int xfs_buf_iorequest(xfs_buf_t *);
240extern int xfs_buf_iowait(xfs_buf_t *); 244extern int xfs_buf_iowait(xfs_buf_t *);
241extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, xfs_caddr_t, 245extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
242 xfs_buf_rw_t); 246 xfs_buf_rw_t);
243 247
244static inline int xfs_buf_iostrategy(xfs_buf_t *bp) 248static inline int xfs_buf_iostrategy(xfs_buf_t *bp)
@@ -261,6 +265,7 @@ extern int xfs_buf_ispin(xfs_buf_t *);
261 265
262/* Delayed Write Buffer Routines */ 266/* Delayed Write Buffer Routines */
263extern void xfs_buf_delwri_dequeue(xfs_buf_t *); 267extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
268extern void xfs_buf_delwri_promote(xfs_buf_t *);
264 269
265/* Buffer Daemon Setup Routines */ 270/* Buffer Daemon Setup Routines */
266extern int xfs_buf_init(void); 271extern int xfs_buf_init(void);
@@ -270,33 +275,19 @@ extern void xfs_buf_terminate(void);
270 ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; }) 275 ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; })
271 276
272 277
273#define XFS_B_ASYNC XBF_ASYNC
274#define XFS_B_DELWRI XBF_DELWRI
275#define XFS_B_READ XBF_READ
276#define XFS_B_WRITE XBF_WRITE
277#define XFS_B_STALE XBF_STALE
278
279#define XFS_BUF_TRYLOCK XBF_TRYLOCK
280#define XFS_INCORE_TRYLOCK XBF_TRYLOCK
281#define XFS_BUF_LOCK XBF_LOCK
282#define XFS_BUF_MAPPED XBF_MAPPED
283
284#define BUF_BUSY XBF_DONT_BLOCK
285
286#define XFS_BUF_BFLAGS(bp) ((bp)->b_flags) 278#define XFS_BUF_BFLAGS(bp) ((bp)->b_flags)
287#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \ 279#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \
288 ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED)) 280 ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
289 281
290#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XFS_B_STALE) 282#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XBF_STALE)
291#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XFS_B_STALE) 283#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
292#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XFS_B_STALE) 284#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE)
293#define XFS_BUF_SUPER_STALE(bp) do { \ 285#define XFS_BUF_SUPER_STALE(bp) do { \
294 XFS_BUF_STALE(bp); \ 286 XFS_BUF_STALE(bp); \
295 xfs_buf_delwri_dequeue(bp); \ 287 xfs_buf_delwri_dequeue(bp); \
296 XFS_BUF_DONE(bp); \ 288 XFS_BUF_DONE(bp); \
297 } while (0) 289 } while (0)
298 290
299#define XFS_BUF_MANAGE XBF_FS_MANAGED
300#define XFS_BUF_UNMANAGE(bp) ((bp)->b_flags &= ~XBF_FS_MANAGED) 291#define XFS_BUF_UNMANAGE(bp) ((bp)->b_flags &= ~XBF_FS_MANAGED)
301 292
302#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI) 293#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI)
@@ -385,31 +376,11 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
385 376
386#define xfs_biomove(bp, off, len, data, rw) \ 377#define xfs_biomove(bp, off, len, data, rw) \
387 xfs_buf_iomove((bp), (off), (len), (data), \ 378 xfs_buf_iomove((bp), (off), (len), (data), \
388 ((rw) == XFS_B_WRITE) ? XBRW_WRITE : XBRW_READ) 379 ((rw) == XBF_WRITE) ? XBRW_WRITE : XBRW_READ)
389 380
390#define xfs_biozero(bp, off, len) \ 381#define xfs_biozero(bp, off, len) \
391 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) 382 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
392 383
393
394static inline int XFS_bwrite(xfs_buf_t *bp)
395{
396 int iowait = (bp->b_flags & XBF_ASYNC) == 0;
397 int error = 0;
398
399 if (!iowait)
400 bp->b_flags |= _XBF_RUN_QUEUES;
401
402 xfs_buf_delwri_dequeue(bp);
403 xfs_buf_iostrategy(bp);
404 if (iowait) {
405 error = xfs_buf_iowait(bp);
406 xfs_buf_relse(bp);
407 }
408 return error;
409}
410
411#define XFS_bdstrat(bp) xfs_buf_iorequest(bp)
412
413#define xfs_iowait(bp) xfs_buf_iowait(bp) 384#define xfs_iowait(bp) xfs_buf_iowait(bp)
414 385
415#define xfs_baread(target, rablkno, ralen) \ 386#define xfs_baread(target, rablkno, ralen) \
@@ -424,6 +395,7 @@ extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
424extern void xfs_wait_buftarg(xfs_buftarg_t *); 395extern void xfs_wait_buftarg(xfs_buftarg_t *);
425extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); 396extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
426extern int xfs_flush_buftarg(xfs_buftarg_t *, int); 397extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
398
427#ifdef CONFIG_KDB_MODULES 399#ifdef CONFIG_KDB_MODULES
428extern struct list_head *xfs_get_buftarg_list(void); 400extern struct list_head *xfs_get_buftarg_list(void);
429#endif 401#endif
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 87b8cbd23d4b..846b75aeb2ab 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -29,6 +29,7 @@
29#include "xfs_vnodeops.h" 29#include "xfs_vnodeops.h"
30#include "xfs_bmap_btree.h" 30#include "xfs_bmap_btree.h"
31#include "xfs_inode.h" 31#include "xfs_inode.h"
32#include "xfs_inode_item.h"
32 33
33/* 34/*
34 * Note that we only accept fileids which are long enough rather than allow 35 * Note that we only accept fileids which are long enough rather than allow
@@ -215,9 +216,28 @@ xfs_fs_get_parent(
215 return d_obtain_alias(VFS_I(cip)); 216 return d_obtain_alias(VFS_I(cip));
216} 217}
217 218
219STATIC int
220xfs_fs_nfs_commit_metadata(
221 struct inode *inode)
222{
223 struct xfs_inode *ip = XFS_I(inode);
224 struct xfs_mount *mp = ip->i_mount;
225 int error = 0;
226
227 xfs_ilock(ip, XFS_ILOCK_SHARED);
228 if (xfs_ipincount(ip)) {
229 error = _xfs_log_force_lsn(mp, ip->i_itemp->ili_last_lsn,
230 XFS_LOG_SYNC, NULL);
231 }
232 xfs_iunlock(ip, XFS_ILOCK_SHARED);
233
234 return error;
235}
236
218const struct export_operations xfs_export_operations = { 237const struct export_operations xfs_export_operations = {
219 .encode_fh = xfs_fs_encode_fh, 238 .encode_fh = xfs_fs_encode_fh,
220 .fh_to_dentry = xfs_fs_fh_to_dentry, 239 .fh_to_dentry = xfs_fs_fh_to_dentry,
221 .fh_to_parent = xfs_fs_fh_to_parent, 240 .fh_to_parent = xfs_fs_fh_to_parent,
222 .get_parent = xfs_fs_get_parent, 241 .get_parent = xfs_fs_get_parent,
242 .commit_metadata = xfs_fs_nfs_commit_metadata,
223}; 243};
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index e4caeb28ce2e..42dd3bcfba6b 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -16,6 +16,7 @@
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h"
19#include "xfs_bit.h" 20#include "xfs_bit.h"
20#include "xfs_log.h" 21#include "xfs_log.h"
21#include "xfs_inum.h" 22#include "xfs_inum.h"
@@ -34,52 +35,279 @@
34#include "xfs_dir2_sf.h" 35#include "xfs_dir2_sf.h"
35#include "xfs_dinode.h" 36#include "xfs_dinode.h"
36#include "xfs_inode.h" 37#include "xfs_inode.h"
38#include "xfs_inode_item.h"
39#include "xfs_bmap.h"
37#include "xfs_error.h" 40#include "xfs_error.h"
38#include "xfs_rw.h" 41#include "xfs_rw.h"
39#include "xfs_vnodeops.h" 42#include "xfs_vnodeops.h"
40#include "xfs_da_btree.h" 43#include "xfs_da_btree.h"
41#include "xfs_ioctl.h" 44#include "xfs_ioctl.h"
45#include "xfs_trace.h"
42 46
43#include <linux/dcache.h> 47#include <linux/dcache.h>
44 48
45static const struct vm_operations_struct xfs_file_vm_ops; 49static const struct vm_operations_struct xfs_file_vm_ops;
46 50
47STATIC ssize_t 51/*
48xfs_file_aio_read( 52 * xfs_iozero
49 struct kiocb *iocb, 53 *
50 const struct iovec *iov, 54 * xfs_iozero clears the specified range of buffer supplied,
51 unsigned long nr_segs, 55 * and marks all the affected blocks as valid and modified. If
52 loff_t pos) 56 * an affected block is not allocated, it will be allocated. If
57 * an affected block is not completely overwritten, and is not
58 * valid before the operation, it will be read from disk before
59 * being partially zeroed.
60 */
61STATIC int
62xfs_iozero(
63 struct xfs_inode *ip, /* inode */
64 loff_t pos, /* offset in file */
65 size_t count) /* size of data to zero */
53{ 66{
54 struct file *file = iocb->ki_filp; 67 struct page *page;
55 int ioflags = 0; 68 struct address_space *mapping;
69 int status;
56 70
57 BUG_ON(iocb->ki_pos != pos); 71 mapping = VFS_I(ip)->i_mapping;
58 if (unlikely(file->f_flags & O_DIRECT)) 72 do {
59 ioflags |= IO_ISDIRECT; 73 unsigned offset, bytes;
60 if (file->f_mode & FMODE_NOCMTIME) 74 void *fsdata;
61 ioflags |= IO_INVIS; 75
62 return xfs_read(XFS_I(file->f_path.dentry->d_inode), iocb, iov, 76 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
63 nr_segs, &iocb->ki_pos, ioflags); 77 bytes = PAGE_CACHE_SIZE - offset;
78 if (bytes > count)
79 bytes = count;
80
81 status = pagecache_write_begin(NULL, mapping, pos, bytes,
82 AOP_FLAG_UNINTERRUPTIBLE,
83 &page, &fsdata);
84 if (status)
85 break;
86
87 zero_user(page, offset, bytes);
88
89 status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
90 page, fsdata);
91 WARN_ON(status <= 0); /* can't return less than zero! */
92 pos += bytes;
93 count -= bytes;
94 status = 0;
95 } while (count);
96
97 return (-status);
98}
99
100STATIC int
101xfs_file_fsync(
102 struct file *file,
103 struct dentry *dentry,
104 int datasync)
105{
106 struct xfs_inode *ip = XFS_I(dentry->d_inode);
107 struct xfs_trans *tp;
108 int error = 0;
109 int log_flushed = 0;
110
111 xfs_itrace_entry(ip);
112
113 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
114 return -XFS_ERROR(EIO);
115
116 xfs_iflags_clear(ip, XFS_ITRUNCATED);
117
118 /*
119 * We always need to make sure that the required inode state is safe on
120 * disk. The inode might be clean but we still might need to force the
121 * log because of committed transactions that haven't hit the disk yet.
122 * Likewise, there could be unflushed non-transactional changes to the
123 * inode core that have to go to disk and this requires us to issue
124 * a synchronous transaction to capture these changes correctly.
125 *
126 * This code relies on the assumption that if the i_update_core field
127 * of the inode is clear and the inode is unpinned then it is clean
128 * and no action is required.
129 */
130 xfs_ilock(ip, XFS_ILOCK_SHARED);
131
132 /*
133 * First check if the VFS inode is marked dirty. All the dirtying
134 * of non-transactional updates no goes through mark_inode_dirty*,
135 * which allows us to distinguish beteeen pure timestamp updates
136 * and i_size updates which need to be caught for fdatasync.
137 * After that also theck for the dirty state in the XFS inode, which
138 * might gets cleared when the inode gets written out via the AIL
139 * or xfs_iflush_cluster.
140 */
141 if (((dentry->d_inode->i_state & I_DIRTY_DATASYNC) ||
142 ((dentry->d_inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
143 ip->i_update_core) {
144 /*
145 * Kick off a transaction to log the inode core to get the
146 * updates. The sync transaction will also force the log.
147 */
148 xfs_iunlock(ip, XFS_ILOCK_SHARED);
149 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
150 error = xfs_trans_reserve(tp, 0,
151 XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
152 if (error) {
153 xfs_trans_cancel(tp, 0);
154 return -error;
155 }
156 xfs_ilock(ip, XFS_ILOCK_EXCL);
157
158 /*
159 * Note - it's possible that we might have pushed ourselves out
160 * of the way during trans_reserve which would flush the inode.
161 * But there's no guarantee that the inode buffer has actually
162 * gone out yet (it's delwri). Plus the buffer could be pinned
163 * anyway if it's part of an inode in another recent
164 * transaction. So we play it safe and fire off the
165 * transaction anyway.
166 */
167 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
168 xfs_trans_ihold(tp, ip);
169 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
170 xfs_trans_set_sync(tp);
171 error = _xfs_trans_commit(tp, 0, &log_flushed);
172
173 xfs_iunlock(ip, XFS_ILOCK_EXCL);
174 } else {
175 /*
176 * Timestamps/size haven't changed since last inode flush or
177 * inode transaction commit. That means either nothing got
178 * written or a transaction committed which caught the updates.
179 * If the latter happened and the transaction hasn't hit the
180 * disk yet, the inode will be still be pinned. If it is,
181 * force the log.
182 */
183 if (xfs_ipincount(ip)) {
184 error = _xfs_log_force_lsn(ip->i_mount,
185 ip->i_itemp->ili_last_lsn,
186 XFS_LOG_SYNC, &log_flushed);
187 }
188 xfs_iunlock(ip, XFS_ILOCK_SHARED);
189 }
190
191 if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) {
192 /*
193 * If the log write didn't issue an ordered tag we need
194 * to flush the disk cache for the data device now.
195 */
196 if (!log_flushed)
197 xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
198
199 /*
200 * If this inode is on the RT dev we need to flush that
201 * cache as well.
202 */
203 if (XFS_IS_REALTIME_INODE(ip))
204 xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
205 }
206
207 return -error;
64} 208}
65 209
66STATIC ssize_t 210STATIC ssize_t
67xfs_file_aio_write( 211xfs_file_aio_read(
68 struct kiocb *iocb, 212 struct kiocb *iocb,
69 const struct iovec *iov, 213 const struct iovec *iovp,
70 unsigned long nr_segs, 214 unsigned long nr_segs,
71 loff_t pos) 215 loff_t pos)
72{ 216{
73 struct file *file = iocb->ki_filp; 217 struct file *file = iocb->ki_filp;
218 struct inode *inode = file->f_mapping->host;
219 struct xfs_inode *ip = XFS_I(inode);
220 struct xfs_mount *mp = ip->i_mount;
221 size_t size = 0;
222 ssize_t ret = 0;
74 int ioflags = 0; 223 int ioflags = 0;
224 xfs_fsize_t n;
225 unsigned long seg;
226
227 XFS_STATS_INC(xs_read_calls);
75 228
76 BUG_ON(iocb->ki_pos != pos); 229 BUG_ON(iocb->ki_pos != pos);
230
77 if (unlikely(file->f_flags & O_DIRECT)) 231 if (unlikely(file->f_flags & O_DIRECT))
78 ioflags |= IO_ISDIRECT; 232 ioflags |= IO_ISDIRECT;
79 if (file->f_mode & FMODE_NOCMTIME) 233 if (file->f_mode & FMODE_NOCMTIME)
80 ioflags |= IO_INVIS; 234 ioflags |= IO_INVIS;
81 return xfs_write(XFS_I(file->f_mapping->host), iocb, iov, nr_segs, 235
82 &iocb->ki_pos, ioflags); 236 /* START copy & waste from filemap.c */
237 for (seg = 0; seg < nr_segs; seg++) {
238 const struct iovec *iv = &iovp[seg];
239
240 /*
241 * If any segment has a negative length, or the cumulative
242 * length ever wraps negative then return -EINVAL.
243 */
244 size += iv->iov_len;
245 if (unlikely((ssize_t)(size|iv->iov_len) < 0))
246 return XFS_ERROR(-EINVAL);
247 }
248 /* END copy & waste from filemap.c */
249
250 if (unlikely(ioflags & IO_ISDIRECT)) {
251 xfs_buftarg_t *target =
252 XFS_IS_REALTIME_INODE(ip) ?
253 mp->m_rtdev_targp : mp->m_ddev_targp;
254 if ((iocb->ki_pos & target->bt_smask) ||
255 (size & target->bt_smask)) {
256 if (iocb->ki_pos == ip->i_size)
257 return 0;
258 return -XFS_ERROR(EINVAL);
259 }
260 }
261
262 n = XFS_MAXIOFFSET(mp) - iocb->ki_pos;
263 if (n <= 0 || size == 0)
264 return 0;
265
266 if (n < size)
267 size = n;
268
269 if (XFS_FORCED_SHUTDOWN(mp))
270 return -EIO;
271
272 if (unlikely(ioflags & IO_ISDIRECT))
273 mutex_lock(&inode->i_mutex);
274 xfs_ilock(ip, XFS_IOLOCK_SHARED);
275
276 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
277 int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
278 int iolock = XFS_IOLOCK_SHARED;
279
280 ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, iocb->ki_pos, size,
281 dmflags, &iolock);
282 if (ret) {
283 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
284 if (unlikely(ioflags & IO_ISDIRECT))
285 mutex_unlock(&inode->i_mutex);
286 return ret;
287 }
288 }
289
290 if (unlikely(ioflags & IO_ISDIRECT)) {
291 if (inode->i_mapping->nrpages) {
292 ret = -xfs_flushinval_pages(ip,
293 (iocb->ki_pos & PAGE_CACHE_MASK),
294 -1, FI_REMAPF_LOCKED);
295 }
296 mutex_unlock(&inode->i_mutex);
297 if (ret) {
298 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
299 return ret;
300 }
301 }
302
303 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
304
305 ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos);
306 if (ret > 0)
307 XFS_STATS_ADD(xs_read_bytes, ret);
308
309 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
310 return ret;
83} 311}
84 312
85STATIC ssize_t 313STATIC ssize_t
@@ -87,16 +315,44 @@ xfs_file_splice_read(
87 struct file *infilp, 315 struct file *infilp,
88 loff_t *ppos, 316 loff_t *ppos,
89 struct pipe_inode_info *pipe, 317 struct pipe_inode_info *pipe,
90 size_t len, 318 size_t count,
91 unsigned int flags) 319 unsigned int flags)
92{ 320{
321 struct xfs_inode *ip = XFS_I(infilp->f_mapping->host);
322 struct xfs_mount *mp = ip->i_mount;
93 int ioflags = 0; 323 int ioflags = 0;
324 ssize_t ret;
325
326 XFS_STATS_INC(xs_read_calls);
94 327
95 if (infilp->f_mode & FMODE_NOCMTIME) 328 if (infilp->f_mode & FMODE_NOCMTIME)
96 ioflags |= IO_INVIS; 329 ioflags |= IO_INVIS;
97 330
98 return xfs_splice_read(XFS_I(infilp->f_path.dentry->d_inode), 331 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
99 infilp, ppos, pipe, len, flags, ioflags); 332 return -EIO;
333
334 xfs_ilock(ip, XFS_IOLOCK_SHARED);
335
336 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
337 int iolock = XFS_IOLOCK_SHARED;
338 int error;
339
340 error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
341 FILP_DELAY_FLAG(infilp), &iolock);
342 if (error) {
343 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
344 return -error;
345 }
346 }
347
348 trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
349
350 ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
351 if (ret > 0)
352 XFS_STATS_ADD(xs_read_bytes, ret);
353
354 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
355 return ret;
100} 356}
101 357
102STATIC ssize_t 358STATIC ssize_t
@@ -104,16 +360,538 @@ xfs_file_splice_write(
104 struct pipe_inode_info *pipe, 360 struct pipe_inode_info *pipe,
105 struct file *outfilp, 361 struct file *outfilp,
106 loff_t *ppos, 362 loff_t *ppos,
107 size_t len, 363 size_t count,
108 unsigned int flags) 364 unsigned int flags)
109{ 365{
366 struct inode *inode = outfilp->f_mapping->host;
367 struct xfs_inode *ip = XFS_I(inode);
368 struct xfs_mount *mp = ip->i_mount;
369 xfs_fsize_t isize, new_size;
110 int ioflags = 0; 370 int ioflags = 0;
371 ssize_t ret;
372
373 XFS_STATS_INC(xs_write_calls);
111 374
112 if (outfilp->f_mode & FMODE_NOCMTIME) 375 if (outfilp->f_mode & FMODE_NOCMTIME)
113 ioflags |= IO_INVIS; 376 ioflags |= IO_INVIS;
114 377
115 return xfs_splice_write(XFS_I(outfilp->f_path.dentry->d_inode), 378 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
116 pipe, outfilp, ppos, len, flags, ioflags); 379 return -EIO;
380
381 xfs_ilock(ip, XFS_IOLOCK_EXCL);
382
383 if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
384 int iolock = XFS_IOLOCK_EXCL;
385 int error;
386
387 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
388 FILP_DELAY_FLAG(outfilp), &iolock);
389 if (error) {
390 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
391 return -error;
392 }
393 }
394
395 new_size = *ppos + count;
396
397 xfs_ilock(ip, XFS_ILOCK_EXCL);
398 if (new_size > ip->i_size)
399 ip->i_new_size = new_size;
400 xfs_iunlock(ip, XFS_ILOCK_EXCL);
401
402 trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
403
404 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
405 if (ret > 0)
406 XFS_STATS_ADD(xs_write_bytes, ret);
407
408 isize = i_size_read(inode);
409 if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
410 *ppos = isize;
411
412 if (*ppos > ip->i_size) {
413 xfs_ilock(ip, XFS_ILOCK_EXCL);
414 if (*ppos > ip->i_size)
415 ip->i_size = *ppos;
416 xfs_iunlock(ip, XFS_ILOCK_EXCL);
417 }
418
419 if (ip->i_new_size) {
420 xfs_ilock(ip, XFS_ILOCK_EXCL);
421 ip->i_new_size = 0;
422 if (ip->i_d.di_size > ip->i_size)
423 ip->i_d.di_size = ip->i_size;
424 xfs_iunlock(ip, XFS_ILOCK_EXCL);
425 }
426 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
427 return ret;
428}
429
430/*
431 * This routine is called to handle zeroing any space in the last
432 * block of the file that is beyond the EOF. We do this since the
433 * size is being increased without writing anything to that block
434 * and we don't want anyone to read the garbage on the disk.
435 */
436STATIC int /* error (positive) */
437xfs_zero_last_block(
438 xfs_inode_t *ip,
439 xfs_fsize_t offset,
440 xfs_fsize_t isize)
441{
442 xfs_fileoff_t last_fsb;
443 xfs_mount_t *mp = ip->i_mount;
444 int nimaps;
445 int zero_offset;
446 int zero_len;
447 int error = 0;
448 xfs_bmbt_irec_t imap;
449
450 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
451
452 zero_offset = XFS_B_FSB_OFFSET(mp, isize);
453 if (zero_offset == 0) {
454 /*
455 * There are no extra bytes in the last block on disk to
456 * zero, so return.
457 */
458 return 0;
459 }
460
461 last_fsb = XFS_B_TO_FSBT(mp, isize);
462 nimaps = 1;
463 error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
464 &nimaps, NULL, NULL);
465 if (error) {
466 return error;
467 }
468 ASSERT(nimaps > 0);
469 /*
470 * If the block underlying isize is just a hole, then there
471 * is nothing to zero.
472 */
473 if (imap.br_startblock == HOLESTARTBLOCK) {
474 return 0;
475 }
476 /*
477 * Zero the part of the last block beyond the EOF, and write it
478 * out sync. We need to drop the ilock while we do this so we
479 * don't deadlock when the buffer cache calls back to us.
480 */
481 xfs_iunlock(ip, XFS_ILOCK_EXCL);
482
483 zero_len = mp->m_sb.sb_blocksize - zero_offset;
484 if (isize + zero_len > offset)
485 zero_len = offset - isize;
486 error = xfs_iozero(ip, isize, zero_len);
487
488 xfs_ilock(ip, XFS_ILOCK_EXCL);
489 ASSERT(error >= 0);
490 return error;
491}
492
493/*
494 * Zero any on disk space between the current EOF and the new,
495 * larger EOF. This handles the normal case of zeroing the remainder
496 * of the last block in the file and the unusual case of zeroing blocks
497 * out beyond the size of the file. This second case only happens
498 * with fixed size extents and when the system crashes before the inode
499 * size was updated but after blocks were allocated. If fill is set,
500 * then any holes in the range are filled and zeroed. If not, the holes
501 * are left alone as holes.
502 */
503
504int /* error (positive) */
505xfs_zero_eof(
506 xfs_inode_t *ip,
507 xfs_off_t offset, /* starting I/O offset */
508 xfs_fsize_t isize) /* current inode size */
509{
510 xfs_mount_t *mp = ip->i_mount;
511 xfs_fileoff_t start_zero_fsb;
512 xfs_fileoff_t end_zero_fsb;
513 xfs_fileoff_t zero_count_fsb;
514 xfs_fileoff_t last_fsb;
515 xfs_fileoff_t zero_off;
516 xfs_fsize_t zero_len;
517 int nimaps;
518 int error = 0;
519 xfs_bmbt_irec_t imap;
520
521 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
522 ASSERT(offset > isize);
523
524 /*
525 * First handle zeroing the block on which isize resides.
526 * We only zero a part of that block so it is handled specially.
527 */
528 error = xfs_zero_last_block(ip, offset, isize);
529 if (error) {
530 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
531 return error;
532 }
533
534 /*
535 * Calculate the range between the new size and the old
536 * where blocks needing to be zeroed may exist. To get the
537 * block where the last byte in the file currently resides,
538 * we need to subtract one from the size and truncate back
539 * to a block boundary. We subtract 1 in case the size is
540 * exactly on a block boundary.
541 */
542 last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
543 start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
544 end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
545 ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
546 if (last_fsb == end_zero_fsb) {
547 /*
548 * The size was only incremented on its last block.
549 * We took care of that above, so just return.
550 */
551 return 0;
552 }
553
554 ASSERT(start_zero_fsb <= end_zero_fsb);
555 while (start_zero_fsb <= end_zero_fsb) {
556 nimaps = 1;
557 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
558 error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
559 0, NULL, 0, &imap, &nimaps, NULL, NULL);
560 if (error) {
561 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
562 return error;
563 }
564 ASSERT(nimaps > 0);
565
566 if (imap.br_state == XFS_EXT_UNWRITTEN ||
567 imap.br_startblock == HOLESTARTBLOCK) {
568 /*
569 * This loop handles initializing pages that were
570 * partially initialized by the code below this
571 * loop. It basically zeroes the part of the page
572 * that sits on a hole and sets the page as P_HOLE
573 * and calls remapf if it is a mapped file.
574 */
575 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
576 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
577 continue;
578 }
579
580 /*
581 * There are blocks we need to zero.
582 * Drop the inode lock while we're doing the I/O.
583 * We'll still have the iolock to protect us.
584 */
585 xfs_iunlock(ip, XFS_ILOCK_EXCL);
586
587 zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
588 zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
589
590 if ((zero_off + zero_len) > offset)
591 zero_len = offset - zero_off;
592
593 error = xfs_iozero(ip, zero_off, zero_len);
594 if (error) {
595 goto out_lock;
596 }
597
598 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
599 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
600
601 xfs_ilock(ip, XFS_ILOCK_EXCL);
602 }
603
604 return 0;
605
606out_lock:
607 xfs_ilock(ip, XFS_ILOCK_EXCL);
608 ASSERT(error >= 0);
609 return error;
610}
611
612STATIC ssize_t
613xfs_file_aio_write(
614 struct kiocb *iocb,
615 const struct iovec *iovp,
616 unsigned long nr_segs,
617 loff_t pos)
618{
619 struct file *file = iocb->ki_filp;
620 struct address_space *mapping = file->f_mapping;
621 struct inode *inode = mapping->host;
622 struct xfs_inode *ip = XFS_I(inode);
623 struct xfs_mount *mp = ip->i_mount;
624 ssize_t ret = 0, error = 0;
625 int ioflags = 0;
626 xfs_fsize_t isize, new_size;
627 int iolock;
628 int eventsent = 0;
629 size_t ocount = 0, count;
630 int need_i_mutex;
631
632 XFS_STATS_INC(xs_write_calls);
633
634 BUG_ON(iocb->ki_pos != pos);
635
636 if (unlikely(file->f_flags & O_DIRECT))
637 ioflags |= IO_ISDIRECT;
638 if (file->f_mode & FMODE_NOCMTIME)
639 ioflags |= IO_INVIS;
640
641 error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
642 if (error)
643 return error;
644
645 count = ocount;
646 if (count == 0)
647 return 0;
648
649 xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
650
651 if (XFS_FORCED_SHUTDOWN(mp))
652 return -EIO;
653
654relock:
655 if (ioflags & IO_ISDIRECT) {
656 iolock = XFS_IOLOCK_SHARED;
657 need_i_mutex = 0;
658 } else {
659 iolock = XFS_IOLOCK_EXCL;
660 need_i_mutex = 1;
661 mutex_lock(&inode->i_mutex);
662 }
663
664 xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
665
666start:
667 error = -generic_write_checks(file, &pos, &count,
668 S_ISBLK(inode->i_mode));
669 if (error) {
670 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
671 goto out_unlock_mutex;
672 }
673
674 if ((DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) &&
675 !(ioflags & IO_INVIS) && !eventsent)) {
676 int dmflags = FILP_DELAY_FLAG(file);
677
678 if (need_i_mutex)
679 dmflags |= DM_FLAGS_IMUX;
680
681 xfs_iunlock(ip, XFS_ILOCK_EXCL);
682 error = XFS_SEND_DATA(ip->i_mount, DM_EVENT_WRITE, ip,
683 pos, count, dmflags, &iolock);
684 if (error) {
685 goto out_unlock_internal;
686 }
687 xfs_ilock(ip, XFS_ILOCK_EXCL);
688 eventsent = 1;
689
690 /*
691 * The iolock was dropped and reacquired in XFS_SEND_DATA
692 * so we have to recheck the size when appending.
693 * We will only "goto start;" once, since having sent the
694 * event prevents another call to XFS_SEND_DATA, which is
695 * what allows the size to change in the first place.
696 */
697 if ((file->f_flags & O_APPEND) && pos != ip->i_size)
698 goto start;
699 }
700
701 if (ioflags & IO_ISDIRECT) {
702 xfs_buftarg_t *target =
703 XFS_IS_REALTIME_INODE(ip) ?
704 mp->m_rtdev_targp : mp->m_ddev_targp;
705
706 if ((pos & target->bt_smask) || (count & target->bt_smask)) {
707 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
708 return XFS_ERROR(-EINVAL);
709 }
710
711 if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) {
712 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
713 iolock = XFS_IOLOCK_EXCL;
714 need_i_mutex = 1;
715 mutex_lock(&inode->i_mutex);
716 xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
717 goto start;
718 }
719 }
720
721 new_size = pos + count;
722 if (new_size > ip->i_size)
723 ip->i_new_size = new_size;
724
725 if (likely(!(ioflags & IO_INVIS)))
726 file_update_time(file);
727
728 /*
729 * If the offset is beyond the size of the file, we have a couple
730 * of things to do. First, if there is already space allocated
731 * we need to either create holes or zero the disk or ...
732 *
733 * If there is a page where the previous size lands, we need
734 * to zero it out up to the new size.
735 */
736
737 if (pos > ip->i_size) {
738 error = xfs_zero_eof(ip, pos, ip->i_size);
739 if (error) {
740 xfs_iunlock(ip, XFS_ILOCK_EXCL);
741 goto out_unlock_internal;
742 }
743 }
744 xfs_iunlock(ip, XFS_ILOCK_EXCL);
745
746 /*
747 * If we're writing the file then make sure to clear the
748 * setuid and setgid bits if the process is not being run
749 * by root. This keeps people from modifying setuid and
750 * setgid binaries.
751 */
752 error = -file_remove_suid(file);
753 if (unlikely(error))
754 goto out_unlock_internal;
755
756 /* We can write back this queue in page reclaim */
757 current->backing_dev_info = mapping->backing_dev_info;
758
759 if ((ioflags & IO_ISDIRECT)) {
760 if (mapping->nrpages) {
761 WARN_ON(need_i_mutex == 0);
762 error = xfs_flushinval_pages(ip,
763 (pos & PAGE_CACHE_MASK),
764 -1, FI_REMAPF_LOCKED);
765 if (error)
766 goto out_unlock_internal;
767 }
768
769 if (need_i_mutex) {
770 /* demote the lock now the cached pages are gone */
771 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
772 mutex_unlock(&inode->i_mutex);
773
774 iolock = XFS_IOLOCK_SHARED;
775 need_i_mutex = 0;
776 }
777
778 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags);
779 ret = generic_file_direct_write(iocb, iovp,
780 &nr_segs, pos, &iocb->ki_pos, count, ocount);
781
782 /*
783 * direct-io write to a hole: fall through to buffered I/O
784 * for completing the rest of the request.
785 */
786 if (ret >= 0 && ret != count) {
787 XFS_STATS_ADD(xs_write_bytes, ret);
788
789 pos += ret;
790 count -= ret;
791
792 ioflags &= ~IO_ISDIRECT;
793 xfs_iunlock(ip, iolock);
794 goto relock;
795 }
796 } else {
797 int enospc = 0;
798 ssize_t ret2 = 0;
799
800write_retry:
801 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags);
802 ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
803 pos, &iocb->ki_pos, count, ret);
804 /*
805 * if we just got an ENOSPC, flush the inode now we
806 * aren't holding any page locks and retry *once*
807 */
808 if (ret2 == -ENOSPC && !enospc) {
809 error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
810 if (error)
811 goto out_unlock_internal;
812 enospc = 1;
813 goto write_retry;
814 }
815 ret = ret2;
816 }
817
818 current->backing_dev_info = NULL;
819
820 isize = i_size_read(inode);
821 if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize))
822 iocb->ki_pos = isize;
823
824 if (iocb->ki_pos > ip->i_size) {
825 xfs_ilock(ip, XFS_ILOCK_EXCL);
826 if (iocb->ki_pos > ip->i_size)
827 ip->i_size = iocb->ki_pos;
828 xfs_iunlock(ip, XFS_ILOCK_EXCL);
829 }
830
831 if (ret == -ENOSPC &&
832 DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
833 xfs_iunlock(ip, iolock);
834 if (need_i_mutex)
835 mutex_unlock(&inode->i_mutex);
836 error = XFS_SEND_NAMESP(ip->i_mount, DM_EVENT_NOSPACE, ip,
837 DM_RIGHT_NULL, ip, DM_RIGHT_NULL, NULL, NULL,
838 0, 0, 0); /* Delay flag intentionally unused */
839 if (need_i_mutex)
840 mutex_lock(&inode->i_mutex);
841 xfs_ilock(ip, iolock);
842 if (error)
843 goto out_unlock_internal;
844 goto start;
845 }
846
847 error = -ret;
848 if (ret <= 0)
849 goto out_unlock_internal;
850
851 XFS_STATS_ADD(xs_write_bytes, ret);
852
853 /* Handle various SYNC-type writes */
854 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
855 loff_t end = pos + ret - 1;
856 int error2;
857
858 xfs_iunlock(ip, iolock);
859 if (need_i_mutex)
860 mutex_unlock(&inode->i_mutex);
861
862 error2 = filemap_write_and_wait_range(mapping, pos, end);
863 if (!error)
864 error = error2;
865 if (need_i_mutex)
866 mutex_lock(&inode->i_mutex);
867 xfs_ilock(ip, iolock);
868
869 error2 = -xfs_file_fsync(file, file->f_path.dentry,
870 (file->f_flags & __O_SYNC) ? 0 : 1);
871 if (!error)
872 error = error2;
873 }
874
875 out_unlock_internal:
876 if (ip->i_new_size) {
877 xfs_ilock(ip, XFS_ILOCK_EXCL);
878 ip->i_new_size = 0;
879 /*
880 * If this was a direct or synchronous I/O that failed (such
881 * as ENOSPC) then part of the I/O may have been written to
882 * disk before the error occured. In this case the on-disk
883 * file size may have been adjusted beyond the in-memory file
884 * size and now needs to be truncated back.
885 */
886 if (ip->i_d.di_size > ip->i_size)
887 ip->i_d.di_size = ip->i_size;
888 xfs_iunlock(ip, XFS_ILOCK_EXCL);
889 }
890 xfs_iunlock(ip, iolock);
891 out_unlock_mutex:
892 if (need_i_mutex)
893 mutex_unlock(&inode->i_mutex);
894 return -error;
117} 895}
118 896
119STATIC int 897STATIC int
@@ -160,28 +938,6 @@ xfs_file_release(
160 return -xfs_release(XFS_I(inode)); 938 return -xfs_release(XFS_I(inode));
161} 939}
162 940
163/*
164 * We ignore the datasync flag here because a datasync is effectively
165 * identical to an fsync. That is, datasync implies that we need to write
166 * only the metadata needed to be able to access the data that is written
167 * if we crash after the call completes. Hence if we are writing beyond
168 * EOF we have to log the inode size change as well, which makes it a
169 * full fsync. If we don't write beyond EOF, the inode core will be
170 * clean in memory and so we don't need to log the inode, just like
171 * fsync.
172 */
173STATIC int
174xfs_file_fsync(
175 struct file *file,
176 struct dentry *dentry,
177 int datasync)
178{
179 struct xfs_inode *ip = XFS_I(dentry->d_inode);
180
181 xfs_iflags_clear(ip, XFS_ITRUNCATED);
182 return -xfs_fsync(ip);
183}
184
185STATIC int 941STATIC int
186xfs_file_readdir( 942xfs_file_readdir(
187 struct file *filp, 943 struct file *filp,
@@ -203,9 +959,9 @@ xfs_file_readdir(
203 * 959 *
204 * Try to give it an estimate that's good enough, maybe at some 960 * Try to give it an estimate that's good enough, maybe at some
205 * point we can change the ->readdir prototype to include the 961 * point we can change the ->readdir prototype to include the
206 * buffer size. 962 * buffer size. For now we use the current glibc buffer size.
207 */ 963 */
208 bufsize = (size_t)min_t(loff_t, PAGE_SIZE, ip->i_d.di_size); 964 bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
209 965
210 error = xfs_readdir(ip, dirent, bufsize, 966 error = xfs_readdir(ip, dirent, bufsize,
211 (xfs_off_t *)&filp->f_pos, filldir); 967 (xfs_off_t *)&filp->f_pos, filldir);
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 7501b85fd860..b6918d76bc7b 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -79,7 +79,7 @@ xfs_flush_pages(
79 xfs_iflags_clear(ip, XFS_ITRUNCATED); 79 xfs_iflags_clear(ip, XFS_ITRUNCATED);
80 ret = -filemap_fdatawrite(mapping); 80 ret = -filemap_fdatawrite(mapping);
81 } 81 }
82 if (flags & XFS_B_ASYNC) 82 if (flags & XBF_ASYNC)
83 return ret; 83 return ret;
84 ret2 = xfs_wait_on_pages(ip, first, last); 84 ret2 = xfs_wait_on_pages(ip, first, last);
85 if (!ret) 85 if (!ret)
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index a034cf624437..4ea1ee18aded 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -447,12 +447,12 @@ xfs_attrlist_by_handle(
447int 447int
448xfs_attrmulti_attr_get( 448xfs_attrmulti_attr_get(
449 struct inode *inode, 449 struct inode *inode,
450 char *name, 450 unsigned char *name,
451 char __user *ubuf, 451 unsigned char __user *ubuf,
452 __uint32_t *len, 452 __uint32_t *len,
453 __uint32_t flags) 453 __uint32_t flags)
454{ 454{
455 char *kbuf; 455 unsigned char *kbuf;
456 int error = EFAULT; 456 int error = EFAULT;
457 457
458 if (*len > XATTR_SIZE_MAX) 458 if (*len > XATTR_SIZE_MAX)
@@ -476,12 +476,12 @@ xfs_attrmulti_attr_get(
476int 476int
477xfs_attrmulti_attr_set( 477xfs_attrmulti_attr_set(
478 struct inode *inode, 478 struct inode *inode,
479 char *name, 479 unsigned char *name,
480 const char __user *ubuf, 480 const unsigned char __user *ubuf,
481 __uint32_t len, 481 __uint32_t len,
482 __uint32_t flags) 482 __uint32_t flags)
483{ 483{
484 char *kbuf; 484 unsigned char *kbuf;
485 int error = EFAULT; 485 int error = EFAULT;
486 486
487 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) 487 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
@@ -501,7 +501,7 @@ xfs_attrmulti_attr_set(
501int 501int
502xfs_attrmulti_attr_remove( 502xfs_attrmulti_attr_remove(
503 struct inode *inode, 503 struct inode *inode,
504 char *name, 504 unsigned char *name,
505 __uint32_t flags) 505 __uint32_t flags)
506{ 506{
507 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) 507 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
@@ -519,7 +519,7 @@ xfs_attrmulti_by_handle(
519 xfs_fsop_attrmulti_handlereq_t am_hreq; 519 xfs_fsop_attrmulti_handlereq_t am_hreq;
520 struct dentry *dentry; 520 struct dentry *dentry;
521 unsigned int i, size; 521 unsigned int i, size;
522 char *attr_name; 522 unsigned char *attr_name;
523 523
524 if (!capable(CAP_SYS_ADMIN)) 524 if (!capable(CAP_SYS_ADMIN))
525 return -XFS_ERROR(EPERM); 525 return -XFS_ERROR(EPERM);
@@ -547,7 +547,7 @@ xfs_attrmulti_by_handle(
547 547
548 error = 0; 548 error = 0;
549 for (i = 0; i < am_hreq.opcount; i++) { 549 for (i = 0; i < am_hreq.opcount; i++) {
550 ops[i].am_error = strncpy_from_user(attr_name, 550 ops[i].am_error = strncpy_from_user((char *)attr_name,
551 ops[i].am_attrname, MAXNAMELEN); 551 ops[i].am_attrname, MAXNAMELEN);
552 if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) 552 if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
553 error = -ERANGE; 553 error = -ERANGE;
@@ -1431,6 +1431,9 @@ xfs_file_ioctl(
1431 if (!capable(CAP_SYS_ADMIN)) 1431 if (!capable(CAP_SYS_ADMIN))
1432 return -EPERM; 1432 return -EPERM;
1433 1433
1434 if (mp->m_flags & XFS_MOUNT_RDONLY)
1435 return -XFS_ERROR(EROFS);
1436
1434 if (copy_from_user(&inout, arg, sizeof(inout))) 1437 if (copy_from_user(&inout, arg, sizeof(inout)))
1435 return -XFS_ERROR(EFAULT); 1438 return -XFS_ERROR(EFAULT);
1436 1439
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/linux-2.6/xfs_ioctl.h
index 7bd7c6afc1eb..d56173b34a2a 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl.h
@@ -45,23 +45,23 @@ xfs_readlink_by_handle(
45extern int 45extern int
46xfs_attrmulti_attr_get( 46xfs_attrmulti_attr_get(
47 struct inode *inode, 47 struct inode *inode,
48 char *name, 48 unsigned char *name,
49 char __user *ubuf, 49 unsigned char __user *ubuf,
50 __uint32_t *len, 50 __uint32_t *len,
51 __uint32_t flags); 51 __uint32_t flags);
52 52
53extern int 53extern int
54 xfs_attrmulti_attr_set( 54xfs_attrmulti_attr_set(
55 struct inode *inode, 55 struct inode *inode,
56 char *name, 56 unsigned char *name,
57 const char __user *ubuf, 57 const unsigned char __user *ubuf,
58 __uint32_t len, 58 __uint32_t len,
59 __uint32_t flags); 59 __uint32_t flags);
60 60
61extern int 61extern int
62xfs_attrmulti_attr_remove( 62xfs_attrmulti_attr_remove(
63 struct inode *inode, 63 struct inode *inode,
64 char *name, 64 unsigned char *name,
65 __uint32_t flags); 65 __uint32_t flags);
66 66
67extern struct dentry * 67extern struct dentry *
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index be1527b1670c..0bf6d61f0528 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -411,7 +411,7 @@ xfs_compat_attrmulti_by_handle(
411 compat_xfs_fsop_attrmulti_handlereq_t am_hreq; 411 compat_xfs_fsop_attrmulti_handlereq_t am_hreq;
412 struct dentry *dentry; 412 struct dentry *dentry;
413 unsigned int i, size; 413 unsigned int i, size;
414 char *attr_name; 414 unsigned char *attr_name;
415 415
416 if (!capable(CAP_SYS_ADMIN)) 416 if (!capable(CAP_SYS_ADMIN))
417 return -XFS_ERROR(EPERM); 417 return -XFS_ERROR(EPERM);
@@ -440,7 +440,7 @@ xfs_compat_attrmulti_by_handle(
440 440
441 error = 0; 441 error = 0;
442 for (i = 0; i < am_hreq.opcount; i++) { 442 for (i = 0; i < am_hreq.opcount; i++) {
443 ops[i].am_error = strncpy_from_user(attr_name, 443 ops[i].am_error = strncpy_from_user((char *)attr_name,
444 compat_ptr(ops[i].am_attrname), 444 compat_ptr(ops[i].am_attrname),
445 MAXNAMELEN); 445 MAXNAMELEN);
446 if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) 446 if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 225946012d0b..61a99608731e 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -91,6 +91,16 @@ xfs_mark_inode_dirty_sync(
91 mark_inode_dirty_sync(inode); 91 mark_inode_dirty_sync(inode);
92} 92}
93 93
94void
95xfs_mark_inode_dirty(
96 xfs_inode_t *ip)
97{
98 struct inode *inode = VFS_I(ip);
99
100 if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR)))
101 mark_inode_dirty(inode);
102}
103
94/* 104/*
95 * Change the requested timestamp in the given inode. 105 * Change the requested timestamp in the given inode.
96 * We don't lock across timestamp updates, and we don't log them but 106 * We don't lock across timestamp updates, and we don't log them but
@@ -140,10 +150,10 @@ xfs_init_security(
140 struct xfs_inode *ip = XFS_I(inode); 150 struct xfs_inode *ip = XFS_I(inode);
141 size_t length; 151 size_t length;
142 void *value; 152 void *value;
143 char *name; 153 unsigned char *name;
144 int error; 154 int error;
145 155
146 error = security_inode_init_security(inode, dir, &name, 156 error = security_inode_init_security(inode, dir, (char **)&name,
147 &value, &length); 157 &value, &length);
148 if (error) { 158 if (error) {
149 if (error == -EOPNOTSUPP) 159 if (error == -EOPNOTSUPP)
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 5af0c81ca1ae..facfb323a706 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -88,7 +88,6 @@
88#include <xfs_super.h> 88#include <xfs_super.h>
89#include <xfs_globals.h> 89#include <xfs_globals.h>
90#include <xfs_fs_subr.h> 90#include <xfs_fs_subr.h>
91#include <xfs_lrw.h>
92#include <xfs_buf.h> 91#include <xfs_buf.h>
93 92
94/* 93/*
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
deleted file mode 100644
index 0d32457abef1..000000000000
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ /dev/null
@@ -1,852 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_bit.h"
21#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_dir2.h"
27#include "xfs_alloc.h"
28#include "xfs_dmapi.h"
29#include "xfs_quota.h"
30#include "xfs_mount.h"
31#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_dinode.h"
37#include "xfs_inode.h"
38#include "xfs_bmap.h"
39#include "xfs_btree.h"
40#include "xfs_ialloc.h"
41#include "xfs_rtalloc.h"
42#include "xfs_error.h"
43#include "xfs_itable.h"
44#include "xfs_rw.h"
45#include "xfs_attr.h"
46#include "xfs_inode_item.h"
47#include "xfs_buf_item.h"
48#include "xfs_utils.h"
49#include "xfs_iomap.h"
50#include "xfs_vnodeops.h"
51#include "xfs_trace.h"
52
53#include <linux/capability.h>
54#include <linux/writeback.h>
55
56
57/*
58 * xfs_iozero
59 *
60 * xfs_iozero clears the specified range of buffer supplied,
61 * and marks all the affected blocks as valid and modified. If
62 * an affected block is not allocated, it will be allocated. If
63 * an affected block is not completely overwritten, and is not
64 * valid before the operation, it will be read from disk before
65 * being partially zeroed.
66 */
67STATIC int
68xfs_iozero(
69 struct xfs_inode *ip, /* inode */
70 loff_t pos, /* offset in file */
71 size_t count) /* size of data to zero */
72{
73 struct page *page;
74 struct address_space *mapping;
75 int status;
76
77 mapping = VFS_I(ip)->i_mapping;
78 do {
79 unsigned offset, bytes;
80 void *fsdata;
81
82 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
83 bytes = PAGE_CACHE_SIZE - offset;
84 if (bytes > count)
85 bytes = count;
86
87 status = pagecache_write_begin(NULL, mapping, pos, bytes,
88 AOP_FLAG_UNINTERRUPTIBLE,
89 &page, &fsdata);
90 if (status)
91 break;
92
93 zero_user(page, offset, bytes);
94
95 status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
96 page, fsdata);
97 WARN_ON(status <= 0); /* can't return less than zero! */
98 pos += bytes;
99 count -= bytes;
100 status = 0;
101 } while (count);
102
103 return (-status);
104}
105
106ssize_t /* bytes read, or (-) error */
107xfs_read(
108 xfs_inode_t *ip,
109 struct kiocb *iocb,
110 const struct iovec *iovp,
111 unsigned int segs,
112 loff_t *offset,
113 int ioflags)
114{
115 struct file *file = iocb->ki_filp;
116 struct inode *inode = file->f_mapping->host;
117 xfs_mount_t *mp = ip->i_mount;
118 size_t size = 0;
119 ssize_t ret = 0;
120 xfs_fsize_t n;
121 unsigned long seg;
122
123
124 XFS_STATS_INC(xs_read_calls);
125
126 /* START copy & waste from filemap.c */
127 for (seg = 0; seg < segs; seg++) {
128 const struct iovec *iv = &iovp[seg];
129
130 /*
131 * If any segment has a negative length, or the cumulative
132 * length ever wraps negative then return -EINVAL.
133 */
134 size += iv->iov_len;
135 if (unlikely((ssize_t)(size|iv->iov_len) < 0))
136 return XFS_ERROR(-EINVAL);
137 }
138 /* END copy & waste from filemap.c */
139
140 if (unlikely(ioflags & IO_ISDIRECT)) {
141 xfs_buftarg_t *target =
142 XFS_IS_REALTIME_INODE(ip) ?
143 mp->m_rtdev_targp : mp->m_ddev_targp;
144 if ((*offset & target->bt_smask) ||
145 (size & target->bt_smask)) {
146 if (*offset == ip->i_size) {
147 return (0);
148 }
149 return -XFS_ERROR(EINVAL);
150 }
151 }
152
153 n = XFS_MAXIOFFSET(mp) - *offset;
154 if ((n <= 0) || (size == 0))
155 return 0;
156
157 if (n < size)
158 size = n;
159
160 if (XFS_FORCED_SHUTDOWN(mp))
161 return -EIO;
162
163 if (unlikely(ioflags & IO_ISDIRECT))
164 mutex_lock(&inode->i_mutex);
165 xfs_ilock(ip, XFS_IOLOCK_SHARED);
166
167 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
168 int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
169 int iolock = XFS_IOLOCK_SHARED;
170
171 ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *offset, size,
172 dmflags, &iolock);
173 if (ret) {
174 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
175 if (unlikely(ioflags & IO_ISDIRECT))
176 mutex_unlock(&inode->i_mutex);
177 return ret;
178 }
179 }
180
181 if (unlikely(ioflags & IO_ISDIRECT)) {
182 if (inode->i_mapping->nrpages)
183 ret = -xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK),
184 -1, FI_REMAPF_LOCKED);
185 mutex_unlock(&inode->i_mutex);
186 if (ret) {
187 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
188 return ret;
189 }
190 }
191
192 trace_xfs_file_read(ip, size, *offset, ioflags);
193
194 iocb->ki_pos = *offset;
195 ret = generic_file_aio_read(iocb, iovp, segs, *offset);
196 if (ret > 0)
197 XFS_STATS_ADD(xs_read_bytes, ret);
198
199 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
200 return ret;
201}
202
203ssize_t
204xfs_splice_read(
205 xfs_inode_t *ip,
206 struct file *infilp,
207 loff_t *ppos,
208 struct pipe_inode_info *pipe,
209 size_t count,
210 int flags,
211 int ioflags)
212{
213 xfs_mount_t *mp = ip->i_mount;
214 ssize_t ret;
215
216 XFS_STATS_INC(xs_read_calls);
217 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
218 return -EIO;
219
220 xfs_ilock(ip, XFS_IOLOCK_SHARED);
221
222 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
223 int iolock = XFS_IOLOCK_SHARED;
224 int error;
225
226 error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
227 FILP_DELAY_FLAG(infilp), &iolock);
228 if (error) {
229 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
230 return -error;
231 }
232 }
233
234 trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
235
236 ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
237 if (ret > 0)
238 XFS_STATS_ADD(xs_read_bytes, ret);
239
240 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
241 return ret;
242}
243
244ssize_t
245xfs_splice_write(
246 xfs_inode_t *ip,
247 struct pipe_inode_info *pipe,
248 struct file *outfilp,
249 loff_t *ppos,
250 size_t count,
251 int flags,
252 int ioflags)
253{
254 xfs_mount_t *mp = ip->i_mount;
255 ssize_t ret;
256 struct inode *inode = outfilp->f_mapping->host;
257 xfs_fsize_t isize, new_size;
258
259 XFS_STATS_INC(xs_write_calls);
260 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
261 return -EIO;
262
263 xfs_ilock(ip, XFS_IOLOCK_EXCL);
264
265 if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
266 int iolock = XFS_IOLOCK_EXCL;
267 int error;
268
269 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
270 FILP_DELAY_FLAG(outfilp), &iolock);
271 if (error) {
272 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
273 return -error;
274 }
275 }
276
277 new_size = *ppos + count;
278
279 xfs_ilock(ip, XFS_ILOCK_EXCL);
280 if (new_size > ip->i_size)
281 ip->i_new_size = new_size;
282 xfs_iunlock(ip, XFS_ILOCK_EXCL);
283
284 trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
285
286 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
287 if (ret > 0)
288 XFS_STATS_ADD(xs_write_bytes, ret);
289
290 isize = i_size_read(inode);
291 if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
292 *ppos = isize;
293
294 if (*ppos > ip->i_size) {
295 xfs_ilock(ip, XFS_ILOCK_EXCL);
296 if (*ppos > ip->i_size)
297 ip->i_size = *ppos;
298 xfs_iunlock(ip, XFS_ILOCK_EXCL);
299 }
300
301 if (ip->i_new_size) {
302 xfs_ilock(ip, XFS_ILOCK_EXCL);
303 ip->i_new_size = 0;
304 if (ip->i_d.di_size > ip->i_size)
305 ip->i_d.di_size = ip->i_size;
306 xfs_iunlock(ip, XFS_ILOCK_EXCL);
307 }
308 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
309 return ret;
310}
311
312/*
313 * This routine is called to handle zeroing any space in the last
314 * block of the file that is beyond the EOF. We do this since the
315 * size is being increased without writing anything to that block
316 * and we don't want anyone to read the garbage on the disk.
317 */
318STATIC int /* error (positive) */
319xfs_zero_last_block(
320 xfs_inode_t *ip,
321 xfs_fsize_t offset,
322 xfs_fsize_t isize)
323{
324 xfs_fileoff_t last_fsb;
325 xfs_mount_t *mp = ip->i_mount;
326 int nimaps;
327 int zero_offset;
328 int zero_len;
329 int error = 0;
330 xfs_bmbt_irec_t imap;
331
332 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
333
334 zero_offset = XFS_B_FSB_OFFSET(mp, isize);
335 if (zero_offset == 0) {
336 /*
337 * There are no extra bytes in the last block on disk to
338 * zero, so return.
339 */
340 return 0;
341 }
342
343 last_fsb = XFS_B_TO_FSBT(mp, isize);
344 nimaps = 1;
345 error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
346 &nimaps, NULL, NULL);
347 if (error) {
348 return error;
349 }
350 ASSERT(nimaps > 0);
351 /*
352 * If the block underlying isize is just a hole, then there
353 * is nothing to zero.
354 */
355 if (imap.br_startblock == HOLESTARTBLOCK) {
356 return 0;
357 }
358 /*
359 * Zero the part of the last block beyond the EOF, and write it
360 * out sync. We need to drop the ilock while we do this so we
361 * don't deadlock when the buffer cache calls back to us.
362 */
363 xfs_iunlock(ip, XFS_ILOCK_EXCL);
364
365 zero_len = mp->m_sb.sb_blocksize - zero_offset;
366 if (isize + zero_len > offset)
367 zero_len = offset - isize;
368 error = xfs_iozero(ip, isize, zero_len);
369
370 xfs_ilock(ip, XFS_ILOCK_EXCL);
371 ASSERT(error >= 0);
372 return error;
373}
374
375/*
376 * Zero any on disk space between the current EOF and the new,
377 * larger EOF. This handles the normal case of zeroing the remainder
378 * of the last block in the file and the unusual case of zeroing blocks
379 * out beyond the size of the file. This second case only happens
380 * with fixed size extents and when the system crashes before the inode
381 * size was updated but after blocks were allocated. If fill is set,
382 * then any holes in the range are filled and zeroed. If not, the holes
383 * are left alone as holes.
384 */
385
386int /* error (positive) */
387xfs_zero_eof(
388 xfs_inode_t *ip,
389 xfs_off_t offset, /* starting I/O offset */
390 xfs_fsize_t isize) /* current inode size */
391{
392 xfs_mount_t *mp = ip->i_mount;
393 xfs_fileoff_t start_zero_fsb;
394 xfs_fileoff_t end_zero_fsb;
395 xfs_fileoff_t zero_count_fsb;
396 xfs_fileoff_t last_fsb;
397 xfs_fileoff_t zero_off;
398 xfs_fsize_t zero_len;
399 int nimaps;
400 int error = 0;
401 xfs_bmbt_irec_t imap;
402
403 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
404 ASSERT(offset > isize);
405
406 /*
407 * First handle zeroing the block on which isize resides.
408 * We only zero a part of that block so it is handled specially.
409 */
410 error = xfs_zero_last_block(ip, offset, isize);
411 if (error) {
412 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
413 return error;
414 }
415
416 /*
417 * Calculate the range between the new size and the old
418 * where blocks needing to be zeroed may exist. To get the
419 * block where the last byte in the file currently resides,
420 * we need to subtract one from the size and truncate back
421 * to a block boundary. We subtract 1 in case the size is
422 * exactly on a block boundary.
423 */
424 last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
425 start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
426 end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
427 ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
428 if (last_fsb == end_zero_fsb) {
429 /*
430 * The size was only incremented on its last block.
431 * We took care of that above, so just return.
432 */
433 return 0;
434 }
435
436 ASSERT(start_zero_fsb <= end_zero_fsb);
437 while (start_zero_fsb <= end_zero_fsb) {
438 nimaps = 1;
439 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
440 error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
441 0, NULL, 0, &imap, &nimaps, NULL, NULL);
442 if (error) {
443 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
444 return error;
445 }
446 ASSERT(nimaps > 0);
447
448 if (imap.br_state == XFS_EXT_UNWRITTEN ||
449 imap.br_startblock == HOLESTARTBLOCK) {
450 /*
451 * This loop handles initializing pages that were
452 * partially initialized by the code below this
453 * loop. It basically zeroes the part of the page
454 * that sits on a hole and sets the page as P_HOLE
455 * and calls remapf if it is a mapped file.
456 */
457 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
458 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
459 continue;
460 }
461
462 /*
463 * There are blocks we need to zero.
464 * Drop the inode lock while we're doing the I/O.
465 * We'll still have the iolock to protect us.
466 */
467 xfs_iunlock(ip, XFS_ILOCK_EXCL);
468
469 zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
470 zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
471
472 if ((zero_off + zero_len) > offset)
473 zero_len = offset - zero_off;
474
475 error = xfs_iozero(ip, zero_off, zero_len);
476 if (error) {
477 goto out_lock;
478 }
479
480 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
481 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
482
483 xfs_ilock(ip, XFS_ILOCK_EXCL);
484 }
485
486 return 0;
487
488out_lock:
489 xfs_ilock(ip, XFS_ILOCK_EXCL);
490 ASSERT(error >= 0);
491 return error;
492}
493
494ssize_t /* bytes written, or (-) error */
495xfs_write(
496 struct xfs_inode *xip,
497 struct kiocb *iocb,
498 const struct iovec *iovp,
499 unsigned int nsegs,
500 loff_t *offset,
501 int ioflags)
502{
503 struct file *file = iocb->ki_filp;
504 struct address_space *mapping = file->f_mapping;
505 struct inode *inode = mapping->host;
506 unsigned long segs = nsegs;
507 xfs_mount_t *mp;
508 ssize_t ret = 0, error = 0;
509 xfs_fsize_t isize, new_size;
510 int iolock;
511 int eventsent = 0;
512 size_t ocount = 0, count;
513 loff_t pos;
514 int need_i_mutex;
515
516 XFS_STATS_INC(xs_write_calls);
517
518 error = generic_segment_checks(iovp, &segs, &ocount, VERIFY_READ);
519 if (error)
520 return error;
521
522 count = ocount;
523 pos = *offset;
524
525 if (count == 0)
526 return 0;
527
528 mp = xip->i_mount;
529
530 xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
531
532 if (XFS_FORCED_SHUTDOWN(mp))
533 return -EIO;
534
535relock:
536 if (ioflags & IO_ISDIRECT) {
537 iolock = XFS_IOLOCK_SHARED;
538 need_i_mutex = 0;
539 } else {
540 iolock = XFS_IOLOCK_EXCL;
541 need_i_mutex = 1;
542 mutex_lock(&inode->i_mutex);
543 }
544
545 xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
546
547start:
548 error = -generic_write_checks(file, &pos, &count,
549 S_ISBLK(inode->i_mode));
550 if (error) {
551 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
552 goto out_unlock_mutex;
553 }
554
555 if ((DM_EVENT_ENABLED(xip, DM_EVENT_WRITE) &&
556 !(ioflags & IO_INVIS) && !eventsent)) {
557 int dmflags = FILP_DELAY_FLAG(file);
558
559 if (need_i_mutex)
560 dmflags |= DM_FLAGS_IMUX;
561
562 xfs_iunlock(xip, XFS_ILOCK_EXCL);
563 error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, xip,
564 pos, count, dmflags, &iolock);
565 if (error) {
566 goto out_unlock_internal;
567 }
568 xfs_ilock(xip, XFS_ILOCK_EXCL);
569 eventsent = 1;
570
571 /*
572 * The iolock was dropped and reacquired in XFS_SEND_DATA
573 * so we have to recheck the size when appending.
574 * We will only "goto start;" once, since having sent the
575 * event prevents another call to XFS_SEND_DATA, which is
576 * what allows the size to change in the first place.
577 */
578 if ((file->f_flags & O_APPEND) && pos != xip->i_size)
579 goto start;
580 }
581
582 if (ioflags & IO_ISDIRECT) {
583 xfs_buftarg_t *target =
584 XFS_IS_REALTIME_INODE(xip) ?
585 mp->m_rtdev_targp : mp->m_ddev_targp;
586
587 if ((pos & target->bt_smask) || (count & target->bt_smask)) {
588 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
589 return XFS_ERROR(-EINVAL);
590 }
591
592 if (!need_i_mutex && (mapping->nrpages || pos > xip->i_size)) {
593 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
594 iolock = XFS_IOLOCK_EXCL;
595 need_i_mutex = 1;
596 mutex_lock(&inode->i_mutex);
597 xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
598 goto start;
599 }
600 }
601
602 new_size = pos + count;
603 if (new_size > xip->i_size)
604 xip->i_new_size = new_size;
605
606 if (likely(!(ioflags & IO_INVIS)))
607 file_update_time(file);
608
609 /*
610 * If the offset is beyond the size of the file, we have a couple
611 * of things to do. First, if there is already space allocated
612 * we need to either create holes or zero the disk or ...
613 *
614 * If there is a page where the previous size lands, we need
615 * to zero it out up to the new size.
616 */
617
618 if (pos > xip->i_size) {
619 error = xfs_zero_eof(xip, pos, xip->i_size);
620 if (error) {
621 xfs_iunlock(xip, XFS_ILOCK_EXCL);
622 goto out_unlock_internal;
623 }
624 }
625 xfs_iunlock(xip, XFS_ILOCK_EXCL);
626
627 /*
628 * If we're writing the file then make sure to clear the
629 * setuid and setgid bits if the process is not being run
630 * by root. This keeps people from modifying setuid and
631 * setgid binaries.
632 */
633
634 if (((xip->i_d.di_mode & S_ISUID) ||
635 ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
636 (S_ISGID | S_IXGRP))) &&
637 !capable(CAP_FSETID)) {
638 error = xfs_write_clear_setuid(xip);
639 if (likely(!error))
640 error = -file_remove_suid(file);
641 if (unlikely(error)) {
642 goto out_unlock_internal;
643 }
644 }
645
646 /* We can write back this queue in page reclaim */
647 current->backing_dev_info = mapping->backing_dev_info;
648
649 if ((ioflags & IO_ISDIRECT)) {
650 if (mapping->nrpages) {
651 WARN_ON(need_i_mutex == 0);
652 error = xfs_flushinval_pages(xip,
653 (pos & PAGE_CACHE_MASK),
654 -1, FI_REMAPF_LOCKED);
655 if (error)
656 goto out_unlock_internal;
657 }
658
659 if (need_i_mutex) {
660 /* demote the lock now the cached pages are gone */
661 xfs_ilock_demote(xip, XFS_IOLOCK_EXCL);
662 mutex_unlock(&inode->i_mutex);
663
664 iolock = XFS_IOLOCK_SHARED;
665 need_i_mutex = 0;
666 }
667
668 trace_xfs_file_direct_write(xip, count, *offset, ioflags);
669 ret = generic_file_direct_write(iocb, iovp,
670 &segs, pos, offset, count, ocount);
671
672 /*
673 * direct-io write to a hole: fall through to buffered I/O
674 * for completing the rest of the request.
675 */
676 if (ret >= 0 && ret != count) {
677 XFS_STATS_ADD(xs_write_bytes, ret);
678
679 pos += ret;
680 count -= ret;
681
682 ioflags &= ~IO_ISDIRECT;
683 xfs_iunlock(xip, iolock);
684 goto relock;
685 }
686 } else {
687 int enospc = 0;
688 ssize_t ret2 = 0;
689
690write_retry:
691 trace_xfs_file_buffered_write(xip, count, *offset, ioflags);
692 ret2 = generic_file_buffered_write(iocb, iovp, segs,
693 pos, offset, count, ret);
694 /*
695 * if we just got an ENOSPC, flush the inode now we
696 * aren't holding any page locks and retry *once*
697 */
698 if (ret2 == -ENOSPC && !enospc) {
699 error = xfs_flush_pages(xip, 0, -1, 0, FI_NONE);
700 if (error)
701 goto out_unlock_internal;
702 enospc = 1;
703 goto write_retry;
704 }
705 ret = ret2;
706 }
707
708 current->backing_dev_info = NULL;
709
710 isize = i_size_read(inode);
711 if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
712 *offset = isize;
713
714 if (*offset > xip->i_size) {
715 xfs_ilock(xip, XFS_ILOCK_EXCL);
716 if (*offset > xip->i_size)
717 xip->i_size = *offset;
718 xfs_iunlock(xip, XFS_ILOCK_EXCL);
719 }
720
721 if (ret == -ENOSPC &&
722 DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
723 xfs_iunlock(xip, iolock);
724 if (need_i_mutex)
725 mutex_unlock(&inode->i_mutex);
726 error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, xip,
727 DM_RIGHT_NULL, xip, DM_RIGHT_NULL, NULL, NULL,
728 0, 0, 0); /* Delay flag intentionally unused */
729 if (need_i_mutex)
730 mutex_lock(&inode->i_mutex);
731 xfs_ilock(xip, iolock);
732 if (error)
733 goto out_unlock_internal;
734 goto start;
735 }
736
737 error = -ret;
738 if (ret <= 0)
739 goto out_unlock_internal;
740
741 XFS_STATS_ADD(xs_write_bytes, ret);
742
743 /* Handle various SYNC-type writes */
744 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
745 loff_t end = pos + ret - 1;
746 int error2;
747
748 xfs_iunlock(xip, iolock);
749 if (need_i_mutex)
750 mutex_unlock(&inode->i_mutex);
751
752 error2 = filemap_write_and_wait_range(mapping, pos, end);
753 if (!error)
754 error = error2;
755 if (need_i_mutex)
756 mutex_lock(&inode->i_mutex);
757 xfs_ilock(xip, iolock);
758
759 error2 = xfs_fsync(xip);
760 if (!error)
761 error = error2;
762 }
763
764 out_unlock_internal:
765 if (xip->i_new_size) {
766 xfs_ilock(xip, XFS_ILOCK_EXCL);
767 xip->i_new_size = 0;
768 /*
769 * If this was a direct or synchronous I/O that failed (such
770 * as ENOSPC) then part of the I/O may have been written to
771 * disk before the error occured. In this case the on-disk
772 * file size may have been adjusted beyond the in-memory file
773 * size and now needs to be truncated back.
774 */
775 if (xip->i_d.di_size > xip->i_size)
776 xip->i_d.di_size = xip->i_size;
777 xfs_iunlock(xip, XFS_ILOCK_EXCL);
778 }
779 xfs_iunlock(xip, iolock);
780 out_unlock_mutex:
781 if (need_i_mutex)
782 mutex_unlock(&inode->i_mutex);
783 return -error;
784}
785
786/*
787 * All xfs metadata buffers except log state machine buffers
788 * get this attached as their b_bdstrat callback function.
789 * This is so that we can catch a buffer
790 * after prematurely unpinning it to forcibly shutdown the filesystem.
791 */
792int
793xfs_bdstrat_cb(struct xfs_buf *bp)
794{
795 if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
796 trace_xfs_bdstrat_shut(bp, _RET_IP_);
797 /*
798 * Metadata write that didn't get logged but
799 * written delayed anyway. These aren't associated
800 * with a transaction, and can be ignored.
801 */
802 if (XFS_BUF_IODONE_FUNC(bp) == NULL &&
803 (XFS_BUF_ISREAD(bp)) == 0)
804 return (xfs_bioerror_relse(bp));
805 else
806 return (xfs_bioerror(bp));
807 }
808
809 xfs_buf_iorequest(bp);
810 return 0;
811}
812
813/*
814 * Wrapper around bdstrat so that we can stop data from going to disk in case
815 * we are shutting down the filesystem. Typically user data goes thru this
816 * path; one of the exceptions is the superblock.
817 */
818void
819xfsbdstrat(
820 struct xfs_mount *mp,
821 struct xfs_buf *bp)
822{
823 ASSERT(mp);
824 if (!XFS_FORCED_SHUTDOWN(mp)) {
825 xfs_buf_iorequest(bp);
826 return;
827 }
828
829 trace_xfs_bdstrat_shut(bp, _RET_IP_);
830 xfs_bioerror_relse(bp);
831}
832
833/*
834 * If the underlying (data/log/rt) device is readonly, there are some
835 * operations that cannot proceed.
836 */
837int
838xfs_dev_is_read_only(
839 xfs_mount_t *mp,
840 char *message)
841{
842 if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
843 xfs_readonly_buftarg(mp->m_logdev_targp) ||
844 (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
845 cmn_err(CE_NOTE,
846 "XFS: %s required on read-only device.", message);
847 cmn_err(CE_NOTE,
848 "XFS: write access unavailable, cannot proceed.");
849 return EROFS;
850 }
851 return 0;
852}
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
deleted file mode 100644
index d1f7789c7ffb..000000000000
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ /dev/null
@@ -1,32 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_LRW_H__
19#define __XFS_LRW_H__
20
21struct xfs_mount;
22struct xfs_inode;
23struct xfs_buf;
24
25/* errors from xfsbdstrat() must be extracted from the buffer */
26extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
27extern int xfs_bdstrat_cb(struct xfs_buf *);
28extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
29
30extern int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
31
32#endif /* __XFS_LRW_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 3d4a0c84d634..1947514ce1ad 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -44,20 +44,6 @@ xfs_quota_type(int type)
44} 44}
45 45
46STATIC int 46STATIC int
47xfs_fs_quota_sync(
48 struct super_block *sb,
49 int type)
50{
51 struct xfs_mount *mp = XFS_M(sb);
52
53 if (sb->s_flags & MS_RDONLY)
54 return -EROFS;
55 if (!XFS_IS_QUOTA_RUNNING(mp))
56 return -ENOSYS;
57 return -xfs_sync_data(mp, 0);
58}
59
60STATIC int
61xfs_fs_get_xstate( 47xfs_fs_get_xstate(
62 struct super_block *sb, 48 struct super_block *sb,
63 struct fs_quota_stat *fqs) 49 struct fs_quota_stat *fqs)
@@ -82,8 +68,6 @@ xfs_fs_set_xstate(
82 return -EROFS; 68 return -EROFS;
83 if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp)) 69 if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp))
84 return -ENOSYS; 70 return -ENOSYS;
85 if (!capable(CAP_SYS_ADMIN))
86 return -EPERM;
87 71
88 if (uflags & XFS_QUOTA_UDQ_ACCT) 72 if (uflags & XFS_QUOTA_UDQ_ACCT)
89 flags |= XFS_UQUOTA_ACCT; 73 flags |= XFS_UQUOTA_ACCT;
@@ -144,14 +128,11 @@ xfs_fs_set_xquota(
144 return -ENOSYS; 128 return -ENOSYS;
145 if (!XFS_IS_QUOTA_ON(mp)) 129 if (!XFS_IS_QUOTA_ON(mp))
146 return -ESRCH; 130 return -ESRCH;
147 if (!capable(CAP_SYS_ADMIN))
148 return -EPERM;
149 131
150 return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq); 132 return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq);
151} 133}
152 134
153const struct quotactl_ops xfs_quotactl_operations = { 135const struct quotactl_ops xfs_quotactl_operations = {
154 .quota_sync = xfs_fs_quota_sync,
155 .get_xstate = xfs_fs_get_xstate, 136 .get_xstate = xfs_fs_get_xstate,
156 .set_xstate = xfs_fs_set_xstate, 137 .set_xstate = xfs_fs_set_xstate,
157 .get_xquota = xfs_fs_get_xquota, 138 .get_xquota = xfs_fs_get_xquota,
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 77414db10dc2..71345a370d9f 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -877,12 +877,11 @@ xfsaild(
877{ 877{
878 struct xfs_ail *ailp = data; 878 struct xfs_ail *ailp = data;
879 xfs_lsn_t last_pushed_lsn = 0; 879 xfs_lsn_t last_pushed_lsn = 0;
880 long tout = 0; 880 long tout = 0; /* milliseconds */
881 881
882 while (!kthread_should_stop()) { 882 while (!kthread_should_stop()) {
883 if (tout) 883 schedule_timeout_interruptible(tout ?
884 schedule_timeout_interruptible(msecs_to_jiffies(tout)); 884 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
885 tout = 1000;
886 885
887 /* swsusp */ 886 /* swsusp */
888 try_to_freeze(); 887 try_to_freeze();
@@ -1022,59 +1021,108 @@ xfs_fs_dirty_inode(
1022 XFS_I(inode)->i_update_core = 1; 1021 XFS_I(inode)->i_update_core = 1;
1023} 1022}
1024 1023
1025/* 1024STATIC int
1026 * Attempt to flush the inode, this will actually fail 1025xfs_log_inode(
1027 * if the inode is pinned, but we dirty the inode again 1026 struct xfs_inode *ip)
1028 * at the point when it is unpinned after a log write, 1027{
1029 * since this is when the inode itself becomes flushable. 1028 struct xfs_mount *mp = ip->i_mount;
1030 */ 1029 struct xfs_trans *tp;
1030 int error;
1031
1032 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1033 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
1034 error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
1035
1036 if (error) {
1037 xfs_trans_cancel(tp, 0);
1038 /* we need to return with the lock hold shared */
1039 xfs_ilock(ip, XFS_ILOCK_SHARED);
1040 return error;
1041 }
1042
1043 xfs_ilock(ip, XFS_ILOCK_EXCL);
1044
1045 /*
1046 * Note - it's possible that we might have pushed ourselves out of the
1047 * way during trans_reserve which would flush the inode. But there's
1048 * no guarantee that the inode buffer has actually gone out yet (it's
1049 * delwri). Plus the buffer could be pinned anyway if it's part of
1050 * an inode in another recent transaction. So we play it safe and
1051 * fire off the transaction anyway.
1052 */
1053 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1054 xfs_trans_ihold(tp, ip);
1055 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1056 xfs_trans_set_sync(tp);
1057 error = xfs_trans_commit(tp, 0);
1058 xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
1059
1060 return error;
1061}
1062
1031STATIC int 1063STATIC int
1032xfs_fs_write_inode( 1064xfs_fs_write_inode(
1033 struct inode *inode, 1065 struct inode *inode,
1034 int sync) 1066 struct writeback_control *wbc)
1035{ 1067{
1036 struct xfs_inode *ip = XFS_I(inode); 1068 struct xfs_inode *ip = XFS_I(inode);
1037 struct xfs_mount *mp = ip->i_mount; 1069 struct xfs_mount *mp = ip->i_mount;
1038 int error = 0; 1070 int error = EAGAIN;
1039 1071
1040 xfs_itrace_entry(ip); 1072 xfs_itrace_entry(ip);
1041 1073
1042 if (XFS_FORCED_SHUTDOWN(mp)) 1074 if (XFS_FORCED_SHUTDOWN(mp))
1043 return XFS_ERROR(EIO); 1075 return XFS_ERROR(EIO);
1044 1076
1045 if (sync) { 1077 if (wbc->sync_mode == WB_SYNC_ALL) {
1046 error = xfs_wait_on_pages(ip, 0, -1); 1078 /*
1047 if (error) 1079 * Make sure the inode has hit stable storage. By using the
1080 * log and the fsync transactions we reduce the IOs we have
1081 * to do here from two (log and inode) to just the log.
1082 *
1083 * Note: We still need to do a delwri write of the inode after
1084 * this to flush it to the backing buffer so that bulkstat
1085 * works properly if this is the first time the inode has been
1086 * written. Because we hold the ilock atomically over the
1087 * transaction commit and the inode flush we are guaranteed
1088 * that the inode is not pinned when it returns. If the flush
1089 * lock is already held, then the inode has already been
1090 * flushed once and we don't need to flush it again. Hence
1091 * the code will only flush the inode if it isn't already
1092 * being flushed.
1093 */
1094 xfs_ilock(ip, XFS_ILOCK_SHARED);
1095 if (ip->i_update_core) {
1096 error = xfs_log_inode(ip);
1097 if (error)
1098 goto out_unlock;
1099 }
1100 } else {
1101 /*
1102 * We make this non-blocking if the inode is contended, return
1103 * EAGAIN to indicate to the caller that they did not succeed.
1104 * This prevents the flush path from blocking on inodes inside
1105 * another operation right now, they get caught later by xfs_sync.
1106 */
1107 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
1048 goto out; 1108 goto out;
1049 } 1109 }
1050 1110
1051 /* 1111 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
1052 * Bypass inodes which have already been cleaned by 1112 goto out_unlock;
1053 * the inode flush clustering code inside xfs_iflush
1054 */
1055 if (xfs_inode_clean(ip))
1056 goto out;
1057 1113
1058 /* 1114 /*
1059 * We make this non-blocking if the inode is contended, return 1115 * Now we have the flush lock and the inode is not pinned, we can check
1060 * EAGAIN to indicate to the caller that they did not succeed. 1116 * if the inode is really clean as we know that there are no pending
1061 * This prevents the flush path from blocking on inodes inside 1117 * transaction completions, it is not waiting on the delayed write
1062 * another operation right now, they get caught later by xfs_sync. 1118 * queue and there is no IO in progress.
1063 */ 1119 */
1064 if (sync) { 1120 if (xfs_inode_clean(ip)) {
1065 xfs_ilock(ip, XFS_ILOCK_SHARED); 1121 xfs_ifunlock(ip);
1066 xfs_iflock(ip); 1122 error = 0;
1067 1123 goto out_unlock;
1068 error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
1069 } else {
1070 error = EAGAIN;
1071 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
1072 goto out;
1073 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
1074 goto out_unlock;
1075
1076 error = xfs_iflush(ip, XFS_IFLUSH_ASYNC_NOBLOCK);
1077 } 1124 }
1125 error = xfs_iflush(ip, 0);
1078 1126
1079 out_unlock: 1127 out_unlock:
1080 xfs_iunlock(ip, XFS_ILOCK_SHARED); 1128 xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -1257,6 +1305,29 @@ xfs_fs_statfs(
1257 return 0; 1305 return 0;
1258} 1306}
1259 1307
1308STATIC void
1309xfs_save_resvblks(struct xfs_mount *mp)
1310{
1311 __uint64_t resblks = 0;
1312
1313 mp->m_resblks_save = mp->m_resblks;
1314 xfs_reserve_blocks(mp, &resblks, NULL);
1315}
1316
1317STATIC void
1318xfs_restore_resvblks(struct xfs_mount *mp)
1319{
1320 __uint64_t resblks;
1321
1322 if (mp->m_resblks_save) {
1323 resblks = mp->m_resblks_save;
1324 mp->m_resblks_save = 0;
1325 } else
1326 resblks = xfs_default_resblks(mp);
1327
1328 xfs_reserve_blocks(mp, &resblks, NULL);
1329}
1330
1260STATIC int 1331STATIC int
1261xfs_fs_remount( 1332xfs_fs_remount(
1262 struct super_block *sb, 1333 struct super_block *sb,
@@ -1336,11 +1407,27 @@ xfs_fs_remount(
1336 } 1407 }
1337 mp->m_update_flags = 0; 1408 mp->m_update_flags = 0;
1338 } 1409 }
1410
1411 /*
1412 * Fill out the reserve pool if it is empty. Use the stashed
1413 * value if it is non-zero, otherwise go with the default.
1414 */
1415 xfs_restore_resvblks(mp);
1339 } 1416 }
1340 1417
1341 /* rw -> ro */ 1418 /* rw -> ro */
1342 if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { 1419 if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
1420 /*
1421 * After we have synced the data but before we sync the
1422 * metadata, we need to free up the reserve block pool so that
1423 * the used block count in the superblock on disk is correct at
1424 * the end of the remount. Stash the current reserve pool size
1425 * so that if we get remounted rw, we can return it to the same
1426 * size.
1427 */
1428
1343 xfs_quiesce_data(mp); 1429 xfs_quiesce_data(mp);
1430 xfs_save_resvblks(mp);
1344 xfs_quiesce_attr(mp); 1431 xfs_quiesce_attr(mp);
1345 mp->m_flags |= XFS_MOUNT_RDONLY; 1432 mp->m_flags |= XFS_MOUNT_RDONLY;
1346 } 1433 }
@@ -1359,11 +1446,22 @@ xfs_fs_freeze(
1359{ 1446{
1360 struct xfs_mount *mp = XFS_M(sb); 1447 struct xfs_mount *mp = XFS_M(sb);
1361 1448
1449 xfs_save_resvblks(mp);
1362 xfs_quiesce_attr(mp); 1450 xfs_quiesce_attr(mp);
1363 return -xfs_fs_log_dummy(mp); 1451 return -xfs_fs_log_dummy(mp);
1364} 1452}
1365 1453
1366STATIC int 1454STATIC int
1455xfs_fs_unfreeze(
1456 struct super_block *sb)
1457{
1458 struct xfs_mount *mp = XFS_M(sb);
1459
1460 xfs_restore_resvblks(mp);
1461 return 0;
1462}
1463
1464STATIC int
1367xfs_fs_show_options( 1465xfs_fs_show_options(
1368 struct seq_file *m, 1466 struct seq_file *m,
1369 struct vfsmount *mnt) 1467 struct vfsmount *mnt)
@@ -1585,6 +1683,7 @@ static const struct super_operations xfs_super_operations = {
1585 .put_super = xfs_fs_put_super, 1683 .put_super = xfs_fs_put_super,
1586 .sync_fs = xfs_fs_sync_fs, 1684 .sync_fs = xfs_fs_sync_fs,
1587 .freeze_fs = xfs_fs_freeze, 1685 .freeze_fs = xfs_fs_freeze,
1686 .unfreeze_fs = xfs_fs_unfreeze,
1588 .statfs = xfs_fs_statfs, 1687 .statfs = xfs_fs_statfs,
1589 .remount_fs = xfs_fs_remount, 1688 .remount_fs = xfs_fs_remount,
1590 .show_options = xfs_fs_show_options, 1689 .show_options = xfs_fs_show_options,
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 1f5e4bb5e970..05cd85317f6f 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -90,14 +90,13 @@ xfs_inode_ag_lookup(
90STATIC int 90STATIC int
91xfs_inode_ag_walk( 91xfs_inode_ag_walk(
92 struct xfs_mount *mp, 92 struct xfs_mount *mp,
93 xfs_agnumber_t ag, 93 struct xfs_perag *pag,
94 int (*execute)(struct xfs_inode *ip, 94 int (*execute)(struct xfs_inode *ip,
95 struct xfs_perag *pag, int flags), 95 struct xfs_perag *pag, int flags),
96 int flags, 96 int flags,
97 int tag, 97 int tag,
98 int exclusive) 98 int exclusive)
99{ 99{
100 struct xfs_perag *pag = &mp->m_perag[ag];
101 uint32_t first_index; 100 uint32_t first_index;
102 int last_error = 0; 101 int last_error = 0;
103 int skipped; 102 int skipped;
@@ -141,8 +140,6 @@ restart:
141 delay(1); 140 delay(1);
142 goto restart; 141 goto restart;
143 } 142 }
144
145 xfs_put_perag(mp, pag);
146 return last_error; 143 return last_error;
147} 144}
148 145
@@ -160,10 +157,16 @@ xfs_inode_ag_iterator(
160 xfs_agnumber_t ag; 157 xfs_agnumber_t ag;
161 158
162 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { 159 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
163 if (!mp->m_perag[ag].pag_ici_init) 160 struct xfs_perag *pag;
161
162 pag = xfs_perag_get(mp, ag);
163 if (!pag->pag_ici_init) {
164 xfs_perag_put(pag);
164 continue; 165 continue;
165 error = xfs_inode_ag_walk(mp, ag, execute, flags, tag, 166 }
167 error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
166 exclusive); 168 exclusive);
169 xfs_perag_put(pag);
167 if (error) { 170 if (error) {
168 last_error = error; 171 last_error = error;
169 if (error == EFSCORRUPTED) 172 if (error == EFSCORRUPTED)
@@ -231,7 +234,7 @@ xfs_sync_inode_data(
231 } 234 }
232 235
233 error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? 236 error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
234 0 : XFS_B_ASYNC, FI_NONE); 237 0 : XBF_ASYNC, FI_NONE);
235 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 238 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
236 239
237 out_wait: 240 out_wait:
@@ -267,8 +270,7 @@ xfs_sync_inode_attr(
267 goto out_unlock; 270 goto out_unlock;
268 } 271 }
269 272
270 error = xfs_iflush(ip, (flags & SYNC_WAIT) ? 273 error = xfs_iflush(ip, flags);
271 XFS_IFLUSH_SYNC : XFS_IFLUSH_DELWRI);
272 274
273 out_unlock: 275 out_unlock:
274 xfs_iunlock(ip, XFS_ILOCK_SHARED); 276 xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -293,10 +295,7 @@ xfs_sync_data(
293 if (error) 295 if (error)
294 return XFS_ERROR(error); 296 return XFS_ERROR(error);
295 297
296 xfs_log_force(mp, 0, 298 xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
297 (flags & SYNC_WAIT) ?
298 XFS_LOG_FORCE | XFS_LOG_SYNC :
299 XFS_LOG_FORCE);
300 return 0; 299 return 0;
301} 300}
302 301
@@ -322,10 +321,6 @@ xfs_commit_dummy_trans(
322 struct xfs_inode *ip = mp->m_rootip; 321 struct xfs_inode *ip = mp->m_rootip;
323 struct xfs_trans *tp; 322 struct xfs_trans *tp;
324 int error; 323 int error;
325 int log_flags = XFS_LOG_FORCE;
326
327 if (flags & SYNC_WAIT)
328 log_flags |= XFS_LOG_SYNC;
329 324
330 /* 325 /*
331 * Put a dummy transaction in the log to tell recovery 326 * Put a dummy transaction in the log to tell recovery
@@ -347,11 +342,11 @@ xfs_commit_dummy_trans(
347 xfs_iunlock(ip, XFS_ILOCK_EXCL); 342 xfs_iunlock(ip, XFS_ILOCK_EXCL);
348 343
349 /* the log force ensures this transaction is pushed to disk */ 344 /* the log force ensures this transaction is pushed to disk */
350 xfs_log_force(mp, 0, log_flags); 345 xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
351 return error; 346 return error;
352} 347}
353 348
354int 349STATIC int
355xfs_sync_fsdata( 350xfs_sync_fsdata(
356 struct xfs_mount *mp, 351 struct xfs_mount *mp,
357 int flags) 352 int flags)
@@ -367,7 +362,7 @@ xfs_sync_fsdata(
367 if (flags & SYNC_TRYLOCK) { 362 if (flags & SYNC_TRYLOCK) {
368 ASSERT(!(flags & SYNC_WAIT)); 363 ASSERT(!(flags & SYNC_WAIT));
369 364
370 bp = xfs_getsb(mp, XFS_BUF_TRYLOCK); 365 bp = xfs_getsb(mp, XBF_TRYLOCK);
371 if (!bp) 366 if (!bp)
372 goto out; 367 goto out;
373 368
@@ -387,7 +382,7 @@ xfs_sync_fsdata(
387 * become pinned in between there and here. 382 * become pinned in between there and here.
388 */ 383 */
389 if (XFS_BUF_ISPINNED(bp)) 384 if (XFS_BUF_ISPINNED(bp))
390 xfs_log_force(mp, 0, XFS_LOG_FORCE); 385 xfs_log_force(mp, 0);
391 } 386 }
392 387
393 388
@@ -448,9 +443,6 @@ xfs_quiesce_data(
448 xfs_sync_data(mp, SYNC_WAIT); 443 xfs_sync_data(mp, SYNC_WAIT);
449 xfs_qm_sync(mp, SYNC_WAIT); 444 xfs_qm_sync(mp, SYNC_WAIT);
450 445
451 /* drop inode references pinned by filestreams */
452 xfs_filestream_flush(mp);
453
454 /* write superblock and hoover up shutdown errors */ 446 /* write superblock and hoover up shutdown errors */
455 error = xfs_sync_fsdata(mp, SYNC_WAIT); 447 error = xfs_sync_fsdata(mp, SYNC_WAIT);
456 448
@@ -467,16 +459,18 @@ xfs_quiesce_fs(
467{ 459{
468 int count = 0, pincount; 460 int count = 0, pincount;
469 461
462 xfs_reclaim_inodes(mp, 0);
470 xfs_flush_buftarg(mp->m_ddev_targp, 0); 463 xfs_flush_buftarg(mp->m_ddev_targp, 0);
471 xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
472 464
473 /* 465 /*
474 * This loop must run at least twice. The first instance of the loop 466 * This loop must run at least twice. The first instance of the loop
475 * will flush most meta data but that will generate more meta data 467 * will flush most meta data but that will generate more meta data
476 * (typically directory updates). Which then must be flushed and 468 * (typically directory updates). Which then must be flushed and
477 * logged before we can write the unmount record. 469 * logged before we can write the unmount record. We also so sync
470 * reclaim of inodes to catch any that the above delwri flush skipped.
478 */ 471 */
479 do { 472 do {
473 xfs_reclaim_inodes(mp, SYNC_WAIT);
480 xfs_sync_attr(mp, SYNC_WAIT); 474 xfs_sync_attr(mp, SYNC_WAIT);
481 pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); 475 pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
482 if (!pincount) { 476 if (!pincount) {
@@ -575,7 +569,7 @@ xfs_flush_inodes(
575 igrab(inode); 569 igrab(inode);
576 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion); 570 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
577 wait_for_completion(&completion); 571 wait_for_completion(&completion);
578 xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC); 572 xfs_log_force(ip->i_mount, XFS_LOG_SYNC);
579} 573}
580 574
581/* 575/*
@@ -591,8 +585,8 @@ xfs_sync_worker(
591 int error; 585 int error;
592 586
593 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { 587 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
594 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); 588 xfs_log_force(mp, 0);
595 xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC); 589 xfs_reclaim_inodes(mp, 0);
596 /* dgc: errors ignored here */ 590 /* dgc: errors ignored here */
597 error = xfs_qm_sync(mp, SYNC_TRYLOCK); 591 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
598 error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); 592 error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
@@ -613,7 +607,8 @@ xfssyncd(
613 set_freezable(); 607 set_freezable();
614 timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10); 608 timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
615 for (;;) { 609 for (;;) {
616 timeleft = schedule_timeout_interruptible(timeleft); 610 if (list_empty(&mp->m_sync_list))
611 timeleft = schedule_timeout_interruptible(timeleft);
617 /* swsusp */ 612 /* swsusp */
618 try_to_freeze(); 613 try_to_freeze();
619 if (kthread_should_stop() && list_empty(&mp->m_sync_list)) 614 if (kthread_should_stop() && list_empty(&mp->m_sync_list))
@@ -633,8 +628,7 @@ xfssyncd(
633 list_add_tail(&mp->m_sync_work.w_list, 628 list_add_tail(&mp->m_sync_work.w_list,
634 &mp->m_sync_list); 629 &mp->m_sync_list);
635 } 630 }
636 list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list) 631 list_splice_init(&mp->m_sync_list, &tmp);
637 list_move(&work->w_list, &tmp);
638 spin_unlock(&mp->m_sync_lock); 632 spin_unlock(&mp->m_sync_lock);
639 633
640 list_for_each_entry_safe(work, n, &tmp, w_list) { 634 list_for_each_entry_safe(work, n, &tmp, w_list) {
@@ -690,16 +684,17 @@ void
690xfs_inode_set_reclaim_tag( 684xfs_inode_set_reclaim_tag(
691 xfs_inode_t *ip) 685 xfs_inode_t *ip)
692{ 686{
693 xfs_mount_t *mp = ip->i_mount; 687 struct xfs_mount *mp = ip->i_mount;
694 xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); 688 struct xfs_perag *pag;
695 689
696 read_lock(&pag->pag_ici_lock); 690 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
691 write_lock(&pag->pag_ici_lock);
697 spin_lock(&ip->i_flags_lock); 692 spin_lock(&ip->i_flags_lock);
698 __xfs_inode_set_reclaim_tag(pag, ip); 693 __xfs_inode_set_reclaim_tag(pag, ip);
699 __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 694 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
700 spin_unlock(&ip->i_flags_lock); 695 spin_unlock(&ip->i_flags_lock);
701 read_unlock(&pag->pag_ici_lock); 696 write_unlock(&pag->pag_ici_lock);
702 xfs_put_perag(mp, pag); 697 xfs_perag_put(pag);
703} 698}
704 699
705void 700void
@@ -712,12 +707,64 @@ __xfs_inode_clear_reclaim_tag(
712 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); 707 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
713} 708}
714 709
710/*
711 * Inodes in different states need to be treated differently, and the return
712 * value of xfs_iflush is not sufficient to get this right. The following table
713 * lists the inode states and the reclaim actions necessary for non-blocking
714 * reclaim:
715 *
716 *
717 * inode state iflush ret required action
718 * --------------- ---------- ---------------
719 * bad - reclaim
720 * shutdown EIO unpin and reclaim
721 * clean, unpinned 0 reclaim
722 * stale, unpinned 0 reclaim
723 * clean, pinned(*) 0 requeue
724 * stale, pinned EAGAIN requeue
725 * dirty, delwri ok 0 requeue
726 * dirty, delwri blocked EAGAIN requeue
727 * dirty, sync flush 0 reclaim
728 *
729 * (*) dgc: I don't think the clean, pinned state is possible but it gets
730 * handled anyway given the order of checks implemented.
731 *
732 * As can be seen from the table, the return value of xfs_iflush() is not
733 * sufficient to correctly decide the reclaim action here. The checks in
734 * xfs_iflush() might look like duplicates, but they are not.
735 *
736 * Also, because we get the flush lock first, we know that any inode that has
737 * been flushed delwri has had the flush completed by the time we check that
738 * the inode is clean. The clean inode check needs to be done before flushing
739 * the inode delwri otherwise we would loop forever requeuing clean inodes as
740 * we cannot tell apart a successful delwri flush and a clean inode from the
741 * return value of xfs_iflush().
742 *
743 * Note that because the inode is flushed delayed write by background
744 * writeback, the flush lock may already be held here and waiting on it can
745 * result in very long latencies. Hence for sync reclaims, where we wait on the
746 * flush lock, the caller should push out delayed write inodes first before
747 * trying to reclaim them to minimise the amount of time spent waiting. For
748 * background relaim, we just requeue the inode for the next pass.
749 *
750 * Hence the order of actions after gaining the locks should be:
751 * bad => reclaim
752 * shutdown => unpin and reclaim
753 * pinned, delwri => requeue
754 * pinned, sync => unpin
755 * stale => reclaim
756 * clean => reclaim
757 * dirty, delwri => flush and requeue
758 * dirty, sync => flush, wait and reclaim
759 */
715STATIC int 760STATIC int
716xfs_reclaim_inode( 761xfs_reclaim_inode(
717 struct xfs_inode *ip, 762 struct xfs_inode *ip,
718 struct xfs_perag *pag, 763 struct xfs_perag *pag,
719 int sync_mode) 764 int sync_mode)
720{ 765{
766 int error = 0;
767
721 /* 768 /*
722 * The radix tree lock here protects a thread in xfs_iget from racing 769 * The radix tree lock here protects a thread in xfs_iget from racing
723 * with us starting reclaim on the inode. Once we have the 770 * with us starting reclaim on the inode. Once we have the
@@ -735,33 +782,70 @@ xfs_reclaim_inode(
735 spin_unlock(&ip->i_flags_lock); 782 spin_unlock(&ip->i_flags_lock);
736 write_unlock(&pag->pag_ici_lock); 783 write_unlock(&pag->pag_ici_lock);
737 784
738 /*
739 * If the inode is still dirty, then flush it out. If the inode
740 * is not in the AIL, then it will be OK to flush it delwri as
741 * long as xfs_iflush() does not keep any references to the inode.
742 * We leave that decision up to xfs_iflush() since it has the
743 * knowledge of whether it's OK to simply do a delwri flush of
744 * the inode or whether we need to wait until the inode is
745 * pulled from the AIL.
746 * We get the flush lock regardless, though, just to make sure
747 * we don't free it while it is being flushed.
748 */
749 xfs_ilock(ip, XFS_ILOCK_EXCL); 785 xfs_ilock(ip, XFS_ILOCK_EXCL);
750 xfs_iflock(ip); 786 if (!xfs_iflock_nowait(ip)) {
787 if (!(sync_mode & SYNC_WAIT))
788 goto out;
789 xfs_iflock(ip);
790 }
791
792 if (is_bad_inode(VFS_I(ip)))
793 goto reclaim;
794 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
795 xfs_iunpin_wait(ip);
796 goto reclaim;
797 }
798 if (xfs_ipincount(ip)) {
799 if (!(sync_mode & SYNC_WAIT)) {
800 xfs_ifunlock(ip);
801 goto out;
802 }
803 xfs_iunpin_wait(ip);
804 }
805 if (xfs_iflags_test(ip, XFS_ISTALE))
806 goto reclaim;
807 if (xfs_inode_clean(ip))
808 goto reclaim;
809
810 /* Now we have an inode that needs flushing */
811 error = xfs_iflush(ip, sync_mode);
812 if (sync_mode & SYNC_WAIT) {
813 xfs_iflock(ip);
814 goto reclaim;
815 }
751 816
752 /* 817 /*
753 * In the case of a forced shutdown we rely on xfs_iflush() to 818 * When we have to flush an inode but don't have SYNC_WAIT set, we
754 * wait for the inode to be unpinned before returning an error. 819 * flush the inode out using a delwri buffer and wait for the next
820 * call into reclaim to find it in a clean state instead of waiting for
821 * it now. We also don't return errors here - if the error is transient
822 * then the next reclaim pass will flush the inode, and if the error
823 * is permanent then the next sync reclaim will relcaim the inode and
824 * pass on the error.
755 */ 825 */
756 if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) { 826 if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
757 /* synchronize with xfs_iflush_done */ 827 xfs_fs_cmn_err(CE_WARN, ip->i_mount,
758 xfs_iflock(ip); 828 "inode 0x%llx background reclaim flush failed with %d",
759 xfs_ifunlock(ip); 829 (long long)ip->i_ino, error);
760 } 830 }
831out:
832 xfs_iflags_clear(ip, XFS_IRECLAIM);
833 xfs_iunlock(ip, XFS_ILOCK_EXCL);
834 /*
835 * We could return EAGAIN here to make reclaim rescan the inode tree in
836 * a short while. However, this just burns CPU time scanning the tree
837 * waiting for IO to complete and xfssyncd never goes back to the idle
838 * state. Instead, return 0 to let the next scheduled background reclaim
839 * attempt to reclaim the inode again.
840 */
841 return 0;
761 842
843reclaim:
844 xfs_ifunlock(ip);
762 xfs_iunlock(ip, XFS_ILOCK_EXCL); 845 xfs_iunlock(ip, XFS_ILOCK_EXCL);
763 xfs_ireclaim(ip); 846 xfs_ireclaim(ip);
764 return 0; 847 return error;
848
765} 849}
766 850
767int 851int
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index ea932b43335d..d480c346cabb 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -37,7 +37,6 @@ void xfs_syncd_stop(struct xfs_mount *mp);
37 37
38int xfs_sync_attr(struct xfs_mount *mp, int flags); 38int xfs_sync_attr(struct xfs_mount *mp, int flags);
39int xfs_sync_data(struct xfs_mount *mp, int flags); 39int xfs_sync_data(struct xfs_mount *mp, int flags);
40int xfs_sync_fsdata(struct xfs_mount *mp, int flags);
41 40
42int xfs_quiesce_data(struct xfs_mount *mp); 41int xfs_quiesce_data(struct xfs_mount *mp);
43void xfs_quiesce_attr(struct xfs_mount *mp); 42void xfs_quiesce_attr(struct xfs_mount *mp);
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
index 856eb3c8d605..5a107601e969 100644
--- a/fs/xfs/linux-2.6/xfs_trace.c
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -52,22 +52,6 @@
52#include "quota/xfs_dquot.h" 52#include "quota/xfs_dquot.h"
53 53
54/* 54/*
55 * Format fsblock number into a static buffer & return it.
56 */
57STATIC char *xfs_fmtfsblock(xfs_fsblock_t bno)
58{
59 static char rval[50];
60
61 if (bno == NULLFSBLOCK)
62 sprintf(rval, "NULLFSBLOCK");
63 else if (isnullstartblock(bno))
64 sprintf(rval, "NULLSTARTBLOCK(%lld)", startblockval(bno));
65 else
66 sprintf(rval, "%lld", (xfs_dfsbno_t)bno);
67 return rval;
68}
69
70/*
71 * We include this last to have the helpers above available for the trace 55 * We include this last to have the helpers above available for the trace
72 * event implementations. 56 * event implementations.
73 */ 57 */
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index c22a608321a3..fcaa62f0799e 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -78,6 +78,33 @@ DECLARE_EVENT_CLASS(xfs_attr_list_class,
78 ) 78 )
79) 79)
80 80
81#define DEFINE_PERAG_REF_EVENT(name) \
82TRACE_EVENT(name, \
83 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \
84 unsigned long caller_ip), \
85 TP_ARGS(mp, agno, refcount, caller_ip), \
86 TP_STRUCT__entry( \
87 __field(dev_t, dev) \
88 __field(xfs_agnumber_t, agno) \
89 __field(int, refcount) \
90 __field(unsigned long, caller_ip) \
91 ), \
92 TP_fast_assign( \
93 __entry->dev = mp->m_super->s_dev; \
94 __entry->agno = agno; \
95 __entry->refcount = refcount; \
96 __entry->caller_ip = caller_ip; \
97 ), \
98 TP_printk("dev %d:%d agno %u refcount %d caller %pf", \
99 MAJOR(__entry->dev), MINOR(__entry->dev), \
100 __entry->agno, \
101 __entry->refcount, \
102 (char *)__entry->caller_ip) \
103);
104
105DEFINE_PERAG_REF_EVENT(xfs_perag_get)
106DEFINE_PERAG_REF_EVENT(xfs_perag_put)
107
81#define DEFINE_ATTR_LIST_EVENT(name) \ 108#define DEFINE_ATTR_LIST_EVENT(name) \
82DEFINE_EVENT(xfs_attr_list_class, name, \ 109DEFINE_EVENT(xfs_attr_list_class, name, \
83 TP_PROTO(struct xfs_attr_list_context *ctx), \ 110 TP_PROTO(struct xfs_attr_list_context *ctx), \
@@ -170,13 +197,13 @@ TRACE_EVENT(xfs_iext_insert,
170 __entry->caller_ip = caller_ip; 197 __entry->caller_ip = caller_ip;
171 ), 198 ),
172 TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " 199 TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
173 "offset %lld block %s count %lld flag %d caller %pf", 200 "offset %lld block %lld count %lld flag %d caller %pf",
174 MAJOR(__entry->dev), MINOR(__entry->dev), 201 MAJOR(__entry->dev), MINOR(__entry->dev),
175 __entry->ino, 202 __entry->ino,
176 __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), 203 __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
177 (long)__entry->idx, 204 (long)__entry->idx,
178 __entry->startoff, 205 __entry->startoff,
179 xfs_fmtfsblock(__entry->startblock), 206 (__int64_t)__entry->startblock,
180 __entry->blockcount, 207 __entry->blockcount,
181 __entry->state, 208 __entry->state,
182 (char *)__entry->caller_ip) 209 (char *)__entry->caller_ip)
@@ -214,13 +241,13 @@ DECLARE_EVENT_CLASS(xfs_bmap_class,
214 __entry->caller_ip = caller_ip; 241 __entry->caller_ip = caller_ip;
215 ), 242 ),
216 TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " 243 TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
217 "offset %lld block %s count %lld flag %d caller %pf", 244 "offset %lld block %lld count %lld flag %d caller %pf",
218 MAJOR(__entry->dev), MINOR(__entry->dev), 245 MAJOR(__entry->dev), MINOR(__entry->dev),
219 __entry->ino, 246 __entry->ino,
220 __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), 247 __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
221 (long)__entry->idx, 248 (long)__entry->idx,
222 __entry->startoff, 249 __entry->startoff,
223 xfs_fmtfsblock(__entry->startblock), 250 (__int64_t)__entry->startblock,
224 __entry->blockcount, 251 __entry->blockcount,
225 __entry->state, 252 __entry->state,
226 (char *)__entry->caller_ip) 253 (char *)__entry->caller_ip)
@@ -456,6 +483,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock);
456DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale); 483DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale);
457DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed); 484DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed);
458DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push); 485DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push);
486DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pushbuf);
459DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf); 487DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf);
460DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur); 488DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur);
461DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb); 489DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb);
@@ -565,7 +593,7 @@ DECLARE_EVENT_CLASS(xfs_dquot_class,
565 TP_ARGS(dqp), 593 TP_ARGS(dqp),
566 TP_STRUCT__entry( 594 TP_STRUCT__entry(
567 __field(dev_t, dev) 595 __field(dev_t, dev)
568 __field(__be32, id) 596 __field(u32, id)
569 __field(unsigned, flags) 597 __field(unsigned, flags)
570 __field(unsigned, nrefs) 598 __field(unsigned, nrefs)
571 __field(unsigned long long, res_bcount) 599 __field(unsigned long long, res_bcount)
@@ -578,7 +606,7 @@ DECLARE_EVENT_CLASS(xfs_dquot_class,
578 ), \ 606 ), \
579 TP_fast_assign( 607 TP_fast_assign(
580 __entry->dev = dqp->q_mount->m_super->s_dev; 608 __entry->dev = dqp->q_mount->m_super->s_dev;
581 __entry->id = dqp->q_core.d_id; 609 __entry->id = be32_to_cpu(dqp->q_core.d_id);
582 __entry->flags = dqp->dq_flags; 610 __entry->flags = dqp->dq_flags;
583 __entry->nrefs = dqp->q_nrefs; 611 __entry->nrefs = dqp->q_nrefs;
584 __entry->res_bcount = dqp->q_res_bcount; 612 __entry->res_bcount = dqp->q_res_bcount;
@@ -594,10 +622,10 @@ DECLARE_EVENT_CLASS(xfs_dquot_class,
594 be64_to_cpu(dqp->q_core.d_ino_softlimit); 622 be64_to_cpu(dqp->q_core.d_ino_softlimit);
595 ), 623 ),
596 TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx " 624 TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx "
597 "bcnt 0x%llx [hard 0x%llx | soft 0x%llx] " 625 "bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx "
598 "icnt 0x%llx [hard 0x%llx | soft 0x%llx]", 626 "icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]",
599 MAJOR(__entry->dev), MINOR(__entry->dev), 627 MAJOR(__entry->dev), MINOR(__entry->dev),
600 be32_to_cpu(__entry->id), 628 __entry->id,
601 __print_flags(__entry->flags, "|", XFS_DQ_FLAGS), 629 __print_flags(__entry->flags, "|", XFS_DQ_FLAGS),
602 __entry->nrefs, 630 __entry->nrefs,
603 __entry->res_bcount, 631 __entry->res_bcount,
@@ -853,7 +881,7 @@ TRACE_EVENT(name, \
853 ), \ 881 ), \
854 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \ 882 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
855 "offset 0x%llx count %zd flags %s " \ 883 "offset 0x%llx count %zd flags %s " \
856 "startoff 0x%llx startblock %s blockcount 0x%llx", \ 884 "startoff 0x%llx startblock %lld blockcount 0x%llx", \
857 MAJOR(__entry->dev), MINOR(__entry->dev), \ 885 MAJOR(__entry->dev), MINOR(__entry->dev), \
858 __entry->ino, \ 886 __entry->ino, \
859 __entry->size, \ 887 __entry->size, \
@@ -862,7 +890,7 @@ TRACE_EVENT(name, \
862 __entry->count, \ 890 __entry->count, \
863 __print_flags(__entry->flags, "|", BMAPI_FLAGS), \ 891 __print_flags(__entry->flags, "|", BMAPI_FLAGS), \
864 __entry->startoff, \ 892 __entry->startoff, \
865 xfs_fmtfsblock(__entry->startblock), \ 893 (__int64_t)__entry->startblock, \
866 __entry->blockcount) \ 894 __entry->blockcount) \
867) 895)
868DEFINE_IOMAP_EVENT(xfs_iomap_enter); 896DEFINE_IOMAP_EVENT(xfs_iomap_enter);
@@ -1414,6 +1442,59 @@ TRACE_EVENT(xfs_dir2_leafn_moveents,
1414 __entry->count) 1442 __entry->count)
1415); 1443);
1416 1444
1445#define XFS_SWAPEXT_INODES \
1446 { 0, "target" }, \
1447 { 1, "temp" }
1448
1449#define XFS_INODE_FORMAT_STR \
1450 { 0, "invalid" }, \
1451 { 1, "local" }, \
1452 { 2, "extent" }, \
1453 { 3, "btree" }
1454
1455DECLARE_EVENT_CLASS(xfs_swap_extent_class,
1456 TP_PROTO(struct xfs_inode *ip, int which),
1457 TP_ARGS(ip, which),
1458 TP_STRUCT__entry(
1459 __field(dev_t, dev)
1460 __field(int, which)
1461 __field(xfs_ino_t, ino)
1462 __field(int, format)
1463 __field(int, nex)
1464 __field(int, max_nex)
1465 __field(int, broot_size)
1466 __field(int, fork_off)
1467 ),
1468 TP_fast_assign(
1469 __entry->dev = VFS_I(ip)->i_sb->s_dev;
1470 __entry->which = which;
1471 __entry->ino = ip->i_ino;
1472 __entry->format = ip->i_d.di_format;
1473 __entry->nex = ip->i_d.di_nextents;
1474 __entry->max_nex = ip->i_df.if_ext_max;
1475 __entry->broot_size = ip->i_df.if_broot_bytes;
1476 __entry->fork_off = XFS_IFORK_BOFF(ip);
1477 ),
1478 TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
1479 "Max in-fork extents %d, broot size %d, fork offset %d",
1480 MAJOR(__entry->dev), MINOR(__entry->dev),
1481 __entry->ino,
1482 __print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
1483 __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
1484 __entry->nex,
1485 __entry->max_nex,
1486 __entry->broot_size,
1487 __entry->fork_off)
1488)
1489
1490#define DEFINE_SWAPEXT_EVENT(name) \
1491DEFINE_EVENT(xfs_swap_extent_class, name, \
1492 TP_PROTO(struct xfs_inode *ip, int which), \
1493 TP_ARGS(ip, which))
1494
1495DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
1496DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
1497
1417#endif /* _TRACE_XFS_H */ 1498#endif /* _TRACE_XFS_H */
1418 1499
1419#undef TRACE_INCLUDE_PATH 1500#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c
index 0b1878857fc3..fa01b9daba6b 100644
--- a/fs/xfs/linux-2.6/xfs_xattr.c
+++ b/fs/xfs/linux-2.6/xfs_xattr.c
@@ -45,7 +45,7 @@ xfs_xattr_get(struct dentry *dentry, const char *name,
45 value = NULL; 45 value = NULL;
46 } 46 }
47 47
48 error = -xfs_attr_get(ip, name, value, &asize, xflags); 48 error = -xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags);
49 if (error) 49 if (error)
50 return error; 50 return error;
51 return asize; 51 return asize;
@@ -67,8 +67,9 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
67 xflags |= ATTR_REPLACE; 67 xflags |= ATTR_REPLACE;
68 68
69 if (!value) 69 if (!value)
70 return -xfs_attr_remove(ip, name, xflags); 70 return -xfs_attr_remove(ip, (unsigned char *)name, xflags);
71 return -xfs_attr_set(ip, name, (void *)value, size, xflags); 71 return -xfs_attr_set(ip, (unsigned char *)name,
72 (void *)value, size, xflags);
72} 73}
73 74
74static struct xattr_handler xfs_xattr_user_handler = { 75static struct xattr_handler xfs_xattr_user_handler = {
@@ -124,8 +125,13 @@ static const char *xfs_xattr_prefix(int flags)
124} 125}
125 126
126static int 127static int
127xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags, 128xfs_xattr_put_listent(
128 char *name, int namelen, int valuelen, char *value) 129 struct xfs_attr_list_context *context,
130 int flags,
131 unsigned char *name,
132 int namelen,
133 int valuelen,
134 unsigned char *value)
129{ 135{
130 unsigned int prefix_len = xfs_xattr_prefix_len(flags); 136 unsigned int prefix_len = xfs_xattr_prefix_len(flags);
131 char *offset; 137 char *offset;
@@ -148,7 +154,7 @@ xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags,
148 offset = (char *)context->alist + context->count; 154 offset = (char *)context->alist + context->count;
149 strncpy(offset, xfs_xattr_prefix(flags), prefix_len); 155 strncpy(offset, xfs_xattr_prefix(flags), prefix_len);
150 offset += prefix_len; 156 offset += prefix_len;
151 strncpy(offset, name, namelen); /* real name */ 157 strncpy(offset, (char *)name, namelen); /* real name */
152 offset += namelen; 158 offset += namelen;
153 *offset = '\0'; 159 *offset = '\0';
154 context->count += prefix_len + namelen + 1; 160 context->count += prefix_len + namelen + 1;
@@ -156,8 +162,13 @@ xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags,
156} 162}
157 163
158static int 164static int
159xfs_xattr_put_listent_sizes(struct xfs_attr_list_context *context, int flags, 165xfs_xattr_put_listent_sizes(
160 char *name, int namelen, int valuelen, char *value) 166 struct xfs_attr_list_context *context,
167 int flags,
168 unsigned char *name,
169 int namelen,
170 int valuelen,
171 unsigned char *value)
161{ 172{
162 context->count += xfs_xattr_prefix_len(flags) + namelen + 1; 173 context->count += xfs_xattr_prefix_len(flags) + namelen + 1;
163 return 0; 174 return 0;