aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/linux-2.6
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/linux-2.6')
-rw-r--r--fs/xfs/linux-2.6/kmem.c56
-rw-r--r--fs/xfs/linux-2.6/kmem.h21
-rw-r--r--fs/xfs/linux-2.6/xfs_acl.c11
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c320
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h52
-rw-r--r--fs/xfs/linux-2.6/xfs_fs_subr.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c21
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.h12
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c62
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.h3
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c169
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c186
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h81
-rw-r--r--fs/xfs/linux-2.6/xfs_xattr.c27
17 files changed, 727 insertions, 305 deletions
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index 2d3f90afe5f1..bc7405585def 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -16,7 +16,6 @@
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include <linux/mm.h> 18#include <linux/mm.h>
19#include <linux/vmalloc.h>
20#include <linux/highmem.h> 19#include <linux/highmem.h>
21#include <linux/swap.h> 20#include <linux/swap.h>
22#include <linux/blkdev.h> 21#include <linux/blkdev.h>
@@ -24,8 +23,25 @@
24#include "time.h" 23#include "time.h"
25#include "kmem.h" 24#include "kmem.h"
26 25
27#define MAX_VMALLOCS 6 26/*
28#define MAX_SLAB_SIZE 0x20000 27 * Greedy allocation. May fail and may return vmalloced memory.
28 *
29 * Must be freed using kmem_free_large.
30 */
31void *
32kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
33{
34 void *ptr;
35 size_t kmsize = maxsize;
36
37 while (!(ptr = kmem_zalloc_large(kmsize))) {
38 if ((kmsize >>= 1) <= minsize)
39 kmsize = minsize;
40 }
41 if (ptr)
42 *size = kmsize;
43 return ptr;
44}
29 45
30void * 46void *
31kmem_alloc(size_t size, unsigned int __nocast flags) 47kmem_alloc(size_t size, unsigned int __nocast flags)
@@ -34,19 +50,8 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
34 gfp_t lflags = kmem_flags_convert(flags); 50 gfp_t lflags = kmem_flags_convert(flags);
35 void *ptr; 51 void *ptr;
36 52
37#ifdef DEBUG
38 if (unlikely(!(flags & KM_LARGE) && (size > PAGE_SIZE))) {
39 printk(KERN_WARNING "Large %s attempt, size=%ld\n",
40 __func__, (long)size);
41 dump_stack();
42 }
43#endif
44
45 do { 53 do {
46 if (size < MAX_SLAB_SIZE || retries > MAX_VMALLOCS) 54 ptr = kmalloc(size, lflags);
47 ptr = kmalloc(size, lflags);
48 else
49 ptr = __vmalloc(size, lflags, PAGE_KERNEL);
50 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) 55 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
51 return ptr; 56 return ptr;
52 if (!(++retries % 100)) 57 if (!(++retries % 100))
@@ -68,27 +73,6 @@ kmem_zalloc(size_t size, unsigned int __nocast flags)
68 return ptr; 73 return ptr;
69} 74}
70 75
71void *
72kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize,
73 unsigned int __nocast flags)
74{
75 void *ptr;
76 size_t kmsize = maxsize;
77 unsigned int kmflags = (flags & ~KM_SLEEP) | KM_NOSLEEP;
78
79 while (!(ptr = kmem_zalloc(kmsize, kmflags))) {
80 if ((kmsize <= minsize) && (flags & KM_NOSLEEP))
81 break;
82 if ((kmsize >>= 1) <= minsize) {
83 kmsize = minsize;
84 kmflags = flags;
85 }
86 }
87 if (ptr)
88 *size = kmsize;
89 return ptr;
90}
91
92void 76void
93kmem_free(const void *ptr) 77kmem_free(const void *ptr)
94{ 78{
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index 179cbd630f69..f7c8f7a9ea6d 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -21,6 +21,7 @@
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/mm.h> 23#include <linux/mm.h>
24#include <linux/vmalloc.h>
24 25
25/* 26/*
26 * General memory allocation interfaces 27 * General memory allocation interfaces
@@ -30,7 +31,6 @@
30#define KM_NOSLEEP 0x0002u 31#define KM_NOSLEEP 0x0002u
31#define KM_NOFS 0x0004u 32#define KM_NOFS 0x0004u
32#define KM_MAYFAIL 0x0008u 33#define KM_MAYFAIL 0x0008u
33#define KM_LARGE 0x0010u
34 34
35/* 35/*
36 * We use a special process flag to avoid recursive callbacks into 36 * We use a special process flag to avoid recursive callbacks into
@@ -42,7 +42,7 @@ kmem_flags_convert(unsigned int __nocast flags)
42{ 42{
43 gfp_t lflags; 43 gfp_t lflags;
44 44
45 BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_LARGE)); 45 BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL));
46 46
47 if (flags & KM_NOSLEEP) { 47 if (flags & KM_NOSLEEP) {
48 lflags = GFP_ATOMIC | __GFP_NOWARN; 48 lflags = GFP_ATOMIC | __GFP_NOWARN;
@@ -56,10 +56,25 @@ kmem_flags_convert(unsigned int __nocast flags)
56 56
57extern void *kmem_alloc(size_t, unsigned int __nocast); 57extern void *kmem_alloc(size_t, unsigned int __nocast);
58extern void *kmem_zalloc(size_t, unsigned int __nocast); 58extern void *kmem_zalloc(size_t, unsigned int __nocast);
59extern void *kmem_zalloc_greedy(size_t *, size_t, size_t, unsigned int __nocast);
60extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast); 59extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast);
61extern void kmem_free(const void *); 60extern void kmem_free(const void *);
62 61
62static inline void *kmem_zalloc_large(size_t size)
63{
64 void *ptr;
65
66 ptr = vmalloc(size);
67 if (ptr)
68 memset(ptr, 0, size);
69 return ptr;
70}
71static inline void kmem_free_large(void *ptr)
72{
73 vfree(ptr);
74}
75
76extern void *kmem_zalloc_greedy(size_t *, size_t, size_t);
77
63/* 78/*
64 * Zone interfaces 79 * Zone interfaces
65 */ 80 */
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index 883ca5ab8af5..bf85bbe4a9ae 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -106,7 +106,7 @@ xfs_get_acl(struct inode *inode, int type)
106 struct posix_acl *acl; 106 struct posix_acl *acl;
107 struct xfs_acl *xfs_acl; 107 struct xfs_acl *xfs_acl;
108 int len = sizeof(struct xfs_acl); 108 int len = sizeof(struct xfs_acl);
109 char *ea_name; 109 unsigned char *ea_name;
110 int error; 110 int error;
111 111
112 acl = get_cached_acl(inode, type); 112 acl = get_cached_acl(inode, type);
@@ -133,7 +133,8 @@ xfs_get_acl(struct inode *inode, int type)
133 if (!xfs_acl) 133 if (!xfs_acl)
134 return ERR_PTR(-ENOMEM); 134 return ERR_PTR(-ENOMEM);
135 135
136 error = -xfs_attr_get(ip, ea_name, (char *)xfs_acl, &len, ATTR_ROOT); 136 error = -xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
137 &len, ATTR_ROOT);
137 if (error) { 138 if (error) {
138 /* 139 /*
139 * If the attribute doesn't exist make sure we have a negative 140 * If the attribute doesn't exist make sure we have a negative
@@ -162,7 +163,7 @@ STATIC int
162xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) 163xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
163{ 164{
164 struct xfs_inode *ip = XFS_I(inode); 165 struct xfs_inode *ip = XFS_I(inode);
165 char *ea_name; 166 unsigned char *ea_name;
166 int error; 167 int error;
167 168
168 if (S_ISLNK(inode->i_mode)) 169 if (S_ISLNK(inode->i_mode))
@@ -194,7 +195,7 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
194 (sizeof(struct xfs_acl_entry) * 195 (sizeof(struct xfs_acl_entry) *
195 (XFS_ACL_MAX_ENTRIES - acl->a_count)); 196 (XFS_ACL_MAX_ENTRIES - acl->a_count));
196 197
197 error = -xfs_attr_set(ip, ea_name, (char *)xfs_acl, 198 error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
198 len, ATTR_ROOT); 199 len, ATTR_ROOT);
199 200
200 kfree(xfs_acl); 201 kfree(xfs_acl);
@@ -262,7 +263,7 @@ xfs_set_mode(struct inode *inode, mode_t mode)
262} 263}
263 264
264static int 265static int
265xfs_acl_exists(struct inode *inode, char *name) 266xfs_acl_exists(struct inode *inode, unsigned char *name)
266{ 267{
267 int len = sizeof(struct xfs_acl); 268 int len = sizeof(struct xfs_acl);
268 269
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 77b8be81c769..6f76ba85f193 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -33,6 +33,7 @@
33#include <linux/migrate.h> 33#include <linux/migrate.h>
34#include <linux/backing-dev.h> 34#include <linux/backing-dev.h>
35#include <linux/freezer.h> 35#include <linux/freezer.h>
36#include <linux/list_sort.h>
36 37
37#include "xfs_sb.h" 38#include "xfs_sb.h"
38#include "xfs_inum.h" 39#include "xfs_inum.h"
@@ -76,6 +77,27 @@ struct workqueue_struct *xfsconvertd_workqueue;
76#define xfs_buf_deallocate(bp) \ 77#define xfs_buf_deallocate(bp) \
77 kmem_zone_free(xfs_buf_zone, (bp)); 78 kmem_zone_free(xfs_buf_zone, (bp));
78 79
80static inline int
81xfs_buf_is_vmapped(
82 struct xfs_buf *bp)
83{
84 /*
85 * Return true if the buffer is vmapped.
86 *
87 * The XBF_MAPPED flag is set if the buffer should be mapped, but the
88 * code is clever enough to know it doesn't have to map a single page,
89 * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1.
90 */
91 return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1;
92}
93
94static inline int
95xfs_buf_vmap_len(
96 struct xfs_buf *bp)
97{
98 return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
99}
100
79/* 101/*
80 * Page Region interfaces. 102 * Page Region interfaces.
81 * 103 *
@@ -314,7 +336,7 @@ xfs_buf_free(
314 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { 336 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
315 uint i; 337 uint i;
316 338
317 if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) 339 if (xfs_buf_is_vmapped(bp))
318 free_address(bp->b_addr - bp->b_offset); 340 free_address(bp->b_addr - bp->b_offset);
319 341
320 for (i = 0; i < bp->b_page_count; i++) { 342 for (i = 0; i < bp->b_page_count; i++) {
@@ -1051,22 +1073,30 @@ xfs_buf_ioerror(
1051} 1073}
1052 1074
1053int 1075int
1054xfs_bawrite( 1076xfs_bwrite(
1055 void *mp, 1077 struct xfs_mount *mp,
1056 struct xfs_buf *bp) 1078 struct xfs_buf *bp)
1057{ 1079{
1058 trace_xfs_buf_bawrite(bp, _RET_IP_); 1080 int iowait = (bp->b_flags & XBF_ASYNC) == 0;
1081 int error = 0;
1059 1082
1060 ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); 1083 bp->b_strat = xfs_bdstrat_cb;
1084 bp->b_mount = mp;
1085 bp->b_flags |= XBF_WRITE;
1086 if (!iowait)
1087 bp->b_flags |= _XBF_RUN_QUEUES;
1061 1088
1062 xfs_buf_delwri_dequeue(bp); 1089 xfs_buf_delwri_dequeue(bp);
1090 xfs_buf_iostrategy(bp);
1063 1091
1064 bp->b_flags &= ~(XBF_READ | XBF_DELWRI | XBF_READ_AHEAD); 1092 if (iowait) {
1065 bp->b_flags |= (XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES); 1093 error = xfs_buf_iowait(bp);
1094 if (error)
1095 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1096 xfs_buf_relse(bp);
1097 }
1066 1098
1067 bp->b_mount = mp; 1099 return error;
1068 bp->b_strat = xfs_bdstrat_cb;
1069 return xfs_bdstrat_cb(bp);
1070} 1100}
1071 1101
1072void 1102void
@@ -1085,6 +1115,126 @@ xfs_bdwrite(
1085 xfs_buf_delwri_queue(bp, 1); 1115 xfs_buf_delwri_queue(bp, 1);
1086} 1116}
1087 1117
1118/*
1119 * Called when we want to stop a buffer from getting written or read.
1120 * We attach the EIO error, muck with its flags, and call biodone
1121 * so that the proper iodone callbacks get called.
1122 */
1123STATIC int
1124xfs_bioerror(
1125 xfs_buf_t *bp)
1126{
1127#ifdef XFSERRORDEBUG
1128 ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
1129#endif
1130
1131 /*
1132 * No need to wait until the buffer is unpinned, we aren't flushing it.
1133 */
1134 XFS_BUF_ERROR(bp, EIO);
1135
1136 /*
1137 * We're calling biodone, so delete XBF_DONE flag.
1138 */
1139 XFS_BUF_UNREAD(bp);
1140 XFS_BUF_UNDELAYWRITE(bp);
1141 XFS_BUF_UNDONE(bp);
1142 XFS_BUF_STALE(bp);
1143
1144 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
1145 xfs_biodone(bp);
1146
1147 return EIO;
1148}
1149
1150/*
1151 * Same as xfs_bioerror, except that we are releasing the buffer
1152 * here ourselves, and avoiding the biodone call.
1153 * This is meant for userdata errors; metadata bufs come with
1154 * iodone functions attached, so that we can track down errors.
1155 */
1156STATIC int
1157xfs_bioerror_relse(
1158 struct xfs_buf *bp)
1159{
1160 int64_t fl = XFS_BUF_BFLAGS(bp);
1161 /*
1162 * No need to wait until the buffer is unpinned.
1163 * We aren't flushing it.
1164 *
1165 * chunkhold expects B_DONE to be set, whether
1166 * we actually finish the I/O or not. We don't want to
1167 * change that interface.
1168 */
1169 XFS_BUF_UNREAD(bp);
1170 XFS_BUF_UNDELAYWRITE(bp);
1171 XFS_BUF_DONE(bp);
1172 XFS_BUF_STALE(bp);
1173 XFS_BUF_CLR_IODONE_FUNC(bp);
1174 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
1175 if (!(fl & XBF_ASYNC)) {
1176 /*
1177 * Mark b_error and B_ERROR _both_.
1178 * Lot's of chunkcache code assumes that.
1179 * There's no reason to mark error for
1180 * ASYNC buffers.
1181 */
1182 XFS_BUF_ERROR(bp, EIO);
1183 XFS_BUF_FINISH_IOWAIT(bp);
1184 } else {
1185 xfs_buf_relse(bp);
1186 }
1187
1188 return EIO;
1189}
1190
1191
1192/*
1193 * All xfs metadata buffers except log state machine buffers
1194 * get this attached as their b_bdstrat callback function.
1195 * This is so that we can catch a buffer
1196 * after prematurely unpinning it to forcibly shutdown the filesystem.
1197 */
1198int
1199xfs_bdstrat_cb(
1200 struct xfs_buf *bp)
1201{
1202 if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
1203 trace_xfs_bdstrat_shut(bp, _RET_IP_);
1204 /*
1205 * Metadata write that didn't get logged but
1206 * written delayed anyway. These aren't associated
1207 * with a transaction, and can be ignored.
1208 */
1209 if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
1210 return xfs_bioerror_relse(bp);
1211 else
1212 return xfs_bioerror(bp);
1213 }
1214
1215 xfs_buf_iorequest(bp);
1216 return 0;
1217}
1218
1219/*
1220 * Wrapper around bdstrat so that we can stop data from going to disk in case
1221 * we are shutting down the filesystem. Typically user data goes thru this
1222 * path; one of the exceptions is the superblock.
1223 */
1224void
1225xfsbdstrat(
1226 struct xfs_mount *mp,
1227 struct xfs_buf *bp)
1228{
1229 if (XFS_FORCED_SHUTDOWN(mp)) {
1230 trace_xfs_bdstrat_shut(bp, _RET_IP_);
1231 xfs_bioerror_relse(bp);
1232 return;
1233 }
1234
1235 xfs_buf_iorequest(bp);
1236}
1237
1088STATIC void 1238STATIC void
1089_xfs_buf_ioend( 1239_xfs_buf_ioend(
1090 xfs_buf_t *bp, 1240 xfs_buf_t *bp,
@@ -1107,6 +1257,9 @@ xfs_buf_bio_end_io(
1107 1257
1108 xfs_buf_ioerror(bp, -error); 1258 xfs_buf_ioerror(bp, -error);
1109 1259
1260 if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
1261 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
1262
1110 do { 1263 do {
1111 struct page *page = bvec->bv_page; 1264 struct page *page = bvec->bv_page;
1112 1265
@@ -1216,6 +1369,10 @@ next_chunk:
1216 1369
1217submit_io: 1370submit_io:
1218 if (likely(bio->bi_size)) { 1371 if (likely(bio->bi_size)) {
1372 if (xfs_buf_is_vmapped(bp)) {
1373 flush_kernel_vmap_range(bp->b_addr,
1374 xfs_buf_vmap_len(bp));
1375 }
1219 submit_bio(rw, bio); 1376 submit_bio(rw, bio);
1220 if (size) 1377 if (size)
1221 goto next_chunk; 1378 goto next_chunk;
@@ -1296,7 +1453,7 @@ xfs_buf_iomove(
1296 xfs_buf_t *bp, /* buffer to process */ 1453 xfs_buf_t *bp, /* buffer to process */
1297 size_t boff, /* starting buffer offset */ 1454 size_t boff, /* starting buffer offset */
1298 size_t bsize, /* length to copy */ 1455 size_t bsize, /* length to copy */
1299 caddr_t data, /* data address */ 1456 void *data, /* data address */
1300 xfs_buf_rw_t mode) /* read/write/zero flag */ 1457 xfs_buf_rw_t mode) /* read/write/zero flag */
1301{ 1458{
1302 size_t bend, cpoff, csize; 1459 size_t bend, cpoff, csize;
@@ -1378,8 +1535,8 @@ xfs_alloc_bufhash(
1378 1535
1379 btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ 1536 btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */
1380 btp->bt_hashmask = (1 << btp->bt_hashshift) - 1; 1537 btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
1381 btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) * 1538 btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
1382 sizeof(xfs_bufhash_t), KM_SLEEP | KM_LARGE); 1539 sizeof(xfs_bufhash_t));
1383 for (i = 0; i < (1 << btp->bt_hashshift); i++) { 1540 for (i = 0; i < (1 << btp->bt_hashshift); i++) {
1384 spin_lock_init(&btp->bt_hash[i].bh_lock); 1541 spin_lock_init(&btp->bt_hash[i].bh_lock);
1385 INIT_LIST_HEAD(&btp->bt_hash[i].bh_list); 1542 INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
@@ -1390,7 +1547,7 @@ STATIC void
1390xfs_free_bufhash( 1547xfs_free_bufhash(
1391 xfs_buftarg_t *btp) 1548 xfs_buftarg_t *btp)
1392{ 1549{
1393 kmem_free(btp->bt_hash); 1550 kmem_free_large(btp->bt_hash);
1394 btp->bt_hash = NULL; 1551 btp->bt_hash = NULL;
1395} 1552}
1396 1553
@@ -1595,6 +1752,11 @@ xfs_buf_delwri_queue(
1595 list_del(&bp->b_list); 1752 list_del(&bp->b_list);
1596 } 1753 }
1597 1754
1755 if (list_empty(dwq)) {
1756 /* start xfsbufd as it is about to have something to do */
1757 wake_up_process(bp->b_target->bt_task);
1758 }
1759
1598 bp->b_flags |= _XBF_DELWRI_Q; 1760 bp->b_flags |= _XBF_DELWRI_Q;
1599 list_add_tail(&bp->b_list, dwq); 1761 list_add_tail(&bp->b_list, dwq);
1600 bp->b_queuetime = jiffies; 1762 bp->b_queuetime = jiffies;
@@ -1626,6 +1788,35 @@ xfs_buf_delwri_dequeue(
1626 trace_xfs_buf_delwri_dequeue(bp, _RET_IP_); 1788 trace_xfs_buf_delwri_dequeue(bp, _RET_IP_);
1627} 1789}
1628 1790
1791/*
1792 * If a delwri buffer needs to be pushed before it has aged out, then promote
1793 * it to the head of the delwri queue so that it will be flushed on the next
1794 * xfsbufd run. We do this by resetting the queuetime of the buffer to be older
1795 * than the age currently needed to flush the buffer. Hence the next time the
1796 * xfsbufd sees it is guaranteed to be considered old enough to flush.
1797 */
1798void
1799xfs_buf_delwri_promote(
1800 struct xfs_buf *bp)
1801{
1802 struct xfs_buftarg *btp = bp->b_target;
1803 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1;
1804
1805 ASSERT(bp->b_flags & XBF_DELWRI);
1806 ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1807
1808 /*
1809 * Check the buffer age before locking the delayed write queue as we
1810 * don't need to promote buffers that are already past the flush age.
1811 */
1812 if (bp->b_queuetime < jiffies - age)
1813 return;
1814 bp->b_queuetime = jiffies - age;
1815 spin_lock(&btp->bt_delwrite_lock);
1816 list_move(&bp->b_list, &btp->bt_delwrite_queue);
1817 spin_unlock(&btp->bt_delwrite_lock);
1818}
1819
1629STATIC void 1820STATIC void
1630xfs_buf_runall_queues( 1821xfs_buf_runall_queues(
1631 struct workqueue_struct *queue) 1822 struct workqueue_struct *queue)
@@ -1644,6 +1835,8 @@ xfsbufd_wakeup(
1644 list_for_each_entry(btp, &xfs_buftarg_list, bt_list) { 1835 list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
1645 if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags)) 1836 if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
1646 continue; 1837 continue;
1838 if (list_empty(&btp->bt_delwrite_queue))
1839 continue;
1647 set_bit(XBT_FORCE_FLUSH, &btp->bt_flags); 1840 set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
1648 wake_up_process(btp->bt_task); 1841 wake_up_process(btp->bt_task);
1649 } 1842 }
@@ -1694,20 +1887,53 @@ xfs_buf_delwri_split(
1694 1887
1695} 1888}
1696 1889
1890/*
1891 * Compare function is more complex than it needs to be because
1892 * the return value is only 32 bits and we are doing comparisons
1893 * on 64 bit values
1894 */
1895static int
1896xfs_buf_cmp(
1897 void *priv,
1898 struct list_head *a,
1899 struct list_head *b)
1900{
1901 struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list);
1902 struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list);
1903 xfs_daddr_t diff;
1904
1905 diff = ap->b_bn - bp->b_bn;
1906 if (diff < 0)
1907 return -1;
1908 if (diff > 0)
1909 return 1;
1910 return 0;
1911}
1912
1913void
1914xfs_buf_delwri_sort(
1915 xfs_buftarg_t *target,
1916 struct list_head *list)
1917{
1918 list_sort(NULL, list, xfs_buf_cmp);
1919}
1920
1697STATIC int 1921STATIC int
1698xfsbufd( 1922xfsbufd(
1699 void *data) 1923 void *data)
1700{ 1924{
1701 struct list_head tmp; 1925 xfs_buftarg_t *target = (xfs_buftarg_t *)data;
1702 xfs_buftarg_t *target = (xfs_buftarg_t *)data;
1703 int count;
1704 xfs_buf_t *bp;
1705 1926
1706 current->flags |= PF_MEMALLOC; 1927 current->flags |= PF_MEMALLOC;
1707 1928
1708 set_freezable(); 1929 set_freezable();
1709 1930
1710 do { 1931 do {
1932 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
1933 long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
1934 int count = 0;
1935 struct list_head tmp;
1936
1711 if (unlikely(freezing(current))) { 1937 if (unlikely(freezing(current))) {
1712 set_bit(XBT_FORCE_SLEEP, &target->bt_flags); 1938 set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
1713 refrigerator(); 1939 refrigerator();
@@ -1715,17 +1941,16 @@ xfsbufd(
1715 clear_bit(XBT_FORCE_SLEEP, &target->bt_flags); 1941 clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);
1716 } 1942 }
1717 1943
1718 schedule_timeout_interruptible( 1944 /* sleep for a long time if there is nothing to do. */
1719 xfs_buf_timer_centisecs * msecs_to_jiffies(10)); 1945 if (list_empty(&target->bt_delwrite_queue))
1946 tout = MAX_SCHEDULE_TIMEOUT;
1947 schedule_timeout_interruptible(tout);
1720 1948
1721 xfs_buf_delwri_split(target, &tmp, 1949 xfs_buf_delwri_split(target, &tmp, age);
1722 xfs_buf_age_centisecs * msecs_to_jiffies(10)); 1950 list_sort(NULL, &tmp, xfs_buf_cmp);
1723
1724 count = 0;
1725 while (!list_empty(&tmp)) { 1951 while (!list_empty(&tmp)) {
1726 bp = list_entry(tmp.next, xfs_buf_t, b_list); 1952 struct xfs_buf *bp;
1727 ASSERT(target == bp->b_target); 1953 bp = list_first_entry(&tmp, struct xfs_buf, b_list);
1728
1729 list_del_init(&bp->b_list); 1954 list_del_init(&bp->b_list);
1730 xfs_buf_iostrategy(bp); 1955 xfs_buf_iostrategy(bp);
1731 count++; 1956 count++;
@@ -1751,42 +1976,45 @@ xfs_flush_buftarg(
1751 xfs_buftarg_t *target, 1976 xfs_buftarg_t *target,
1752 int wait) 1977 int wait)
1753{ 1978{
1754 struct list_head tmp; 1979 xfs_buf_t *bp;
1755 xfs_buf_t *bp, *n;
1756 int pincount = 0; 1980 int pincount = 0;
1981 LIST_HEAD(tmp_list);
1982 LIST_HEAD(wait_list);
1757 1983
1758 xfs_buf_runall_queues(xfsconvertd_workqueue); 1984 xfs_buf_runall_queues(xfsconvertd_workqueue);
1759 xfs_buf_runall_queues(xfsdatad_workqueue); 1985 xfs_buf_runall_queues(xfsdatad_workqueue);
1760 xfs_buf_runall_queues(xfslogd_workqueue); 1986 xfs_buf_runall_queues(xfslogd_workqueue);
1761 1987
1762 set_bit(XBT_FORCE_FLUSH, &target->bt_flags); 1988 set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
1763 pincount = xfs_buf_delwri_split(target, &tmp, 0); 1989 pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
1764 1990
1765 /* 1991 /*
1766 * Dropped the delayed write list lock, now walk the temporary list 1992 * Dropped the delayed write list lock, now walk the temporary list.
1993 * All I/O is issued async and then if we need to wait for completion
1994 * we do that after issuing all the IO.
1767 */ 1995 */
1768 list_for_each_entry_safe(bp, n, &tmp, b_list) { 1996 list_sort(NULL, &tmp_list, xfs_buf_cmp);
1997 while (!list_empty(&tmp_list)) {
1998 bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
1769 ASSERT(target == bp->b_target); 1999 ASSERT(target == bp->b_target);
1770 if (wait) 2000 list_del_init(&bp->b_list);
2001 if (wait) {
1771 bp->b_flags &= ~XBF_ASYNC; 2002 bp->b_flags &= ~XBF_ASYNC;
1772 else 2003 list_add(&bp->b_list, &wait_list);
1773 list_del_init(&bp->b_list); 2004 }
1774
1775 xfs_buf_iostrategy(bp); 2005 xfs_buf_iostrategy(bp);
1776 } 2006 }
1777 2007
1778 if (wait) 2008 if (wait) {
2009 /* Expedite and wait for IO to complete. */
1779 blk_run_address_space(target->bt_mapping); 2010 blk_run_address_space(target->bt_mapping);
2011 while (!list_empty(&wait_list)) {
2012 bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
1780 2013
1781 /* 2014 list_del_init(&bp->b_list);
1782 * Remaining list items must be flushed before returning 2015 xfs_iowait(bp);
1783 */ 2016 xfs_buf_relse(bp);
1784 while (!list_empty(&tmp)) { 2017 }
1785 bp = list_entry(tmp.next, xfs_buf_t, b_list);
1786
1787 list_del_init(&bp->b_list);
1788 xfs_iowait(bp);
1789 xfs_buf_relse(bp);
1790 } 2018 }
1791 2019
1792 return pincount; 2020 return pincount;
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index a34c7b54822d..386e7361e50e 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -232,13 +232,17 @@ extern void xfs_buf_lock(xfs_buf_t *);
232extern void xfs_buf_unlock(xfs_buf_t *); 232extern void xfs_buf_unlock(xfs_buf_t *);
233 233
234/* Buffer Read and Write Routines */ 234/* Buffer Read and Write Routines */
235extern int xfs_bawrite(void *mp, xfs_buf_t *bp); 235extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
236extern void xfs_bdwrite(void *mp, xfs_buf_t *bp); 236extern void xfs_bdwrite(void *mp, xfs_buf_t *bp);
237
238extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
239extern int xfs_bdstrat_cb(struct xfs_buf *);
240
237extern void xfs_buf_ioend(xfs_buf_t *, int); 241extern void xfs_buf_ioend(xfs_buf_t *, int);
238extern void xfs_buf_ioerror(xfs_buf_t *, int); 242extern void xfs_buf_ioerror(xfs_buf_t *, int);
239extern int xfs_buf_iorequest(xfs_buf_t *); 243extern int xfs_buf_iorequest(xfs_buf_t *);
240extern int xfs_buf_iowait(xfs_buf_t *); 244extern int xfs_buf_iowait(xfs_buf_t *);
241extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, xfs_caddr_t, 245extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
242 xfs_buf_rw_t); 246 xfs_buf_rw_t);
243 247
244static inline int xfs_buf_iostrategy(xfs_buf_t *bp) 248static inline int xfs_buf_iostrategy(xfs_buf_t *bp)
@@ -261,6 +265,7 @@ extern int xfs_buf_ispin(xfs_buf_t *);
261 265
262/* Delayed Write Buffer Routines */ 266/* Delayed Write Buffer Routines */
263extern void xfs_buf_delwri_dequeue(xfs_buf_t *); 267extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
268extern void xfs_buf_delwri_promote(xfs_buf_t *);
264 269
265/* Buffer Daemon Setup Routines */ 270/* Buffer Daemon Setup Routines */
266extern int xfs_buf_init(void); 271extern int xfs_buf_init(void);
@@ -270,33 +275,19 @@ extern void xfs_buf_terminate(void);
270 ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; }) 275 ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; })
271 276
272 277
273#define XFS_B_ASYNC XBF_ASYNC
274#define XFS_B_DELWRI XBF_DELWRI
275#define XFS_B_READ XBF_READ
276#define XFS_B_WRITE XBF_WRITE
277#define XFS_B_STALE XBF_STALE
278
279#define XFS_BUF_TRYLOCK XBF_TRYLOCK
280#define XFS_INCORE_TRYLOCK XBF_TRYLOCK
281#define XFS_BUF_LOCK XBF_LOCK
282#define XFS_BUF_MAPPED XBF_MAPPED
283
284#define BUF_BUSY XBF_DONT_BLOCK
285
286#define XFS_BUF_BFLAGS(bp) ((bp)->b_flags) 278#define XFS_BUF_BFLAGS(bp) ((bp)->b_flags)
287#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \ 279#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \
288 ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED)) 280 ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
289 281
290#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XFS_B_STALE) 282#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XBF_STALE)
291#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XFS_B_STALE) 283#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
292#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XFS_B_STALE) 284#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE)
293#define XFS_BUF_SUPER_STALE(bp) do { \ 285#define XFS_BUF_SUPER_STALE(bp) do { \
294 XFS_BUF_STALE(bp); \ 286 XFS_BUF_STALE(bp); \
295 xfs_buf_delwri_dequeue(bp); \ 287 xfs_buf_delwri_dequeue(bp); \
296 XFS_BUF_DONE(bp); \ 288 XFS_BUF_DONE(bp); \
297 } while (0) 289 } while (0)
298 290
299#define XFS_BUF_MANAGE XBF_FS_MANAGED
300#define XFS_BUF_UNMANAGE(bp) ((bp)->b_flags &= ~XBF_FS_MANAGED) 291#define XFS_BUF_UNMANAGE(bp) ((bp)->b_flags &= ~XBF_FS_MANAGED)
301 292
302#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI) 293#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI)
@@ -385,31 +376,11 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
385 376
386#define xfs_biomove(bp, off, len, data, rw) \ 377#define xfs_biomove(bp, off, len, data, rw) \
387 xfs_buf_iomove((bp), (off), (len), (data), \ 378 xfs_buf_iomove((bp), (off), (len), (data), \
388 ((rw) == XFS_B_WRITE) ? XBRW_WRITE : XBRW_READ) 379 ((rw) == XBF_WRITE) ? XBRW_WRITE : XBRW_READ)
389 380
390#define xfs_biozero(bp, off, len) \ 381#define xfs_biozero(bp, off, len) \
391 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) 382 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
392 383
393
394static inline int XFS_bwrite(xfs_buf_t *bp)
395{
396 int iowait = (bp->b_flags & XBF_ASYNC) == 0;
397 int error = 0;
398
399 if (!iowait)
400 bp->b_flags |= _XBF_RUN_QUEUES;
401
402 xfs_buf_delwri_dequeue(bp);
403 xfs_buf_iostrategy(bp);
404 if (iowait) {
405 error = xfs_buf_iowait(bp);
406 xfs_buf_relse(bp);
407 }
408 return error;
409}
410
411#define XFS_bdstrat(bp) xfs_buf_iorequest(bp)
412
413#define xfs_iowait(bp) xfs_buf_iowait(bp) 384#define xfs_iowait(bp) xfs_buf_iowait(bp)
414 385
415#define xfs_baread(target, rablkno, ralen) \ 386#define xfs_baread(target, rablkno, ralen) \
@@ -424,6 +395,7 @@ extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
424extern void xfs_wait_buftarg(xfs_buftarg_t *); 395extern void xfs_wait_buftarg(xfs_buftarg_t *);
425extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); 396extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
426extern int xfs_flush_buftarg(xfs_buftarg_t *, int); 397extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
398
427#ifdef CONFIG_KDB_MODULES 399#ifdef CONFIG_KDB_MODULES
428extern struct list_head *xfs_get_buftarg_list(void); 400extern struct list_head *xfs_get_buftarg_list(void);
429#endif 401#endif
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 7501b85fd860..b6918d76bc7b 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -79,7 +79,7 @@ xfs_flush_pages(
79 xfs_iflags_clear(ip, XFS_ITRUNCATED); 79 xfs_iflags_clear(ip, XFS_ITRUNCATED);
80 ret = -filemap_fdatawrite(mapping); 80 ret = -filemap_fdatawrite(mapping);
81 } 81 }
82 if (flags & XFS_B_ASYNC) 82 if (flags & XBF_ASYNC)
83 return ret; 83 return ret;
84 ret2 = xfs_wait_on_pages(ip, first, last); 84 ret2 = xfs_wait_on_pages(ip, first, last);
85 if (!ret) 85 if (!ret)
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index a034cf624437..4ea1ee18aded 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -447,12 +447,12 @@ xfs_attrlist_by_handle(
447int 447int
448xfs_attrmulti_attr_get( 448xfs_attrmulti_attr_get(
449 struct inode *inode, 449 struct inode *inode,
450 char *name, 450 unsigned char *name,
451 char __user *ubuf, 451 unsigned char __user *ubuf,
452 __uint32_t *len, 452 __uint32_t *len,
453 __uint32_t flags) 453 __uint32_t flags)
454{ 454{
455 char *kbuf; 455 unsigned char *kbuf;
456 int error = EFAULT; 456 int error = EFAULT;
457 457
458 if (*len > XATTR_SIZE_MAX) 458 if (*len > XATTR_SIZE_MAX)
@@ -476,12 +476,12 @@ xfs_attrmulti_attr_get(
476int 476int
477xfs_attrmulti_attr_set( 477xfs_attrmulti_attr_set(
478 struct inode *inode, 478 struct inode *inode,
479 char *name, 479 unsigned char *name,
480 const char __user *ubuf, 480 const unsigned char __user *ubuf,
481 __uint32_t len, 481 __uint32_t len,
482 __uint32_t flags) 482 __uint32_t flags)
483{ 483{
484 char *kbuf; 484 unsigned char *kbuf;
485 int error = EFAULT; 485 int error = EFAULT;
486 486
487 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) 487 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
@@ -501,7 +501,7 @@ xfs_attrmulti_attr_set(
501int 501int
502xfs_attrmulti_attr_remove( 502xfs_attrmulti_attr_remove(
503 struct inode *inode, 503 struct inode *inode,
504 char *name, 504 unsigned char *name,
505 __uint32_t flags) 505 __uint32_t flags)
506{ 506{
507 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) 507 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
@@ -519,7 +519,7 @@ xfs_attrmulti_by_handle(
519 xfs_fsop_attrmulti_handlereq_t am_hreq; 519 xfs_fsop_attrmulti_handlereq_t am_hreq;
520 struct dentry *dentry; 520 struct dentry *dentry;
521 unsigned int i, size; 521 unsigned int i, size;
522 char *attr_name; 522 unsigned char *attr_name;
523 523
524 if (!capable(CAP_SYS_ADMIN)) 524 if (!capable(CAP_SYS_ADMIN))
525 return -XFS_ERROR(EPERM); 525 return -XFS_ERROR(EPERM);
@@ -547,7 +547,7 @@ xfs_attrmulti_by_handle(
547 547
548 error = 0; 548 error = 0;
549 for (i = 0; i < am_hreq.opcount; i++) { 549 for (i = 0; i < am_hreq.opcount; i++) {
550 ops[i].am_error = strncpy_from_user(attr_name, 550 ops[i].am_error = strncpy_from_user((char *)attr_name,
551 ops[i].am_attrname, MAXNAMELEN); 551 ops[i].am_attrname, MAXNAMELEN);
552 if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) 552 if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
553 error = -ERANGE; 553 error = -ERANGE;
@@ -1431,6 +1431,9 @@ xfs_file_ioctl(
1431 if (!capable(CAP_SYS_ADMIN)) 1431 if (!capable(CAP_SYS_ADMIN))
1432 return -EPERM; 1432 return -EPERM;
1433 1433
1434 if (mp->m_flags & XFS_MOUNT_RDONLY)
1435 return -XFS_ERROR(EROFS);
1436
1434 if (copy_from_user(&inout, arg, sizeof(inout))) 1437 if (copy_from_user(&inout, arg, sizeof(inout)))
1435 return -XFS_ERROR(EFAULT); 1438 return -XFS_ERROR(EFAULT);
1436 1439
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/linux-2.6/xfs_ioctl.h
index 7bd7c6afc1eb..d56173b34a2a 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl.h
@@ -45,23 +45,23 @@ xfs_readlink_by_handle(
45extern int 45extern int
46xfs_attrmulti_attr_get( 46xfs_attrmulti_attr_get(
47 struct inode *inode, 47 struct inode *inode,
48 char *name, 48 unsigned char *name,
49 char __user *ubuf, 49 unsigned char __user *ubuf,
50 __uint32_t *len, 50 __uint32_t *len,
51 __uint32_t flags); 51 __uint32_t flags);
52 52
53extern int 53extern int
54 xfs_attrmulti_attr_set( 54xfs_attrmulti_attr_set(
55 struct inode *inode, 55 struct inode *inode,
56 char *name, 56 unsigned char *name,
57 const char __user *ubuf, 57 const unsigned char __user *ubuf,
58 __uint32_t len, 58 __uint32_t len,
59 __uint32_t flags); 59 __uint32_t flags);
60 60
61extern int 61extern int
62xfs_attrmulti_attr_remove( 62xfs_attrmulti_attr_remove(
63 struct inode *inode, 63 struct inode *inode,
64 char *name, 64 unsigned char *name,
65 __uint32_t flags); 65 __uint32_t flags);
66 66
67extern struct dentry * 67extern struct dentry *
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index be1527b1670c..0bf6d61f0528 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -411,7 +411,7 @@ xfs_compat_attrmulti_by_handle(
411 compat_xfs_fsop_attrmulti_handlereq_t am_hreq; 411 compat_xfs_fsop_attrmulti_handlereq_t am_hreq;
412 struct dentry *dentry; 412 struct dentry *dentry;
413 unsigned int i, size; 413 unsigned int i, size;
414 char *attr_name; 414 unsigned char *attr_name;
415 415
416 if (!capable(CAP_SYS_ADMIN)) 416 if (!capable(CAP_SYS_ADMIN))
417 return -XFS_ERROR(EPERM); 417 return -XFS_ERROR(EPERM);
@@ -440,7 +440,7 @@ xfs_compat_attrmulti_by_handle(
440 440
441 error = 0; 441 error = 0;
442 for (i = 0; i < am_hreq.opcount; i++) { 442 for (i = 0; i < am_hreq.opcount; i++) {
443 ops[i].am_error = strncpy_from_user(attr_name, 443 ops[i].am_error = strncpy_from_user((char *)attr_name,
444 compat_ptr(ops[i].am_attrname), 444 compat_ptr(ops[i].am_attrname),
445 MAXNAMELEN); 445 MAXNAMELEN);
446 if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) 446 if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 225946012d0b..e8566bbf0f00 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -140,10 +140,10 @@ xfs_init_security(
140 struct xfs_inode *ip = XFS_I(inode); 140 struct xfs_inode *ip = XFS_I(inode);
141 size_t length; 141 size_t length;
142 void *value; 142 void *value;
143 char *name; 143 unsigned char *name;
144 int error; 144 int error;
145 145
146 error = security_inode_init_security(inode, dir, &name, 146 error = security_inode_init_security(inode, dir, (char **)&name,
147 &value, &length); 147 &value, &length);
148 if (error) { 148 if (error) {
149 if (error == -EOPNOTSUPP) 149 if (error == -EOPNOTSUPP)
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 0d32457abef1..eac6f80d786d 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -630,18 +630,9 @@ start:
630 * by root. This keeps people from modifying setuid and 630 * by root. This keeps people from modifying setuid and
631 * setgid binaries. 631 * setgid binaries.
632 */ 632 */
633 633 error = -file_remove_suid(file);
634 if (((xip->i_d.di_mode & S_ISUID) || 634 if (unlikely(error))
635 ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) == 635 goto out_unlock_internal;
636 (S_ISGID | S_IXGRP))) &&
637 !capable(CAP_FSETID)) {
638 error = xfs_write_clear_setuid(xip);
639 if (likely(!error))
640 error = -file_remove_suid(file);
641 if (unlikely(error)) {
642 goto out_unlock_internal;
643 }
644 }
645 636
646 /* We can write back this queue in page reclaim */ 637 /* We can write back this queue in page reclaim */
647 current->backing_dev_info = mapping->backing_dev_info; 638 current->backing_dev_info = mapping->backing_dev_info;
@@ -784,53 +775,6 @@ write_retry:
784} 775}
785 776
786/* 777/*
787 * All xfs metadata buffers except log state machine buffers
788 * get this attached as their b_bdstrat callback function.
789 * This is so that we can catch a buffer
790 * after prematurely unpinning it to forcibly shutdown the filesystem.
791 */
792int
793xfs_bdstrat_cb(struct xfs_buf *bp)
794{
795 if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
796 trace_xfs_bdstrat_shut(bp, _RET_IP_);
797 /*
798 * Metadata write that didn't get logged but
799 * written delayed anyway. These aren't associated
800 * with a transaction, and can be ignored.
801 */
802 if (XFS_BUF_IODONE_FUNC(bp) == NULL &&
803 (XFS_BUF_ISREAD(bp)) == 0)
804 return (xfs_bioerror_relse(bp));
805 else
806 return (xfs_bioerror(bp));
807 }
808
809 xfs_buf_iorequest(bp);
810 return 0;
811}
812
813/*
814 * Wrapper around bdstrat so that we can stop data from going to disk in case
815 * we are shutting down the filesystem. Typically user data goes thru this
816 * path; one of the exceptions is the superblock.
817 */
818void
819xfsbdstrat(
820 struct xfs_mount *mp,
821 struct xfs_buf *bp)
822{
823 ASSERT(mp);
824 if (!XFS_FORCED_SHUTDOWN(mp)) {
825 xfs_buf_iorequest(bp);
826 return;
827 }
828
829 trace_xfs_bdstrat_shut(bp, _RET_IP_);
830 xfs_bioerror_relse(bp);
831}
832
833/*
834 * If the underlying (data/log/rt) device is readonly, there are some 778 * If the underlying (data/log/rt) device is readonly, there are some
835 * operations that cannot proceed. 779 * operations that cannot proceed.
836 */ 780 */
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
index d1f7789c7ffb..342ae8c0d011 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ b/fs/xfs/linux-2.6/xfs_lrw.h
@@ -22,9 +22,6 @@ struct xfs_mount;
22struct xfs_inode; 22struct xfs_inode;
23struct xfs_buf; 23struct xfs_buf;
24 24
25/* errors from xfsbdstrat() must be extracted from the buffer */
26extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
27extern int xfs_bdstrat_cb(struct xfs_buf *);
28extern int xfs_dev_is_read_only(struct xfs_mount *, char *); 25extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
29 26
30extern int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); 27extern int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 77414db10dc2..25ea2408118f 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -877,12 +877,11 @@ xfsaild(
877{ 877{
878 struct xfs_ail *ailp = data; 878 struct xfs_ail *ailp = data;
879 xfs_lsn_t last_pushed_lsn = 0; 879 xfs_lsn_t last_pushed_lsn = 0;
880 long tout = 0; 880 long tout = 0; /* milliseconds */
881 881
882 while (!kthread_should_stop()) { 882 while (!kthread_should_stop()) {
883 if (tout) 883 schedule_timeout_interruptible(tout ?
884 schedule_timeout_interruptible(msecs_to_jiffies(tout)); 884 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
885 tout = 1000;
886 885
887 /* swsusp */ 886 /* swsusp */
888 try_to_freeze(); 887 try_to_freeze();
@@ -1022,12 +1021,45 @@ xfs_fs_dirty_inode(
1022 XFS_I(inode)->i_update_core = 1; 1021 XFS_I(inode)->i_update_core = 1;
1023} 1022}
1024 1023
1025/* 1024STATIC int
1026 * Attempt to flush the inode, this will actually fail 1025xfs_log_inode(
1027 * if the inode is pinned, but we dirty the inode again 1026 struct xfs_inode *ip)
1028 * at the point when it is unpinned after a log write, 1027{
1029 * since this is when the inode itself becomes flushable. 1028 struct xfs_mount *mp = ip->i_mount;
1030 */ 1029 struct xfs_trans *tp;
1030 int error;
1031
1032 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1033 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
1034 error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
1035
1036 if (error) {
1037 xfs_trans_cancel(tp, 0);
1038 /* we need to return with the lock hold shared */
1039 xfs_ilock(ip, XFS_ILOCK_SHARED);
1040 return error;
1041 }
1042
1043 xfs_ilock(ip, XFS_ILOCK_EXCL);
1044
1045 /*
1046 * Note - it's possible that we might have pushed ourselves out of the
1047 * way during trans_reserve which would flush the inode. But there's
1048 * no guarantee that the inode buffer has actually gone out yet (it's
1049 * delwri). Plus the buffer could be pinned anyway if it's part of
1050 * an inode in another recent transaction. So we play it safe and
1051 * fire off the transaction anyway.
1052 */
1053 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1054 xfs_trans_ihold(tp, ip);
1055 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1056 xfs_trans_set_sync(tp);
1057 error = xfs_trans_commit(tp, 0);
1058 xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
1059
1060 return error;
1061}
1062
1031STATIC int 1063STATIC int
1032xfs_fs_write_inode( 1064xfs_fs_write_inode(
1033 struct inode *inode, 1065 struct inode *inode,
@@ -1035,7 +1067,7 @@ xfs_fs_write_inode(
1035{ 1067{
1036 struct xfs_inode *ip = XFS_I(inode); 1068 struct xfs_inode *ip = XFS_I(inode);
1037 struct xfs_mount *mp = ip->i_mount; 1069 struct xfs_mount *mp = ip->i_mount;
1038 int error = 0; 1070 int error = EAGAIN;
1039 1071
1040 xfs_itrace_entry(ip); 1072 xfs_itrace_entry(ip);
1041 1073
@@ -1046,35 +1078,55 @@ xfs_fs_write_inode(
1046 error = xfs_wait_on_pages(ip, 0, -1); 1078 error = xfs_wait_on_pages(ip, 0, -1);
1047 if (error) 1079 if (error)
1048 goto out; 1080 goto out;
1049 }
1050
1051 /*
1052 * Bypass inodes which have already been cleaned by
1053 * the inode flush clustering code inside xfs_iflush
1054 */
1055 if (xfs_inode_clean(ip))
1056 goto out;
1057 1081
1058 /* 1082 /*
1059 * We make this non-blocking if the inode is contended, return 1083 * Make sure the inode has hit stable storage. By using the
1060 * EAGAIN to indicate to the caller that they did not succeed. 1084 * log and the fsync transactions we reduce the IOs we have
1061 * This prevents the flush path from blocking on inodes inside 1085 * to do here from two (log and inode) to just the log.
1062 * another operation right now, they get caught later by xfs_sync. 1086 *
1063 */ 1087 * Note: We still need to do a delwri write of the inode after
1064 if (sync) { 1088 * this to flush it to the backing buffer so that bulkstat
1089 * works properly if this is the first time the inode has been
1090 * written. Because we hold the ilock atomically over the
1091 * transaction commit and the inode flush we are guaranteed
1092 * that the inode is not pinned when it returns. If the flush
1093 * lock is already held, then the inode has already been
1094 * flushed once and we don't need to flush it again. Hence
1095 * the code will only flush the inode if it isn't already
1096 * being flushed.
1097 */
1065 xfs_ilock(ip, XFS_ILOCK_SHARED); 1098 xfs_ilock(ip, XFS_ILOCK_SHARED);
1066 xfs_iflock(ip); 1099 if (ip->i_update_core) {
1067 1100 error = xfs_log_inode(ip);
1068 error = xfs_iflush(ip, XFS_IFLUSH_SYNC); 1101 if (error)
1102 goto out_unlock;
1103 }
1069 } else { 1104 } else {
1070 error = EAGAIN; 1105 /*
1106 * We make this non-blocking if the inode is contended, return
1107 * EAGAIN to indicate to the caller that they did not succeed.
1108 * This prevents the flush path from blocking on inodes inside
1109 * another operation right now, they get caught later by xfs_sync.
1110 */
1071 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) 1111 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
1072 goto out; 1112 goto out;
1073 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) 1113 }
1074 goto out_unlock;
1075 1114
1076 error = xfs_iflush(ip, XFS_IFLUSH_ASYNC_NOBLOCK); 1115 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
1116 goto out_unlock;
1117
1118 /*
1119 * Now we have the flush lock and the inode is not pinned, we can check
1120 * if the inode is really clean as we know that there are no pending
1121 * transaction completions, it is not waiting on the delayed write
1122 * queue and there is no IO in progress.
1123 */
1124 if (xfs_inode_clean(ip)) {
1125 xfs_ifunlock(ip);
1126 error = 0;
1127 goto out_unlock;
1077 } 1128 }
1129 error = xfs_iflush(ip, 0);
1078 1130
1079 out_unlock: 1131 out_unlock:
1080 xfs_iunlock(ip, XFS_ILOCK_SHARED); 1132 xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -1257,6 +1309,29 @@ xfs_fs_statfs(
1257 return 0; 1309 return 0;
1258} 1310}
1259 1311
1312STATIC void
1313xfs_save_resvblks(struct xfs_mount *mp)
1314{
1315 __uint64_t resblks = 0;
1316
1317 mp->m_resblks_save = mp->m_resblks;
1318 xfs_reserve_blocks(mp, &resblks, NULL);
1319}
1320
1321STATIC void
1322xfs_restore_resvblks(struct xfs_mount *mp)
1323{
1324 __uint64_t resblks;
1325
1326 if (mp->m_resblks_save) {
1327 resblks = mp->m_resblks_save;
1328 mp->m_resblks_save = 0;
1329 } else
1330 resblks = xfs_default_resblks(mp);
1331
1332 xfs_reserve_blocks(mp, &resblks, NULL);
1333}
1334
1260STATIC int 1335STATIC int
1261xfs_fs_remount( 1336xfs_fs_remount(
1262 struct super_block *sb, 1337 struct super_block *sb,
@@ -1336,11 +1411,27 @@ xfs_fs_remount(
1336 } 1411 }
1337 mp->m_update_flags = 0; 1412 mp->m_update_flags = 0;
1338 } 1413 }
1414
1415 /*
1416 * Fill out the reserve pool if it is empty. Use the stashed
1417 * value if it is non-zero, otherwise go with the default.
1418 */
1419 xfs_restore_resvblks(mp);
1339 } 1420 }
1340 1421
1341 /* rw -> ro */ 1422 /* rw -> ro */
1342 if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { 1423 if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
1424 /*
1425 * After we have synced the data but before we sync the
1426 * metadata, we need to free up the reserve block pool so that
1427 * the used block count in the superblock on disk is correct at
1428 * the end of the remount. Stash the current reserve pool size
1429 * so that if we get remounted rw, we can return it to the same
1430 * size.
1431 */
1432
1343 xfs_quiesce_data(mp); 1433 xfs_quiesce_data(mp);
1434 xfs_save_resvblks(mp);
1344 xfs_quiesce_attr(mp); 1435 xfs_quiesce_attr(mp);
1345 mp->m_flags |= XFS_MOUNT_RDONLY; 1436 mp->m_flags |= XFS_MOUNT_RDONLY;
1346 } 1437 }
@@ -1359,11 +1450,22 @@ xfs_fs_freeze(
1359{ 1450{
1360 struct xfs_mount *mp = XFS_M(sb); 1451 struct xfs_mount *mp = XFS_M(sb);
1361 1452
1453 xfs_save_resvblks(mp);
1362 xfs_quiesce_attr(mp); 1454 xfs_quiesce_attr(mp);
1363 return -xfs_fs_log_dummy(mp); 1455 return -xfs_fs_log_dummy(mp);
1364} 1456}
1365 1457
1366STATIC int 1458STATIC int
1459xfs_fs_unfreeze(
1460 struct super_block *sb)
1461{
1462 struct xfs_mount *mp = XFS_M(sb);
1463
1464 xfs_restore_resvblks(mp);
1465 return 0;
1466}
1467
1468STATIC int
1367xfs_fs_show_options( 1469xfs_fs_show_options(
1368 struct seq_file *m, 1470 struct seq_file *m,
1369 struct vfsmount *mnt) 1471 struct vfsmount *mnt)
@@ -1585,6 +1687,7 @@ static const struct super_operations xfs_super_operations = {
1585 .put_super = xfs_fs_put_super, 1687 .put_super = xfs_fs_put_super,
1586 .sync_fs = xfs_fs_sync_fs, 1688 .sync_fs = xfs_fs_sync_fs,
1587 .freeze_fs = xfs_fs_freeze, 1689 .freeze_fs = xfs_fs_freeze,
1690 .unfreeze_fs = xfs_fs_unfreeze,
1588 .statfs = xfs_fs_statfs, 1691 .statfs = xfs_fs_statfs,
1589 .remount_fs = xfs_fs_remount, 1692 .remount_fs = xfs_fs_remount,
1590 .show_options = xfs_fs_show_options, 1693 .show_options = xfs_fs_show_options,
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 1f5e4bb5e970..a9f6d20aff41 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -90,14 +90,13 @@ xfs_inode_ag_lookup(
90STATIC int 90STATIC int
91xfs_inode_ag_walk( 91xfs_inode_ag_walk(
92 struct xfs_mount *mp, 92 struct xfs_mount *mp,
93 xfs_agnumber_t ag, 93 struct xfs_perag *pag,
94 int (*execute)(struct xfs_inode *ip, 94 int (*execute)(struct xfs_inode *ip,
95 struct xfs_perag *pag, int flags), 95 struct xfs_perag *pag, int flags),
96 int flags, 96 int flags,
97 int tag, 97 int tag,
98 int exclusive) 98 int exclusive)
99{ 99{
100 struct xfs_perag *pag = &mp->m_perag[ag];
101 uint32_t first_index; 100 uint32_t first_index;
102 int last_error = 0; 101 int last_error = 0;
103 int skipped; 102 int skipped;
@@ -141,8 +140,6 @@ restart:
141 delay(1); 140 delay(1);
142 goto restart; 141 goto restart;
143 } 142 }
144
145 xfs_put_perag(mp, pag);
146 return last_error; 143 return last_error;
147} 144}
148 145
@@ -160,10 +157,16 @@ xfs_inode_ag_iterator(
160 xfs_agnumber_t ag; 157 xfs_agnumber_t ag;
161 158
162 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { 159 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
163 if (!mp->m_perag[ag].pag_ici_init) 160 struct xfs_perag *pag;
161
162 pag = xfs_perag_get(mp, ag);
163 if (!pag->pag_ici_init) {
164 xfs_perag_put(pag);
164 continue; 165 continue;
165 error = xfs_inode_ag_walk(mp, ag, execute, flags, tag, 166 }
167 error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
166 exclusive); 168 exclusive);
169 xfs_perag_put(pag);
167 if (error) { 170 if (error) {
168 last_error = error; 171 last_error = error;
169 if (error == EFSCORRUPTED) 172 if (error == EFSCORRUPTED)
@@ -231,7 +234,7 @@ xfs_sync_inode_data(
231 } 234 }
232 235
233 error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? 236 error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
234 0 : XFS_B_ASYNC, FI_NONE); 237 0 : XBF_ASYNC, FI_NONE);
235 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 238 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
236 239
237 out_wait: 240 out_wait:
@@ -267,8 +270,7 @@ xfs_sync_inode_attr(
267 goto out_unlock; 270 goto out_unlock;
268 } 271 }
269 272
270 error = xfs_iflush(ip, (flags & SYNC_WAIT) ? 273 error = xfs_iflush(ip, flags);
271 XFS_IFLUSH_SYNC : XFS_IFLUSH_DELWRI);
272 274
273 out_unlock: 275 out_unlock:
274 xfs_iunlock(ip, XFS_ILOCK_SHARED); 276 xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -293,10 +295,7 @@ xfs_sync_data(
293 if (error) 295 if (error)
294 return XFS_ERROR(error); 296 return XFS_ERROR(error);
295 297
296 xfs_log_force(mp, 0, 298 xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
297 (flags & SYNC_WAIT) ?
298 XFS_LOG_FORCE | XFS_LOG_SYNC :
299 XFS_LOG_FORCE);
300 return 0; 299 return 0;
301} 300}
302 301
@@ -322,10 +321,6 @@ xfs_commit_dummy_trans(
322 struct xfs_inode *ip = mp->m_rootip; 321 struct xfs_inode *ip = mp->m_rootip;
323 struct xfs_trans *tp; 322 struct xfs_trans *tp;
324 int error; 323 int error;
325 int log_flags = XFS_LOG_FORCE;
326
327 if (flags & SYNC_WAIT)
328 log_flags |= XFS_LOG_SYNC;
329 324
330 /* 325 /*
331 * Put a dummy transaction in the log to tell recovery 326 * Put a dummy transaction in the log to tell recovery
@@ -347,11 +342,11 @@ xfs_commit_dummy_trans(
347 xfs_iunlock(ip, XFS_ILOCK_EXCL); 342 xfs_iunlock(ip, XFS_ILOCK_EXCL);
348 343
349 /* the log force ensures this transaction is pushed to disk */ 344 /* the log force ensures this transaction is pushed to disk */
350 xfs_log_force(mp, 0, log_flags); 345 xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
351 return error; 346 return error;
352} 347}
353 348
354int 349STATIC int
355xfs_sync_fsdata( 350xfs_sync_fsdata(
356 struct xfs_mount *mp, 351 struct xfs_mount *mp,
357 int flags) 352 int flags)
@@ -367,7 +362,7 @@ xfs_sync_fsdata(
367 if (flags & SYNC_TRYLOCK) { 362 if (flags & SYNC_TRYLOCK) {
368 ASSERT(!(flags & SYNC_WAIT)); 363 ASSERT(!(flags & SYNC_WAIT));
369 364
370 bp = xfs_getsb(mp, XFS_BUF_TRYLOCK); 365 bp = xfs_getsb(mp, XBF_TRYLOCK);
371 if (!bp) 366 if (!bp)
372 goto out; 367 goto out;
373 368
@@ -387,7 +382,7 @@ xfs_sync_fsdata(
387 * become pinned in between there and here. 382 * become pinned in between there and here.
388 */ 383 */
389 if (XFS_BUF_ISPINNED(bp)) 384 if (XFS_BUF_ISPINNED(bp))
390 xfs_log_force(mp, 0, XFS_LOG_FORCE); 385 xfs_log_force(mp, 0);
391 } 386 }
392 387
393 388
@@ -448,9 +443,6 @@ xfs_quiesce_data(
448 xfs_sync_data(mp, SYNC_WAIT); 443 xfs_sync_data(mp, SYNC_WAIT);
449 xfs_qm_sync(mp, SYNC_WAIT); 444 xfs_qm_sync(mp, SYNC_WAIT);
450 445
451 /* drop inode references pinned by filestreams */
452 xfs_filestream_flush(mp);
453
454 /* write superblock and hoover up shutdown errors */ 446 /* write superblock and hoover up shutdown errors */
455 error = xfs_sync_fsdata(mp, SYNC_WAIT); 447 error = xfs_sync_fsdata(mp, SYNC_WAIT);
456 448
@@ -467,16 +459,18 @@ xfs_quiesce_fs(
467{ 459{
468 int count = 0, pincount; 460 int count = 0, pincount;
469 461
462 xfs_reclaim_inodes(mp, 0);
470 xfs_flush_buftarg(mp->m_ddev_targp, 0); 463 xfs_flush_buftarg(mp->m_ddev_targp, 0);
471 xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
472 464
473 /* 465 /*
474 * This loop must run at least twice. The first instance of the loop 466 * This loop must run at least twice. The first instance of the loop
475 * will flush most meta data but that will generate more meta data 467 * will flush most meta data but that will generate more meta data
476 * (typically directory updates). Which then must be flushed and 468 * (typically directory updates). Which then must be flushed and
477 * logged before we can write the unmount record. 469 * logged before we can write the unmount record. We also so sync
470 * reclaim of inodes to catch any that the above delwri flush skipped.
478 */ 471 */
479 do { 472 do {
473 xfs_reclaim_inodes(mp, SYNC_WAIT);
480 xfs_sync_attr(mp, SYNC_WAIT); 474 xfs_sync_attr(mp, SYNC_WAIT);
481 pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); 475 pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
482 if (!pincount) { 476 if (!pincount) {
@@ -575,7 +569,7 @@ xfs_flush_inodes(
575 igrab(inode); 569 igrab(inode);
576 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion); 570 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
577 wait_for_completion(&completion); 571 wait_for_completion(&completion);
578 xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC); 572 xfs_log_force(ip->i_mount, XFS_LOG_SYNC);
579} 573}
580 574
581/* 575/*
@@ -591,8 +585,8 @@ xfs_sync_worker(
591 int error; 585 int error;
592 586
593 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { 587 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
594 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); 588 xfs_log_force(mp, 0);
595 xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC); 589 xfs_reclaim_inodes(mp, 0);
596 /* dgc: errors ignored here */ 590 /* dgc: errors ignored here */
597 error = xfs_qm_sync(mp, SYNC_TRYLOCK); 591 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
598 error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); 592 error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
@@ -690,16 +684,17 @@ void
690xfs_inode_set_reclaim_tag( 684xfs_inode_set_reclaim_tag(
691 xfs_inode_t *ip) 685 xfs_inode_t *ip)
692{ 686{
693 xfs_mount_t *mp = ip->i_mount; 687 struct xfs_mount *mp = ip->i_mount;
694 xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); 688 struct xfs_perag *pag;
695 689
690 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
696 read_lock(&pag->pag_ici_lock); 691 read_lock(&pag->pag_ici_lock);
697 spin_lock(&ip->i_flags_lock); 692 spin_lock(&ip->i_flags_lock);
698 __xfs_inode_set_reclaim_tag(pag, ip); 693 __xfs_inode_set_reclaim_tag(pag, ip);
699 __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 694 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
700 spin_unlock(&ip->i_flags_lock); 695 spin_unlock(&ip->i_flags_lock);
701 read_unlock(&pag->pag_ici_lock); 696 read_unlock(&pag->pag_ici_lock);
702 xfs_put_perag(mp, pag); 697 xfs_perag_put(pag);
703} 698}
704 699
705void 700void
@@ -712,12 +707,64 @@ __xfs_inode_clear_reclaim_tag(
712 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); 707 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
713} 708}
714 709
710/*
711 * Inodes in different states need to be treated differently, and the return
712 * value of xfs_iflush is not sufficient to get this right. The following table
713 * lists the inode states and the reclaim actions necessary for non-blocking
714 * reclaim:
715 *
716 *
717 * inode state iflush ret required action
718 * --------------- ---------- ---------------
719 * bad - reclaim
720 * shutdown EIO unpin and reclaim
721 * clean, unpinned 0 reclaim
722 * stale, unpinned 0 reclaim
723 * clean, pinned(*) 0 requeue
724 * stale, pinned EAGAIN requeue
725 * dirty, delwri ok 0 requeue
726 * dirty, delwri blocked EAGAIN requeue
727 * dirty, sync flush 0 reclaim
728 *
729 * (*) dgc: I don't think the clean, pinned state is possible but it gets
730 * handled anyway given the order of checks implemented.
731 *
732 * As can be seen from the table, the return value of xfs_iflush() is not
733 * sufficient to correctly decide the reclaim action here. The checks in
734 * xfs_iflush() might look like duplicates, but they are not.
735 *
736 * Also, because we get the flush lock first, we know that any inode that has
737 * been flushed delwri has had the flush completed by the time we check that
738 * the inode is clean. The clean inode check needs to be done before flushing
739 * the inode delwri otherwise we would loop forever requeuing clean inodes as
740 * we cannot tell apart a successful delwri flush and a clean inode from the
741 * return value of xfs_iflush().
742 *
743 * Note that because the inode is flushed delayed write by background
744 * writeback, the flush lock may already be held here and waiting on it can
745 * result in very long latencies. Hence for sync reclaims, where we wait on the
746 * flush lock, the caller should push out delayed write inodes first before
747 * trying to reclaim them to minimise the amount of time spent waiting. For
748 * background relaim, we just requeue the inode for the next pass.
749 *
750 * Hence the order of actions after gaining the locks should be:
751 * bad => reclaim
752 * shutdown => unpin and reclaim
753 * pinned, delwri => requeue
754 * pinned, sync => unpin
755 * stale => reclaim
756 * clean => reclaim
757 * dirty, delwri => flush and requeue
758 * dirty, sync => flush, wait and reclaim
759 */
715STATIC int 760STATIC int
716xfs_reclaim_inode( 761xfs_reclaim_inode(
717 struct xfs_inode *ip, 762 struct xfs_inode *ip,
718 struct xfs_perag *pag, 763 struct xfs_perag *pag,
719 int sync_mode) 764 int sync_mode)
720{ 765{
766 int error = 0;
767
721 /* 768 /*
722 * The radix tree lock here protects a thread in xfs_iget from racing 769 * The radix tree lock here protects a thread in xfs_iget from racing
723 * with us starting reclaim on the inode. Once we have the 770 * with us starting reclaim on the inode. Once we have the
@@ -735,33 +782,70 @@ xfs_reclaim_inode(
735 spin_unlock(&ip->i_flags_lock); 782 spin_unlock(&ip->i_flags_lock);
736 write_unlock(&pag->pag_ici_lock); 783 write_unlock(&pag->pag_ici_lock);
737 784
738 /*
739 * If the inode is still dirty, then flush it out. If the inode
740 * is not in the AIL, then it will be OK to flush it delwri as
741 * long as xfs_iflush() does not keep any references to the inode.
742 * We leave that decision up to xfs_iflush() since it has the
743 * knowledge of whether it's OK to simply do a delwri flush of
744 * the inode or whether we need to wait until the inode is
745 * pulled from the AIL.
746 * We get the flush lock regardless, though, just to make sure
747 * we don't free it while it is being flushed.
748 */
749 xfs_ilock(ip, XFS_ILOCK_EXCL); 785 xfs_ilock(ip, XFS_ILOCK_EXCL);
750 xfs_iflock(ip); 786 if (!xfs_iflock_nowait(ip)) {
787 if (!(sync_mode & SYNC_WAIT))
788 goto out;
789 xfs_iflock(ip);
790 }
791
792 if (is_bad_inode(VFS_I(ip)))
793 goto reclaim;
794 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
795 xfs_iunpin_wait(ip);
796 goto reclaim;
797 }
798 if (xfs_ipincount(ip)) {
799 if (!(sync_mode & SYNC_WAIT)) {
800 xfs_ifunlock(ip);
801 goto out;
802 }
803 xfs_iunpin_wait(ip);
804 }
805 if (xfs_iflags_test(ip, XFS_ISTALE))
806 goto reclaim;
807 if (xfs_inode_clean(ip))
808 goto reclaim;
809
810 /* Now we have an inode that needs flushing */
811 error = xfs_iflush(ip, sync_mode);
812 if (sync_mode & SYNC_WAIT) {
813 xfs_iflock(ip);
814 goto reclaim;
815 }
751 816
752 /* 817 /*
753 * In the case of a forced shutdown we rely on xfs_iflush() to 818 * When we have to flush an inode but don't have SYNC_WAIT set, we
754 * wait for the inode to be unpinned before returning an error. 819 * flush the inode out using a delwri buffer and wait for the next
820 * call into reclaim to find it in a clean state instead of waiting for
821 * it now. We also don't return errors here - if the error is transient
822 * then the next reclaim pass will flush the inode, and if the error
823 * is permanent then the next sync reclaim will relcaim the inode and
824 * pass on the error.
755 */ 825 */
756 if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) { 826 if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
757 /* synchronize with xfs_iflush_done */ 827 xfs_fs_cmn_err(CE_WARN, ip->i_mount,
758 xfs_iflock(ip); 828 "inode 0x%llx background reclaim flush failed with %d",
759 xfs_ifunlock(ip); 829 (long long)ip->i_ino, error);
760 } 830 }
831out:
832 xfs_iflags_clear(ip, XFS_IRECLAIM);
833 xfs_iunlock(ip, XFS_ILOCK_EXCL);
834 /*
835 * We could return EAGAIN here to make reclaim rescan the inode tree in
836 * a short while. However, this just burns CPU time scanning the tree
837 * waiting for IO to complete and xfssyncd never goes back to the idle
838 * state. Instead, return 0 to let the next scheduled background reclaim
839 * attempt to reclaim the inode again.
840 */
841 return 0;
761 842
843reclaim:
844 xfs_ifunlock(ip);
762 xfs_iunlock(ip, XFS_ILOCK_EXCL); 845 xfs_iunlock(ip, XFS_ILOCK_EXCL);
763 xfs_ireclaim(ip); 846 xfs_ireclaim(ip);
764 return 0; 847 return error;
848
765} 849}
766 850
767int 851int
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index ea932b43335d..d480c346cabb 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -37,7 +37,6 @@ void xfs_syncd_stop(struct xfs_mount *mp);
37 37
38int xfs_sync_attr(struct xfs_mount *mp, int flags); 38int xfs_sync_attr(struct xfs_mount *mp, int flags);
39int xfs_sync_data(struct xfs_mount *mp, int flags); 39int xfs_sync_data(struct xfs_mount *mp, int flags);
40int xfs_sync_fsdata(struct xfs_mount *mp, int flags);
41 40
42int xfs_quiesce_data(struct xfs_mount *mp); 41int xfs_quiesce_data(struct xfs_mount *mp);
43void xfs_quiesce_attr(struct xfs_mount *mp); 42void xfs_quiesce_attr(struct xfs_mount *mp);
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index c22a608321a3..a4574dcf5065 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -78,6 +78,33 @@ DECLARE_EVENT_CLASS(xfs_attr_list_class,
78 ) 78 )
79) 79)
80 80
81#define DEFINE_PERAG_REF_EVENT(name) \
82TRACE_EVENT(name, \
83 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \
84 unsigned long caller_ip), \
85 TP_ARGS(mp, agno, refcount, caller_ip), \
86 TP_STRUCT__entry( \
87 __field(dev_t, dev) \
88 __field(xfs_agnumber_t, agno) \
89 __field(int, refcount) \
90 __field(unsigned long, caller_ip) \
91 ), \
92 TP_fast_assign( \
93 __entry->dev = mp->m_super->s_dev; \
94 __entry->agno = agno; \
95 __entry->refcount = refcount; \
96 __entry->caller_ip = caller_ip; \
97 ), \
98 TP_printk("dev %d:%d agno %u refcount %d caller %pf", \
99 MAJOR(__entry->dev), MINOR(__entry->dev), \
100 __entry->agno, \
101 __entry->refcount, \
102 (char *)__entry->caller_ip) \
103);
104
105DEFINE_PERAG_REF_EVENT(xfs_perag_get)
106DEFINE_PERAG_REF_EVENT(xfs_perag_put)
107
81#define DEFINE_ATTR_LIST_EVENT(name) \ 108#define DEFINE_ATTR_LIST_EVENT(name) \
82DEFINE_EVENT(xfs_attr_list_class, name, \ 109DEFINE_EVENT(xfs_attr_list_class, name, \
83 TP_PROTO(struct xfs_attr_list_context *ctx), \ 110 TP_PROTO(struct xfs_attr_list_context *ctx), \
@@ -456,6 +483,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock);
456DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale); 483DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale);
457DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed); 484DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed);
458DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push); 485DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push);
486DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pushbuf);
459DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf); 487DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf);
460DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur); 488DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur);
461DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb); 489DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb);
@@ -1414,6 +1442,59 @@ TRACE_EVENT(xfs_dir2_leafn_moveents,
1414 __entry->count) 1442 __entry->count)
1415); 1443);
1416 1444
1445#define XFS_SWAPEXT_INODES \
1446 { 0, "target" }, \
1447 { 1, "temp" }
1448
1449#define XFS_INODE_FORMAT_STR \
1450 { 0, "invalid" }, \
1451 { 1, "local" }, \
1452 { 2, "extent" }, \
1453 { 3, "btree" }
1454
1455DECLARE_EVENT_CLASS(xfs_swap_extent_class,
1456 TP_PROTO(struct xfs_inode *ip, int which),
1457 TP_ARGS(ip, which),
1458 TP_STRUCT__entry(
1459 __field(dev_t, dev)
1460 __field(int, which)
1461 __field(xfs_ino_t, ino)
1462 __field(int, format)
1463 __field(int, nex)
1464 __field(int, max_nex)
1465 __field(int, broot_size)
1466 __field(int, fork_off)
1467 ),
1468 TP_fast_assign(
1469 __entry->dev = VFS_I(ip)->i_sb->s_dev;
1470 __entry->which = which;
1471 __entry->ino = ip->i_ino;
1472 __entry->format = ip->i_d.di_format;
1473 __entry->nex = ip->i_d.di_nextents;
1474 __entry->max_nex = ip->i_df.if_ext_max;
1475 __entry->broot_size = ip->i_df.if_broot_bytes;
1476 __entry->fork_off = XFS_IFORK_BOFF(ip);
1477 ),
1478 TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
1479 "Max in-fork extents %d, broot size %d, fork offset %d",
1480 MAJOR(__entry->dev), MINOR(__entry->dev),
1481 __entry->ino,
1482 __print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
1483 __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
1484 __entry->nex,
1485 __entry->max_nex,
1486 __entry->broot_size,
1487 __entry->fork_off)
1488)
1489
1490#define DEFINE_SWAPEXT_EVENT(name) \
1491DEFINE_EVENT(xfs_swap_extent_class, name, \
1492 TP_PROTO(struct xfs_inode *ip, int which), \
1493 TP_ARGS(ip, which))
1494
1495DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
1496DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
1497
1417#endif /* _TRACE_XFS_H */ 1498#endif /* _TRACE_XFS_H */
1418 1499
1419#undef TRACE_INCLUDE_PATH 1500#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c
index 0b1878857fc3..fa01b9daba6b 100644
--- a/fs/xfs/linux-2.6/xfs_xattr.c
+++ b/fs/xfs/linux-2.6/xfs_xattr.c
@@ -45,7 +45,7 @@ xfs_xattr_get(struct dentry *dentry, const char *name,
45 value = NULL; 45 value = NULL;
46 } 46 }
47 47
48 error = -xfs_attr_get(ip, name, value, &asize, xflags); 48 error = -xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags);
49 if (error) 49 if (error)
50 return error; 50 return error;
51 return asize; 51 return asize;
@@ -67,8 +67,9 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
67 xflags |= ATTR_REPLACE; 67 xflags |= ATTR_REPLACE;
68 68
69 if (!value) 69 if (!value)
70 return -xfs_attr_remove(ip, name, xflags); 70 return -xfs_attr_remove(ip, (unsigned char *)name, xflags);
71 return -xfs_attr_set(ip, name, (void *)value, size, xflags); 71 return -xfs_attr_set(ip, (unsigned char *)name,
72 (void *)value, size, xflags);
72} 73}
73 74
74static struct xattr_handler xfs_xattr_user_handler = { 75static struct xattr_handler xfs_xattr_user_handler = {
@@ -124,8 +125,13 @@ static const char *xfs_xattr_prefix(int flags)
124} 125}
125 126
126static int 127static int
127xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags, 128xfs_xattr_put_listent(
128 char *name, int namelen, int valuelen, char *value) 129 struct xfs_attr_list_context *context,
130 int flags,
131 unsigned char *name,
132 int namelen,
133 int valuelen,
134 unsigned char *value)
129{ 135{
130 unsigned int prefix_len = xfs_xattr_prefix_len(flags); 136 unsigned int prefix_len = xfs_xattr_prefix_len(flags);
131 char *offset; 137 char *offset;
@@ -148,7 +154,7 @@ xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags,
148 offset = (char *)context->alist + context->count; 154 offset = (char *)context->alist + context->count;
149 strncpy(offset, xfs_xattr_prefix(flags), prefix_len); 155 strncpy(offset, xfs_xattr_prefix(flags), prefix_len);
150 offset += prefix_len; 156 offset += prefix_len;
151 strncpy(offset, name, namelen); /* real name */ 157 strncpy(offset, (char *)name, namelen); /* real name */
152 offset += namelen; 158 offset += namelen;
153 *offset = '\0'; 159 *offset = '\0';
154 context->count += prefix_len + namelen + 1; 160 context->count += prefix_len + namelen + 1;
@@ -156,8 +162,13 @@ xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags,
156} 162}
157 163
158static int 164static int
159xfs_xattr_put_listent_sizes(struct xfs_attr_list_context *context, int flags, 165xfs_xattr_put_listent_sizes(
160 char *name, int namelen, int valuelen, char *value) 166 struct xfs_attr_list_context *context,
167 int flags,
168 unsigned char *name,
169 int namelen,
170 int valuelen,
171 unsigned char *value)
161{ 172{
162 context->count += xfs_xattr_prefix_len(flags) + namelen + 1; 173 context->count += xfs_xattr_prefix_len(flags) + namelen + 1;
163 return 0; 174 return 0;