aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-05-23 11:13:39 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-05-23 11:13:39 -0400
commit6483d152acffb83442b90dad1517fde8a7b1e12d (patch)
tree21ba8f033a80c9c312bb18438990feb1d3ec5ebb /fs
parentcb618965bc2073267b7f9345066f502515fcfdf5 (diff)
parent6ab455eeaff6893cd06da33843e840d888cdc04a (diff)
Merge branch 'for-linus' of git://oss.sgi.com:8090/xfs/xfs-2.6
* 'for-linus' of git://oss.sgi.com:8090/xfs/xfs-2.6: [XFS] Fix memory corruption with small buffer reads [XFS] Fix inode list allocation size in writeback. [XFS] Don't allow memory reclaim to wait on the filesystem in inode [XFS] Fix fsync() b0rkage. [XFS] Include linux/random.h in all builds, not just debug builds.
Diffstat (limited to 'fs')
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c24
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h19
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c17
-rw-r--r--fs/xfs/linux-2.6/xfs_vnode.h8
-rw-r--r--fs/xfs/xfs_inode.c9
-rw-r--r--fs/xfs/xfs_vnodeops.c112
-rw-r--r--fs/xfs/xfs_vnodeops.h3
7 files changed, 98 insertions, 94 deletions
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 5105015a75ad..98e0e86093b4 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -387,6 +387,8 @@ _xfs_buf_lookup_pages(
387 if (unlikely(page == NULL)) { 387 if (unlikely(page == NULL)) {
388 if (flags & XBF_READ_AHEAD) { 388 if (flags & XBF_READ_AHEAD) {
389 bp->b_page_count = i; 389 bp->b_page_count = i;
390 for (i = 0; i < bp->b_page_count; i++)
391 unlock_page(bp->b_pages[i]);
390 return -ENOMEM; 392 return -ENOMEM;
391 } 393 }
392 394
@@ -416,17 +418,24 @@ _xfs_buf_lookup_pages(
416 ASSERT(!PagePrivate(page)); 418 ASSERT(!PagePrivate(page));
417 if (!PageUptodate(page)) { 419 if (!PageUptodate(page)) {
418 page_count--; 420 page_count--;
419 if (blocksize < PAGE_CACHE_SIZE && !PagePrivate(page)) { 421 if (blocksize >= PAGE_CACHE_SIZE) {
422 if (flags & XBF_READ)
423 bp->b_flags |= _XBF_PAGE_LOCKED;
424 } else if (!PagePrivate(page)) {
420 if (test_page_region(page, offset, nbytes)) 425 if (test_page_region(page, offset, nbytes))
421 page_count++; 426 page_count++;
422 } 427 }
423 } 428 }
424 429
425 unlock_page(page);
426 bp->b_pages[i] = page; 430 bp->b_pages[i] = page;
427 offset = 0; 431 offset = 0;
428 } 432 }
429 433
434 if (!(bp->b_flags & _XBF_PAGE_LOCKED)) {
435 for (i = 0; i < bp->b_page_count; i++)
436 unlock_page(bp->b_pages[i]);
437 }
438
430 if (page_count == bp->b_page_count) 439 if (page_count == bp->b_page_count)
431 bp->b_flags |= XBF_DONE; 440 bp->b_flags |= XBF_DONE;
432 441
@@ -746,6 +755,7 @@ xfs_buf_associate_memory(
746 bp->b_count_desired = len; 755 bp->b_count_desired = len;
747 bp->b_buffer_length = buflen; 756 bp->b_buffer_length = buflen;
748 bp->b_flags |= XBF_MAPPED; 757 bp->b_flags |= XBF_MAPPED;
758 bp->b_flags &= ~_XBF_PAGE_LOCKED;
749 759
750 return 0; 760 return 0;
751} 761}
@@ -1093,8 +1103,10 @@ _xfs_buf_ioend(
1093 xfs_buf_t *bp, 1103 xfs_buf_t *bp,
1094 int schedule) 1104 int schedule)
1095{ 1105{
1096 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) 1106 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
1107 bp->b_flags &= ~_XBF_PAGE_LOCKED;
1097 xfs_buf_ioend(bp, schedule); 1108 xfs_buf_ioend(bp, schedule);
1109 }
1098} 1110}
1099 1111
1100STATIC void 1112STATIC void
@@ -1125,6 +1137,9 @@ xfs_buf_bio_end_io(
1125 1137
1126 if (--bvec >= bio->bi_io_vec) 1138 if (--bvec >= bio->bi_io_vec)
1127 prefetchw(&bvec->bv_page->flags); 1139 prefetchw(&bvec->bv_page->flags);
1140
1141 if (bp->b_flags & _XBF_PAGE_LOCKED)
1142 unlock_page(page);
1128 } while (bvec >= bio->bi_io_vec); 1143 } while (bvec >= bio->bi_io_vec);
1129 1144
1130 _xfs_buf_ioend(bp, 1); 1145 _xfs_buf_ioend(bp, 1);
@@ -1163,7 +1178,8 @@ _xfs_buf_ioapply(
1163 * filesystem block size is not smaller than the page size. 1178 * filesystem block size is not smaller than the page size.
1164 */ 1179 */
1165 if ((bp->b_buffer_length < PAGE_CACHE_SIZE) && 1180 if ((bp->b_buffer_length < PAGE_CACHE_SIZE) &&
1166 (bp->b_flags & XBF_READ) && 1181 ((bp->b_flags & (XBF_READ|_XBF_PAGE_LOCKED)) ==
1182 (XBF_READ|_XBF_PAGE_LOCKED)) &&
1167 (blocksize >= PAGE_CACHE_SIZE)) { 1183 (blocksize >= PAGE_CACHE_SIZE)) {
1168 bio = bio_alloc(GFP_NOIO, 1); 1184 bio = bio_alloc(GFP_NOIO, 1);
1169 1185
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 841d7883528d..f948ec7ba9a4 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -66,6 +66,25 @@ typedef enum {
66 _XBF_PAGES = (1 << 18), /* backed by refcounted pages */ 66 _XBF_PAGES = (1 << 18), /* backed by refcounted pages */
67 _XBF_RUN_QUEUES = (1 << 19),/* run block device task queue */ 67 _XBF_RUN_QUEUES = (1 << 19),/* run block device task queue */
68 _XBF_DELWRI_Q = (1 << 21), /* buffer on delwri queue */ 68 _XBF_DELWRI_Q = (1 << 21), /* buffer on delwri queue */
69
70 /*
71 * Special flag for supporting metadata blocks smaller than a FSB.
72 *
73 * In this case we can have multiple xfs_buf_t on a single page and
74 * need to lock out concurrent xfs_buf_t readers as they only
75 * serialise access to the buffer.
76 *
77 * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation
78 * between reads of the page. Hence we can have one thread read the
79 * page and modify it, but then race with another thread that thinks
80 * the page is not up-to-date and hence reads it again.
81 *
82 * The result is that the first modifcation to the page is lost.
83 * This sort of AGF/AGI reading race can happen when unlinking inodes
84 * that require truncation and results in the AGI unlinked list
85 * modifications being lost.
86 */
87 _XBF_PAGE_LOCKED = (1 << 22),
69} xfs_buf_flags_t; 88} xfs_buf_flags_t;
70 89
71typedef enum { 90typedef enum {
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 65e78c13d4ae..5f60363b9343 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -184,19 +184,24 @@ xfs_file_release(
184 return -xfs_release(XFS_I(inode)); 184 return -xfs_release(XFS_I(inode));
185} 185}
186 186
187/*
188 * We ignore the datasync flag here because a datasync is effectively
189 * identical to an fsync. That is, datasync implies that we need to write
190 * only the metadata needed to be able to access the data that is written
191 * if we crash after the call completes. Hence if we are writing beyond
192 * EOF we have to log the inode size change as well, which makes it a
193 * full fsync. If we don't write beyond EOF, the inode core will be
194 * clean in memory and so we don't need to log the inode, just like
195 * fsync.
196 */
187STATIC int 197STATIC int
188xfs_file_fsync( 198xfs_file_fsync(
189 struct file *filp, 199 struct file *filp,
190 struct dentry *dentry, 200 struct dentry *dentry,
191 int datasync) 201 int datasync)
192{ 202{
193 int flags = FSYNC_WAIT;
194
195 if (datasync)
196 flags |= FSYNC_DATA;
197 xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED); 203 xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED);
198 return -xfs_fsync(XFS_I(dentry->d_inode), flags, 204 return -xfs_fsync(XFS_I(dentry->d_inode));
199 (xfs_off_t)0, (xfs_off_t)-1);
200} 205}
201 206
202/* 207/*
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 9d73cb5c0fc7..25eb2a9e8d9b 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -230,14 +230,6 @@ static inline void vn_atime_to_time_t(bhv_vnode_t *vp, time_t *tt)
230#define ATTR_NOSIZETOK 0x400 /* Don't get the SIZE token */ 230#define ATTR_NOSIZETOK 0x400 /* Don't get the SIZE token */
231 231
232/* 232/*
233 * Flags to vop_fsync/reclaim.
234 */
235#define FSYNC_NOWAIT 0 /* asynchronous flush */
236#define FSYNC_WAIT 0x1 /* synchronous fsync or forced reclaim */
237#define FSYNC_INVAL 0x2 /* flush and invalidate cached data */
238#define FSYNC_DATA 0x4 /* synchronous fsync of data only */
239
240/*
241 * Tracking vnode activity. 233 * Tracking vnode activity.
242 */ 234 */
243#if defined(XFS_INODE_TRACE) 235#if defined(XFS_INODE_TRACE)
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index cf0bb9c1d621..e569bf5d6cf0 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2974,6 +2974,7 @@ xfs_iflush_cluster(
2974 xfs_mount_t *mp = ip->i_mount; 2974 xfs_mount_t *mp = ip->i_mount;
2975 xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); 2975 xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino);
2976 unsigned long first_index, mask; 2976 unsigned long first_index, mask;
2977 unsigned long inodes_per_cluster;
2977 int ilist_size; 2978 int ilist_size;
2978 xfs_inode_t **ilist; 2979 xfs_inode_t **ilist;
2979 xfs_inode_t *iq; 2980 xfs_inode_t *iq;
@@ -2985,8 +2986,9 @@ xfs_iflush_cluster(
2985 ASSERT(pag->pagi_inodeok); 2986 ASSERT(pag->pagi_inodeok);
2986 ASSERT(pag->pag_ici_init); 2987 ASSERT(pag->pag_ici_init);
2987 2988
2988 ilist_size = XFS_INODE_CLUSTER_SIZE(mp) * sizeof(xfs_inode_t *); 2989 inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
2989 ilist = kmem_alloc(ilist_size, KM_MAYFAIL); 2990 ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
2991 ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
2990 if (!ilist) 2992 if (!ilist)
2991 return 0; 2993 return 0;
2992 2994
@@ -2995,8 +2997,7 @@ xfs_iflush_cluster(
2995 read_lock(&pag->pag_ici_lock); 2997 read_lock(&pag->pag_ici_lock);
2996 /* really need a gang lookup range call here */ 2998 /* really need a gang lookup range call here */
2997 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, 2999 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
2998 first_index, 3000 first_index, inodes_per_cluster);
2999 XFS_INODE_CLUSTER_SIZE(mp));
3000 if (nr_found == 0) 3001 if (nr_found == 0)
3001 goto out_free; 3002 goto out_free;
3002 3003
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 70702a60b4bb..e475e3717eb3 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -856,18 +856,14 @@ xfs_readlink(
856/* 856/*
857 * xfs_fsync 857 * xfs_fsync
858 * 858 *
859 * This is called to sync the inode and its data out to disk. 859 * This is called to sync the inode and its data out to disk. We need to hold
860 * We need to hold the I/O lock while flushing the data, and 860 * the I/O lock while flushing the data, and the inode lock while flushing the
861 * the inode lock while flushing the inode. The inode lock CANNOT 861 * inode. The inode lock CANNOT be held while flushing the data, so acquire
862 * be held while flushing the data, so acquire after we're done 862 * after we're done with that.
863 * with that.
864 */ 863 */
865int 864int
866xfs_fsync( 865xfs_fsync(
867 xfs_inode_t *ip, 866 xfs_inode_t *ip)
868 int flag,
869 xfs_off_t start,
870 xfs_off_t stop)
871{ 867{
872 xfs_trans_t *tp; 868 xfs_trans_t *tp;
873 int error; 869 int error;
@@ -875,103 +871,79 @@ xfs_fsync(
875 871
876 xfs_itrace_entry(ip); 872 xfs_itrace_entry(ip);
877 873
878 ASSERT(start >= 0 && stop >= -1);
879
880 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 874 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
881 return XFS_ERROR(EIO); 875 return XFS_ERROR(EIO);
882 876
883 if (flag & FSYNC_DATA) 877 /* capture size updates in I/O completion before writing the inode. */
884 filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping); 878 error = filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
879 if (error)
880 return XFS_ERROR(error);
885 881
886 /* 882 /*
887 * We always need to make sure that the required inode state 883 * We always need to make sure that the required inode state is safe on
888 * is safe on disk. The vnode might be clean but because 884 * disk. The vnode might be clean but we still might need to force the
889 * of committed transactions that haven't hit the disk yet. 885 * log because of committed transactions that haven't hit the disk yet.
890 * Likewise, there could be unflushed non-transactional 886 * Likewise, there could be unflushed non-transactional changes to the
891 * changes to the inode core that have to go to disk. 887 * inode core that have to go to disk and this requires us to issue
888 * a synchronous transaction to capture these changes correctly.
892 * 889 *
893 * The following code depends on one assumption: that 890 * This code relies on the assumption that if the update_* fields
894 * any transaction that changes an inode logs the core 891 * of the inode are clear and the inode is unpinned then it is clean
895 * because it has to change some field in the inode core 892 * and no action is required.
896 * (typically nextents or nblocks). That assumption
897 * implies that any transactions against an inode will
898 * catch any non-transactional updates. If inode-altering
899 * transactions exist that violate this assumption, the
900 * code breaks. Right now, it figures that if the involved
901 * update_* field is clear and the inode is unpinned, the
902 * inode is clean. Either it's been flushed or it's been
903 * committed and the commit has hit the disk unpinning the inode.
904 * (Note that xfs_inode_item_format() called at commit clears
905 * the update_* fields.)
906 */ 893 */
907 xfs_ilock(ip, XFS_ILOCK_SHARED); 894 xfs_ilock(ip, XFS_ILOCK_SHARED);
908 895
909 /* If we are flushing data then we care about update_size 896 if (!(ip->i_update_size || ip->i_update_core)) {
910 * being set, otherwise we care about update_core
911 */
912 if ((flag & FSYNC_DATA) ?
913 (ip->i_update_size == 0) :
914 (ip->i_update_core == 0)) {
915 /* 897 /*
916 * Timestamps/size haven't changed since last inode 898 * Timestamps/size haven't changed since last inode flush or
917 * flush or inode transaction commit. That means 899 * inode transaction commit. That means either nothing got
918 * either nothing got written or a transaction 900 * written or a transaction committed which caught the updates.
919 * committed which caught the updates. If the 901 * If the latter happened and the transaction hasn't hit the
920 * latter happened and the transaction hasn't 902 * disk yet, the inode will be still be pinned. If it is,
921 * hit the disk yet, the inode will be still 903 * force the log.
922 * be pinned. If it is, force the log.
923 */ 904 */
924 905
925 xfs_iunlock(ip, XFS_ILOCK_SHARED); 906 xfs_iunlock(ip, XFS_ILOCK_SHARED);
926 907
927 if (xfs_ipincount(ip)) { 908 if (xfs_ipincount(ip)) {
928 _xfs_log_force(ip->i_mount, (xfs_lsn_t)0, 909 error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
929 XFS_LOG_FORCE | 910 XFS_LOG_FORCE | XFS_LOG_SYNC,
930 ((flag & FSYNC_WAIT)
931 ? XFS_LOG_SYNC : 0),
932 &log_flushed); 911 &log_flushed);
933 } else { 912 } else {
934 /* 913 /*
935 * If the inode is not pinned and nothing 914 * If the inode is not pinned and nothing has changed
936 * has changed we don't need to flush the 915 * we don't need to flush the cache.
937 * cache.
938 */ 916 */
939 changed = 0; 917 changed = 0;
940 } 918 }
941 error = 0;
942 } else { 919 } else {
943 /* 920 /*
944 * Kick off a transaction to log the inode 921 * Kick off a transaction to log the inode core to get the
945 * core to get the updates. Make it 922 * updates. The sync transaction will also force the log.
946 * sync if FSYNC_WAIT is passed in (which
947 * is done by everybody but specfs). The
948 * sync transaction will also force the log.
949 */ 923 */
950 xfs_iunlock(ip, XFS_ILOCK_SHARED); 924 xfs_iunlock(ip, XFS_ILOCK_SHARED);
951 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS); 925 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
952 if ((error = xfs_trans_reserve(tp, 0, 926 error = xfs_trans_reserve(tp, 0,
953 XFS_FSYNC_TS_LOG_RES(ip->i_mount), 927 XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
954 0, 0, 0))) { 928 if (error) {
955 xfs_trans_cancel(tp, 0); 929 xfs_trans_cancel(tp, 0);
956 return error; 930 return error;
957 } 931 }
958 xfs_ilock(ip, XFS_ILOCK_EXCL); 932 xfs_ilock(ip, XFS_ILOCK_EXCL);
959 933
960 /* 934 /*
961 * Note - it's possible that we might have pushed 935 * Note - it's possible that we might have pushed ourselves out
962 * ourselves out of the way during trans_reserve 936 * of the way during trans_reserve which would flush the inode.
963 * which would flush the inode. But there's no 937 * But there's no guarantee that the inode buffer has actually
964 * guarantee that the inode buffer has actually 938 * gone out yet (it's delwri). Plus the buffer could be pinned
965 * gone out yet (it's delwri). Plus the buffer 939 * anyway if it's part of an inode in another recent
966 * could be pinned anyway if it's part of an 940 * transaction. So we play it safe and fire off the
967 * inode in another recent transaction. So we 941 * transaction anyway.
968 * play it safe and fire off the transaction anyway.
969 */ 942 */
970 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 943 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
971 xfs_trans_ihold(tp, ip); 944 xfs_trans_ihold(tp, ip);
972 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 945 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
973 if (flag & FSYNC_WAIT) 946 xfs_trans_set_sync(tp);
974 xfs_trans_set_sync(tp);
975 error = _xfs_trans_commit(tp, 0, &log_flushed); 947 error = _xfs_trans_commit(tp, 0, &log_flushed);
976 948
977 xfs_iunlock(ip, XFS_ILOCK_EXCL); 949 xfs_iunlock(ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 8abe8f186e20..57335ba4ce53 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -18,8 +18,7 @@ int xfs_open(struct xfs_inode *ip);
18int xfs_setattr(struct xfs_inode *ip, struct bhv_vattr *vap, int flags, 18int xfs_setattr(struct xfs_inode *ip, struct bhv_vattr *vap, int flags,
19 struct cred *credp); 19 struct cred *credp);
20int xfs_readlink(struct xfs_inode *ip, char *link); 20int xfs_readlink(struct xfs_inode *ip, char *link);
21int xfs_fsync(struct xfs_inode *ip, int flag, xfs_off_t start, 21int xfs_fsync(struct xfs_inode *ip);
22 xfs_off_t stop);
23int xfs_release(struct xfs_inode *ip); 22int xfs_release(struct xfs_inode *ip);
24int xfs_inactive(struct xfs_inode *ip); 23int xfs_inactive(struct xfs_inode *ip);
25int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, 24int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,