diff options
author | Mark Fasheh <mark.fasheh@oracle.com> | 2007-05-09 18:16:19 -0400 |
---|---|---|
committer | Mark Fasheh <mark.fasheh@oracle.com> | 2007-07-10 20:31:51 -0400 |
commit | 7307de80510a70e5e5aa98de1e80ccbb7d90a3a8 (patch) | |
tree | ba45bef3e0b875feb67b97aebe8295159852ef97 /fs | |
parent | 607d44aa3fa6f40b0facaf1028886ed362b92682 (diff) |
ocfs2: shared writeable mmap
Implement cluster consistent shared writeable mappings using the
->page_mkwrite() callback.
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/ocfs2/aops.c | 56 | ||||
-rw-r--r-- | fs/ocfs2/aops.h | 9 | ||||
-rw-r--r-- | fs/ocfs2/file.c | 7 | ||||
-rw-r--r-- | fs/ocfs2/mmap.c | 167 |
4 files changed, 200 insertions, 39 deletions
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index fc723fb9c981..b8869fd0884f 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
@@ -1034,7 +1034,8 @@ out: | |||
1034 | */ | 1034 | */ |
1035 | static int ocfs2_grab_pages_for_write(struct address_space *mapping, | 1035 | static int ocfs2_grab_pages_for_write(struct address_space *mapping, |
1036 | struct ocfs2_write_ctxt *wc, | 1036 | struct ocfs2_write_ctxt *wc, |
1037 | u32 cpos, loff_t user_pos, int new) | 1037 | u32 cpos, loff_t user_pos, int new, |
1038 | struct page *mmap_page) | ||
1038 | { | 1039 | { |
1039 | int ret = 0, i; | 1040 | int ret = 0, i; |
1040 | unsigned long start, target_index, index; | 1041 | unsigned long start, target_index, index; |
@@ -1058,11 +1059,36 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, | |||
1058 | for(i = 0; i < wc->w_num_pages; i++) { | 1059 | for(i = 0; i < wc->w_num_pages; i++) { |
1059 | index = start + i; | 1060 | index = start + i; |
1060 | 1061 | ||
1061 | wc->w_pages[i] = find_or_create_page(mapping, index, GFP_NOFS); | 1062 | if (index == target_index && mmap_page) { |
1062 | if (!wc->w_pages[i]) { | 1063 | /* |
1063 | ret = -ENOMEM; | 1064 | * ocfs2_pagemkwrite() is a little different |
1064 | mlog_errno(ret); | 1065 | * and wants us to directly use the page |
1065 | goto out; | 1066 | * passed in. |
1067 | */ | ||
1068 | lock_page(mmap_page); | ||
1069 | |||
1070 | if (mmap_page->mapping != mapping) { | ||
1071 | unlock_page(mmap_page); | ||
1072 | /* | ||
1073 | * Sanity check - the locking in | ||
1074 | * ocfs2_pagemkwrite() should ensure | ||
1075 | * that this code doesn't trigger. | ||
1076 | */ | ||
1077 | ret = -EINVAL; | ||
1078 | mlog_errno(ret); | ||
1079 | goto out; | ||
1080 | } | ||
1081 | |||
1082 | page_cache_get(mmap_page); | ||
1083 | wc->w_pages[i] = mmap_page; | ||
1084 | } else { | ||
1085 | wc->w_pages[i] = find_or_create_page(mapping, index, | ||
1086 | GFP_NOFS); | ||
1087 | if (!wc->w_pages[i]) { | ||
1088 | ret = -ENOMEM; | ||
1089 | mlog_errno(ret); | ||
1090 | goto out; | ||
1091 | } | ||
1066 | } | 1092 | } |
1067 | 1093 | ||
1068 | if (index == target_index) | 1094 | if (index == target_index) |
@@ -1213,10 +1239,10 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, | |||
1213 | } | 1239 | } |
1214 | } | 1240 | } |
1215 | 1241 | ||
1216 | static int ocfs2_write_begin_nolock(struct address_space *mapping, | 1242 | int ocfs2_write_begin_nolock(struct address_space *mapping, |
1217 | loff_t pos, unsigned len, unsigned flags, | 1243 | loff_t pos, unsigned len, unsigned flags, |
1218 | struct page **pagep, void **fsdata, | 1244 | struct page **pagep, void **fsdata, |
1219 | struct buffer_head *di_bh) | 1245 | struct buffer_head *di_bh, struct page *mmap_page) |
1220 | { | 1246 | { |
1221 | int ret, i, credits = OCFS2_INODE_UPDATE_CREDITS; | 1247 | int ret, i, credits = OCFS2_INODE_UPDATE_CREDITS; |
1222 | unsigned int num_clusters = 0, clusters_to_alloc = 0; | 1248 | unsigned int num_clusters = 0, clusters_to_alloc = 0; |
@@ -1318,7 +1344,7 @@ static int ocfs2_write_begin_nolock(struct address_space *mapping, | |||
1318 | * extent. | 1344 | * extent. |
1319 | */ | 1345 | */ |
1320 | ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, | 1346 | ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, |
1321 | clusters_to_alloc); | 1347 | clusters_to_alloc, mmap_page); |
1322 | if (ret) { | 1348 | if (ret) { |
1323 | mlog_errno(ret); | 1349 | mlog_errno(ret); |
1324 | goto out_commit; | 1350 | goto out_commit; |
@@ -1386,7 +1412,7 @@ int ocfs2_write_begin(struct file *file, struct address_space *mapping, | |||
1386 | } | 1412 | } |
1387 | 1413 | ||
1388 | ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep, | 1414 | ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep, |
1389 | fsdata, di_bh); | 1415 | fsdata, di_bh, NULL); |
1390 | if (ret) { | 1416 | if (ret) { |
1391 | mlog_errno(ret); | 1417 | mlog_errno(ret); |
1392 | goto out_fail_data; | 1418 | goto out_fail_data; |
@@ -1407,9 +1433,9 @@ out_fail: | |||
1407 | return ret; | 1433 | return ret; |
1408 | } | 1434 | } |
1409 | 1435 | ||
1410 | static int ocfs2_write_end_nolock(struct address_space *mapping, | 1436 | int ocfs2_write_end_nolock(struct address_space *mapping, |
1411 | loff_t pos, unsigned len, unsigned copied, | 1437 | loff_t pos, unsigned len, unsigned copied, |
1412 | struct page *page, void *fsdata) | 1438 | struct page *page, void *fsdata) |
1413 | { | 1439 | { |
1414 | int i; | 1440 | int i; |
1415 | unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1); | 1441 | unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1); |
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index bdcdd1ae63a9..389579bd64e3 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h | |||
@@ -50,6 +50,15 @@ int ocfs2_write_end(struct file *file, struct address_space *mapping, | |||
50 | loff_t pos, unsigned len, unsigned copied, | 50 | loff_t pos, unsigned len, unsigned copied, |
51 | struct page *page, void *fsdata); | 51 | struct page *page, void *fsdata); |
52 | 52 | ||
53 | int ocfs2_write_end_nolock(struct address_space *mapping, | ||
54 | loff_t pos, unsigned len, unsigned copied, | ||
55 | struct page *page, void *fsdata); | ||
56 | |||
57 | int ocfs2_write_begin_nolock(struct address_space *mapping, | ||
58 | loff_t pos, unsigned len, unsigned flags, | ||
59 | struct page **pagep, void **fsdata, | ||
60 | struct buffer_head *di_bh, struct page *mmap_page); | ||
61 | |||
53 | /* all ocfs2_dio_end_io()'s fault */ | 62 | /* all ocfs2_dio_end_io()'s fault */ |
54 | #define ocfs2_iocb_is_rw_locked(iocb) \ | 63 | #define ocfs2_iocb_is_rw_locked(iocb) \ |
55 | test_bit(0, (unsigned long *)&iocb->private) | 64 | test_bit(0, (unsigned long *)&iocb->private) |
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 4c850d00c269..a80f31776d94 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c | |||
@@ -1001,6 +1001,13 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) | |||
1001 | goto bail_unlock; | 1001 | goto bail_unlock; |
1002 | } | 1002 | } |
1003 | 1003 | ||
1004 | /* | ||
1005 | * This will intentionally not wind up calling vmtruncate(), | ||
1006 | * since all the work for a size change has been done above. | ||
1007 | * Otherwise, we could get into problems with truncate as | ||
1008 | * ip_alloc_sem is used there to protect against i_size | ||
1009 | * changes. | ||
1010 | */ | ||
1004 | status = inode_setattr(inode, attr); | 1011 | status = inode_setattr(inode, attr); |
1005 | if (status < 0) { | 1012 | if (status < 0) { |
1006 | mlog_errno(status); | 1013 | mlog_errno(status); |
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index af01158b39f5..d79aa12137d2 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c | |||
@@ -37,11 +37,29 @@ | |||
37 | 37 | ||
38 | #include "ocfs2.h" | 38 | #include "ocfs2.h" |
39 | 39 | ||
40 | #include "aops.h" | ||
40 | #include "dlmglue.h" | 41 | #include "dlmglue.h" |
41 | #include "file.h" | 42 | #include "file.h" |
42 | #include "inode.h" | 43 | #include "inode.h" |
43 | #include "mmap.h" | 44 | #include "mmap.h" |
44 | 45 | ||
46 | static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset) | ||
47 | { | ||
48 | /* The best way to deal with signals in the vm path is | ||
49 | * to block them upfront, rather than allowing the | ||
50 | * locking paths to return -ERESTARTSYS. */ | ||
51 | sigfillset(blocked); | ||
52 | |||
53 | /* We should technically never get a bad return value | ||
54 | * from sigprocmask */ | ||
55 | return sigprocmask(SIG_BLOCK, blocked, oldset); | ||
56 | } | ||
57 | |||
58 | static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset) | ||
59 | { | ||
60 | return sigprocmask(SIG_SETMASK, oldset, NULL); | ||
61 | } | ||
62 | |||
45 | static struct page *ocfs2_nopage(struct vm_area_struct * area, | 63 | static struct page *ocfs2_nopage(struct vm_area_struct * area, |
46 | unsigned long address, | 64 | unsigned long address, |
47 | int *type) | 65 | int *type) |
@@ -53,14 +71,7 @@ static struct page *ocfs2_nopage(struct vm_area_struct * area, | |||
53 | mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address, | 71 | mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address, |
54 | type); | 72 | type); |
55 | 73 | ||
56 | /* The best way to deal with signals in this path is | 74 | ret = ocfs2_vm_op_block_sigs(&blocked, &oldset); |
57 | * to block them upfront, rather than allowing the | ||
58 | * locking paths to return -ERESTARTSYS. */ | ||
59 | sigfillset(&blocked); | ||
60 | |||
61 | /* We should technically never get a bad ret return | ||
62 | * from sigprocmask */ | ||
63 | ret = sigprocmask(SIG_BLOCK, &blocked, &oldset); | ||
64 | if (ret < 0) { | 75 | if (ret < 0) { |
65 | mlog_errno(ret); | 76 | mlog_errno(ret); |
66 | goto out; | 77 | goto out; |
@@ -68,7 +79,7 @@ static struct page *ocfs2_nopage(struct vm_area_struct * area, | |||
68 | 79 | ||
69 | page = filemap_nopage(area, address, type); | 80 | page = filemap_nopage(area, address, type); |
70 | 81 | ||
71 | ret = sigprocmask(SIG_SETMASK, &oldset, NULL); | 82 | ret = ocfs2_vm_op_unblock_sigs(&oldset); |
72 | if (ret < 0) | 83 | if (ret < 0) |
73 | mlog_errno(ret); | 84 | mlog_errno(ret); |
74 | out: | 85 | out: |
@@ -76,28 +87,136 @@ out: | |||
76 | return page; | 87 | return page; |
77 | } | 88 | } |
78 | 89 | ||
79 | static struct vm_operations_struct ocfs2_file_vm_ops = { | 90 | static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh, |
80 | .nopage = ocfs2_nopage, | 91 | struct page *page) |
81 | }; | 92 | { |
93 | int ret; | ||
94 | struct address_space *mapping = inode->i_mapping; | ||
95 | loff_t pos = page->index << PAGE_CACHE_SHIFT; | ||
96 | unsigned int len = PAGE_CACHE_SIZE; | ||
97 | pgoff_t last_index; | ||
98 | struct page *locked_page = NULL; | ||
99 | void *fsdata; | ||
100 | loff_t size = i_size_read(inode); | ||
82 | 101 | ||
83 | int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) | 102 | /* |
103 | * Another node might have truncated while we were waiting on | ||
104 | * cluster locks. | ||
105 | */ | ||
106 | last_index = size >> PAGE_CACHE_SHIFT; | ||
107 | if (page->index > last_index) { | ||
108 | ret = -EINVAL; | ||
109 | goto out; | ||
110 | } | ||
111 | |||
112 | /* | ||
113 | * The i_size check above doesn't catch the case where nodes | ||
114 | * truncated and then re-extended the file. We'll re-check the | ||
115 | * page mapping after taking the page lock inside of | ||
116 | * ocfs2_write_begin_nolock(). | ||
117 | */ | ||
118 | if (!PageUptodate(page) || page->mapping != inode->i_mapping) { | ||
119 | ret = -EINVAL; | ||
120 | goto out; | ||
121 | } | ||
122 | |||
123 | /* | ||
124 | * Call ocfs2_write_begin() and ocfs2_write_end() to take | ||
125 | * advantage of the allocation code there. We pass a write | ||
126 | * length of the whole page (chopped to i_size) to make sure | ||
127 | * the whole thing is allocated. | ||
128 | * | ||
129 | * Since we know the page is up to date, we don't have to | ||
130 | * worry about ocfs2_write_begin() skipping some buffer reads | ||
131 | * because the "write" would invalidate their data. | ||
132 | */ | ||
133 | if (page->index == last_index) | ||
134 | len = size & ~PAGE_CACHE_MASK; | ||
135 | |||
136 | ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page, | ||
137 | &fsdata, di_bh, page); | ||
138 | if (ret) { | ||
139 | if (ret != -ENOSPC) | ||
140 | mlog_errno(ret); | ||
141 | goto out; | ||
142 | } | ||
143 | |||
144 | ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page, | ||
145 | fsdata); | ||
146 | if (ret < 0) { | ||
147 | mlog_errno(ret); | ||
148 | goto out; | ||
149 | } | ||
150 | BUG_ON(ret != len); | ||
151 | ret = 0; | ||
152 | out: | ||
153 | return ret; | ||
154 | } | ||
155 | |||
156 | static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) | ||
84 | { | 157 | { |
85 | int ret = 0, lock_level = 0; | 158 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; |
86 | struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb); | 159 | struct buffer_head *di_bh = NULL; |
160 | sigset_t blocked, oldset; | ||
161 | int ret, ret2; | ||
162 | |||
163 | ret = ocfs2_vm_op_block_sigs(&blocked, &oldset); | ||
164 | if (ret < 0) { | ||
165 | mlog_errno(ret); | ||
166 | return ret; | ||
167 | } | ||
168 | |||
169 | /* | ||
170 | * The cluster locks taken will block a truncate from another | ||
171 | * node. Taking the data lock will also ensure that we don't | ||
172 | * attempt page truncation as part of a downconvert. | ||
173 | */ | ||
174 | ret = ocfs2_meta_lock(inode, &di_bh, 1); | ||
175 | if (ret < 0) { | ||
176 | mlog_errno(ret); | ||
177 | goto out; | ||
178 | } | ||
87 | 179 | ||
88 | /* | 180 | /* |
89 | * Only support shared writeable mmap for local mounts which | 181 | * The alloc sem should be enough to serialize with |
90 | * don't know about holes. | 182 | * ocfs2_truncate_file() changing i_size as well as any thread |
183 | * modifying the inode btree. | ||
91 | */ | 184 | */ |
92 | if ((!ocfs2_mount_local(osb) || ocfs2_sparse_alloc(osb)) && | 185 | down_write(&OCFS2_I(inode)->ip_alloc_sem); |
93 | ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) && | 186 | |
94 | ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) { | 187 | ret = ocfs2_data_lock(inode, 1); |
95 | mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags); | 188 | if (ret < 0) { |
96 | /* This is -EINVAL because generic_file_readonly_mmap | 189 | mlog_errno(ret); |
97 | * returns it in a similar situation. */ | 190 | goto out_meta_unlock; |
98 | return -EINVAL; | ||
99 | } | 191 | } |
100 | 192 | ||
193 | ret = __ocfs2_page_mkwrite(inode, di_bh, page); | ||
194 | |||
195 | ocfs2_data_unlock(inode, 1); | ||
196 | |||
197 | out_meta_unlock: | ||
198 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
199 | |||
200 | brelse(di_bh); | ||
201 | ocfs2_meta_unlock(inode, 1); | ||
202 | |||
203 | out: | ||
204 | ret2 = ocfs2_vm_op_unblock_sigs(&oldset); | ||
205 | if (ret2 < 0) | ||
206 | mlog_errno(ret2); | ||
207 | |||
208 | return ret; | ||
209 | } | ||
210 | |||
211 | static struct vm_operations_struct ocfs2_file_vm_ops = { | ||
212 | .nopage = ocfs2_nopage, | ||
213 | .page_mkwrite = ocfs2_page_mkwrite, | ||
214 | }; | ||
215 | |||
216 | int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) | ||
217 | { | ||
218 | int ret = 0, lock_level = 0; | ||
219 | |||
101 | ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode, | 220 | ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode, |
102 | file->f_vfsmnt, &lock_level); | 221 | file->f_vfsmnt, &lock_level); |
103 | if (ret < 0) { | 222 | if (ret < 0) { |