diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-10-28 19:42:18 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-10-28 19:42:18 -0400 |
commit | 97d2eb13a019ec09cc1a7ea2d3705c0b117b3c0d (patch) | |
tree | 86f6382941f8cfc41647d33d87bec7bc1407c18c /fs | |
parent | 68d99b2c8efcb6ed3807a55569300c53b5f88be5 (diff) | |
parent | 339573406737461cfb17bebabf7ba536a302d841 (diff) |
Merge branch 'for-linus' of git://ceph.newdream.net/git/ceph-client
* 'for-linus' of git://ceph.newdream.net/git/ceph-client:
libceph: fix double-free of page vector
ceph: fix 32-bit ino numbers
libceph: force resend of osd requests if we skip an osdmap
ceph: use kernel DNS resolver
ceph: fix ceph_monc_init memory leak
ceph: let the set_layout ioctl set single traits
Revert "ceph: don't truncate dirty pages in invalidate work thread"
ceph: replace leading spaces with tabs
libceph: warn on msg allocation failures
libceph: don't complain on msgpool alloc failures
libceph: always preallocate mon connection
libceph: create messenger with client
ceph: document ioctls
ceph: implement (optional) max read size
ceph: rename rsize -> rasize
ceph: make readpages fully async
Diffstat (limited to 'fs')
-rw-r--r-- | fs/ceph/addr.c | 193 | ||||
-rw-r--r-- | fs/ceph/caps.c | 2 | ||||
-rw-r--r-- | fs/ceph/inode.c | 46 | ||||
-rw-r--r-- | fs/ceph/ioctl.c | 34 | ||||
-rw-r--r-- | fs/ceph/ioctl.h | 55 | ||||
-rw-r--r-- | fs/ceph/mds_client.c | 11 | ||||
-rw-r--r-- | fs/ceph/super.c | 61 | ||||
-rw-r--r-- | fs/ceph/super.h | 19 |
8 files changed, 260 insertions, 161 deletions
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 5a3953db8118..4144caf2f9d3 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c | |||
@@ -228,102 +228,155 @@ static int ceph_readpage(struct file *filp, struct page *page) | |||
228 | } | 228 | } |
229 | 229 | ||
230 | /* | 230 | /* |
231 | * Build a vector of contiguous pages from the provided page list. | 231 | * Finish an async read(ahead) op. |
232 | */ | 232 | */ |
233 | static struct page **page_vector_from_list(struct list_head *page_list, | 233 | static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) |
234 | unsigned *nr_pages) | ||
235 | { | 234 | { |
236 | struct page **pages; | 235 | struct inode *inode = req->r_inode; |
237 | struct page *page; | 236 | struct ceph_osd_reply_head *replyhead; |
238 | int next_index, contig_pages = 0; | 237 | int rc, bytes; |
238 | int i; | ||
239 | 239 | ||
240 | /* build page vector */ | 240 | /* parse reply */ |
241 | pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS); | 241 | replyhead = msg->front.iov_base; |
242 | if (!pages) | 242 | WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); |
243 | return ERR_PTR(-ENOMEM); | 243 | rc = le32_to_cpu(replyhead->result); |
244 | bytes = le32_to_cpu(msg->hdr.data_len); | ||
244 | 245 | ||
245 | BUG_ON(list_empty(page_list)); | 246 | dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); |
246 | next_index = list_entry(page_list->prev, struct page, lru)->index; | 247 | |
247 | list_for_each_entry_reverse(page, page_list, lru) { | 248 | /* unlock all pages, zeroing any data we didn't read */ |
248 | if (page->index == next_index) { | 249 | for (i = 0; i < req->r_num_pages; i++, bytes -= PAGE_CACHE_SIZE) { |
249 | dout("readpages page %d %p\n", contig_pages, page); | 250 | struct page *page = req->r_pages[i]; |
250 | pages[contig_pages] = page; | 251 | |
251 | contig_pages++; | 252 | if (bytes < (int)PAGE_CACHE_SIZE) { |
252 | next_index++; | 253 | /* zero (remainder of) page */ |
253 | } else { | 254 | int s = bytes < 0 ? 0 : bytes; |
254 | break; | 255 | zero_user_segment(page, s, PAGE_CACHE_SIZE); |
255 | } | 256 | } |
257 | dout("finish_read %p uptodate %p idx %lu\n", inode, page, | ||
258 | page->index); | ||
259 | flush_dcache_page(page); | ||
260 | SetPageUptodate(page); | ||
261 | unlock_page(page); | ||
262 | page_cache_release(page); | ||
256 | } | 263 | } |
257 | *nr_pages = contig_pages; | 264 | kfree(req->r_pages); |
258 | return pages; | ||
259 | } | 265 | } |
260 | 266 | ||
261 | /* | 267 | /* |
262 | * Read multiple pages. Leave pages we don't read + unlock in page_list; | 268 | * start an async read(ahead) operation. return nr_pages we submitted |
263 | * the caller (VM) cleans them up. | 269 | * a read for on success, or negative error code. |
264 | */ | 270 | */ |
265 | static int ceph_readpages(struct file *file, struct address_space *mapping, | 271 | static int start_read(struct inode *inode, struct list_head *page_list, int max) |
266 | struct list_head *page_list, unsigned nr_pages) | ||
267 | { | 272 | { |
268 | struct inode *inode = file->f_dentry->d_inode; | ||
269 | struct ceph_inode_info *ci = ceph_inode(inode); | ||
270 | struct ceph_osd_client *osdc = | 273 | struct ceph_osd_client *osdc = |
271 | &ceph_inode_to_client(inode)->client->osdc; | 274 | &ceph_inode_to_client(inode)->client->osdc; |
272 | int rc = 0; | 275 | struct ceph_inode_info *ci = ceph_inode(inode); |
273 | struct page **pages; | 276 | struct page *page = list_entry(page_list->prev, struct page, lru); |
274 | loff_t offset; | 277 | struct ceph_osd_request *req; |
278 | u64 off; | ||
275 | u64 len; | 279 | u64 len; |
280 | int i; | ||
281 | struct page **pages; | ||
282 | pgoff_t next_index; | ||
283 | int nr_pages = 0; | ||
284 | int ret; | ||
276 | 285 | ||
277 | dout("readpages %p file %p nr_pages %d\n", | 286 | off = page->index << PAGE_CACHE_SHIFT; |
278 | inode, file, nr_pages); | ||
279 | |||
280 | pages = page_vector_from_list(page_list, &nr_pages); | ||
281 | if (IS_ERR(pages)) | ||
282 | return PTR_ERR(pages); | ||
283 | 287 | ||
284 | /* guess read extent */ | 288 | /* count pages */ |
285 | offset = pages[0]->index << PAGE_CACHE_SHIFT; | 289 | next_index = page->index; |
290 | list_for_each_entry_reverse(page, page_list, lru) { | ||
291 | if (page->index != next_index) | ||
292 | break; | ||
293 | nr_pages++; | ||
294 | next_index++; | ||
295 | if (max && nr_pages == max) | ||
296 | break; | ||
297 | } | ||
286 | len = nr_pages << PAGE_CACHE_SHIFT; | 298 | len = nr_pages << PAGE_CACHE_SHIFT; |
287 | rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, | 299 | dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, |
288 | offset, &len, | 300 | off, len); |
289 | ci->i_truncate_seq, ci->i_truncate_size, | 301 | |
290 | pages, nr_pages, 0); | 302 | req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), |
291 | if (rc == -ENOENT) | 303 | off, &len, |
292 | rc = 0; | 304 | CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, |
293 | if (rc < 0) | 305 | NULL, 0, |
294 | goto out; | 306 | ci->i_truncate_seq, ci->i_truncate_size, |
295 | 307 | NULL, false, 1, 0); | |
296 | for (; !list_empty(page_list) && len > 0; | 308 | if (!req) |
297 | rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) { | 309 | return -ENOMEM; |
298 | struct page *page = | ||
299 | list_entry(page_list->prev, struct page, lru); | ||
300 | 310 | ||
311 | /* build page vector */ | ||
312 | nr_pages = len >> PAGE_CACHE_SHIFT; | ||
313 | pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS); | ||
314 | ret = -ENOMEM; | ||
315 | if (!pages) | ||
316 | goto out; | ||
317 | for (i = 0; i < nr_pages; ++i) { | ||
318 | page = list_entry(page_list->prev, struct page, lru); | ||
319 | BUG_ON(PageLocked(page)); | ||
301 | list_del(&page->lru); | 320 | list_del(&page->lru); |
302 | 321 | ||
303 | if (rc < (int)PAGE_CACHE_SIZE) { | 322 | dout("start_read %p adding %p idx %lu\n", inode, page, |
304 | /* zero (remainder of) page */ | 323 | page->index); |
305 | int s = rc < 0 ? 0 : rc; | 324 | if (add_to_page_cache_lru(page, &inode->i_data, page->index, |
306 | zero_user_segment(page, s, PAGE_CACHE_SIZE); | ||
307 | } | ||
308 | |||
309 | if (add_to_page_cache_lru(page, mapping, page->index, | ||
310 | GFP_NOFS)) { | 325 | GFP_NOFS)) { |
311 | page_cache_release(page); | 326 | page_cache_release(page); |
312 | dout("readpages %p add_to_page_cache failed %p\n", | 327 | dout("start_read %p add_to_page_cache failed %p\n", |
313 | inode, page); | 328 | inode, page); |
314 | continue; | 329 | nr_pages = i; |
330 | goto out_pages; | ||
315 | } | 331 | } |
316 | dout("readpages %p adding %p idx %lu\n", inode, page, | 332 | pages[i] = page; |
317 | page->index); | ||
318 | flush_dcache_page(page); | ||
319 | SetPageUptodate(page); | ||
320 | unlock_page(page); | ||
321 | page_cache_release(page); | ||
322 | } | 333 | } |
323 | rc = 0; | 334 | req->r_pages = pages; |
335 | req->r_num_pages = nr_pages; | ||
336 | req->r_callback = finish_read; | ||
337 | req->r_inode = inode; | ||
338 | |||
339 | dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); | ||
340 | ret = ceph_osdc_start_request(osdc, req, false); | ||
341 | if (ret < 0) | ||
342 | goto out_pages; | ||
343 | ceph_osdc_put_request(req); | ||
344 | return nr_pages; | ||
324 | 345 | ||
346 | out_pages: | ||
347 | ceph_release_page_vector(pages, nr_pages); | ||
348 | out: | ||
349 | ceph_osdc_put_request(req); | ||
350 | return ret; | ||
351 | } | ||
352 | |||
353 | |||
354 | /* | ||
355 | * Read multiple pages. Leave pages we don't read + unlock in page_list; | ||
356 | * the caller (VM) cleans them up. | ||
357 | */ | ||
358 | static int ceph_readpages(struct file *file, struct address_space *mapping, | ||
359 | struct list_head *page_list, unsigned nr_pages) | ||
360 | { | ||
361 | struct inode *inode = file->f_dentry->d_inode; | ||
362 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | ||
363 | int rc = 0; | ||
364 | int max = 0; | ||
365 | |||
366 | if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) | ||
367 | max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) | ||
368 | >> PAGE_SHIFT; | ||
369 | |||
370 | dout("readpages %p file %p nr_pages %d max %d\n", inode, file, nr_pages, | ||
371 | max); | ||
372 | while (!list_empty(page_list)) { | ||
373 | rc = start_read(inode, page_list, max); | ||
374 | if (rc < 0) | ||
375 | goto out; | ||
376 | BUG_ON(rc == 0); | ||
377 | } | ||
325 | out: | 378 | out: |
326 | kfree(pages); | 379 | dout("readpages %p file %p ret %d\n", inode, file, rc); |
327 | return rc; | 380 | return rc; |
328 | } | 381 | } |
329 | 382 | ||
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 8d74ad7ba556..b8731bf3ef1f 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c | |||
@@ -945,7 +945,7 @@ static int send_cap_msg(struct ceph_mds_session *session, | |||
945 | seq, issue_seq, mseq, follows, size, max_size, | 945 | seq, issue_seq, mseq, follows, size, max_size, |
946 | xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); | 946 | xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); |
947 | 947 | ||
948 | msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS); | 948 | msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS, false); |
949 | if (!msg) | 949 | if (!msg) |
950 | return -ENOMEM; | 950 | return -ENOMEM; |
951 | 951 | ||
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 095799ba9dd1..5dde7d51dc11 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c | |||
@@ -9,7 +9,6 @@ | |||
9 | #include <linux/namei.h> | 9 | #include <linux/namei.h> |
10 | #include <linux/writeback.h> | 10 | #include <linux/writeback.h> |
11 | #include <linux/vmalloc.h> | 11 | #include <linux/vmalloc.h> |
12 | #include <linux/pagevec.h> | ||
13 | 12 | ||
14 | #include "super.h" | 13 | #include "super.h" |
15 | #include "mds_client.h" | 14 | #include "mds_client.h" |
@@ -1364,49 +1363,6 @@ void ceph_queue_invalidate(struct inode *inode) | |||
1364 | } | 1363 | } |
1365 | 1364 | ||
1366 | /* | 1365 | /* |
1367 | * invalidate any pages that are not dirty or under writeback. this | ||
1368 | * includes pages that are clean and mapped. | ||
1369 | */ | ||
1370 | static void ceph_invalidate_nondirty_pages(struct address_space *mapping) | ||
1371 | { | ||
1372 | struct pagevec pvec; | ||
1373 | pgoff_t next = 0; | ||
1374 | int i; | ||
1375 | |||
1376 | pagevec_init(&pvec, 0); | ||
1377 | while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | ||
1378 | for (i = 0; i < pagevec_count(&pvec); i++) { | ||
1379 | struct page *page = pvec.pages[i]; | ||
1380 | pgoff_t index; | ||
1381 | int skip_page = | ||
1382 | (PageDirty(page) || PageWriteback(page)); | ||
1383 | |||
1384 | if (!skip_page) | ||
1385 | skip_page = !trylock_page(page); | ||
1386 | |||
1387 | /* | ||
1388 | * We really shouldn't be looking at the ->index of an | ||
1389 | * unlocked page. But we're not allowed to lock these | ||
1390 | * pages. So we rely upon nobody altering the ->index | ||
1391 | * of this (pinned-by-us) page. | ||
1392 | */ | ||
1393 | index = page->index; | ||
1394 | if (index > next) | ||
1395 | next = index; | ||
1396 | next++; | ||
1397 | |||
1398 | if (skip_page) | ||
1399 | continue; | ||
1400 | |||
1401 | generic_error_remove_page(mapping, page); | ||
1402 | unlock_page(page); | ||
1403 | } | ||
1404 | pagevec_release(&pvec); | ||
1405 | cond_resched(); | ||
1406 | } | ||
1407 | } | ||
1408 | |||
1409 | /* | ||
1410 | * Invalidate inode pages in a worker thread. (This can't be done | 1366 | * Invalidate inode pages in a worker thread. (This can't be done |
1411 | * in the message handler context.) | 1367 | * in the message handler context.) |
1412 | */ | 1368 | */ |
@@ -1429,7 +1385,7 @@ static void ceph_invalidate_work(struct work_struct *work) | |||
1429 | orig_gen = ci->i_rdcache_gen; | 1385 | orig_gen = ci->i_rdcache_gen; |
1430 | spin_unlock(&inode->i_lock); | 1386 | spin_unlock(&inode->i_lock); |
1431 | 1387 | ||
1432 | ceph_invalidate_nondirty_pages(inode->i_mapping); | 1388 | truncate_inode_pages(&inode->i_data, 0); |
1433 | 1389 | ||
1434 | spin_lock(&inode->i_lock); | 1390 | spin_lock(&inode->i_lock); |
1435 | if (orig_gen == ci->i_rdcache_gen && | 1391 | if (orig_gen == ci->i_rdcache_gen && |
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 3b256b50f7d8..5a14c29cbba6 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c | |||
@@ -42,17 +42,39 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) | |||
42 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; | 42 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; |
43 | struct ceph_mds_request *req; | 43 | struct ceph_mds_request *req; |
44 | struct ceph_ioctl_layout l; | 44 | struct ceph_ioctl_layout l; |
45 | struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode); | ||
46 | struct ceph_ioctl_layout nl; | ||
45 | int err, i; | 47 | int err, i; |
46 | 48 | ||
47 | /* copy and validate */ | ||
48 | if (copy_from_user(&l, arg, sizeof(l))) | 49 | if (copy_from_user(&l, arg, sizeof(l))) |
49 | return -EFAULT; | 50 | return -EFAULT; |
50 | 51 | ||
51 | if ((l.object_size & ~PAGE_MASK) || | 52 | /* validate changed params against current layout */ |
52 | (l.stripe_unit & ~PAGE_MASK) || | 53 | err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT); |
53 | !l.stripe_unit || | 54 | if (!err) { |
54 | (l.object_size && | 55 | nl.stripe_unit = ceph_file_layout_su(ci->i_layout); |
55 | (unsigned)l.object_size % (unsigned)l.stripe_unit)) | 56 | nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); |
57 | nl.object_size = ceph_file_layout_object_size(ci->i_layout); | ||
58 | nl.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool); | ||
59 | nl.preferred_osd = | ||
60 | (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred); | ||
61 | } else | ||
62 | return err; | ||
63 | |||
64 | if (l.stripe_count) | ||
65 | nl.stripe_count = l.stripe_count; | ||
66 | if (l.stripe_unit) | ||
67 | nl.stripe_unit = l.stripe_unit; | ||
68 | if (l.object_size) | ||
69 | nl.object_size = l.object_size; | ||
70 | if (l.data_pool) | ||
71 | nl.data_pool = l.data_pool; | ||
72 | if (l.preferred_osd) | ||
73 | nl.preferred_osd = l.preferred_osd; | ||
74 | |||
75 | if ((nl.object_size & ~PAGE_MASK) || | ||
76 | (nl.stripe_unit & ~PAGE_MASK) || | ||
77 | ((unsigned)nl.object_size % (unsigned)nl.stripe_unit)) | ||
56 | return -EINVAL; | 78 | return -EINVAL; |
57 | 79 | ||
58 | /* make sure it's a valid data pool */ | 80 | /* make sure it's a valid data pool */ |
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h index 0c5167e43180..be4a60487333 100644 --- a/fs/ceph/ioctl.h +++ b/fs/ceph/ioctl.h | |||
@@ -6,7 +6,31 @@ | |||
6 | 6 | ||
7 | #define CEPH_IOCTL_MAGIC 0x97 | 7 | #define CEPH_IOCTL_MAGIC 0x97 |
8 | 8 | ||
9 | /* just use u64 to align sanely on all archs */ | 9 | /* |
10 | * CEPH_IOC_GET_LAYOUT - get file layout or dir layout policy | ||
11 | * CEPH_IOC_SET_LAYOUT - set file layout | ||
12 | * CEPH_IOC_SET_LAYOUT_POLICY - set dir layout policy | ||
13 | * | ||
14 | * The file layout specifies how file data is striped over objects in | ||
15 | * the distributed object store, which object pool they belong to (if | ||
16 | * it differs from the default), and an optional 'preferred osd' to | ||
17 | * store them on. | ||
18 | * | ||
19 | * Files get a new layout based on the policy set on the containing | ||
20 | * directory or one of its ancestors. The GET_LAYOUT ioctl will let | ||
21 | * you examine the layout for a file or the policy on a directory. | ||
22 | * | ||
23 | * SET_LAYOUT will let you set a layout on a newly created file. This | ||
24 | * only works immediately after the file is created and before any | ||
25 | * data is written to it. | ||
26 | * | ||
27 | * SET_LAYOUT_POLICY will let you set a layout policy (default layout) | ||
28 | * on a directory that will apply to any new files created in that | ||
29 | * directory (or any child directory that doesn't specify a layout of | ||
30 | * its own). | ||
31 | */ | ||
32 | |||
33 | /* use u64 to align sanely on all archs */ | ||
10 | struct ceph_ioctl_layout { | 34 | struct ceph_ioctl_layout { |
11 | __u64 stripe_unit, stripe_count, object_size; | 35 | __u64 stripe_unit, stripe_count, object_size; |
12 | __u64 data_pool; | 36 | __u64 data_pool; |
@@ -21,6 +45,8 @@ struct ceph_ioctl_layout { | |||
21 | struct ceph_ioctl_layout) | 45 | struct ceph_ioctl_layout) |
22 | 46 | ||
23 | /* | 47 | /* |
48 | * CEPH_IOC_GET_DATALOC - get location of file data in the cluster | ||
49 | * | ||
24 | * Extract identity, address of the OSD and object storing a given | 50 | * Extract identity, address of the OSD and object storing a given |
25 | * file offset. | 51 | * file offset. |
26 | */ | 52 | */ |
@@ -39,7 +65,34 @@ struct ceph_ioctl_dataloc { | |||
39 | #define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \ | 65 | #define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \ |
40 | struct ceph_ioctl_dataloc) | 66 | struct ceph_ioctl_dataloc) |
41 | 67 | ||
68 | /* | ||
69 | * CEPH_IOC_LAZYIO - relax consistency | ||
70 | * | ||
71 | * Normally Ceph switches to synchronous IO when multiple clients have | ||
72 | * the file open (and or more for write). Reads and writes bypass the | ||
73 | * page cache and go directly to the OSD. Setting this flag on a file | ||
74 | * descriptor will allow buffered IO for this file in cases where the | ||
75 | * application knows it won't interfere with other nodes (or doesn't | ||
76 | * care). | ||
77 | */ | ||
42 | #define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4) | 78 | #define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4) |
79 | |||
80 | /* | ||
81 | * CEPH_IOC_SYNCIO - force synchronous IO | ||
82 | * | ||
83 | * This ioctl sets a file flag that forces the synchronous IO that | ||
84 | * bypasses the page cache, even if it is not necessary. This is | ||
85 | * essentially the opposite behavior of IOC_LAZYIO. This forces the | ||
86 | * same read/write path as a file opened by multiple clients when one | ||
87 | * or more of those clients is opened for write. | ||
88 | * | ||
89 | * Note that this type of sync IO takes a different path than a file | ||
90 | * opened with O_SYNC/D_SYNC (writes hit the page cache and are | ||
91 | * immediately flushed on page boundaries). It is very similar to | ||
92 | * O_DIRECT (writes bypass the page cache) excep that O_DIRECT writes | ||
93 | * are not copied (user page must remain stable) and O_DIRECT writes | ||
94 | * have alignment restrictions (on the buffer and file offset). | ||
95 | */ | ||
43 | #define CEPH_IOC_SYNCIO _IO(CEPH_IOCTL_MAGIC, 5) | 96 | #define CEPH_IOC_SYNCIO _IO(CEPH_IOCTL_MAGIC, 5) |
44 | 97 | ||
45 | #endif | 98 | #endif |
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 86c59e16ba74..1d72f15fe9f4 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c | |||
@@ -764,7 +764,8 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq) | |||
764 | struct ceph_msg *msg; | 764 | struct ceph_msg *msg; |
765 | struct ceph_mds_session_head *h; | 765 | struct ceph_mds_session_head *h; |
766 | 766 | ||
767 | msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS); | 767 | msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS, |
768 | false); | ||
768 | if (!msg) { | 769 | if (!msg) { |
769 | pr_err("create_session_msg ENOMEM creating msg\n"); | 770 | pr_err("create_session_msg ENOMEM creating msg\n"); |
770 | return NULL; | 771 | return NULL; |
@@ -1240,7 +1241,7 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc, | |||
1240 | while (session->s_num_cap_releases < session->s_nr_caps + extra) { | 1241 | while (session->s_num_cap_releases < session->s_nr_caps + extra) { |
1241 | spin_unlock(&session->s_cap_lock); | 1242 | spin_unlock(&session->s_cap_lock); |
1242 | msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, | 1243 | msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, |
1243 | GFP_NOFS); | 1244 | GFP_NOFS, false); |
1244 | if (!msg) | 1245 | if (!msg) |
1245 | goto out_unlocked; | 1246 | goto out_unlocked; |
1246 | dout("add_cap_releases %p msg %p now %d\n", session, msg, | 1247 | dout("add_cap_releases %p msg %p now %d\n", session, msg, |
@@ -1652,7 +1653,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, | |||
1652 | if (req->r_old_dentry_drop) | 1653 | if (req->r_old_dentry_drop) |
1653 | len += req->r_old_dentry->d_name.len; | 1654 | len += req->r_old_dentry->d_name.len; |
1654 | 1655 | ||
1655 | msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS); | 1656 | msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false); |
1656 | if (!msg) { | 1657 | if (!msg) { |
1657 | msg = ERR_PTR(-ENOMEM); | 1658 | msg = ERR_PTR(-ENOMEM); |
1658 | goto out_free2; | 1659 | goto out_free2; |
@@ -2518,7 +2519,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, | |||
2518 | goto fail_nopagelist; | 2519 | goto fail_nopagelist; |
2519 | ceph_pagelist_init(pagelist); | 2520 | ceph_pagelist_init(pagelist); |
2520 | 2521 | ||
2521 | reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS); | 2522 | reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false); |
2522 | if (!reply) | 2523 | if (!reply) |
2523 | goto fail_nomsg; | 2524 | goto fail_nomsg; |
2524 | 2525 | ||
@@ -2831,7 +2832,7 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, | |||
2831 | dnamelen = dentry->d_name.len; | 2832 | dnamelen = dentry->d_name.len; |
2832 | len += dnamelen; | 2833 | len += dnamelen; |
2833 | 2834 | ||
2834 | msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS); | 2835 | msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false); |
2835 | if (!msg) | 2836 | if (!msg) |
2836 | return; | 2837 | return; |
2837 | lease = msg->front.iov_base; | 2838 | lease = msg->front.iov_base; |
diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 88bacaf385d9..788f5ad8e66d 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c | |||
@@ -114,6 +114,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait) | |||
114 | enum { | 114 | enum { |
115 | Opt_wsize, | 115 | Opt_wsize, |
116 | Opt_rsize, | 116 | Opt_rsize, |
117 | Opt_rasize, | ||
117 | Opt_caps_wanted_delay_min, | 118 | Opt_caps_wanted_delay_min, |
118 | Opt_caps_wanted_delay_max, | 119 | Opt_caps_wanted_delay_max, |
119 | Opt_cap_release_safety, | 120 | Opt_cap_release_safety, |
@@ -136,6 +137,7 @@ enum { | |||
136 | static match_table_t fsopt_tokens = { | 137 | static match_table_t fsopt_tokens = { |
137 | {Opt_wsize, "wsize=%d"}, | 138 | {Opt_wsize, "wsize=%d"}, |
138 | {Opt_rsize, "rsize=%d"}, | 139 | {Opt_rsize, "rsize=%d"}, |
140 | {Opt_rasize, "rasize=%d"}, | ||
139 | {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, | 141 | {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, |
140 | {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, | 142 | {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, |
141 | {Opt_cap_release_safety, "cap_release_safety=%d"}, | 143 | {Opt_cap_release_safety, "cap_release_safety=%d"}, |
@@ -196,6 +198,9 @@ static int parse_fsopt_token(char *c, void *private) | |||
196 | case Opt_rsize: | 198 | case Opt_rsize: |
197 | fsopt->rsize = intval; | 199 | fsopt->rsize = intval; |
198 | break; | 200 | break; |
201 | case Opt_rasize: | ||
202 | fsopt->rasize = intval; | ||
203 | break; | ||
199 | case Opt_caps_wanted_delay_min: | 204 | case Opt_caps_wanted_delay_min: |
200 | fsopt->caps_wanted_delay_min = intval; | 205 | fsopt->caps_wanted_delay_min = intval; |
201 | break; | 206 | break; |
@@ -289,28 +294,29 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, | |||
289 | 294 | ||
290 | dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name); | 295 | dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name); |
291 | 296 | ||
292 | fsopt->sb_flags = flags; | 297 | fsopt->sb_flags = flags; |
293 | fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; | 298 | fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; |
294 | 299 | ||
295 | fsopt->rsize = CEPH_RSIZE_DEFAULT; | 300 | fsopt->rsize = CEPH_RSIZE_DEFAULT; |
296 | fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); | 301 | fsopt->rasize = CEPH_RASIZE_DEFAULT; |
302 | fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); | ||
297 | fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; | 303 | fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; |
298 | fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; | 304 | fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; |
299 | fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; | 305 | fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; |
300 | fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; | 306 | fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; |
301 | fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; | 307 | fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; |
302 | fsopt->congestion_kb = default_congestion_kb(); | 308 | fsopt->congestion_kb = default_congestion_kb(); |
303 | 309 | ||
304 | /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ | 310 | /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ |
305 | err = -EINVAL; | 311 | err = -EINVAL; |
306 | if (!dev_name) | 312 | if (!dev_name) |
307 | goto out; | 313 | goto out; |
308 | *path = strstr(dev_name, ":/"); | 314 | *path = strstr(dev_name, ":/"); |
309 | if (*path == NULL) { | 315 | if (*path == NULL) { |
310 | pr_err("device name is missing path (no :/ in %s)\n", | 316 | pr_err("device name is missing path (no :/ in %s)\n", |
311 | dev_name); | 317 | dev_name); |
312 | goto out; | 318 | goto out; |
313 | } | 319 | } |
314 | dev_name_end = *path; | 320 | dev_name_end = *path; |
315 | dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); | 321 | dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); |
316 | 322 | ||
@@ -376,6 +382,8 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) | |||
376 | seq_printf(m, ",wsize=%d", fsopt->wsize); | 382 | seq_printf(m, ",wsize=%d", fsopt->wsize); |
377 | if (fsopt->rsize != CEPH_RSIZE_DEFAULT) | 383 | if (fsopt->rsize != CEPH_RSIZE_DEFAULT) |
378 | seq_printf(m, ",rsize=%d", fsopt->rsize); | 384 | seq_printf(m, ",rsize=%d", fsopt->rsize); |
385 | if (fsopt->rasize != CEPH_RASIZE_DEFAULT) | ||
386 | seq_printf(m, ",rasize=%d", fsopt->rsize); | ||
379 | if (fsopt->congestion_kb != default_congestion_kb()) | 387 | if (fsopt->congestion_kb != default_congestion_kb()) |
380 | seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); | 388 | seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); |
381 | if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) | 389 | if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) |
@@ -422,20 +430,23 @@ struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, | |||
422 | struct ceph_options *opt) | 430 | struct ceph_options *opt) |
423 | { | 431 | { |
424 | struct ceph_fs_client *fsc; | 432 | struct ceph_fs_client *fsc; |
433 | const unsigned supported_features = | ||
434 | CEPH_FEATURE_FLOCK | | ||
435 | CEPH_FEATURE_DIRLAYOUTHASH; | ||
436 | const unsigned required_features = 0; | ||
425 | int err = -ENOMEM; | 437 | int err = -ENOMEM; |
426 | 438 | ||
427 | fsc = kzalloc(sizeof(*fsc), GFP_KERNEL); | 439 | fsc = kzalloc(sizeof(*fsc), GFP_KERNEL); |
428 | if (!fsc) | 440 | if (!fsc) |
429 | return ERR_PTR(-ENOMEM); | 441 | return ERR_PTR(-ENOMEM); |
430 | 442 | ||
431 | fsc->client = ceph_create_client(opt, fsc); | 443 | fsc->client = ceph_create_client(opt, fsc, supported_features, |
444 | required_features); | ||
432 | if (IS_ERR(fsc->client)) { | 445 | if (IS_ERR(fsc->client)) { |
433 | err = PTR_ERR(fsc->client); | 446 | err = PTR_ERR(fsc->client); |
434 | goto fail; | 447 | goto fail; |
435 | } | 448 | } |
436 | fsc->client->extra_mon_dispatch = extra_mon_dispatch; | 449 | fsc->client->extra_mon_dispatch = extra_mon_dispatch; |
437 | fsc->client->supported_features |= CEPH_FEATURE_FLOCK | | ||
438 | CEPH_FEATURE_DIRLAYOUTHASH; | ||
439 | fsc->client->monc.want_mdsmap = 1; | 450 | fsc->client->monc.want_mdsmap = 1; |
440 | 451 | ||
441 | fsc->mount_options = fsopt; | 452 | fsc->mount_options = fsopt; |
@@ -774,10 +785,10 @@ static int ceph_register_bdi(struct super_block *sb, | |||
774 | { | 785 | { |
775 | int err; | 786 | int err; |
776 | 787 | ||
777 | /* set ra_pages based on rsize mount option? */ | 788 | /* set ra_pages based on rasize mount option? */ |
778 | if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) | 789 | if (fsc->mount_options->rasize >= PAGE_CACHE_SIZE) |
779 | fsc->backing_dev_info.ra_pages = | 790 | fsc->backing_dev_info.ra_pages = |
780 | (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) | 791 | (fsc->mount_options->rasize + PAGE_CACHE_SIZE - 1) |
781 | >> PAGE_SHIFT; | 792 | >> PAGE_SHIFT; |
782 | else | 793 | else |
783 | fsc->backing_dev_info.ra_pages = | 794 | fsc->backing_dev_info.ra_pages = |
diff --git a/fs/ceph/super.h b/fs/ceph/super.h index a23eed526f05..b01442aaf278 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h | |||
@@ -36,7 +36,8 @@ | |||
36 | #define ceph_test_mount_opt(fsc, opt) \ | 36 | #define ceph_test_mount_opt(fsc, opt) \ |
37 | (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) | 37 | (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) |
38 | 38 | ||
39 | #define CEPH_RSIZE_DEFAULT (512*1024) /* readahead */ | 39 | #define CEPH_RSIZE_DEFAULT 0 /* max read size */ |
40 | #define CEPH_RASIZE_DEFAULT (8192*1024) /* readahead */ | ||
40 | #define CEPH_MAX_READDIR_DEFAULT 1024 | 41 | #define CEPH_MAX_READDIR_DEFAULT 1024 |
41 | #define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) | 42 | #define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) |
42 | #define CEPH_SNAPDIRNAME_DEFAULT ".snap" | 43 | #define CEPH_SNAPDIRNAME_DEFAULT ".snap" |
@@ -45,8 +46,9 @@ struct ceph_mount_options { | |||
45 | int flags; | 46 | int flags; |
46 | int sb_flags; | 47 | int sb_flags; |
47 | 48 | ||
48 | int wsize; | 49 | int wsize; /* max write size */ |
49 | int rsize; /* max readahead */ | 50 | int rsize; /* max read size */ |
51 | int rasize; /* max readahead */ | ||
50 | int congestion_kb; /* max writeback in flight */ | 52 | int congestion_kb; /* max writeback in flight */ |
51 | int caps_wanted_delay_min, caps_wanted_delay_max; | 53 | int caps_wanted_delay_min, caps_wanted_delay_max; |
52 | int cap_release_safety; | 54 | int cap_release_safety; |
@@ -344,9 +346,10 @@ static inline struct ceph_vino ceph_vino(struct inode *inode) | |||
344 | * x86_64+ino32 64 32 | 346 | * x86_64+ino32 64 32 |
345 | * x86_64 64 64 | 347 | * x86_64 64 64 |
346 | */ | 348 | */ |
347 | static inline u32 ceph_ino_to_ino32(ino_t ino) | 349 | static inline u32 ceph_ino_to_ino32(__u64 vino) |
348 | { | 350 | { |
349 | ino ^= ino >> (sizeof(ino) * 8 - 32); | 351 | u32 ino = vino & 0xffffffff; |
352 | ino ^= vino >> 32; | ||
350 | if (!ino) | 353 | if (!ino) |
351 | ino = 1; | 354 | ino = 1; |
352 | return ino; | 355 | return ino; |
@@ -357,11 +360,11 @@ static inline u32 ceph_ino_to_ino32(ino_t ino) | |||
357 | */ | 360 | */ |
358 | static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) | 361 | static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) |
359 | { | 362 | { |
360 | ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */ | ||
361 | #if BITS_PER_LONG == 32 | 363 | #if BITS_PER_LONG == 32 |
362 | ino = ceph_ino_to_ino32(ino); | 364 | return ceph_ino_to_ino32(vino.ino); |
365 | #else | ||
366 | return (ino_t)vino.ino; | ||
363 | #endif | 367 | #endif |
364 | return ino; | ||
365 | } | 368 | } |
366 | 369 | ||
367 | /* | 370 | /* |