aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJinshan Xiong <jinshan.xiong@intel.com>2017-01-28 19:04:34 -0500
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2017-02-03 07:01:37 -0500
commit198a49a964a0f76f14c26079065fb48c71db48f3 (patch)
treeee63dd968f2d2606cf0be671c69ef4d370127bcf
parentea3f00df27159969c061031a21965970f85c1b61 (diff)
staging: lustre: clio: revise readahead to support 16MB IO
Read ahead currently doesn't handle 16MB RPC packets correctly by assuming the packets are a default size instead of querying the size. This work adjust the read ahead policy to issue read ahead RPC by the underlying RPC size. Signed-off-by: Jinshan Xiong <jinshan.xiong@intel.com> Signed-off-by: Gu Zheng <gzheng@ddn.com> Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-7990 Reviewed-on: http://review.whamcloud.com/19368 Reviewed-by: Andreas Dilger <andreas.dilger@intel.com> Reviewed-by: Li Xi <lixi@ddn.com> Reviewed-by: Oleg Drokin <oleg.drokin@intel.com> Signed-off-by: James Simmons <jsimmons@infradead.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-rw-r--r--drivers/staging/lustre/lustre/include/cl_object.h4
-rw-r--r--drivers/staging/lustre/lustre/ldlm/ldlm_lib.c10
-rw-r--r--drivers/staging/lustre/lustre/llite/llite_internal.h14
-rw-r--r--drivers/staging/lustre/lustre/llite/rw.c195
-rw-r--r--drivers/staging/lustre/lustre/osc/osc_io.c3
5 files changed, 114 insertions, 112 deletions
diff --git a/drivers/staging/lustre/lustre/include/cl_object.h b/drivers/staging/lustre/lustre/include/cl_object.h
index 05203c219f83..e4c0c440f01b 100644
--- a/drivers/staging/lustre/lustre/include/cl_object.h
+++ b/drivers/staging/lustre/lustre/include/cl_object.h
@@ -1452,8 +1452,10 @@ struct cl_read_ahead {
1452 * cra_end is included. 1452 * cra_end is included.
1453 */ 1453 */
1454 pgoff_t cra_end; 1454 pgoff_t cra_end;
1455 /* optimal RPC size for this read, by pages */
1456 unsigned long cra_rpc_size;
1455 /* 1457 /*
1456 * Release routine. If readahead holds resources underneath, this 1458 * Release callback. If readahead holds resources underneath, this
1457 * function should be called to release it. 1459 * function should be called to release it.
1458 */ 1460 */
1459 void (*cra_release)(const struct lu_env *env, void *cbdata); 1461 void (*cra_release)(const struct lu_env *env, void *cbdata);
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c
index 675e25ba2a08..95b8c76f70d0 100644
--- a/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c
@@ -351,13 +351,11 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
351 cli->cl_supp_cksum_types = OBD_CKSUM_CRC32; 351 cli->cl_supp_cksum_types = OBD_CKSUM_CRC32;
352 atomic_set(&cli->cl_resends, OSC_DEFAULT_RESENDS); 352 atomic_set(&cli->cl_resends, OSC_DEFAULT_RESENDS);
353 353
354 /* This value may be reduced at connect time in 354 /*
355 * ptlrpc_connect_interpret() . We initialize it to only 355 * Set it to possible maximum size. It may be reduced by ocd_brw_size
356 * 1MB until we know what the performance looks like. 356 * from OFD after connecting.
357 * In the future this should likely be increased. LU-1431
358 */ 357 */
359 cli->cl_max_pages_per_rpc = min_t(int, PTLRPC_MAX_BRW_PAGES, 358 cli->cl_max_pages_per_rpc = PTLRPC_MAX_BRW_PAGES;
360 LNET_MTU >> PAGE_SHIFT);
361 359
362 /* 360 /*
363 * set cl_chunkbits default value to PAGE_CACHE_SHIFT, 361 * set cl_chunkbits default value to PAGE_CACHE_SHIFT,
diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h
index 2c72177c81b6..501957cac8ad 100644
--- a/drivers/staging/lustre/lustre/llite/llite_internal.h
+++ b/drivers/staging/lustre/lustre/llite/llite_internal.h
@@ -281,10 +281,8 @@ static inline struct ll_inode_info *ll_i2info(struct inode *inode)
281 return container_of(inode, struct ll_inode_info, lli_vfs_inode); 281 return container_of(inode, struct ll_inode_info, lli_vfs_inode);
282} 282}
283 283
284/* default to about 40meg of readahead on a given system. That much tied 284/* default to about 64M of readahead on a given system. */
285 * up in 512k readahead requests serviced at 40ms each is about 1GB/s. 285#define SBI_DEFAULT_READAHEAD_MAX (64UL << (20 - PAGE_SHIFT))
286 */
287#define SBI_DEFAULT_READAHEAD_MAX (40UL << (20 - PAGE_SHIFT))
288 286
289/* default to read-ahead full files smaller than 2MB on the second read */ 287/* default to read-ahead full files smaller than 2MB on the second read */
290#define SBI_DEFAULT_READAHEAD_WHOLE_MAX (2UL << (20 - PAGE_SHIFT)) 288#define SBI_DEFAULT_READAHEAD_WHOLE_MAX (2UL << (20 - PAGE_SHIFT))
@@ -321,6 +319,9 @@ struct ll_ra_info {
321struct ra_io_arg { 319struct ra_io_arg {
322 unsigned long ria_start; /* start offset of read-ahead*/ 320 unsigned long ria_start; /* start offset of read-ahead*/
323 unsigned long ria_end; /* end offset of read-ahead*/ 321 unsigned long ria_end; /* end offset of read-ahead*/
322 unsigned long ria_reserved; /* reserved pages for read-ahead */
323 unsigned long ria_end_min; /* minimum end to cover current read */
324 bool ria_eof; /* reach end of file */
324 /* If stride read pattern is detected, ria_stoff means where 325 /* If stride read pattern is detected, ria_stoff means where
325 * stride read is started. Note: for normal read-ahead, the 326 * stride read is started. Note: for normal read-ahead, the
326 * value here is meaningless, and also it will not be accessed 327 * value here is meaningless, and also it will not be accessed
@@ -551,6 +552,11 @@ struct ll_readahead_state {
551 */ 552 */
552 unsigned long ras_window_start, ras_window_len; 553 unsigned long ras_window_start, ras_window_len;
553 /* 554 /*
555 * Optimal RPC size. It decides how many pages will be sent
556 * for each read-ahead.
557 */
558 unsigned long ras_rpc_size;
559 /*
554 * Where next read-ahead should start at. This lies within read-ahead 560 * Where next read-ahead should start at. This lies within read-ahead
555 * window. Read-ahead window is read in pieces rather than at once 561 * window. Read-ahead window is read in pieces rather than at once
556 * because: 1. lustre limits total number of pages under read-ahead by 562 * because: 1. lustre limits total number of pages under read-ahead by
diff --git a/drivers/staging/lustre/lustre/llite/rw.c b/drivers/staging/lustre/lustre/llite/rw.c
index f10e092979fe..18d3ccbd00e2 100644
--- a/drivers/staging/lustre/lustre/llite/rw.c
+++ b/drivers/staging/lustre/lustre/llite/rw.c
@@ -92,25 +92,6 @@ static unsigned long ll_ra_count_get(struct ll_sb_info *sbi,
92 goto out; 92 goto out;
93 } 93 }
94 94
95 /* If the non-strided (ria_pages == 0) readahead window
96 * (ria_start + ret) has grown across an RPC boundary, then trim
97 * readahead size by the amount beyond the RPC so it ends on an
98 * RPC boundary. If the readahead window is already ending on
99 * an RPC boundary (beyond_rpc == 0), or smaller than a full
100 * RPC (beyond_rpc < ret) the readahead size is unchanged.
101 * The (beyond_rpc != 0) check is skipped since the conditional
102 * branch is more expensive than subtracting zero from the result.
103 *
104 * Strided read is left unaligned to avoid small fragments beyond
105 * the RPC boundary from needing an extra read RPC.
106 */
107 if (ria->ria_pages == 0) {
108 long beyond_rpc = (ria->ria_start + ret) % PTLRPC_MAX_BRW_PAGES;
109
110 if (/* beyond_rpc != 0 && */ beyond_rpc < ret)
111 ret -= beyond_rpc;
112 }
113
114 if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) { 95 if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) {
115 atomic_sub(ret, &ra->ra_cur_pages); 96 atomic_sub(ret, &ra->ra_cur_pages);
116 ret = 0; 97 ret = 0;
@@ -147,11 +128,12 @@ void ll_ra_stats_inc(struct inode *inode, enum ra_stat which)
147 128
148#define RAS_CDEBUG(ras) \ 129#define RAS_CDEBUG(ras) \
149 CDEBUG(D_READA, \ 130 CDEBUG(D_READA, \
150 "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu" \ 131 "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu rpc %lu " \
151 "csr %lu sf %lu sp %lu sl %lu\n", \ 132 "r %lu ri %lu csr %lu sf %lu sp %lu sl %lu\n", \
152 ras->ras_last_readpage, ras->ras_consecutive_requests, \ 133 ras->ras_last_readpage, ras->ras_consecutive_requests, \
153 ras->ras_consecutive_pages, ras->ras_window_start, \ 134 ras->ras_consecutive_pages, ras->ras_window_start, \
154 ras->ras_window_len, ras->ras_next_readahead, \ 135 ras->ras_window_len, ras->ras_next_readahead, \
136 ras->ras_rpc_size, \
155 ras->ras_requests, ras->ras_request_index, \ 137 ras->ras_requests, ras->ras_request_index, \
156 ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \ 138 ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \
157 ras->ras_stride_pages, ras->ras_stride_length) 139 ras->ras_stride_pages, ras->ras_stride_length)
@@ -261,20 +243,6 @@ out:
261 ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\ 243 ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\
262 ria->ria_pages) 244 ria->ria_pages)
263 245
264/* Limit this to the blocksize instead of PTLRPC_BRW_MAX_SIZE, since we don't
265 * know what the actual RPC size is. If this needs to change, it makes more
266 * sense to tune the i_blkbits value for the file based on the OSTs it is
267 * striped over, rather than having a constant value for all files here.
268 */
269
270/* RAS_INCREASE_STEP should be (1UL << (inode->i_blkbits - PAGE_SHIFT)).
271 * Temporarily set RAS_INCREASE_STEP to 1MB. After 4MB RPC is enabled
272 * by default, this should be adjusted corresponding with max_read_ahead_mb
273 * and max_read_ahead_per_file_mb otherwise the readahead budget can be used
274 * up quickly which will affect read performance significantly. See LU-2816
275 */
276#define RAS_INCREASE_STEP(inode) (ONE_MB_BRW_SIZE >> PAGE_SHIFT)
277
278static inline int stride_io_mode(struct ll_readahead_state *ras) 246static inline int stride_io_mode(struct ll_readahead_state *ras)
279{ 247{
280 return ras->ras_consecutive_stride_requests > 1; 248 return ras->ras_consecutive_stride_requests > 1;
@@ -345,6 +313,17 @@ static int ria_page_count(struct ra_io_arg *ria)
345 length); 313 length);
346} 314}
347 315
316static unsigned long ras_align(struct ll_readahead_state *ras,
317 unsigned long index,
318 unsigned long *remainder)
319{
320 unsigned long rem = index % ras->ras_rpc_size;
321
322 if (remainder)
323 *remainder = rem;
324 return index - rem;
325}
326
348/*Check whether the index is in the defined ra-window */ 327/*Check whether the index is in the defined ra-window */
349static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria) 328static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria)
350{ 329{
@@ -358,42 +337,63 @@ static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria)
358 ria->ria_length < ria->ria_pages); 337 ria->ria_length < ria->ria_pages);
359} 338}
360 339
361static int ll_read_ahead_pages(const struct lu_env *env, 340static unsigned long
362 struct cl_io *io, struct cl_page_list *queue, 341ll_read_ahead_pages(const struct lu_env *env, struct cl_io *io,
363 struct ra_io_arg *ria, 342 struct cl_page_list *queue, struct ll_readahead_state *ras,
364 unsigned long *reserved_pages, 343 struct ra_io_arg *ria)
365 pgoff_t *ra_end)
366{ 344{
367 struct cl_read_ahead ra = { 0 }; 345 struct cl_read_ahead ra = { 0 };
368 int rc, count = 0; 346 unsigned long ra_end = 0;
369 bool stride_ria; 347 bool stride_ria;
370 pgoff_t page_idx; 348 pgoff_t page_idx;
349 int rc;
371 350
372 LASSERT(ria); 351 LASSERT(ria);
373 RIA_DEBUG(ria); 352 RIA_DEBUG(ria);
374 353
375 stride_ria = ria->ria_length > ria->ria_pages && ria->ria_pages > 0; 354 stride_ria = ria->ria_length > ria->ria_pages && ria->ria_pages > 0;
376 for (page_idx = ria->ria_start; 355 for (page_idx = ria->ria_start;
377 page_idx <= ria->ria_end && *reserved_pages > 0; page_idx++) { 356 page_idx <= ria->ria_end && ria->ria_reserved > 0; page_idx++) {
378 if (ras_inside_ra_window(page_idx, ria)) { 357 if (ras_inside_ra_window(page_idx, ria)) {
379 if (!ra.cra_end || ra.cra_end < page_idx) { 358 if (!ra.cra_end || ra.cra_end < page_idx) {
359 unsigned long end;
360
380 cl_read_ahead_release(env, &ra); 361 cl_read_ahead_release(env, &ra);
381 362
382 rc = cl_io_read_ahead(env, io, page_idx, &ra); 363 rc = cl_io_read_ahead(env, io, page_idx, &ra);
383 if (rc < 0) 364 if (rc < 0)
384 break; 365 break;
385 366
367 CDEBUG(D_READA, "idx: %lu, ra: %lu, rpc: %lu\n",
368 page_idx, ra.cra_end, ra.cra_rpc_size);
386 LASSERTF(ra.cra_end >= page_idx, 369 LASSERTF(ra.cra_end >= page_idx,
387 "object: %p, indcies %lu / %lu\n", 370 "object: %p, indcies %lu / %lu\n",
388 io->ci_obj, ra.cra_end, page_idx); 371 io->ci_obj, ra.cra_end, page_idx);
372 /*
373 * update read ahead RPC size.
374 * NB: it's racy but doesn't matter
375 */
376 if (ras->ras_rpc_size > ra.cra_rpc_size &&
377 ra.cra_rpc_size > 0)
378 ras->ras_rpc_size = ra.cra_rpc_size;
379 /* trim it to align with optimal RPC size */
380 end = ras_align(ras, ria->ria_end + 1, NULL);
381 if (end > 0 && !ria->ria_eof)
382 ria->ria_end = end - 1;
383 if (ria->ria_end < ria->ria_end_min)
384 ria->ria_end = ria->ria_end_min;
385 if (ria->ria_end > ra.cra_end)
386 ria->ria_end = ra.cra_end;
389 } 387 }
390 388
391 /* If the page is inside the read-ahead window*/ 389 /* If the page is inside the read-ahead window */
392 rc = ll_read_ahead_page(env, io, queue, page_idx); 390 rc = ll_read_ahead_page(env, io, queue, page_idx);
393 if (!rc) { 391 if (rc < 0)
394 (*reserved_pages)--; 392 break;
395 count++; 393
396 } 394 ra_end = page_idx;
395 if (!rc)
396 ria->ria_reserved--;
397 } else if (stride_ria) { 397 } else if (stride_ria) {
398 /* If it is not in the read-ahead window, and it is 398 /* If it is not in the read-ahead window, and it is
399 * read-ahead mode, then check whether it should skip 399 * read-ahead mode, then check whether it should skip
@@ -420,8 +420,7 @@ static int ll_read_ahead_pages(const struct lu_env *env,
420 } 420 }
421 cl_read_ahead_release(env, &ra); 421 cl_read_ahead_release(env, &ra);
422 422
423 *ra_end = page_idx; 423 return ra_end;
424 return count;
425} 424}
426 425
427static int ll_readahead(const struct lu_env *env, struct cl_io *io, 426static int ll_readahead(const struct lu_env *env, struct cl_io *io,
@@ -431,7 +430,7 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io,
431 struct vvp_io *vio = vvp_env_io(env); 430 struct vvp_io *vio = vvp_env_io(env);
432 struct ll_thread_info *lti = ll_env_info(env); 431 struct ll_thread_info *lti = ll_env_info(env);
433 struct cl_attr *attr = vvp_env_thread_attr(env); 432 struct cl_attr *attr = vvp_env_thread_attr(env);
434 unsigned long len, mlen = 0, reserved; 433 unsigned long len, mlen = 0;
435 pgoff_t ra_end, start = 0, end = 0; 434 pgoff_t ra_end, start = 0, end = 0;
436 struct inode *inode; 435 struct inode *inode;
437 struct ra_io_arg *ria = &lti->lti_ria; 436 struct ra_io_arg *ria = &lti->lti_ria;
@@ -478,29 +477,15 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io,
478 end < vio->vui_ra_start + vio->vui_ra_count - 1) 477 end < vio->vui_ra_start + vio->vui_ra_count - 1)
479 end = vio->vui_ra_start + vio->vui_ra_count - 1; 478 end = vio->vui_ra_start + vio->vui_ra_count - 1;
480 479
481 if (end != 0) { 480 if (end) {
482 unsigned long rpc_boundary; 481 unsigned long end_index;
483 /*
484 * Align RA window to an optimal boundary.
485 *
486 * XXX This would be better to align to cl_max_pages_per_rpc
487 * instead of PTLRPC_MAX_BRW_PAGES, because the RPC size may
488 * be aligned to the RAID stripe size in the future and that
489 * is more important than the RPC size.
490 */
491 /* Note: we only trim the RPC, instead of extending the RPC
492 * to the boundary, so to avoid reading too much pages during
493 * random reading.
494 */
495 rpc_boundary = (end + 1) & (~(PTLRPC_MAX_BRW_PAGES - 1));
496 if (rpc_boundary > 0)
497 rpc_boundary--;
498
499 if (rpc_boundary > start)
500 end = rpc_boundary;
501 482
502 /* Truncate RA window to end of file */ 483 /* Truncate RA window to end of file */
503 end = min(end, (unsigned long)((kms - 1) >> PAGE_SHIFT)); 484 end_index = (unsigned long)((kms - 1) >> PAGE_SHIFT);
485 if (end_index <= end) {
486 end = end_index;
487 ria->ria_eof = true;
488 }
504 489
505 ras->ras_next_readahead = max(end, end + 1); 490 ras->ras_next_readahead = max(end, end + 1);
506 RAS_CDEBUG(ras); 491 RAS_CDEBUG(ras);
@@ -535,28 +520,31 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io,
535 /* at least to extend the readahead window to cover current read */ 520 /* at least to extend the readahead window to cover current read */
536 if (!hit && vio->vui_ra_valid && 521 if (!hit && vio->vui_ra_valid &&
537 vio->vui_ra_start + vio->vui_ra_count > ria->ria_start) { 522 vio->vui_ra_start + vio->vui_ra_count > ria->ria_start) {
523 unsigned long remainder;
524
538 /* to the end of current read window. */ 525 /* to the end of current read window. */
539 mlen = vio->vui_ra_start + vio->vui_ra_count - ria->ria_start; 526 mlen = vio->vui_ra_start + vio->vui_ra_count - ria->ria_start;
540 /* trim to RPC boundary */ 527 /* trim to RPC boundary */
541 start = ria->ria_start & (PTLRPC_MAX_BRW_PAGES - 1); 528 ras_align(ras, ria->ria_start, &remainder);
542 mlen = min(mlen, PTLRPC_MAX_BRW_PAGES - start); 529 mlen = min(mlen, ras->ras_rpc_size - remainder);
530 ria->ria_end_min = ria->ria_start + mlen;
543 } 531 }
544 532
545 reserved = ll_ra_count_get(ll_i2sbi(inode), ria, len, mlen); 533 ria->ria_reserved = ll_ra_count_get(ll_i2sbi(inode), ria, len, mlen);
546 if (reserved < len) 534 if (ria->ria_reserved < len)
547 ll_ra_stats_inc(inode, RA_STAT_MAX_IN_FLIGHT); 535 ll_ra_stats_inc(inode, RA_STAT_MAX_IN_FLIGHT);
548 536
549 CDEBUG(D_READA, "reserved pages %lu/%lu/%lu, ra_cur %d, ra_max %lu\n", 537 CDEBUG(D_READA, "reserved pages %lu/%lu/%lu, ra_cur %d, ra_max %lu\n",
550 reserved, len, mlen, 538 ria->ria_reserved, len, mlen,
551 atomic_read(&ll_i2sbi(inode)->ll_ra_info.ra_cur_pages), 539 atomic_read(&ll_i2sbi(inode)->ll_ra_info.ra_cur_pages),
552 ll_i2sbi(inode)->ll_ra_info.ra_max_pages); 540 ll_i2sbi(inode)->ll_ra_info.ra_max_pages);
553 541
554 ret = ll_read_ahead_pages(env, io, queue, ria, &reserved, &ra_end); 542 ra_end = ll_read_ahead_pages(env, io, queue, ras, ria);
555 543
556 if (reserved != 0) 544 if (ria->ria_reserved)
557 ll_ra_count_put(ll_i2sbi(inode), reserved); 545 ll_ra_count_put(ll_i2sbi(inode), ria->ria_reserved);
558 546
559 if (ra_end == end + 1 && ra_end == (kms >> PAGE_SHIFT)) 547 if (ra_end == end && ra_end == (kms >> PAGE_SHIFT))
560 ll_ra_stats_inc(inode, RA_STAT_EOF); 548 ll_ra_stats_inc(inode, RA_STAT_EOF);
561 549
562 /* if we didn't get to the end of the region we reserved from 550 /* if we didn't get to the end of the region we reserved from
@@ -568,13 +556,13 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io,
568 CDEBUG(D_READA, "ra_end = %lu end = %lu stride end = %lu pages = %d\n", 556 CDEBUG(D_READA, "ra_end = %lu end = %lu stride end = %lu pages = %d\n",
569 ra_end, end, ria->ria_end, ret); 557 ra_end, end, ria->ria_end, ret);
570 558
571 if (ra_end != end + 1) { 559 if (ra_end > 0 && ra_end != end) {
572 ll_ra_stats_inc(inode, RA_STAT_FAILED_REACH_END); 560 ll_ra_stats_inc(inode, RA_STAT_FAILED_REACH_END);
573 spin_lock(&ras->ras_lock); 561 spin_lock(&ras->ras_lock);
574 if (ra_end < ras->ras_next_readahead && 562 if (ra_end <= ras->ras_next_readahead &&
575 index_in_window(ra_end, ras->ras_window_start, 0, 563 index_in_window(ra_end, ras->ras_window_start, 0,
576 ras->ras_window_len)) { 564 ras->ras_window_len)) {
577 ras->ras_next_readahead = ra_end; 565 ras->ras_next_readahead = ra_end + 1;
578 RAS_CDEBUG(ras); 566 RAS_CDEBUG(ras);
579 } 567 }
580 spin_unlock(&ras->ras_lock); 568 spin_unlock(&ras->ras_lock);
@@ -586,7 +574,7 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io,
586static void ras_set_start(struct inode *inode, struct ll_readahead_state *ras, 574static void ras_set_start(struct inode *inode, struct ll_readahead_state *ras,
587 unsigned long index) 575 unsigned long index)
588{ 576{
589 ras->ras_window_start = index & (~(RAS_INCREASE_STEP(inode) - 1)); 577 ras->ras_window_start = ras_align(ras, index, NULL);
590} 578}
591 579
592/* called with the ras_lock held or from places where it doesn't matter */ 580/* called with the ras_lock held or from places where it doesn't matter */
@@ -615,6 +603,7 @@ static void ras_stride_reset(struct ll_readahead_state *ras)
615void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras) 603void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras)
616{ 604{
617 spin_lock_init(&ras->ras_lock); 605 spin_lock_init(&ras->ras_lock);
606 ras->ras_rpc_size = PTLRPC_MAX_BRW_PAGES;
618 ras_reset(inode, ras, 0); 607 ras_reset(inode, ras, 0);
619 ras->ras_requests = 0; 608 ras->ras_requests = 0;
620} 609}
@@ -719,12 +708,15 @@ static void ras_increase_window(struct inode *inode,
719 * but current clio architecture does not support retrieve such 708 * but current clio architecture does not support retrieve such
720 * information from lower layer. FIXME later 709 * information from lower layer. FIXME later
721 */ 710 */
722 if (stride_io_mode(ras)) 711 if (stride_io_mode(ras)) {
723 ras_stride_increase_window(ras, ra, RAS_INCREASE_STEP(inode)); 712 ras_stride_increase_window(ras, ra, ras->ras_rpc_size);
724 else 713 } else {
725 ras->ras_window_len = min(ras->ras_window_len + 714 unsigned long wlen;
726 RAS_INCREASE_STEP(inode), 715
727 ra->ra_max_pages_per_file); 716 wlen = min(ras->ras_window_len + ras->ras_rpc_size,
717 ra->ra_max_pages_per_file);
718 ras->ras_window_len = ras_align(ras, wlen, NULL);
719 }
728} 720}
729 721
730static void ras_update(struct ll_sb_info *sbi, struct inode *inode, 722static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
@@ -852,6 +844,8 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
852 * instead of ras_window_start, which is RPC aligned 844 * instead of ras_window_start, which is RPC aligned
853 */ 845 */
854 ras->ras_next_readahead = max(index, ras->ras_next_readahead); 846 ras->ras_next_readahead = max(index, ras->ras_next_readahead);
847 ras->ras_window_start = max(ras->ras_stride_offset,
848 ras->ras_window_start);
855 } else { 849 } else {
856 if (ras->ras_next_readahead < ras->ras_window_start) 850 if (ras->ras_next_readahead < ras->ras_window_start)
857 ras->ras_next_readahead = ras->ras_window_start; 851 ras->ras_next_readahead = ras->ras_window_start;
@@ -881,7 +875,7 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
881 */ 875 */
882 ras->ras_next_readahead = max(index, ras->ras_next_readahead); 876 ras->ras_next_readahead = max(index, ras->ras_next_readahead);
883 ras->ras_stride_offset = index; 877 ras->ras_stride_offset = index;
884 ras->ras_window_len = RAS_INCREASE_STEP(inode); 878 ras->ras_window_start = max(index, ras->ras_window_start);
885 } 879 }
886 880
887 /* The initial ras_window_len is set to the request size. To avoid 881 /* The initial ras_window_len is set to the request size. To avoid
@@ -1098,38 +1092,39 @@ static int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
1098 struct cl_2queue *queue = &io->ci_queue; 1092 struct cl_2queue *queue = &io->ci_queue;
1099 struct ll_sb_info *sbi = ll_i2sbi(inode); 1093 struct ll_sb_info *sbi = ll_i2sbi(inode);
1100 struct vvp_page *vpg; 1094 struct vvp_page *vpg;
1095 bool uptodate;
1101 int rc = 0; 1096 int rc = 0;
1102 1097
1103 vpg = cl2vvp_page(cl_object_page_slice(page->cp_obj, page)); 1098 vpg = cl2vvp_page(cl_object_page_slice(page->cp_obj, page));
1099 uptodate = vpg->vpg_defer_uptodate;
1100
1104 if (sbi->ll_ra_info.ra_max_pages_per_file > 0 && 1101 if (sbi->ll_ra_info.ra_max_pages_per_file > 0 &&
1105 sbi->ll_ra_info.ra_max_pages > 0) { 1102 sbi->ll_ra_info.ra_max_pages > 0) {
1106 struct vvp_io *vio = vvp_env_io(env); 1103 struct vvp_io *vio = vvp_env_io(env);
1107 enum ras_update_flags flags = 0; 1104 enum ras_update_flags flags = 0;
1108 1105
1109 if (vpg->vpg_defer_uptodate) 1106 if (uptodate)
1110 flags |= LL_RAS_HIT; 1107 flags |= LL_RAS_HIT;
1111 if (!vio->vui_ra_valid) 1108 if (!vio->vui_ra_valid)
1112 flags |= LL_RAS_MMAP; 1109 flags |= LL_RAS_MMAP;
1113 ras_update(sbi, inode, ras, vvp_index(vpg), flags); 1110 ras_update(sbi, inode, ras, vvp_index(vpg), flags);
1114 } 1111 }
1115 1112
1116 if (vpg->vpg_defer_uptodate) { 1113 cl_2queue_init(queue);
1114 if (uptodate) {
1117 vpg->vpg_ra_used = 1; 1115 vpg->vpg_ra_used = 1;
1118 cl_page_export(env, page, 1); 1116 cl_page_export(env, page, 1);
1117 cl_page_disown(env, io, page);
1118 } else {
1119 cl_page_list_add(&queue->c2_qin, page);
1119 } 1120 }
1120 1121
1121 cl_2queue_init(queue);
1122 /*
1123 * Add page into the queue even when it is marked uptodate above.
1124 * this will unlock it automatically as part of cl_page_list_disown().
1125 */
1126 cl_page_list_add(&queue->c2_qin, page);
1127 if (sbi->ll_ra_info.ra_max_pages_per_file > 0 && 1122 if (sbi->ll_ra_info.ra_max_pages_per_file > 0 &&
1128 sbi->ll_ra_info.ra_max_pages > 0) { 1123 sbi->ll_ra_info.ra_max_pages > 0) {
1129 int rc2; 1124 int rc2;
1130 1125
1131 rc2 = ll_readahead(env, io, &queue->c2_qin, ras, 1126 rc2 = ll_readahead(env, io, &queue->c2_qin, ras,
1132 vpg->vpg_defer_uptodate); 1127 uptodate);
1133 CDEBUG(D_READA, DFID "%d pages read ahead at %lu\n", 1128 CDEBUG(D_READA, DFID "%d pages read ahead at %lu\n",
1134 PFID(ll_inode2fid(inode)), rc2, vvp_index(vpg)); 1129 PFID(ll_inode2fid(inode)), rc2, vvp_index(vpg));
1135 } 1130 }
diff --git a/drivers/staging/lustre/lustre/osc/osc_io.c b/drivers/staging/lustre/lustre/osc/osc_io.c
index 9f08f0308a57..38f742496ee2 100644
--- a/drivers/staging/lustre/lustre/osc/osc_io.c
+++ b/drivers/staging/lustre/lustre/osc/osc_io.c
@@ -99,6 +99,7 @@ static int osc_io_read_ahead(const struct lu_env *env,
99 ldlm_lock_decref(&lockh, dlmlock->l_req_mode); 99 ldlm_lock_decref(&lockh, dlmlock->l_req_mode);
100 } 100 }
101 101
102 ra->cra_rpc_size = osc_cli(osc)->cl_max_pages_per_rpc;
102 ra->cra_end = cl_index(osc2cl(osc), 103 ra->cra_end = cl_index(osc2cl(osc),
103 dlmlock->l_policy_data.l_extent.end); 104 dlmlock->l_policy_data.l_extent.end);
104 ra->cra_release = osc_read_ahead_release; 105 ra->cra_release = osc_read_ahead_release;
@@ -138,7 +139,7 @@ static int osc_io_submit(const struct lu_env *env,
138 139
139 LASSERT(qin->pl_nr > 0); 140 LASSERT(qin->pl_nr > 0);
140 141
141 CDEBUG(D_CACHE, "%d %d\n", qin->pl_nr, crt); 142 CDEBUG(D_CACHE | D_READA, "%d %d\n", qin->pl_nr, crt);
142 143
143 osc = cl2osc(ios->cis_obj); 144 osc = cl2osc(ios->cis_obj);
144 cli = osc_cli(osc); 145 cli = osc_cli(osc);