diff options
author | Jinshan Xiong <jinshan.xiong@intel.com> | 2017-01-28 19:04:34 -0500 |
---|---|---|
committer | Greg Kroah-Hartman <gregkh@linuxfoundation.org> | 2017-02-03 07:01:37 -0500 |
commit | 198a49a964a0f76f14c26079065fb48c71db48f3 (patch) | |
tree | ee63dd968f2d2606cf0be671c69ef4d370127bcf | |
parent | ea3f00df27159969c061031a21965970f85c1b61 (diff) |
staging: lustre: clio: revise readahead to support 16MB IO
Read ahead currently doesn't handle 16MB RPC packets correctly
by assuming the packets are a default size instead of querying
the size. This work adjust the read ahead policy to issue
read ahead RPC by the underlying RPC size.
Signed-off-by: Jinshan Xiong <jinshan.xiong@intel.com>
Signed-off-by: Gu Zheng <gzheng@ddn.com>
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-7990
Reviewed-on: http://review.whamcloud.com/19368
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Li Xi <lixi@ddn.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
Signed-off-by: James Simmons <jsimmons@infradead.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-rw-r--r-- | drivers/staging/lustre/lustre/include/cl_object.h | 4 | ||||
-rw-r--r-- | drivers/staging/lustre/lustre/ldlm/ldlm_lib.c | 10 | ||||
-rw-r--r-- | drivers/staging/lustre/lustre/llite/llite_internal.h | 14 | ||||
-rw-r--r-- | drivers/staging/lustre/lustre/llite/rw.c | 195 | ||||
-rw-r--r-- | drivers/staging/lustre/lustre/osc/osc_io.c | 3 |
5 files changed, 114 insertions, 112 deletions
diff --git a/drivers/staging/lustre/lustre/include/cl_object.h b/drivers/staging/lustre/lustre/include/cl_object.h index 05203c219f83..e4c0c440f01b 100644 --- a/drivers/staging/lustre/lustre/include/cl_object.h +++ b/drivers/staging/lustre/lustre/include/cl_object.h | |||
@@ -1452,8 +1452,10 @@ struct cl_read_ahead { | |||
1452 | * cra_end is included. | 1452 | * cra_end is included. |
1453 | */ | 1453 | */ |
1454 | pgoff_t cra_end; | 1454 | pgoff_t cra_end; |
1455 | /* optimal RPC size for this read, by pages */ | ||
1456 | unsigned long cra_rpc_size; | ||
1455 | /* | 1457 | /* |
1456 | * Release routine. If readahead holds resources underneath, this | 1458 | * Release callback. If readahead holds resources underneath, this |
1457 | * function should be called to release it. | 1459 | * function should be called to release it. |
1458 | */ | 1460 | */ |
1459 | void (*cra_release)(const struct lu_env *env, void *cbdata); | 1461 | void (*cra_release)(const struct lu_env *env, void *cbdata); |
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c index 675e25ba2a08..95b8c76f70d0 100644 --- a/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c +++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c | |||
@@ -351,13 +351,11 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) | |||
351 | cli->cl_supp_cksum_types = OBD_CKSUM_CRC32; | 351 | cli->cl_supp_cksum_types = OBD_CKSUM_CRC32; |
352 | atomic_set(&cli->cl_resends, OSC_DEFAULT_RESENDS); | 352 | atomic_set(&cli->cl_resends, OSC_DEFAULT_RESENDS); |
353 | 353 | ||
354 | /* This value may be reduced at connect time in | 354 | /* |
355 | * ptlrpc_connect_interpret() . We initialize it to only | 355 | * Set it to possible maximum size. It may be reduced by ocd_brw_size |
356 | * 1MB until we know what the performance looks like. | 356 | * from OFD after connecting. |
357 | * In the future this should likely be increased. LU-1431 | ||
358 | */ | 357 | */ |
359 | cli->cl_max_pages_per_rpc = min_t(int, PTLRPC_MAX_BRW_PAGES, | 358 | cli->cl_max_pages_per_rpc = PTLRPC_MAX_BRW_PAGES; |
360 | LNET_MTU >> PAGE_SHIFT); | ||
361 | 359 | ||
362 | /* | 360 | /* |
363 | * set cl_chunkbits default value to PAGE_CACHE_SHIFT, | 361 | * set cl_chunkbits default value to PAGE_CACHE_SHIFT, |
diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h index 2c72177c81b6..501957cac8ad 100644 --- a/drivers/staging/lustre/lustre/llite/llite_internal.h +++ b/drivers/staging/lustre/lustre/llite/llite_internal.h | |||
@@ -281,10 +281,8 @@ static inline struct ll_inode_info *ll_i2info(struct inode *inode) | |||
281 | return container_of(inode, struct ll_inode_info, lli_vfs_inode); | 281 | return container_of(inode, struct ll_inode_info, lli_vfs_inode); |
282 | } | 282 | } |
283 | 283 | ||
284 | /* default to about 40meg of readahead on a given system. That much tied | 284 | /* default to about 64M of readahead on a given system. */ |
285 | * up in 512k readahead requests serviced at 40ms each is about 1GB/s. | 285 | #define SBI_DEFAULT_READAHEAD_MAX (64UL << (20 - PAGE_SHIFT)) |
286 | */ | ||
287 | #define SBI_DEFAULT_READAHEAD_MAX (40UL << (20 - PAGE_SHIFT)) | ||
288 | 286 | ||
289 | /* default to read-ahead full files smaller than 2MB on the second read */ | 287 | /* default to read-ahead full files smaller than 2MB on the second read */ |
290 | #define SBI_DEFAULT_READAHEAD_WHOLE_MAX (2UL << (20 - PAGE_SHIFT)) | 288 | #define SBI_DEFAULT_READAHEAD_WHOLE_MAX (2UL << (20 - PAGE_SHIFT)) |
@@ -321,6 +319,9 @@ struct ll_ra_info { | |||
321 | struct ra_io_arg { | 319 | struct ra_io_arg { |
322 | unsigned long ria_start; /* start offset of read-ahead*/ | 320 | unsigned long ria_start; /* start offset of read-ahead*/ |
323 | unsigned long ria_end; /* end offset of read-ahead*/ | 321 | unsigned long ria_end; /* end offset of read-ahead*/ |
322 | unsigned long ria_reserved; /* reserved pages for read-ahead */ | ||
323 | unsigned long ria_end_min; /* minimum end to cover current read */ | ||
324 | bool ria_eof; /* reach end of file */ | ||
324 | /* If stride read pattern is detected, ria_stoff means where | 325 | /* If stride read pattern is detected, ria_stoff means where |
325 | * stride read is started. Note: for normal read-ahead, the | 326 | * stride read is started. Note: for normal read-ahead, the |
326 | * value here is meaningless, and also it will not be accessed | 327 | * value here is meaningless, and also it will not be accessed |
@@ -551,6 +552,11 @@ struct ll_readahead_state { | |||
551 | */ | 552 | */ |
552 | unsigned long ras_window_start, ras_window_len; | 553 | unsigned long ras_window_start, ras_window_len; |
553 | /* | 554 | /* |
555 | * Optimal RPC size. It decides how many pages will be sent | ||
556 | * for each read-ahead. | ||
557 | */ | ||
558 | unsigned long ras_rpc_size; | ||
559 | /* | ||
554 | * Where next read-ahead should start at. This lies within read-ahead | 560 | * Where next read-ahead should start at. This lies within read-ahead |
555 | * window. Read-ahead window is read in pieces rather than at once | 561 | * window. Read-ahead window is read in pieces rather than at once |
556 | * because: 1. lustre limits total number of pages under read-ahead by | 562 | * because: 1. lustre limits total number of pages under read-ahead by |
diff --git a/drivers/staging/lustre/lustre/llite/rw.c b/drivers/staging/lustre/lustre/llite/rw.c index f10e092979fe..18d3ccbd00e2 100644 --- a/drivers/staging/lustre/lustre/llite/rw.c +++ b/drivers/staging/lustre/lustre/llite/rw.c | |||
@@ -92,25 +92,6 @@ static unsigned long ll_ra_count_get(struct ll_sb_info *sbi, | |||
92 | goto out; | 92 | goto out; |
93 | } | 93 | } |
94 | 94 | ||
95 | /* If the non-strided (ria_pages == 0) readahead window | ||
96 | * (ria_start + ret) has grown across an RPC boundary, then trim | ||
97 | * readahead size by the amount beyond the RPC so it ends on an | ||
98 | * RPC boundary. If the readahead window is already ending on | ||
99 | * an RPC boundary (beyond_rpc == 0), or smaller than a full | ||
100 | * RPC (beyond_rpc < ret) the readahead size is unchanged. | ||
101 | * The (beyond_rpc != 0) check is skipped since the conditional | ||
102 | * branch is more expensive than subtracting zero from the result. | ||
103 | * | ||
104 | * Strided read is left unaligned to avoid small fragments beyond | ||
105 | * the RPC boundary from needing an extra read RPC. | ||
106 | */ | ||
107 | if (ria->ria_pages == 0) { | ||
108 | long beyond_rpc = (ria->ria_start + ret) % PTLRPC_MAX_BRW_PAGES; | ||
109 | |||
110 | if (/* beyond_rpc != 0 && */ beyond_rpc < ret) | ||
111 | ret -= beyond_rpc; | ||
112 | } | ||
113 | |||
114 | if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) { | 95 | if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) { |
115 | atomic_sub(ret, &ra->ra_cur_pages); | 96 | atomic_sub(ret, &ra->ra_cur_pages); |
116 | ret = 0; | 97 | ret = 0; |
@@ -147,11 +128,12 @@ void ll_ra_stats_inc(struct inode *inode, enum ra_stat which) | |||
147 | 128 | ||
148 | #define RAS_CDEBUG(ras) \ | 129 | #define RAS_CDEBUG(ras) \ |
149 | CDEBUG(D_READA, \ | 130 | CDEBUG(D_READA, \ |
150 | "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu" \ | 131 | "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu rpc %lu " \ |
151 | "csr %lu sf %lu sp %lu sl %lu\n", \ | 132 | "r %lu ri %lu csr %lu sf %lu sp %lu sl %lu\n", \ |
152 | ras->ras_last_readpage, ras->ras_consecutive_requests, \ | 133 | ras->ras_last_readpage, ras->ras_consecutive_requests, \ |
153 | ras->ras_consecutive_pages, ras->ras_window_start, \ | 134 | ras->ras_consecutive_pages, ras->ras_window_start, \ |
154 | ras->ras_window_len, ras->ras_next_readahead, \ | 135 | ras->ras_window_len, ras->ras_next_readahead, \ |
136 | ras->ras_rpc_size, \ | ||
155 | ras->ras_requests, ras->ras_request_index, \ | 137 | ras->ras_requests, ras->ras_request_index, \ |
156 | ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \ | 138 | ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \ |
157 | ras->ras_stride_pages, ras->ras_stride_length) | 139 | ras->ras_stride_pages, ras->ras_stride_length) |
@@ -261,20 +243,6 @@ out: | |||
261 | ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\ | 243 | ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\ |
262 | ria->ria_pages) | 244 | ria->ria_pages) |
263 | 245 | ||
264 | /* Limit this to the blocksize instead of PTLRPC_BRW_MAX_SIZE, since we don't | ||
265 | * know what the actual RPC size is. If this needs to change, it makes more | ||
266 | * sense to tune the i_blkbits value for the file based on the OSTs it is | ||
267 | * striped over, rather than having a constant value for all files here. | ||
268 | */ | ||
269 | |||
270 | /* RAS_INCREASE_STEP should be (1UL << (inode->i_blkbits - PAGE_SHIFT)). | ||
271 | * Temporarily set RAS_INCREASE_STEP to 1MB. After 4MB RPC is enabled | ||
272 | * by default, this should be adjusted corresponding with max_read_ahead_mb | ||
273 | * and max_read_ahead_per_file_mb otherwise the readahead budget can be used | ||
274 | * up quickly which will affect read performance significantly. See LU-2816 | ||
275 | */ | ||
276 | #define RAS_INCREASE_STEP(inode) (ONE_MB_BRW_SIZE >> PAGE_SHIFT) | ||
277 | |||
278 | static inline int stride_io_mode(struct ll_readahead_state *ras) | 246 | static inline int stride_io_mode(struct ll_readahead_state *ras) |
279 | { | 247 | { |
280 | return ras->ras_consecutive_stride_requests > 1; | 248 | return ras->ras_consecutive_stride_requests > 1; |
@@ -345,6 +313,17 @@ static int ria_page_count(struct ra_io_arg *ria) | |||
345 | length); | 313 | length); |
346 | } | 314 | } |
347 | 315 | ||
316 | static unsigned long ras_align(struct ll_readahead_state *ras, | ||
317 | unsigned long index, | ||
318 | unsigned long *remainder) | ||
319 | { | ||
320 | unsigned long rem = index % ras->ras_rpc_size; | ||
321 | |||
322 | if (remainder) | ||
323 | *remainder = rem; | ||
324 | return index - rem; | ||
325 | } | ||
326 | |||
348 | /*Check whether the index is in the defined ra-window */ | 327 | /*Check whether the index is in the defined ra-window */ |
349 | static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria) | 328 | static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria) |
350 | { | 329 | { |
@@ -358,42 +337,63 @@ static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria) | |||
358 | ria->ria_length < ria->ria_pages); | 337 | ria->ria_length < ria->ria_pages); |
359 | } | 338 | } |
360 | 339 | ||
361 | static int ll_read_ahead_pages(const struct lu_env *env, | 340 | static unsigned long |
362 | struct cl_io *io, struct cl_page_list *queue, | 341 | ll_read_ahead_pages(const struct lu_env *env, struct cl_io *io, |
363 | struct ra_io_arg *ria, | 342 | struct cl_page_list *queue, struct ll_readahead_state *ras, |
364 | unsigned long *reserved_pages, | 343 | struct ra_io_arg *ria) |
365 | pgoff_t *ra_end) | ||
366 | { | 344 | { |
367 | struct cl_read_ahead ra = { 0 }; | 345 | struct cl_read_ahead ra = { 0 }; |
368 | int rc, count = 0; | 346 | unsigned long ra_end = 0; |
369 | bool stride_ria; | 347 | bool stride_ria; |
370 | pgoff_t page_idx; | 348 | pgoff_t page_idx; |
349 | int rc; | ||
371 | 350 | ||
372 | LASSERT(ria); | 351 | LASSERT(ria); |
373 | RIA_DEBUG(ria); | 352 | RIA_DEBUG(ria); |
374 | 353 | ||
375 | stride_ria = ria->ria_length > ria->ria_pages && ria->ria_pages > 0; | 354 | stride_ria = ria->ria_length > ria->ria_pages && ria->ria_pages > 0; |
376 | for (page_idx = ria->ria_start; | 355 | for (page_idx = ria->ria_start; |
377 | page_idx <= ria->ria_end && *reserved_pages > 0; page_idx++) { | 356 | page_idx <= ria->ria_end && ria->ria_reserved > 0; page_idx++) { |
378 | if (ras_inside_ra_window(page_idx, ria)) { | 357 | if (ras_inside_ra_window(page_idx, ria)) { |
379 | if (!ra.cra_end || ra.cra_end < page_idx) { | 358 | if (!ra.cra_end || ra.cra_end < page_idx) { |
359 | unsigned long end; | ||
360 | |||
380 | cl_read_ahead_release(env, &ra); | 361 | cl_read_ahead_release(env, &ra); |
381 | 362 | ||
382 | rc = cl_io_read_ahead(env, io, page_idx, &ra); | 363 | rc = cl_io_read_ahead(env, io, page_idx, &ra); |
383 | if (rc < 0) | 364 | if (rc < 0) |
384 | break; | 365 | break; |
385 | 366 | ||
367 | CDEBUG(D_READA, "idx: %lu, ra: %lu, rpc: %lu\n", | ||
368 | page_idx, ra.cra_end, ra.cra_rpc_size); | ||
386 | LASSERTF(ra.cra_end >= page_idx, | 369 | LASSERTF(ra.cra_end >= page_idx, |
387 | "object: %p, indcies %lu / %lu\n", | 370 | "object: %p, indcies %lu / %lu\n", |
388 | io->ci_obj, ra.cra_end, page_idx); | 371 | io->ci_obj, ra.cra_end, page_idx); |
372 | /* | ||
373 | * update read ahead RPC size. | ||
374 | * NB: it's racy but doesn't matter | ||
375 | */ | ||
376 | if (ras->ras_rpc_size > ra.cra_rpc_size && | ||
377 | ra.cra_rpc_size > 0) | ||
378 | ras->ras_rpc_size = ra.cra_rpc_size; | ||
379 | /* trim it to align with optimal RPC size */ | ||
380 | end = ras_align(ras, ria->ria_end + 1, NULL); | ||
381 | if (end > 0 && !ria->ria_eof) | ||
382 | ria->ria_end = end - 1; | ||
383 | if (ria->ria_end < ria->ria_end_min) | ||
384 | ria->ria_end = ria->ria_end_min; | ||
385 | if (ria->ria_end > ra.cra_end) | ||
386 | ria->ria_end = ra.cra_end; | ||
389 | } | 387 | } |
390 | 388 | ||
391 | /* If the page is inside the read-ahead window*/ | 389 | /* If the page is inside the read-ahead window */ |
392 | rc = ll_read_ahead_page(env, io, queue, page_idx); | 390 | rc = ll_read_ahead_page(env, io, queue, page_idx); |
393 | if (!rc) { | 391 | if (rc < 0) |
394 | (*reserved_pages)--; | 392 | break; |
395 | count++; | 393 | |
396 | } | 394 | ra_end = page_idx; |
395 | if (!rc) | ||
396 | ria->ria_reserved--; | ||
397 | } else if (stride_ria) { | 397 | } else if (stride_ria) { |
398 | /* If it is not in the read-ahead window, and it is | 398 | /* If it is not in the read-ahead window, and it is |
399 | * read-ahead mode, then check whether it should skip | 399 | * read-ahead mode, then check whether it should skip |
@@ -420,8 +420,7 @@ static int ll_read_ahead_pages(const struct lu_env *env, | |||
420 | } | 420 | } |
421 | cl_read_ahead_release(env, &ra); | 421 | cl_read_ahead_release(env, &ra); |
422 | 422 | ||
423 | *ra_end = page_idx; | 423 | return ra_end; |
424 | return count; | ||
425 | } | 424 | } |
426 | 425 | ||
427 | static int ll_readahead(const struct lu_env *env, struct cl_io *io, | 426 | static int ll_readahead(const struct lu_env *env, struct cl_io *io, |
@@ -431,7 +430,7 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io, | |||
431 | struct vvp_io *vio = vvp_env_io(env); | 430 | struct vvp_io *vio = vvp_env_io(env); |
432 | struct ll_thread_info *lti = ll_env_info(env); | 431 | struct ll_thread_info *lti = ll_env_info(env); |
433 | struct cl_attr *attr = vvp_env_thread_attr(env); | 432 | struct cl_attr *attr = vvp_env_thread_attr(env); |
434 | unsigned long len, mlen = 0, reserved; | 433 | unsigned long len, mlen = 0; |
435 | pgoff_t ra_end, start = 0, end = 0; | 434 | pgoff_t ra_end, start = 0, end = 0; |
436 | struct inode *inode; | 435 | struct inode *inode; |
437 | struct ra_io_arg *ria = <i->lti_ria; | 436 | struct ra_io_arg *ria = <i->lti_ria; |
@@ -478,29 +477,15 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io, | |||
478 | end < vio->vui_ra_start + vio->vui_ra_count - 1) | 477 | end < vio->vui_ra_start + vio->vui_ra_count - 1) |
479 | end = vio->vui_ra_start + vio->vui_ra_count - 1; | 478 | end = vio->vui_ra_start + vio->vui_ra_count - 1; |
480 | 479 | ||
481 | if (end != 0) { | 480 | if (end) { |
482 | unsigned long rpc_boundary; | 481 | unsigned long end_index; |
483 | /* | ||
484 | * Align RA window to an optimal boundary. | ||
485 | * | ||
486 | * XXX This would be better to align to cl_max_pages_per_rpc | ||
487 | * instead of PTLRPC_MAX_BRW_PAGES, because the RPC size may | ||
488 | * be aligned to the RAID stripe size in the future and that | ||
489 | * is more important than the RPC size. | ||
490 | */ | ||
491 | /* Note: we only trim the RPC, instead of extending the RPC | ||
492 | * to the boundary, so to avoid reading too much pages during | ||
493 | * random reading. | ||
494 | */ | ||
495 | rpc_boundary = (end + 1) & (~(PTLRPC_MAX_BRW_PAGES - 1)); | ||
496 | if (rpc_boundary > 0) | ||
497 | rpc_boundary--; | ||
498 | |||
499 | if (rpc_boundary > start) | ||
500 | end = rpc_boundary; | ||
501 | 482 | ||
502 | /* Truncate RA window to end of file */ | 483 | /* Truncate RA window to end of file */ |
503 | end = min(end, (unsigned long)((kms - 1) >> PAGE_SHIFT)); | 484 | end_index = (unsigned long)((kms - 1) >> PAGE_SHIFT); |
485 | if (end_index <= end) { | ||
486 | end = end_index; | ||
487 | ria->ria_eof = true; | ||
488 | } | ||
504 | 489 | ||
505 | ras->ras_next_readahead = max(end, end + 1); | 490 | ras->ras_next_readahead = max(end, end + 1); |
506 | RAS_CDEBUG(ras); | 491 | RAS_CDEBUG(ras); |
@@ -535,28 +520,31 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io, | |||
535 | /* at least to extend the readahead window to cover current read */ | 520 | /* at least to extend the readahead window to cover current read */ |
536 | if (!hit && vio->vui_ra_valid && | 521 | if (!hit && vio->vui_ra_valid && |
537 | vio->vui_ra_start + vio->vui_ra_count > ria->ria_start) { | 522 | vio->vui_ra_start + vio->vui_ra_count > ria->ria_start) { |
523 | unsigned long remainder; | ||
524 | |||
538 | /* to the end of current read window. */ | 525 | /* to the end of current read window. */ |
539 | mlen = vio->vui_ra_start + vio->vui_ra_count - ria->ria_start; | 526 | mlen = vio->vui_ra_start + vio->vui_ra_count - ria->ria_start; |
540 | /* trim to RPC boundary */ | 527 | /* trim to RPC boundary */ |
541 | start = ria->ria_start & (PTLRPC_MAX_BRW_PAGES - 1); | 528 | ras_align(ras, ria->ria_start, &remainder); |
542 | mlen = min(mlen, PTLRPC_MAX_BRW_PAGES - start); | 529 | mlen = min(mlen, ras->ras_rpc_size - remainder); |
530 | ria->ria_end_min = ria->ria_start + mlen; | ||
543 | } | 531 | } |
544 | 532 | ||
545 | reserved = ll_ra_count_get(ll_i2sbi(inode), ria, len, mlen); | 533 | ria->ria_reserved = ll_ra_count_get(ll_i2sbi(inode), ria, len, mlen); |
546 | if (reserved < len) | 534 | if (ria->ria_reserved < len) |
547 | ll_ra_stats_inc(inode, RA_STAT_MAX_IN_FLIGHT); | 535 | ll_ra_stats_inc(inode, RA_STAT_MAX_IN_FLIGHT); |
548 | 536 | ||
549 | CDEBUG(D_READA, "reserved pages %lu/%lu/%lu, ra_cur %d, ra_max %lu\n", | 537 | CDEBUG(D_READA, "reserved pages %lu/%lu/%lu, ra_cur %d, ra_max %lu\n", |
550 | reserved, len, mlen, | 538 | ria->ria_reserved, len, mlen, |
551 | atomic_read(&ll_i2sbi(inode)->ll_ra_info.ra_cur_pages), | 539 | atomic_read(&ll_i2sbi(inode)->ll_ra_info.ra_cur_pages), |
552 | ll_i2sbi(inode)->ll_ra_info.ra_max_pages); | 540 | ll_i2sbi(inode)->ll_ra_info.ra_max_pages); |
553 | 541 | ||
554 | ret = ll_read_ahead_pages(env, io, queue, ria, &reserved, &ra_end); | 542 | ra_end = ll_read_ahead_pages(env, io, queue, ras, ria); |
555 | 543 | ||
556 | if (reserved != 0) | 544 | if (ria->ria_reserved) |
557 | ll_ra_count_put(ll_i2sbi(inode), reserved); | 545 | ll_ra_count_put(ll_i2sbi(inode), ria->ria_reserved); |
558 | 546 | ||
559 | if (ra_end == end + 1 && ra_end == (kms >> PAGE_SHIFT)) | 547 | if (ra_end == end && ra_end == (kms >> PAGE_SHIFT)) |
560 | ll_ra_stats_inc(inode, RA_STAT_EOF); | 548 | ll_ra_stats_inc(inode, RA_STAT_EOF); |
561 | 549 | ||
562 | /* if we didn't get to the end of the region we reserved from | 550 | /* if we didn't get to the end of the region we reserved from |
@@ -568,13 +556,13 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io, | |||
568 | CDEBUG(D_READA, "ra_end = %lu end = %lu stride end = %lu pages = %d\n", | 556 | CDEBUG(D_READA, "ra_end = %lu end = %lu stride end = %lu pages = %d\n", |
569 | ra_end, end, ria->ria_end, ret); | 557 | ra_end, end, ria->ria_end, ret); |
570 | 558 | ||
571 | if (ra_end != end + 1) { | 559 | if (ra_end > 0 && ra_end != end) { |
572 | ll_ra_stats_inc(inode, RA_STAT_FAILED_REACH_END); | 560 | ll_ra_stats_inc(inode, RA_STAT_FAILED_REACH_END); |
573 | spin_lock(&ras->ras_lock); | 561 | spin_lock(&ras->ras_lock); |
574 | if (ra_end < ras->ras_next_readahead && | 562 | if (ra_end <= ras->ras_next_readahead && |
575 | index_in_window(ra_end, ras->ras_window_start, 0, | 563 | index_in_window(ra_end, ras->ras_window_start, 0, |
576 | ras->ras_window_len)) { | 564 | ras->ras_window_len)) { |
577 | ras->ras_next_readahead = ra_end; | 565 | ras->ras_next_readahead = ra_end + 1; |
578 | RAS_CDEBUG(ras); | 566 | RAS_CDEBUG(ras); |
579 | } | 567 | } |
580 | spin_unlock(&ras->ras_lock); | 568 | spin_unlock(&ras->ras_lock); |
@@ -586,7 +574,7 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io, | |||
586 | static void ras_set_start(struct inode *inode, struct ll_readahead_state *ras, | 574 | static void ras_set_start(struct inode *inode, struct ll_readahead_state *ras, |
587 | unsigned long index) | 575 | unsigned long index) |
588 | { | 576 | { |
589 | ras->ras_window_start = index & (~(RAS_INCREASE_STEP(inode) - 1)); | 577 | ras->ras_window_start = ras_align(ras, index, NULL); |
590 | } | 578 | } |
591 | 579 | ||
592 | /* called with the ras_lock held or from places where it doesn't matter */ | 580 | /* called with the ras_lock held or from places where it doesn't matter */ |
@@ -615,6 +603,7 @@ static void ras_stride_reset(struct ll_readahead_state *ras) | |||
615 | void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras) | 603 | void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras) |
616 | { | 604 | { |
617 | spin_lock_init(&ras->ras_lock); | 605 | spin_lock_init(&ras->ras_lock); |
606 | ras->ras_rpc_size = PTLRPC_MAX_BRW_PAGES; | ||
618 | ras_reset(inode, ras, 0); | 607 | ras_reset(inode, ras, 0); |
619 | ras->ras_requests = 0; | 608 | ras->ras_requests = 0; |
620 | } | 609 | } |
@@ -719,12 +708,15 @@ static void ras_increase_window(struct inode *inode, | |||
719 | * but current clio architecture does not support retrieve such | 708 | * but current clio architecture does not support retrieve such |
720 | * information from lower layer. FIXME later | 709 | * information from lower layer. FIXME later |
721 | */ | 710 | */ |
722 | if (stride_io_mode(ras)) | 711 | if (stride_io_mode(ras)) { |
723 | ras_stride_increase_window(ras, ra, RAS_INCREASE_STEP(inode)); | 712 | ras_stride_increase_window(ras, ra, ras->ras_rpc_size); |
724 | else | 713 | } else { |
725 | ras->ras_window_len = min(ras->ras_window_len + | 714 | unsigned long wlen; |
726 | RAS_INCREASE_STEP(inode), | 715 | |
727 | ra->ra_max_pages_per_file); | 716 | wlen = min(ras->ras_window_len + ras->ras_rpc_size, |
717 | ra->ra_max_pages_per_file); | ||
718 | ras->ras_window_len = ras_align(ras, wlen, NULL); | ||
719 | } | ||
728 | } | 720 | } |
729 | 721 | ||
730 | static void ras_update(struct ll_sb_info *sbi, struct inode *inode, | 722 | static void ras_update(struct ll_sb_info *sbi, struct inode *inode, |
@@ -852,6 +844,8 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode, | |||
852 | * instead of ras_window_start, which is RPC aligned | 844 | * instead of ras_window_start, which is RPC aligned |
853 | */ | 845 | */ |
854 | ras->ras_next_readahead = max(index, ras->ras_next_readahead); | 846 | ras->ras_next_readahead = max(index, ras->ras_next_readahead); |
847 | ras->ras_window_start = max(ras->ras_stride_offset, | ||
848 | ras->ras_window_start); | ||
855 | } else { | 849 | } else { |
856 | if (ras->ras_next_readahead < ras->ras_window_start) | 850 | if (ras->ras_next_readahead < ras->ras_window_start) |
857 | ras->ras_next_readahead = ras->ras_window_start; | 851 | ras->ras_next_readahead = ras->ras_window_start; |
@@ -881,7 +875,7 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode, | |||
881 | */ | 875 | */ |
882 | ras->ras_next_readahead = max(index, ras->ras_next_readahead); | 876 | ras->ras_next_readahead = max(index, ras->ras_next_readahead); |
883 | ras->ras_stride_offset = index; | 877 | ras->ras_stride_offset = index; |
884 | ras->ras_window_len = RAS_INCREASE_STEP(inode); | 878 | ras->ras_window_start = max(index, ras->ras_window_start); |
885 | } | 879 | } |
886 | 880 | ||
887 | /* The initial ras_window_len is set to the request size. To avoid | 881 | /* The initial ras_window_len is set to the request size. To avoid |
@@ -1098,38 +1092,39 @@ static int ll_io_read_page(const struct lu_env *env, struct cl_io *io, | |||
1098 | struct cl_2queue *queue = &io->ci_queue; | 1092 | struct cl_2queue *queue = &io->ci_queue; |
1099 | struct ll_sb_info *sbi = ll_i2sbi(inode); | 1093 | struct ll_sb_info *sbi = ll_i2sbi(inode); |
1100 | struct vvp_page *vpg; | 1094 | struct vvp_page *vpg; |
1095 | bool uptodate; | ||
1101 | int rc = 0; | 1096 | int rc = 0; |
1102 | 1097 | ||
1103 | vpg = cl2vvp_page(cl_object_page_slice(page->cp_obj, page)); | 1098 | vpg = cl2vvp_page(cl_object_page_slice(page->cp_obj, page)); |
1099 | uptodate = vpg->vpg_defer_uptodate; | ||
1100 | |||
1104 | if (sbi->ll_ra_info.ra_max_pages_per_file > 0 && | 1101 | if (sbi->ll_ra_info.ra_max_pages_per_file > 0 && |
1105 | sbi->ll_ra_info.ra_max_pages > 0) { | 1102 | sbi->ll_ra_info.ra_max_pages > 0) { |
1106 | struct vvp_io *vio = vvp_env_io(env); | 1103 | struct vvp_io *vio = vvp_env_io(env); |
1107 | enum ras_update_flags flags = 0; | 1104 | enum ras_update_flags flags = 0; |
1108 | 1105 | ||
1109 | if (vpg->vpg_defer_uptodate) | 1106 | if (uptodate) |
1110 | flags |= LL_RAS_HIT; | 1107 | flags |= LL_RAS_HIT; |
1111 | if (!vio->vui_ra_valid) | 1108 | if (!vio->vui_ra_valid) |
1112 | flags |= LL_RAS_MMAP; | 1109 | flags |= LL_RAS_MMAP; |
1113 | ras_update(sbi, inode, ras, vvp_index(vpg), flags); | 1110 | ras_update(sbi, inode, ras, vvp_index(vpg), flags); |
1114 | } | 1111 | } |
1115 | 1112 | ||
1116 | if (vpg->vpg_defer_uptodate) { | 1113 | cl_2queue_init(queue); |
1114 | if (uptodate) { | ||
1117 | vpg->vpg_ra_used = 1; | 1115 | vpg->vpg_ra_used = 1; |
1118 | cl_page_export(env, page, 1); | 1116 | cl_page_export(env, page, 1); |
1117 | cl_page_disown(env, io, page); | ||
1118 | } else { | ||
1119 | cl_page_list_add(&queue->c2_qin, page); | ||
1119 | } | 1120 | } |
1120 | 1121 | ||
1121 | cl_2queue_init(queue); | ||
1122 | /* | ||
1123 | * Add page into the queue even when it is marked uptodate above. | ||
1124 | * this will unlock it automatically as part of cl_page_list_disown(). | ||
1125 | */ | ||
1126 | cl_page_list_add(&queue->c2_qin, page); | ||
1127 | if (sbi->ll_ra_info.ra_max_pages_per_file > 0 && | 1122 | if (sbi->ll_ra_info.ra_max_pages_per_file > 0 && |
1128 | sbi->ll_ra_info.ra_max_pages > 0) { | 1123 | sbi->ll_ra_info.ra_max_pages > 0) { |
1129 | int rc2; | 1124 | int rc2; |
1130 | 1125 | ||
1131 | rc2 = ll_readahead(env, io, &queue->c2_qin, ras, | 1126 | rc2 = ll_readahead(env, io, &queue->c2_qin, ras, |
1132 | vpg->vpg_defer_uptodate); | 1127 | uptodate); |
1133 | CDEBUG(D_READA, DFID "%d pages read ahead at %lu\n", | 1128 | CDEBUG(D_READA, DFID "%d pages read ahead at %lu\n", |
1134 | PFID(ll_inode2fid(inode)), rc2, vvp_index(vpg)); | 1129 | PFID(ll_inode2fid(inode)), rc2, vvp_index(vpg)); |
1135 | } | 1130 | } |
diff --git a/drivers/staging/lustre/lustre/osc/osc_io.c b/drivers/staging/lustre/lustre/osc/osc_io.c index 9f08f0308a57..38f742496ee2 100644 --- a/drivers/staging/lustre/lustre/osc/osc_io.c +++ b/drivers/staging/lustre/lustre/osc/osc_io.c | |||
@@ -99,6 +99,7 @@ static int osc_io_read_ahead(const struct lu_env *env, | |||
99 | ldlm_lock_decref(&lockh, dlmlock->l_req_mode); | 99 | ldlm_lock_decref(&lockh, dlmlock->l_req_mode); |
100 | } | 100 | } |
101 | 101 | ||
102 | ra->cra_rpc_size = osc_cli(osc)->cl_max_pages_per_rpc; | ||
102 | ra->cra_end = cl_index(osc2cl(osc), | 103 | ra->cra_end = cl_index(osc2cl(osc), |
103 | dlmlock->l_policy_data.l_extent.end); | 104 | dlmlock->l_policy_data.l_extent.end); |
104 | ra->cra_release = osc_read_ahead_release; | 105 | ra->cra_release = osc_read_ahead_release; |
@@ -138,7 +139,7 @@ static int osc_io_submit(const struct lu_env *env, | |||
138 | 139 | ||
139 | LASSERT(qin->pl_nr > 0); | 140 | LASSERT(qin->pl_nr > 0); |
140 | 141 | ||
141 | CDEBUG(D_CACHE, "%d %d\n", qin->pl_nr, crt); | 142 | CDEBUG(D_CACHE | D_READA, "%d %d\n", qin->pl_nr, crt); |
142 | 143 | ||
143 | osc = cl2osc(ios->cis_obj); | 144 | osc = cl2osc(ios->cis_obj); |
144 | cli = osc_cli(osc); | 145 | cli = osc_cli(osc); |