aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTang Junhui <tang.junhui@zte.com.cn>2018-02-07 14:41:40 -0500
committerJens Axboe <axboe@kernel.dk>2018-02-07 14:50:01 -0500
commitc4dc2497d50d9c6fb16aa0d07b6a14f3b2adb1e0 (patch)
tree574e8a7549cc73433eeef79fcdabb4d3c2083e84
parenta728eacbbdd229d1d903e46261c57d5206f87a4a (diff)
bcache: fix high CPU occupancy during journal
After long time small writing I/O running, we found the occupancy of CPU is very high and I/O performance has been reduced by about half: [root@ceph151 internal]# top top - 15:51:05 up 1 day,2:43, 4 users, load average: 16.89, 15.15, 16.53 Tasks: 2063 total, 4 running, 2059 sleeping, 0 stopped, 0 zombie %Cpu(s):4.3 us, 17.1 sy 0.0 ni, 66.1 id, 12.0 wa, 0.0 hi, 0.5 si, 0.0 st KiB Mem : 65450044 total, 24586420 free, 38909008 used, 1954616 buff/cache KiB Swap: 65667068 total, 65667068 free, 0 used. 25136812 avail Mem PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 2023 root 20 0 0 0 0 S 55.1 0.0 0:04.42 kworker/11:191 14126 root 20 0 0 0 0 S 42.9 0.0 0:08.72 kworker/10:3 9292 root 20 0 0 0 0 S 30.4 0.0 1:10.99 kworker/6:1 8553 ceph 20 0 4242492 1.805g 18804 S 30.0 2.9 410:07.04 ceph-osd 12287 root 20 0 0 0 0 S 26.7 0.0 0:28.13 kworker/7:85 31019 root 20 0 0 0 0 S 26.1 0.0 1:30.79 kworker/22:1 1787 root 20 0 0 0 0 R 25.7 0.0 5:18.45 kworker/8:7 32169 root 20 0 0 0 0 S 14.5 0.0 1:01.92 kworker/23:1 21476 root 20 0 0 0 0 S 13.9 0.0 0:05.09 kworker/1:54 2204 root 20 0 0 0 0 S 12.5 0.0 1:25.17 kworker/9:10 16994 root 20 0 0 0 0 S 12.2 0.0 0:06.27 kworker/5:106 15714 root 20 0 0 0 0 R 10.9 0.0 0:01.85 kworker/19:2 9661 ceph 20 0 4246876 1.731g 18800 S 10.6 2.8 403:00.80 ceph-osd 11460 ceph 20 0 4164692 2.206g 18876 S 10.6 3.5 360:27.19 ceph-osd 9960 root 20 0 0 0 0 S 10.2 0.0 0:02.75 kworker/2:139 11699 ceph 20 0 4169244 1.920g 18920 S 10.2 3.1 355:23.67 ceph-osd 6843 ceph 20 0 4197632 1.810g 18900 S 9.6 2.9 380:08.30 ceph-osd The kernel work consumed a lot of CPU, and I found they are running journal work, The journal is reclaiming source and flush btree node with surprising frequency. Through further analysis, we found that in btree_flush_write(), we try to get a btree node with the smallest fifo idex to flush by traverse all the btree nodein c->bucket_hash, after we getting it, since no locker protects it, this btree node may have been written to cache device by other works, and if this occurred, we retry to traverse in c->bucket_hash and get another btree node. When the problem occurrd, the retry times is very high, and we consume a lot of CPU in looking for a appropriate btree node. In this patch, we try to record 128 btree nodes with the smallest fifo idex in heap, and pop one by one when we need to flush btree node. It greatly reduces the time for the loop to find the appropriate BTREE node, and also reduce the occupancy of CPU. [note by mpl: this triggers a checkpatch error because of adjacent, pre-existing style violations] Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn> Reviewed-by: Michael Lyle <mlyle@lyle.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
-rw-r--r--drivers/md/bcache/bcache.h2
-rw-r--r--drivers/md/bcache/journal.c47
-rw-r--r--drivers/md/bcache/util.h2
3 files changed, 36 insertions, 15 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index b98d7705acb6..a857eb3c10de 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -679,6 +679,8 @@ struct cache_set {
679 679
680#define BUCKET_HASH_BITS 12 680#define BUCKET_HASH_BITS 12
681 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; 681 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS];
682
683 DECLARE_HEAP(struct btree *, flush_btree);
682}; 684};
683 685
684struct bbio { 686struct bbio {
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index f5296007a9d5..1b736b860739 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -368,6 +368,12 @@ err:
368} 368}
369 369
370/* Journalling */ 370/* Journalling */
371#define journal_max_cmp(l, r) \
372 (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) < \
373 fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal))
374#define journal_min_cmp(l, r) \
375 (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) > \
376 fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal))
371 377
372static void btree_flush_write(struct cache_set *c) 378static void btree_flush_write(struct cache_set *c)
373{ 379{
@@ -375,25 +381,35 @@ static void btree_flush_write(struct cache_set *c)
375 * Try to find the btree node with that references the oldest journal 381 * Try to find the btree node with that references the oldest journal
376 * entry, best is our current candidate and is locked if non NULL: 382 * entry, best is our current candidate and is locked if non NULL:
377 */ 383 */
378 struct btree *b, *best; 384 struct btree *b;
379 unsigned i; 385 int i;
380 386
381 atomic_long_inc(&c->flush_write); 387 atomic_long_inc(&c->flush_write);
388
382retry: 389retry:
383 best = NULL; 390 spin_lock(&c->journal.lock);
384 391 if (heap_empty(&c->flush_btree)) {
385 for_each_cached_btree(b, c, i) 392 for_each_cached_btree(b, c, i)
386 if (btree_current_write(b)->journal) { 393 if (btree_current_write(b)->journal) {
387 if (!best) 394 if (!heap_full(&c->flush_btree))
388 best = b; 395 heap_add(&c->flush_btree, b,
389 else if (journal_pin_cmp(c, 396 journal_max_cmp);
390 btree_current_write(best)->journal, 397 else if (journal_max_cmp(b,
391 btree_current_write(b)->journal)) { 398 heap_peek(&c->flush_btree))) {
392 best = b; 399 c->flush_btree.data[0] = b;
400 heap_sift(&c->flush_btree, 0,
401 journal_max_cmp);
402 }
393 } 403 }
394 }
395 404
396 b = best; 405 for (i = c->flush_btree.used / 2 - 1; i >= 0; --i)
406 heap_sift(&c->flush_btree, i, journal_min_cmp);
407 }
408
409 b = NULL;
410 heap_pop(&c->flush_btree, b, journal_min_cmp);
411 spin_unlock(&c->journal.lock);
412
397 if (b) { 413 if (b) {
398 mutex_lock(&b->write_lock); 414 mutex_lock(&b->write_lock);
399 if (!btree_current_write(b)->journal) { 415 if (!btree_current_write(b)->journal) {
@@ -824,7 +840,8 @@ int bch_journal_alloc(struct cache_set *c)
824 j->w[0].c = c; 840 j->w[0].c = c;
825 j->w[1].c = c; 841 j->w[1].c = c;
826 842
827 if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || 843 if (!(init_heap(&c->flush_btree, 128, GFP_KERNEL)) ||
844 !(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
828 !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || 845 !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) ||
829 !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) 846 !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
830 return -ENOMEM; 847 return -ENOMEM;
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 4df4c5c1cab2..a6763db7f061 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -112,6 +112,8 @@ do { \
112 112
113#define heap_full(h) ((h)->used == (h)->size) 113#define heap_full(h) ((h)->used == (h)->size)
114 114
115#define heap_empty(h) ((h)->used == 0)
116
115#define DECLARE_FIFO(type, name) \ 117#define DECLARE_FIFO(type, name) \
116 struct { \ 118 struct { \
117 size_t front, back, size, mask; \ 119 size_t front, back, size, mask; \