summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDennis Zhou (Facebook) <dennisszhou@gmail.com>2018-08-31 16:22:43 -0400
committerJens Axboe <axboe@kernel.dk>2018-08-31 16:48:56 -0400
commit59b57717fff8b562825d9d25e0180ad7e8048ca9 (patch)
treed3e3158cd0f5e9ff6340ba0f9d070772429401b2
parent6b06546206868f723f2061d703a3c3c378dcbf4c (diff)
blkcg: delay blkg destruction until after writeback has finished
Currently, blkcg destruction relies on a sequence of events: 1. Destruction starts. blkcg_css_offline() is called and blkgs release their reference to the blkcg. This immediately destroys the cgwbs (writeback). 2. With blkgs giving up their reference, the blkcg ref count should become zero and eventually call blkcg_css_free() which finally frees the blkcg. Jiufei Xue reported that there is a race between blkcg_bio_issue_check() and cgroup_rmdir(). To remedy this, blkg destruction becomes contingent on the completion of all writeback associated with the blkcg. A count of the number of cgwbs is maintained and once that goes to zero, blkg destruction can follow. This should prevent premature blkg destruction related to writeback. The new process for blkcg cleanup is as follows: 1. Destruction starts. blkcg_css_offline() is called which offlines writeback. Blkg destruction is delayed on the cgwb_refcnt count to avoid punting potentially large amounts of outstanding writeback to root while maintaining any ongoing policies. Here, the base cgwb_refcnt is put back. 2. When the cgwb_refcnt becomes zero, blkcg_destroy_blkgs() is called and handles destruction of blkgs. This is where the css reference held by each blkg is released. 3. Once the blkcg ref count goes to zero, blkcg_css_free() is called. This finally frees the blkg. It seems in the past blk-throttle didn't do the most understandable things with taking data from a blkg while associating with current. So, the simplification and unification of what blk-throttle is doing caused this. Fixes: 08e18eab0c579 ("block: add bi_blkg to the bio for cgroups") Reviewed-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: Dennis Zhou <dennisszhou@gmail.com> Cc: Jiufei Xue <jiufei.xue@linux.alibaba.com> Cc: Joseph Qi <joseph.qi@linux.alibaba.com> Cc: Tejun Heo <tj@kernel.org> Cc: Josef Bacik <josef@toxicpanda.com> Cc: Jens Axboe <axboe@kernel.dk> Signed-off-by: Jens Axboe <axboe@kernel.dk>
-rw-r--r--block/blk-cgroup.c53
-rw-r--r--include/linux/blk-cgroup.h44
-rw-r--r--mm/backing-dev.c5
3 files changed, 94 insertions, 8 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 2998e4f095d1..c19f9078da1e 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1042,21 +1042,59 @@ static struct cftype blkcg_legacy_files[] = {
1042 { } /* terminate */ 1042 { } /* terminate */
1043}; 1043};
1044 1044
1045/*
1046 * blkcg destruction is a three-stage process.
1047 *
1048 * 1. Destruction starts. The blkcg_css_offline() callback is invoked
1049 * which offlines writeback. Here we tie the next stage of blkg destruction
1050 * to the completion of writeback associated with the blkcg. This lets us
1051 * avoid punting potentially large amounts of outstanding writeback to root
1052 * while maintaining any ongoing policies. The next stage is triggered when
1053 * the nr_cgwbs count goes to zero.
1054 *
1055 * 2. When the nr_cgwbs count goes to zero, blkcg_destroy_blkgs() is called
1056 * and handles the destruction of blkgs. Here the css reference held by
1057 * the blkg is put back eventually allowing blkcg_css_free() to be called.
1058 * This work may occur in cgwb_release_workfn() on the cgwb_release
1059 * workqueue. Any submitted ios that fail to get the blkg ref will be
1060 * punted to the root_blkg.
1061 *
1062 * 3. Once the blkcg ref count goes to zero, blkcg_css_free() is called.
1063 * This finally frees the blkcg.
1064 */
1065
1045/** 1066/**
1046 * blkcg_css_offline - cgroup css_offline callback 1067 * blkcg_css_offline - cgroup css_offline callback
1047 * @css: css of interest 1068 * @css: css of interest
1048 * 1069 *
1049 * This function is called when @css is about to go away and responsible 1070 * This function is called when @css is about to go away. Here the cgwbs are
1050 * for shooting down all blkgs associated with @css. blkgs should be 1071 * offlined first and only once writeback associated with the blkcg has
1051 * removed while holding both q and blkcg locks. As blkcg lock is nested 1072 * finished do we start step 2 (see above).
1052 * inside q lock, this function performs reverse double lock dancing.
1053 *
1054 * This is the blkcg counterpart of ioc_release_fn().
1055 */ 1073 */
1056static void blkcg_css_offline(struct cgroup_subsys_state *css) 1074static void blkcg_css_offline(struct cgroup_subsys_state *css)
1057{ 1075{
1058 struct blkcg *blkcg = css_to_blkcg(css); 1076 struct blkcg *blkcg = css_to_blkcg(css);
1059 1077
1078 /* this prevents anyone from attaching or migrating to this blkcg */
1079 wb_blkcg_offline(blkcg);
1080
1081 /* put the base cgwb reference allowing step 2 to be triggered */
1082 blkcg_cgwb_put(blkcg);
1083}
1084
1085/**
1086 * blkcg_destroy_blkgs - responsible for shooting down blkgs
1087 * @blkcg: blkcg of interest
1088 *
1089 * blkgs should be removed while holding both q and blkcg locks. As blkcg lock
1090 * is nested inside q lock, this function performs reverse double lock dancing.
1091 * Destroying the blkgs releases the reference held on the blkcg's css allowing
1092 * blkcg_css_free to eventually be called.
1093 *
1094 * This is the blkcg counterpart of ioc_release_fn().
1095 */
1096void blkcg_destroy_blkgs(struct blkcg *blkcg)
1097{
1060 spin_lock_irq(&blkcg->lock); 1098 spin_lock_irq(&blkcg->lock);
1061 1099
1062 while (!hlist_empty(&blkcg->blkg_list)) { 1100 while (!hlist_empty(&blkcg->blkg_list)) {
@@ -1075,8 +1113,6 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css)
1075 } 1113 }
1076 1114
1077 spin_unlock_irq(&blkcg->lock); 1115 spin_unlock_irq(&blkcg->lock);
1078
1079 wb_blkcg_offline(blkcg);
1080} 1116}
1081 1117
1082static void blkcg_css_free(struct cgroup_subsys_state *css) 1118static void blkcg_css_free(struct cgroup_subsys_state *css)
@@ -1146,6 +1182,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
1146 INIT_HLIST_HEAD(&blkcg->blkg_list); 1182 INIT_HLIST_HEAD(&blkcg->blkg_list);
1147#ifdef CONFIG_CGROUP_WRITEBACK 1183#ifdef CONFIG_CGROUP_WRITEBACK
1148 INIT_LIST_HEAD(&blkcg->cgwb_list); 1184 INIT_LIST_HEAD(&blkcg->cgwb_list);
1185 refcount_set(&blkcg->cgwb_refcnt, 1);
1149#endif 1186#endif
1150 list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs); 1187 list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
1151 1188
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 1615cdd4c797..6d766a19f2bb 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -56,6 +56,7 @@ struct blkcg {
56 struct list_head all_blkcgs_node; 56 struct list_head all_blkcgs_node;
57#ifdef CONFIG_CGROUP_WRITEBACK 57#ifdef CONFIG_CGROUP_WRITEBACK
58 struct list_head cgwb_list; 58 struct list_head cgwb_list;
59 refcount_t cgwb_refcnt;
59#endif 60#endif
60}; 61};
61 62
@@ -386,6 +387,49 @@ static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
386 return cpd ? cpd->blkcg : NULL; 387 return cpd ? cpd->blkcg : NULL;
387} 388}
388 389
390extern void blkcg_destroy_blkgs(struct blkcg *blkcg);
391
392#ifdef CONFIG_CGROUP_WRITEBACK
393
394/**
395 * blkcg_cgwb_get - get a reference for blkcg->cgwb_list
396 * @blkcg: blkcg of interest
397 *
398 * This is used to track the number of active wb's related to a blkcg.
399 */
400static inline void blkcg_cgwb_get(struct blkcg *blkcg)
401{
402 refcount_inc(&blkcg->cgwb_refcnt);
403}
404
405/**
406 * blkcg_cgwb_put - put a reference for @blkcg->cgwb_list
407 * @blkcg: blkcg of interest
408 *
409 * This is used to track the number of active wb's related to a blkcg.
410 * When this count goes to zero, all active wb has finished so the
411 * blkcg can continue destruction by calling blkcg_destroy_blkgs().
412 * This work may occur in cgwb_release_workfn() on the cgwb_release
413 * workqueue.
414 */
415static inline void blkcg_cgwb_put(struct blkcg *blkcg)
416{
417 if (refcount_dec_and_test(&blkcg->cgwb_refcnt))
418 blkcg_destroy_blkgs(blkcg);
419}
420
421#else
422
423static inline void blkcg_cgwb_get(struct blkcg *blkcg) { }
424
425static inline void blkcg_cgwb_put(struct blkcg *blkcg)
426{
427 /* wb isn't being accounted, so trigger destruction right away */
428 blkcg_destroy_blkgs(blkcg);
429}
430
431#endif
432
389/** 433/**
390 * blkg_path - format cgroup path of blkg 434 * blkg_path - format cgroup path of blkg
391 * @blkg: blkg of interest 435 * @blkg: blkg of interest
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index f5981e9d6ae2..8a8bb8796c6c 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -491,6 +491,7 @@ static void cgwb_release_workfn(struct work_struct *work)
491{ 491{
492 struct bdi_writeback *wb = container_of(work, struct bdi_writeback, 492 struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
493 release_work); 493 release_work);
494 struct blkcg *blkcg = css_to_blkcg(wb->blkcg_css);
494 495
495 mutex_lock(&wb->bdi->cgwb_release_mutex); 496 mutex_lock(&wb->bdi->cgwb_release_mutex);
496 wb_shutdown(wb); 497 wb_shutdown(wb);
@@ -499,6 +500,9 @@ static void cgwb_release_workfn(struct work_struct *work)
499 css_put(wb->blkcg_css); 500 css_put(wb->blkcg_css);
500 mutex_unlock(&wb->bdi->cgwb_release_mutex); 501 mutex_unlock(&wb->bdi->cgwb_release_mutex);
501 502
503 /* triggers blkg destruction if cgwb_refcnt becomes zero */
504 blkcg_cgwb_put(blkcg);
505
502 fprop_local_destroy_percpu(&wb->memcg_completions); 506 fprop_local_destroy_percpu(&wb->memcg_completions);
503 percpu_ref_exit(&wb->refcnt); 507 percpu_ref_exit(&wb->refcnt);
504 wb_exit(wb); 508 wb_exit(wb);
@@ -597,6 +601,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
597 list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list); 601 list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
598 list_add(&wb->memcg_node, memcg_cgwb_list); 602 list_add(&wb->memcg_node, memcg_cgwb_list);
599 list_add(&wb->blkcg_node, blkcg_cgwb_list); 603 list_add(&wb->blkcg_node, blkcg_cgwb_list);
604 blkcg_cgwb_get(blkcg);
600 css_get(memcg_css); 605 css_get(memcg_css);
601 css_get(blkcg_css); 606 css_get(blkcg_css);
602 } 607 }