aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorcolyli@suse.de <colyli@suse.de>2017-01-28 08:11:49 -0500
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2017-03-12 00:41:52 -0500
commit6f9c02ab9d0d5aefa15953ad3d55912e3a7abb57 (patch)
treedc7fb7add3434feb18275075b7ad1aa56917e348 /drivers/md
parentde2aa5b3ee76e97dc3cea33cbd3d1bd443fda7a0 (diff)
md linear: fix a race between linear_add() and linear_congested()
commit 03a9e24ef2aaa5f1f9837356aed79c860521407a upstream. Recently I receive a bug report that on Linux v3.0 based kerenl, hot add disk to a md linear device causes kernel crash at linear_congested(). From the crash image analysis, I find in linear_congested(), mddev->raid_disks contains value N, but conf->disks[] only has N-1 pointers available. Then a NULL pointer deference crashes the kernel. There is a race between linear_add() and linear_congested(), RCU stuffs used in these two functions cannot avoid the race. Since Linuv v4.0 RCU code is replaced by introducing mddev_suspend(). After checking the upstream code, it seems linear_congested() is not called in generic_make_request() code patch, so mddev_suspend() cannot provent it from being called. The possible race still exists. Here I explain how the race still exists in current code. For a machine has many CPUs, on one CPU, linear_add() is called to add a hard disk to a md linear device; at the same time on other CPU, linear_congested() is called to detect whether this md linear device is congested before issuing an I/O request onto it. Now I use a possible code execution time sequence to demo how the possible race happens, seq linear_add() linear_congested() 0 conf=mddev->private 1 oldconf=mddev->private 2 mddev->raid_disks++ 3 for (i=0; i<mddev->raid_disks;i++) 4 bdev_get_queue(conf->disks[i].rdev->bdev) 5 mddev->private=newconf In linear_add() mddev->raid_disks is increased in time seq 2, and on another CPU in linear_congested() the for-loop iterates conf->disks[i] by the increased mddev->raid_disks in time seq 3,4. But conf with one more element (which is a pointer to struct dev_info type) to conf->disks[] is not updated yet, accessing its structure member in time seq 4 will cause a NULL pointer deference fault. To fix this race, there are 2 parts of modification in the patch, 1) Add 'int raid_disks' in struct linear_conf, as a copy of mddev->raid_disks. It is initialized in linear_conf(), always being consistent with pointers number of 'struct dev_info disks[]'. When iterating conf->disks[] in linear_congested(), use conf->raid_disks to replace mddev->raid_disks in the for-loop, then NULL pointer deference will not happen again. 2) RCU stuffs are back again, and use kfree_rcu() in linear_add() to free oldconf memory. Because oldconf may be referenced as mddev->private in linear_congested(), kfree_rcu() makes sure that its memory will not be released until no one uses it any more. Also some code comments are added in this patch, to make this modification to be easier understandable. This patch can be applied for kernels since v4.0 after commit: 3be260cc18f8 ("md/linear: remove rcu protections in favour of suspend/resume"). But this bug is reported on Linux v3.0 based kernel, for people who maintain kernels before Linux v4.0, they need to do some back back port to this patch. Changelog: - V3: add 'int raid_disks' in struct linear_conf, and use kfree_rcu() to replace rcu_call() in linear_add(). - v2: add RCU stuffs by suggestion from Shaohua and Neil. - v1: initial effort. Signed-off-by: Coly Li <colyli@suse.de> Cc: Shaohua Li <shli@fb.com> Cc: Neil Brown <neilb@suse.com> Signed-off-by: Shaohua Li <shli@fb.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/linear.c39
-rw-r--r--drivers/md/linear.h1
2 files changed, 35 insertions, 5 deletions
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 86f5d435901d..b0c0aef92a37 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -52,18 +52,26 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
52 return conf->disks + lo; 52 return conf->disks + lo;
53} 53}
54 54
55/*
56 * In linear_congested() conf->raid_disks is used as a copy of
57 * mddev->raid_disks to iterate conf->disks[], because conf->raid_disks
58 * and conf->disks[] are created in linear_conf(), they are always
59 * consitent with each other, but mddev->raid_disks does not.
60 */
55static int linear_congested(struct mddev *mddev, int bits) 61static int linear_congested(struct mddev *mddev, int bits)
56{ 62{
57 struct linear_conf *conf; 63 struct linear_conf *conf;
58 int i, ret = 0; 64 int i, ret = 0;
59 65
60 conf = mddev->private; 66 rcu_read_lock();
67 conf = rcu_dereference(mddev->private);
61 68
62 for (i = 0; i < mddev->raid_disks && !ret ; i++) { 69 for (i = 0; i < conf->raid_disks && !ret ; i++) {
63 struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); 70 struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev);
64 ret |= bdi_congested(&q->backing_dev_info, bits); 71 ret |= bdi_congested(&q->backing_dev_info, bits);
65 } 72 }
66 73
74 rcu_read_unlock();
67 return ret; 75 return ret;
68} 76}
69 77
@@ -143,6 +151,19 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
143 conf->disks[i-1].end_sector + 151 conf->disks[i-1].end_sector +
144 conf->disks[i].rdev->sectors; 152 conf->disks[i].rdev->sectors;
145 153
154 /*
155 * conf->raid_disks is copy of mddev->raid_disks. The reason to
156 * keep a copy of mddev->raid_disks in struct linear_conf is,
157 * mddev->raid_disks may not be consistent with pointers number of
158 * conf->disks[] when it is updated in linear_add() and used to
159 * iterate old conf->disks[] earray in linear_congested().
160 * Here conf->raid_disks is always consitent with number of
161 * pointers in conf->disks[] array, and mddev->private is updated
162 * with rcu_assign_pointer() in linear_addr(), such race can be
163 * avoided.
164 */
165 conf->raid_disks = raid_disks;
166
146 return conf; 167 return conf;
147 168
148out: 169out:
@@ -195,15 +216,23 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
195 if (!newconf) 216 if (!newconf)
196 return -ENOMEM; 217 return -ENOMEM;
197 218
219 /* newconf->raid_disks already keeps a copy of * the increased
220 * value of mddev->raid_disks, WARN_ONCE() is just used to make
221 * sure of this. It is possible that oldconf is still referenced
222 * in linear_congested(), therefore kfree_rcu() is used to free
223 * oldconf until no one uses it anymore.
224 */
198 mddev_suspend(mddev); 225 mddev_suspend(mddev);
199 oldconf = mddev->private; 226 oldconf = rcu_dereference(mddev->private);
200 mddev->raid_disks++; 227 mddev->raid_disks++;
201 mddev->private = newconf; 228 WARN_ONCE(mddev->raid_disks != newconf->raid_disks,
229 "copied raid_disks doesn't match mddev->raid_disks");
230 rcu_assign_pointer(mddev->private, newconf);
202 md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); 231 md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
203 set_capacity(mddev->gendisk, mddev->array_sectors); 232 set_capacity(mddev->gendisk, mddev->array_sectors);
204 mddev_resume(mddev); 233 mddev_resume(mddev);
205 revalidate_disk(mddev->gendisk); 234 revalidate_disk(mddev->gendisk);
206 kfree(oldconf); 235 kfree_rcu(oldconf, rcu);
207 return 0; 236 return 0;
208} 237}
209 238
diff --git a/drivers/md/linear.h b/drivers/md/linear.h
index b685ddd7d7f7..8d392e6098b3 100644
--- a/drivers/md/linear.h
+++ b/drivers/md/linear.h
@@ -10,6 +10,7 @@ struct linear_conf
10{ 10{
11 struct rcu_head rcu; 11 struct rcu_head rcu;
12 sector_t array_sectors; 12 sector_t array_sectors;
13 int raid_disks; /* a copy of mddev->raid_disks */
13 struct dev_info disks[0]; 14 struct dev_info disks[0];
14}; 15};
15#endif 16#endif