diff options
author | Lars Ellenberg <lars.ellenberg@linbit.com> | 2011-01-24 08:47:09 -0500 |
---|---|---|
committer | Philipp Reisner <philipp.reisner@linbit.com> | 2011-03-10 05:48:10 -0500 |
commit | 10f6d9926cd17afff9dc03c967706419798b4929 (patch) | |
tree | 7b1cd4bfcffbb3832c756ce29861411f7300aadd /drivers/block | |
parent | 039312b6481e2928f3be19fee94c83327d93e4c7 (diff) |
drbd: don't BUG_ON, if bio_add_page of a single page to an empty bio fails
Just deal with it more gracefully, if we fail to add even a single page
to an empty bio. We used to BUG_ON() there, but it has been observed in
some Xen deployment, so we need to handle that case more robustly now.
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Diffstat (limited to 'drivers/block')
-rw-r--r-- | drivers/block/drbd/drbd_receiver.c | 45 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_worker.c | 7 |
2 files changed, 34 insertions, 18 deletions
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index a56b107e01eb..9e9fc3413137 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c | |||
@@ -1073,6 +1073,16 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) | |||
1073 | * @mdev: DRBD device. | 1073 | * @mdev: DRBD device. |
1074 | * @e: epoch entry | 1074 | * @e: epoch entry |
1075 | * @rw: flag field, see bio->bi_rw | 1075 | * @rw: flag field, see bio->bi_rw |
1076 | * | ||
1077 | * May spread the pages to multiple bios, | ||
1078 | * depending on bio_add_page restrictions. | ||
1079 | * | ||
1080 | * Returns 0 if all bios have been submitted, | ||
1081 | * -ENOMEM if we could not allocate enough bios, | ||
1082 | * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a | ||
1083 | * single page to an empty bio (which should never happen and likely indicates | ||
1084 | * that the lower level IO stack is in some way broken). This has been observed | ||
1085 | * on certain Xen deployments. | ||
1076 | */ | 1086 | */ |
1077 | /* TODO allocate from our own bio_set. */ | 1087 | /* TODO allocate from our own bio_set. */ |
1078 | int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, | 1088 | int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, |
@@ -1085,6 +1095,7 @@ int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, | |||
1085 | unsigned ds = e->size; | 1095 | unsigned ds = e->size; |
1086 | unsigned n_bios = 0; | 1096 | unsigned n_bios = 0; |
1087 | unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT; | 1097 | unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT; |
1098 | int err = -ENOMEM; | ||
1088 | 1099 | ||
1089 | /* In most cases, we will only need one bio. But in case the lower | 1100 | /* In most cases, we will only need one bio. But in case the lower |
1090 | * level restrictions happen to be different at this offset on this | 1101 | * level restrictions happen to be different at this offset on this |
@@ -1110,8 +1121,17 @@ next_bio: | |||
1110 | page_chain_for_each(page) { | 1121 | page_chain_for_each(page) { |
1111 | unsigned len = min_t(unsigned, ds, PAGE_SIZE); | 1122 | unsigned len = min_t(unsigned, ds, PAGE_SIZE); |
1112 | if (!bio_add_page(bio, page, len, 0)) { | 1123 | if (!bio_add_page(bio, page, len, 0)) { |
1113 | /* a single page must always be possible! */ | 1124 | /* A single page must always be possible! |
1114 | BUG_ON(bio->bi_vcnt == 0); | 1125 | * But in case it fails anyways, |
1126 | * we deal with it, and complain (below). */ | ||
1127 | if (bio->bi_vcnt == 0) { | ||
1128 | dev_err(DEV, | ||
1129 | "bio_add_page failed for len=%u, " | ||
1130 | "bi_vcnt=0 (bi_sector=%llu)\n", | ||
1131 | len, (unsigned long long)bio->bi_sector); | ||
1132 | err = -ENOSPC; | ||
1133 | goto fail; | ||
1134 | } | ||
1115 | goto next_bio; | 1135 | goto next_bio; |
1116 | } | 1136 | } |
1117 | ds -= len; | 1137 | ds -= len; |
@@ -1137,7 +1157,7 @@ fail: | |||
1137 | bios = bios->bi_next; | 1157 | bios = bios->bi_next; |
1138 | bio_put(bio); | 1158 | bio_put(bio); |
1139 | } | 1159 | } |
1140 | return -ENOMEM; | 1160 | return err; |
1141 | } | 1161 | } |
1142 | 1162 | ||
1143 | static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) | 1163 | static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) |
@@ -1436,9 +1456,8 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si | |||
1436 | if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0) | 1456 | if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0) |
1437 | return true; | 1457 | return true; |
1438 | 1458 | ||
1439 | /* drbd_submit_ee currently fails for one reason only: | 1459 | /* don't care for the reason here */ |
1440 | * not being able to allocate enough bios. | 1460 | dev_err(DEV, "submit failed, triggering re-connect\n"); |
1441 | * Is dropping the connection going to help? */ | ||
1442 | spin_lock_irq(&mdev->req_lock); | 1461 | spin_lock_irq(&mdev->req_lock); |
1443 | list_del(&e->w.list); | 1462 | list_del(&e->w.list); |
1444 | spin_unlock_irq(&mdev->req_lock); | 1463 | spin_unlock_irq(&mdev->req_lock); |
@@ -1837,9 +1856,8 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned | |||
1837 | if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0) | 1856 | if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0) |
1838 | return true; | 1857 | return true; |
1839 | 1858 | ||
1840 | /* drbd_submit_ee currently fails for one reason only: | 1859 | /* don't care for the reason here */ |
1841 | * not being able to allocate enough bios. | 1860 | dev_err(DEV, "submit failed, triggering re-connect\n"); |
1842 | * Is dropping the connection going to help? */ | ||
1843 | spin_lock_irq(&mdev->req_lock); | 1861 | spin_lock_irq(&mdev->req_lock); |
1844 | list_del(&e->w.list); | 1862 | list_del(&e->w.list); |
1845 | hlist_del_init(&e->colision); | 1863 | hlist_del_init(&e->colision); |
@@ -1848,9 +1866,7 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned | |||
1848 | drbd_al_complete_io(mdev, e->sector); | 1866 | drbd_al_complete_io(mdev, e->sector); |
1849 | 1867 | ||
1850 | out_interrupted: | 1868 | out_interrupted: |
1851 | /* yes, the epoch_size now is imbalanced. | 1869 | drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + EV_CLEANUP); |
1852 | * but we drop the connection anyways, so we don't have a chance to | ||
1853 | * receive a barrier... atomic_inc(&mdev->epoch_size); */ | ||
1854 | put_ldev(mdev); | 1870 | put_ldev(mdev); |
1855 | drbd_free_ee(mdev, e); | 1871 | drbd_free_ee(mdev, e); |
1856 | return false; | 1872 | return false; |
@@ -2096,9 +2112,8 @@ submit: | |||
2096 | if (drbd_submit_ee(mdev, e, READ, fault_type) == 0) | 2112 | if (drbd_submit_ee(mdev, e, READ, fault_type) == 0) |
2097 | return true; | 2113 | return true; |
2098 | 2114 | ||
2099 | /* drbd_submit_ee currently fails for one reason only: | 2115 | /* don't care for the reason here */ |
2100 | * not being able to allocate enough bios. | 2116 | dev_err(DEV, "submit failed, triggering re-connect\n"); |
2101 | * Is dropping the connection going to help? */ | ||
2102 | spin_lock_irq(&mdev->req_lock); | 2117 | spin_lock_irq(&mdev->req_lock); |
2103 | list_del(&e->w.list); | 2118 | list_del(&e->w.list); |
2104 | spin_unlock_irq(&mdev->req_lock); | 2119 | spin_unlock_irq(&mdev->req_lock); |
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index ff0eb308ee4a..cfd324b9f95b 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c | |||
@@ -369,9 +369,10 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size) | |||
369 | if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0) | 369 | if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0) |
370 | return 0; | 370 | return 0; |
371 | 371 | ||
372 | /* drbd_submit_ee currently fails for one reason only: | 372 | /* If it failed because of ENOMEM, retry should help. If it failed |
373 | * not being able to allocate enough bios. | 373 | * because bio_add_page failed (probably broken lower level driver), |
374 | * Is dropping the connection going to help? */ | 374 | * retry may or may not help. |
375 | * If it does not, you may need to force disconnect. */ | ||
375 | spin_lock_irq(&mdev->req_lock); | 376 | spin_lock_irq(&mdev->req_lock); |
376 | list_del(&e->w.list); | 377 | list_del(&e->w.list); |
377 | spin_unlock_irq(&mdev->req_lock); | 378 | spin_unlock_irq(&mdev->req_lock); |