Skip to content

Commit 95af587

Browse files
author
NeilBrown
committed
md/raid10: ensure device failure recorded before write request returns.
When a write to one of the legs of a RAID10 fails, the failure is recorded in the metadata of the other legs so that after a restart the data on the failed drive wont be trusted even if that drive seems to be working again (maybe a cable was unplugged). Currently there is no interlock between the write request completing and the metadata update. So it is possible that the write will complete, the app will confirm success in some way, and then the machine will crash before the metadata update completes. This is an extremely small hole for a racy to fit in, but it is theoretically possible and so should be closed. So: - set MD_CHANGE_PENDING when requesting a metadata update for a failed device, so we can know with certainty when it completes - queue requests that experienced an error on a new queue which is only processed after the metadata update completes - call raid_end_bio_io() on bios in that queue when the time comes. Signed-off-by: NeilBrown <[email protected]>
1 parent 55ce74d commit 95af587

File tree

2 files changed

+34
-1
lines changed

2 files changed

+34
-1
lines changed

drivers/md/raid10.c

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1681,6 +1681,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
16811681
set_bit(Blocked, &rdev->flags);
16821682
set_bit(Faulty, &rdev->flags);
16831683
set_bit(MD_CHANGE_DEVS, &mddev->flags);
1684+
set_bit(MD_CHANGE_PENDING, &mddev->flags);
16841685
spin_unlock_irqrestore(&conf->device_lock, flags);
16851686
printk(KERN_ALERT
16861687
"md/raid10:%s: Disk failure on %s, disabling device.\n"
@@ -2738,6 +2739,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
27382739
}
27392740
put_buf(r10_bio);
27402741
} else {
2742+
bool fail = false;
27412743
for (m = 0; m < conf->copies; m++) {
27422744
int dev = r10_bio->devs[m].devnum;
27432745
struct bio *bio = r10_bio->devs[m].bio;
@@ -2750,6 +2752,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
27502752
rdev_dec_pending(rdev, conf->mddev);
27512753
} else if (bio != NULL &&
27522754
!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2755+
fail = true;
27532756
if (!narrow_write_error(r10_bio, m)) {
27542757
md_error(conf->mddev, rdev);
27552758
set_bit(R10BIO_Degraded,
@@ -2770,7 +2773,13 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
27702773
if (test_bit(R10BIO_WriteError,
27712774
&r10_bio->state))
27722775
close_write(r10_bio);
2773-
raid_end_bio_io(r10_bio);
2776+
if (fail) {
2777+
spin_lock_irq(&conf->device_lock);
2778+
list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
2779+
spin_unlock_irq(&conf->device_lock);
2780+
md_wakeup_thread(conf->mddev->thread);
2781+
} else
2782+
raid_end_bio_io(r10_bio);
27742783
}
27752784
}
27762785

@@ -2785,6 +2794,23 @@ static void raid10d(struct md_thread *thread)
27852794

27862795
md_check_recovery(mddev);
27872796

2797+
if (!list_empty_careful(&conf->bio_end_io_list) &&
2798+
!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
2799+
LIST_HEAD(tmp);
2800+
spin_lock_irqsave(&conf->device_lock, flags);
2801+
if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
2802+
list_add(&tmp, &conf->bio_end_io_list);
2803+
list_del_init(&conf->bio_end_io_list);
2804+
}
2805+
spin_unlock_irqrestore(&conf->device_lock, flags);
2806+
while (!list_empty(&tmp)) {
2807+
r10_bio = list_first_entry(&conf->bio_end_io_list,
2808+
struct r10bio, retry_list);
2809+
list_del(&r10_bio->retry_list);
2810+
raid_end_bio_io(r10_bio);
2811+
}
2812+
}
2813+
27882814
blk_start_plug(&plug);
27892815
for (;;) {
27902816

@@ -3559,6 +3585,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
35593585
conf->reshape_safe = conf->reshape_progress;
35603586
spin_lock_init(&conf->device_lock);
35613587
INIT_LIST_HEAD(&conf->retry_list);
3588+
INIT_LIST_HEAD(&conf->bio_end_io_list);
35623589

35633590
spin_lock_init(&conf->resync_lock);
35643591
init_waitqueue_head(&conf->wait_barrier);

drivers/md/raid10.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,12 @@ struct r10conf {
5353
sector_t offset_diff;
5454

5555
struct list_head retry_list;
56+
/* A separate list of r1bio which just need raid_end_bio_io called.
57+
* This mustn't happen for writes which had any errors if the superblock
58+
* needs to be written.
59+
*/
60+
struct list_head bio_end_io_list;
61+
5662
/* queue pending writes and submit them on unplug */
5763
struct bio_list pending_bio_list;
5864
int pending_count;

0 commit comments

Comments
 (0)