Skip to content

Commit 3a0a529

Browse files
KAGA-KOKOaxboe
authored andcommitted
block, scsi: Make SCSI quiesce and resume work reliably
The contexts from which a SCSI device can be quiesced or resumed are: * Writing into /sys/class/scsi_device/*/device/state. * SCSI parallel (SPI) domain validation. * The SCSI device power management methods. See also scsi_bus_pm_ops. It is essential during suspend and resume that neither the filesystem state nor the filesystem metadata in RAM changes. This is why while the hibernation image is being written or restored that SCSI devices are quiesced. The SCSI core quiesces devices through scsi_device_quiesce() and scsi_device_resume(). In the SDEV_QUIESCE state execution of non-preempt requests is deferred. This is realized by returning BLKPREP_DEFER from inside scsi_prep_state_check() for quiesced SCSI devices. Avoid that a full queue prevents power management requests to be submitted by deferring allocation of non-preempt requests for devices in the quiesced state. This patch has been tested by running the following commands and by verifying that after each resume the fio job was still running: for ((i=0; i<10; i++)); do ( cd /sys/block/md0/md && while true; do [ "$(<sync_action)" = "idle" ] && echo check > sync_action sleep 1 done ) & pids=($!) for d in /sys/class/block/sd*[a-z]; do bdev=${d#/sys/class/block/} hcil=$(readlink "$d/device") hcil=${hcil#../../../} echo 4 > "$d/queue/nr_requests" echo 1 > "/sys/class/scsi_device/$hcil/device/queue_depth" fio --name="$bdev" --filename="/dev/$bdev" --buffered=0 --bs=512 \ --rw=randread --ioengine=libaio --numjobs=4 --iodepth=16 \ --iodepth_batch=1 --thread --loops=$((2**31)) & pids+=($!) done sleep 1 echo "$(date) Hibernating ..." >>hibernate-test-log.txt systemctl hibernate sleep 10 kill "${pids[@]}" echo idle > /sys/block/md0/md/sync_action wait echo "$(date) Done." >>hibernate-test-log.txt done Reported-by: Oleksandr Natalenko <[email protected]> References: "I/O hangs after resuming from suspend-to-ram" (https://marc.info/?l=linux-block&m=150340235201348). Signed-off-by: Bart Van Assche <[email protected]> Reviewed-by: Hannes Reinecke <[email protected]> Tested-by: Martin Steigerwald <[email protected]> Tested-by: Oleksandr Natalenko <[email protected]> Cc: Martin K. Petersen <[email protected]> Cc: Ming Lei <[email protected]> Cc: Christoph Hellwig <[email protected]> Cc: Johannes Thumshirn <[email protected]> Signed-off-by: Jens Axboe <[email protected]>
1 parent c9254f2 commit 3a0a529

File tree

6 files changed

+70
-25
lines changed

6 files changed

+70
-25
lines changed

block/blk-core.c

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,7 @@ void blk_clear_preempt_only(struct request_queue *q)
374374

375375
spin_lock_irqsave(q->queue_lock, flags);
376376
queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q);
377+
wake_up_all(&q->mq_freeze_wq);
377378
spin_unlock_irqrestore(q->queue_lock, flags);
378379
}
379380
EXPORT_SYMBOL_GPL(blk_clear_preempt_only);
@@ -795,15 +796,38 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
795796
}
796797
EXPORT_SYMBOL(blk_alloc_queue);
797798

798-
int blk_queue_enter(struct request_queue *q, bool nowait)
799+
/**
800+
* blk_queue_enter() - try to increase q->q_usage_counter
801+
* @q: request queue pointer
802+
* @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT
803+
*/
804+
int blk_queue_enter(struct request_queue *q, unsigned int flags)
799805
{
806+
const bool preempt = flags & BLK_MQ_REQ_PREEMPT;
807+
800808
while (true) {
809+
bool success = false;
801810
int ret;
802811

803-
if (percpu_ref_tryget_live(&q->q_usage_counter))
812+
rcu_read_lock_sched();
813+
if (percpu_ref_tryget_live(&q->q_usage_counter)) {
814+
/*
815+
* The code that sets the PREEMPT_ONLY flag is
816+
* responsible for ensuring that that flag is globally
817+
* visible before the queue is unfrozen.
818+
*/
819+
if (preempt || !blk_queue_preempt_only(q)) {
820+
success = true;
821+
} else {
822+
percpu_ref_put(&q->q_usage_counter);
823+
}
824+
}
825+
rcu_read_unlock_sched();
826+
827+
if (success)
804828
return 0;
805829

806-
if (nowait)
830+
if (flags & BLK_MQ_REQ_NOWAIT)
807831
return -EBUSY;
808832

809833
/*
@@ -816,7 +840,8 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
816840
smp_rmb();
817841

818842
ret = wait_event_interruptible(q->mq_freeze_wq,
819-
!atomic_read(&q->mq_freeze_depth) ||
843+
(atomic_read(&q->mq_freeze_depth) == 0 &&
844+
(preempt || !blk_queue_preempt_only(q))) ||
820845
blk_queue_dying(q));
821846
if (blk_queue_dying(q))
822847
return -ENODEV;
@@ -1445,8 +1470,7 @@ static struct request *blk_old_get_request(struct request_queue *q,
14451470
/* create ioc upfront */
14461471
create_io_context(gfp_mask, q->node);
14471472

1448-
ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM) ||
1449-
(op & REQ_NOWAIT));
1473+
ret = blk_queue_enter(q, flags);
14501474
if (ret)
14511475
return ERR_PTR(ret);
14521476
spin_lock_irq(q->queue_lock);
@@ -2267,8 +2291,10 @@ blk_qc_t generic_make_request(struct bio *bio)
22672291
current->bio_list = bio_list_on_stack;
22682292
do {
22692293
struct request_queue *q = bio->bi_disk->queue;
2294+
unsigned int flags = bio->bi_opf & REQ_NOWAIT ?
2295+
BLK_MQ_REQ_NOWAIT : 0;
22702296

2271-
if (likely(blk_queue_enter(q, bio->bi_opf & REQ_NOWAIT) == 0)) {
2297+
if (likely(blk_queue_enter(q, flags) == 0)) {
22722298
struct bio_list lower, same;
22732299

22742300
/* Create a fresh bio_list for all subordinate requests */
@@ -2327,7 +2353,7 @@ blk_qc_t direct_make_request(struct bio *bio)
23272353
if (!generic_make_request_checks(bio))
23282354
return BLK_QC_T_NONE;
23292355

2330-
if (unlikely(blk_queue_enter(q, nowait))) {
2356+
if (unlikely(blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0))) {
23312357
if (nowait && !blk_queue_dying(q))
23322358
bio->bi_status = BLK_STS_AGAIN;
23332359
else

block/blk-mq.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -389,7 +389,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
389389
struct request *rq;
390390
int ret;
391391

392-
ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
392+
ret = blk_queue_enter(q, flags);
393393
if (ret)
394394
return ERR_PTR(ret);
395395

@@ -428,7 +428,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
428428
if (hctx_idx >= q->nr_hw_queues)
429429
return ERR_PTR(-EIO);
430430

431-
ret = blk_queue_enter(q, true);
431+
ret = blk_queue_enter(q, flags);
432432
if (ret)
433433
return ERR_PTR(ret);
434434

drivers/scsi/scsi_lib.c

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2947,21 +2947,37 @@ static void scsi_wait_for_queuecommand(struct scsi_device *sdev)
29472947
int
29482948
scsi_device_quiesce(struct scsi_device *sdev)
29492949
{
2950+
struct request_queue *q = sdev->request_queue;
29502951
int err;
29512952

2953+
/*
2954+
* It is allowed to call scsi_device_quiesce() multiple times from
2955+
* the same context but concurrent scsi_device_quiesce() calls are
2956+
* not allowed.
2957+
*/
2958+
WARN_ON_ONCE(sdev->quiesced_by && sdev->quiesced_by != current);
2959+
2960+
blk_set_preempt_only(q);
2961+
2962+
blk_mq_freeze_queue(q);
2963+
/*
2964+
* Ensure that the effect of blk_set_preempt_only() will be visible
2965+
* for percpu_ref_tryget() callers that occur after the queue
2966+
* unfreeze even if the queue was already frozen before this function
2967+
* was called. See also https://lwn.net/Articles/573497/.
2968+
*/
2969+
synchronize_rcu();
2970+
blk_mq_unfreeze_queue(q);
2971+
29522972
mutex_lock(&sdev->state_mutex);
29532973
err = scsi_device_set_state(sdev, SDEV_QUIESCE);
2974+
if (err == 0)
2975+
sdev->quiesced_by = current;
2976+
else
2977+
blk_clear_preempt_only(q);
29542978
mutex_unlock(&sdev->state_mutex);
29552979

2956-
if (err)
2957-
return err;
2958-
2959-
scsi_run_queue(sdev->request_queue);
2960-
while (atomic_read(&sdev->device_busy)) {
2961-
msleep_interruptible(200);
2962-
scsi_run_queue(sdev->request_queue);
2963-
}
2964-
return 0;
2980+
return err;
29652981
}
29662982
EXPORT_SYMBOL(scsi_device_quiesce);
29672983

@@ -2981,9 +2997,11 @@ void scsi_device_resume(struct scsi_device *sdev)
29812997
* device deleted during suspend)
29822998
*/
29832999
mutex_lock(&sdev->state_mutex);
2984-
if (sdev->sdev_state == SDEV_QUIESCE &&
2985-
scsi_device_set_state(sdev, SDEV_RUNNING) == 0)
2986-
scsi_run_queue(sdev->request_queue);
3000+
WARN_ON_ONCE(!sdev->quiesced_by);
3001+
sdev->quiesced_by = NULL;
3002+
blk_clear_preempt_only(sdev->request_queue);
3003+
if (sdev->sdev_state == SDEV_QUIESCE)
3004+
scsi_device_set_state(sdev, SDEV_RUNNING);
29873005
mutex_unlock(&sdev->state_mutex);
29883006
}
29893007
EXPORT_SYMBOL(scsi_device_resume);

fs/block_dev.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -662,7 +662,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
662662
if (!ops->rw_page || bdev_get_integrity(bdev))
663663
return result;
664664

665-
result = blk_queue_enter(bdev->bd_queue, false);
665+
result = blk_queue_enter(bdev->bd_queue, 0);
666666
if (result)
667667
return result;
668668
result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false);
@@ -698,7 +698,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
698698

699699
if (!ops->rw_page || bdev_get_integrity(bdev))
700700
return -EOPNOTSUPP;
701-
result = blk_queue_enter(bdev->bd_queue, false);
701+
result = blk_queue_enter(bdev->bd_queue, 0);
702702
if (result)
703703
return result;
704704

include/linux/blkdev.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -959,7 +959,7 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
959959
extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
960960
struct scsi_ioctl_command __user *);
961961

962-
extern int blk_queue_enter(struct request_queue *q, bool nowait);
962+
extern int blk_queue_enter(struct request_queue *q, unsigned int flags);
963963
extern void blk_queue_exit(struct request_queue *q);
964964
extern void blk_start_queue(struct request_queue *q);
965965
extern void blk_start_queue_async(struct request_queue *q);

include/scsi/scsi_device.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,7 @@ struct scsi_device {
219219
unsigned char access_state;
220220
struct mutex state_mutex;
221221
enum scsi_device_state sdev_state;
222+
struct task_struct *quiesced_by;
222223
unsigned long sdev_data[0];
223224
} __attribute__((aligned(sizeof(unsigned long))));
224225

0 commit comments

Comments
 (0)