Skip to content

Commit ac325fc

Browse files
josefbacikkdave
authored andcommitted
btrfs: do not hold the extent lock for entire read
Historically we've held the extent lock throughout the entire read. There's been a few reasons for this, but it's mostly just caused us problems. For example, this prevents us from allowing page faults during direct io reads, because we could deadlock. This has forced us to only allow 4k reads at a time for io_uring NOWAIT requests because we have no idea if we'll be forced to page fault and thus have to do a whole lot of work. On the buffered side we are protected by the page lock, as long as we're reading things like buffered writes, punch hole, and even direct IO to a certain degree will get hung up on the page lock while the page is in flight. On the direct side we have the dio extent lock, which acts much like the way the extent lock worked previously to this patch, however just for direct reads. This protects direct reads from concurrent direct writes, while we're protected from buffered writes via the inode lock. Now that we're protected in all cases, narrow the extent lock to the part where we're getting the extent map to submit the reads, no longer holding the extent lock for the entire read operation. Push the extent lock down into do_readpage() so that we're only grabbing it when looking up the extent map. This portion was contributed by Goldwyn. Co-developed-by: Goldwyn Rodrigues <[email protected]> Reviewed-by: Goldwyn Rodrigues <[email protected]> Signed-off-by: Josef Bacik <[email protected]> Signed-off-by: David Sterba <[email protected]>
1 parent 07d399c commit ac325fc

File tree

3 files changed

+29
-116
lines changed

3 files changed

+29
-116
lines changed

fs/btrfs/compression.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -521,6 +521,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
521521
}
522522
add_size = min(em->start + em->len, page_end + 1) - cur;
523523
free_extent_map(em);
524+
unlock_extent(tree, cur, page_end, NULL);
524525

525526
if (folio->index == end_index) {
526527
size_t zero_offset = offset_in_folio(folio, isize);
@@ -534,7 +535,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
534535

535536
if (!bio_add_folio(orig_bio, folio, add_size,
536537
offset_in_folio(folio, cur))) {
537-
unlock_extent(tree, cur, page_end, NULL);
538538
folio_unlock(folio);
539539
folio_put(folio);
540540
break;

fs/btrfs/direct-io.c

Lines changed: 24 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
365365
int ret = 0;
366366
u64 len = length;
367367
const u64 data_alloc_len = length;
368-
bool unlock_extents = false;
368+
u32 unlock_bits = EXTENT_LOCKED;
369369

370370
/*
371371
* We could potentially fault if we have a buffer > PAGE_SIZE, and if
@@ -526,7 +526,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
526526
start, &len, flags);
527527
if (ret < 0)
528528
goto unlock_err;
529-
unlock_extents = true;
530529
/* Recalc len in case the new em is smaller than requested */
531530
len = min(len, em->len - (start - em->start));
532531
if (dio_data->data_space_reserved) {
@@ -547,23 +546,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
547546
release_offset,
548547
release_len);
549548
}
550-
} else {
551-
/*
552-
* We need to unlock only the end area that we aren't using.
553-
* The rest is going to be unlocked by the endio routine.
554-
*/
555-
lockstart = start + len;
556-
if (lockstart < lockend)
557-
unlock_extents = true;
558549
}
559550

560-
if (unlock_extents)
561-
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
562-
EXTENT_LOCKED | EXTENT_DIO_LOCKED,
563-
&cached_state);
564-
else
565-
free_extent_state(cached_state);
566-
567551
/*
568552
* Translate extent map information to iomap.
569553
* We trim the extents (and move the addr) even though iomap code does
@@ -582,6 +566,23 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
582566
iomap->length = len;
583567
free_extent_map(em);
584568

569+
/*
570+
* Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed,
571+
* writes only hold it for this part. We hold the extent lock until
572+
* we're completely done with the extent map to make sure it remains
573+
* valid.
574+
*/
575+
if (write)
576+
unlock_bits |= EXTENT_DIO_LOCKED;
577+
578+
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
579+
unlock_bits, &cached_state);
580+
581+
/* We didn't use everything, unlock the dio extent for the remainder. */
582+
if (!write && (start + len) < lockend)
583+
unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len,
584+
lockend, NULL);
585+
585586
return 0;
586587

587588
unlock_err:
@@ -614,8 +615,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
614615

615616
if (!write && (iomap->type == IOMAP_HOLE)) {
616617
/* If reading from a hole, unlock and return */
617-
clear_extent_bit(&BTRFS_I(inode)->io_tree, pos, pos + length - 1,
618-
EXTENT_LOCKED | EXTENT_DIO_LOCKED, NULL);
618+
unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
619+
pos + length - 1, NULL);
619620
return 0;
620621
}
621622

@@ -626,9 +627,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
626627
btrfs_finish_ordered_extent(dio_data->ordered, NULL,
627628
pos, length, false);
628629
else
629-
clear_extent_bit(&BTRFS_I(inode)->io_tree, pos,
630-
pos + length - 1,
631-
EXTENT_LOCKED | EXTENT_DIO_LOCKED, NULL);
630+
unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
631+
pos + length - 1, NULL);
632632
ret = -ENOTBLK;
633633
}
634634
if (write) {
@@ -660,9 +660,8 @@ static void btrfs_dio_end_io(struct btrfs_bio *bbio)
660660
dip->file_offset, dip->bytes,
661661
!bio->bi_status);
662662
} else {
663-
clear_extent_bit(&inode->io_tree, dip->file_offset,
664-
dip->file_offset + dip->bytes - 1,
665-
EXTENT_LOCKED | EXTENT_DIO_LOCKED, NULL);
663+
unlock_dio_extent(&inode->io_tree, dip->file_offset,
664+
dip->file_offset + dip->bytes - 1, NULL);
666665
}
667666

668667
bbio->bio.bi_private = bbio->private;

fs/btrfs/extent_io.c

Lines changed: 4 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -480,75 +480,6 @@ static void end_bbio_data_write(struct btrfs_bio *bbio)
480480
bio_put(bio);
481481
}
482482

483-
/*
484-
* Record previously processed extent range
485-
*
486-
* For endio_readpage_release_extent() to handle a full extent range, reducing
487-
* the extent io operations.
488-
*/
489-
struct processed_extent {
490-
struct btrfs_inode *inode;
491-
/* Start of the range in @inode */
492-
u64 start;
493-
/* End of the range in @inode */
494-
u64 end;
495-
bool uptodate;
496-
};
497-
498-
/*
499-
* Try to release processed extent range
500-
*
501-
* May not release the extent range right now if the current range is
502-
* contiguous to processed extent.
503-
*
504-
* Will release processed extent when any of @inode, @uptodate, the range is
505-
* no longer contiguous to the processed range.
506-
*
507-
* Passing @inode == NULL will force processed extent to be released.
508-
*/
509-
static void endio_readpage_release_extent(struct processed_extent *processed,
510-
struct btrfs_inode *inode, u64 start, u64 end,
511-
bool uptodate)
512-
{
513-
struct extent_state *cached = NULL;
514-
struct extent_io_tree *tree;
515-
516-
/* The first extent, initialize @processed */
517-
if (!processed->inode)
518-
goto update;
519-
520-
/*
521-
* Contiguous to processed extent, just uptodate the end.
522-
*
523-
* Several things to notice:
524-
*
525-
* - bio can be merged as long as on-disk bytenr is contiguous
526-
* This means we can have page belonging to other inodes, thus need to
527-
* check if the inode still matches.
528-
* - bvec can contain range beyond current page for multi-page bvec
529-
* Thus we need to do processed->end + 1 >= start check
530-
*/
531-
if (processed->inode == inode && processed->uptodate == uptodate &&
532-
processed->end + 1 >= start && end >= processed->end) {
533-
processed->end = end;
534-
return;
535-
}
536-
537-
tree = &processed->inode->io_tree;
538-
/*
539-
* Now we don't have range contiguous to the processed range, release
540-
* the processed range now.
541-
*/
542-
unlock_extent(tree, processed->start, processed->end, &cached);
543-
544-
update:
545-
/* Update processed to current range */
546-
processed->inode = inode;
547-
processed->start = start;
548-
processed->end = end;
549-
processed->uptodate = uptodate;
550-
}
551-
552483
static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio)
553484
{
554485
ASSERT(folio_test_locked(folio));
@@ -575,7 +506,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
575506
{
576507
struct btrfs_fs_info *fs_info = bbio->fs_info;
577508
struct bio *bio = &bbio->bio;
578-
struct processed_extent processed = { 0 };
579509
struct folio_iter fi;
580510
const u32 sectorsize = fs_info->sectorsize;
581511

@@ -640,11 +570,7 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
640570

641571
/* Update page status and unlock. */
642572
end_folio_read(folio, uptodate, start, len);
643-
endio_readpage_release_extent(&processed, BTRFS_I(inode),
644-
start, end, uptodate);
645573
}
646-
/* Release the last extent */
647-
endio_readpage_release_extent(&processed, NULL, 0, 0, false);
648574
bio_put(bio);
649575
}
650576

@@ -973,6 +899,7 @@ static struct extent_map *__get_extent_map(struct inode *inode,
973899
u64 len, struct extent_map **em_cached)
974900
{
975901
struct extent_map *em;
902+
struct extent_state *cached_state = NULL;
976903

977904
ASSERT(em_cached);
978905

@@ -988,12 +915,15 @@ static struct extent_map *__get_extent_map(struct inode *inode,
988915
*em_cached = NULL;
989916
}
990917

918+
btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), start, start + len - 1, &cached_state);
991919
em = btrfs_get_extent(BTRFS_I(inode), folio, start, len);
992920
if (!IS_ERR(em)) {
993921
BUG_ON(*em_cached);
994922
refcount_inc(&em->refs);
995923
*em_cached = em;
996924
}
925+
unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state);
926+
997927
return em;
998928
}
999929
/*
@@ -1019,11 +949,9 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
1019949
size_t pg_offset = 0;
1020950
size_t iosize;
1021951
size_t blocksize = fs_info->sectorsize;
1022-
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
1023952

1024953
ret = set_folio_extent_mapped(folio);
1025954
if (ret < 0) {
1026-
unlock_extent(tree, start, end, NULL);
1027955
folio_unlock(folio);
1028956
return ret;
1029957
}
@@ -1047,14 +975,12 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
1047975
if (cur >= last_byte) {
1048976
iosize = folio_size(folio) - pg_offset;
1049977
folio_zero_range(folio, pg_offset, iosize);
1050-
unlock_extent(tree, cur, cur + iosize - 1, NULL);
1051978
end_folio_read(folio, true, cur, iosize);
1052979
break;
1053980
}
1054981
em = __get_extent_map(inode, folio, cur, end - cur + 1,
1055982
em_cached);
1056983
if (IS_ERR(em)) {
1057-
unlock_extent(tree, cur, end, NULL);
1058984
end_folio_read(folio, false, cur, end + 1 - cur);
1059985
return PTR_ERR(em);
1060986
}
@@ -1123,15 +1049,13 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
11231049
if (block_start == EXTENT_MAP_HOLE) {
11241050
folio_zero_range(folio, pg_offset, iosize);
11251051

1126-
unlock_extent(tree, cur, cur + iosize - 1, NULL);
11271052
end_folio_read(folio, true, cur, iosize);
11281053
cur = cur + iosize;
11291054
pg_offset += iosize;
11301055
continue;
11311056
}
11321057
/* the get_extent function already copied into the folio */
11331058
if (block_start == EXTENT_MAP_INLINE) {
1134-
unlock_extent(tree, cur, cur + iosize - 1, NULL);
11351059
end_folio_read(folio, true, cur, iosize);
11361060
cur = cur + iosize;
11371061
pg_offset += iosize;
@@ -1156,15 +1080,10 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
11561080

11571081
int btrfs_read_folio(struct file *file, struct folio *folio)
11581082
{
1159-
struct btrfs_inode *inode = folio_to_inode(folio);
1160-
u64 start = folio_pos(folio);
1161-
u64 end = start + folio_size(folio) - 1;
11621083
struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ };
11631084
struct extent_map *em_cached = NULL;
11641085
int ret;
11651086

1166-
btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
1167-
11681087
ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL);
11691088
free_extent_map(em_cached);
11701089

@@ -2337,15 +2256,10 @@ int btrfs_writepages(struct address_space *mapping, struct writeback_control *wb
23372256
void btrfs_readahead(struct readahead_control *rac)
23382257
{
23392258
struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD };
2340-
struct btrfs_inode *inode = BTRFS_I(rac->mapping->host);
23412259
struct folio *folio;
2342-
u64 start = readahead_pos(rac);
2343-
u64 end = start + readahead_length(rac) - 1;
23442260
struct extent_map *em_cached = NULL;
23452261
u64 prev_em_start = (u64)-1;
23462262

2347-
btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
2348-
23492263
while ((folio = readahead_folio(rac)) != NULL)
23502264
btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start);
23512265

0 commit comments

Comments
 (0)