Skip to content

Commit 18bb8bb

Browse files
morbidrsakdave
authored andcommitted
btrfs: zoned: automatically reclaim zones
When a file gets deleted on a zoned file system, the space freed is not returned back into the block group's free space, but is migrated to zone_unusable. As this zone_unusable space is behind the current write pointer it is not possible to use it for new allocations. In the current implementation a zone is reset once all of the block group's space is accounted as zone unusable. This behaviour can lead to premature ENOSPC errors on a busy file system. Instead of only reclaiming the zone once it is completely unusable, kick off a reclaim job once the amount of unusable bytes exceeds a user configurable threshold between 51% and 100%. It can be set per mounted filesystem via the sysfs tunable bg_reclaim_threshold which is set to 75% by default. Similar to reclaiming unused block groups, these dirty block groups are added to a to_reclaim list and then on a transaction commit, the reclaim process is triggered but after we deleted unused block groups, which will free space for the relocation process. Reviewed-by: Filipe Manana <[email protected]> Signed-off-by: Johannes Thumshirn <[email protected]> Reviewed-by: David Sterba <[email protected]> Signed-off-by: David Sterba <[email protected]>
1 parent f337206 commit 18bb8bb

File tree

10 files changed

+185
-2
lines changed

10 files changed

+185
-2
lines changed

fs/btrfs/block-group.c

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1485,6 +1485,97 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
14851485
spin_unlock(&fs_info->unused_bgs_lock);
14861486
}
14871487

1488+
void btrfs_reclaim_bgs_work(struct work_struct *work)
1489+
{
1490+
struct btrfs_fs_info *fs_info =
1491+
container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
1492+
struct btrfs_block_group *bg;
1493+
struct btrfs_space_info *space_info;
1494+
int ret;
1495+
1496+
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1497+
return;
1498+
1499+
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
1500+
return;
1501+
1502+
mutex_lock(&fs_info->reclaim_bgs_lock);
1503+
spin_lock(&fs_info->unused_bgs_lock);
1504+
while (!list_empty(&fs_info->reclaim_bgs)) {
1505+
bg = list_first_entry(&fs_info->reclaim_bgs,
1506+
struct btrfs_block_group,
1507+
bg_list);
1508+
list_del_init(&bg->bg_list);
1509+
1510+
space_info = bg->space_info;
1511+
spin_unlock(&fs_info->unused_bgs_lock);
1512+
1513+
/* Don't race with allocators so take the groups_sem */
1514+
down_write(&space_info->groups_sem);
1515+
1516+
spin_lock(&bg->lock);
1517+
if (bg->reserved || bg->pinned || bg->ro) {
1518+
/*
1519+
* We want to bail if we made new allocations or have
1520+
* outstanding allocations in this block group. We do
1521+
* the ro check in case balance is currently acting on
1522+
* this block group.
1523+
*/
1524+
spin_unlock(&bg->lock);
1525+
up_write(&space_info->groups_sem);
1526+
goto next;
1527+
}
1528+
spin_unlock(&bg->lock);
1529+
1530+
/* Get out fast, in case we're unmounting the filesystem */
1531+
if (btrfs_fs_closing(fs_info)) {
1532+
up_write(&space_info->groups_sem);
1533+
goto next;
1534+
}
1535+
1536+
ret = inc_block_group_ro(bg, 0);
1537+
up_write(&space_info->groups_sem);
1538+
if (ret < 0)
1539+
goto next;
1540+
1541+
btrfs_info(fs_info, "reclaiming chunk %llu with %llu%% used",
1542+
bg->start, div_u64(bg->used * 100, bg->length));
1543+
trace_btrfs_reclaim_block_group(bg);
1544+
ret = btrfs_relocate_chunk(fs_info, bg->start);
1545+
if (ret)
1546+
btrfs_err(fs_info, "error relocating chunk %llu",
1547+
bg->start);
1548+
1549+
next:
1550+
btrfs_put_block_group(bg);
1551+
spin_lock(&fs_info->unused_bgs_lock);
1552+
}
1553+
spin_unlock(&fs_info->unused_bgs_lock);
1554+
mutex_unlock(&fs_info->reclaim_bgs_lock);
1555+
btrfs_exclop_finish(fs_info);
1556+
}
1557+
1558+
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
1559+
{
1560+
spin_lock(&fs_info->unused_bgs_lock);
1561+
if (!list_empty(&fs_info->reclaim_bgs))
1562+
queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
1563+
spin_unlock(&fs_info->unused_bgs_lock);
1564+
}
1565+
1566+
void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
1567+
{
1568+
struct btrfs_fs_info *fs_info = bg->fs_info;
1569+
1570+
spin_lock(&fs_info->unused_bgs_lock);
1571+
if (list_empty(&bg->bg_list)) {
1572+
btrfs_get_block_group(bg);
1573+
trace_btrfs_add_reclaim_block_group(bg);
1574+
list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs);
1575+
}
1576+
spin_unlock(&fs_info->unused_bgs_lock);
1577+
}
1578+
14881579
static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
14891580
struct btrfs_path *path)
14901581
{
@@ -3446,6 +3537,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
34463537
}
34473538
spin_unlock(&info->unused_bgs_lock);
34483539

3540+
spin_lock(&info->unused_bgs_lock);
3541+
while (!list_empty(&info->reclaim_bgs)) {
3542+
block_group = list_first_entry(&info->reclaim_bgs,
3543+
struct btrfs_block_group,
3544+
bg_list);
3545+
list_del_init(&block_group->bg_list);
3546+
btrfs_put_block_group(block_group);
3547+
}
3548+
spin_unlock(&info->unused_bgs_lock);
3549+
34493550
spin_lock(&info->block_group_cache_lock);
34503551
while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
34513552
block_group = rb_entry(n, struct btrfs_block_group,

fs/btrfs/block-group.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
264264
u64 group_start, struct extent_map *em);
265265
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
266266
void btrfs_mark_bg_unused(struct btrfs_block_group *bg);
267+
void btrfs_reclaim_bgs_work(struct work_struct *work);
268+
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
269+
void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
267270
int btrfs_read_block_groups(struct btrfs_fs_info *info);
268271
int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
269272
u64 type, u64 chunk_offset, u64 size);

fs/btrfs/ctree.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -960,6 +960,11 @@ struct btrfs_fs_info {
960960
struct work_struct async_data_reclaim_work;
961961
struct work_struct preempt_reclaim_work;
962962

963+
/* Reclaim partially filled block groups in the background */
964+
struct work_struct reclaim_bgs_work;
965+
struct list_head reclaim_bgs;
966+
int bg_reclaim_threshold;
967+
963968
spinlock_t unused_bgs_lock;
964969
struct list_head unused_bgs;
965970
struct mutex unused_bg_unpin_mutex;

fs/btrfs/disk-io.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1898,6 +1898,13 @@ static int cleaner_kthread(void *arg)
18981898
* unused block groups.
18991899
*/
19001900
btrfs_delete_unused_bgs(fs_info);
1901+
1902+
/*
1903+
* Reclaim block groups in the reclaim_bgs list after we deleted
1904+
* all unused block_groups. This possibly gives us some more free
1905+
* space.
1906+
*/
1907+
btrfs_reclaim_bgs(fs_info);
19011908
sleep:
19021909
clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
19031910
if (kthread_should_park())
@@ -2886,6 +2893,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
28862893
INIT_LIST_HEAD(&fs_info->space_info);
28872894
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
28882895
INIT_LIST_HEAD(&fs_info->unused_bgs);
2896+
INIT_LIST_HEAD(&fs_info->reclaim_bgs);
28892897
#ifdef CONFIG_BTRFS_DEBUG
28902898
INIT_LIST_HEAD(&fs_info->allocated_roots);
28912899
INIT_LIST_HEAD(&fs_info->allocated_ebs);
@@ -2974,6 +2982,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
29742982
fs_info->swapfile_pins = RB_ROOT;
29752983

29762984
fs_info->send_in_progress = 0;
2985+
2986+
fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
2987+
INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work);
29772988
}
29782989

29792990
static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
@@ -4332,6 +4343,8 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
43324343
cancel_work_sync(&fs_info->async_data_reclaim_work);
43334344
cancel_work_sync(&fs_info->preempt_reclaim_work);
43344345

4346+
cancel_work_sync(&fs_info->reclaim_bgs_work);
4347+
43354348
/* Cancel or finish ongoing discard work */
43364349
btrfs_discard_cleanup(fs_info);
43374350

fs/btrfs/free-space-cache.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include <linux/ratelimit.h>
1212
#include <linux/error-injection.h>
1313
#include <linux/sched/mm.h>
14+
#include "misc.h"
1415
#include "ctree.h"
1516
#include "free-space-cache.h"
1617
#include "transaction.h"
@@ -2539,6 +2540,7 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
25392540
static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
25402541
u64 bytenr, u64 size, bool used)
25412542
{
2543+
struct btrfs_fs_info *fs_info = block_group->fs_info;
25422544
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
25432545
u64 offset = bytenr - block_group->start;
25442546
u64 to_free, to_unusable;
@@ -2569,8 +2571,13 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
25692571
}
25702572

25712573
/* All the region is now unusable. Mark it as unused and reclaim */
2572-
if (block_group->zone_unusable == block_group->length)
2574+
if (block_group->zone_unusable == block_group->length) {
25732575
btrfs_mark_bg_unused(block_group);
2576+
} else if (block_group->zone_unusable >=
2577+
div_factor_fine(block_group->length,
2578+
fs_info->bg_reclaim_threshold)) {
2579+
btrfs_mark_bg_to_reclaim(block_group);
2580+
}
25742581

25752582
return 0;
25762583
}

fs/btrfs/sysfs.c

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -980,6 +980,40 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
980980
}
981981
BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store);
982982

983+
static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj,
984+
struct kobj_attribute *a,
985+
char *buf)
986+
{
987+
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
988+
ssize_t ret;
989+
990+
ret = scnprintf(buf, PAGE_SIZE, "%d\n", fs_info->bg_reclaim_threshold);
991+
992+
return ret;
993+
}
994+
995+
static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
996+
struct kobj_attribute *a,
997+
const char *buf, size_t len)
998+
{
999+
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
1000+
int thresh;
1001+
int ret;
1002+
1003+
ret = kstrtoint(buf, 10, &thresh);
1004+
if (ret)
1005+
return ret;
1006+
1007+
if (thresh <= 50 || thresh > 100)
1008+
return -EINVAL;
1009+
1010+
fs_info->bg_reclaim_threshold = thresh;
1011+
1012+
return len;
1013+
}
1014+
BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show,
1015+
btrfs_bg_reclaim_threshold_store);
1016+
9831017
static const struct attribute *btrfs_attrs[] = {
9841018
BTRFS_ATTR_PTR(, label),
9851019
BTRFS_ATTR_PTR(, nodesize),
@@ -991,6 +1025,7 @@ static const struct attribute *btrfs_attrs[] = {
9911025
BTRFS_ATTR_PTR(, exclusive_operation),
9921026
BTRFS_ATTR_PTR(, generation),
9931027
BTRFS_ATTR_PTR(, read_policy),
1028+
BTRFS_ATTR_PTR(, bg_reclaim_threshold),
9941029
NULL,
9951030
};
9961031

fs/btrfs/volumes.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3098,7 +3098,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
30983098
return ret;
30993099
}
31003100

3101-
static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
3101+
int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
31023102
{
31033103
struct btrfs_root *root = fs_info->chunk_root;
31043104
struct btrfs_trans_handle *trans;

fs/btrfs/volumes.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -484,6 +484,7 @@ void btrfs_describe_block_groups(u64 flags, char *buf, u32 size_buf);
484484
int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
485485
int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
486486
int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
487+
int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset);
487488
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
488489
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
489490
int btrfs_uuid_scan_kthread(void *data);

fs/btrfs/zoned.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,12 @@
99
#include "disk-io.h"
1010
#include "block-group.h"
1111

12+
/*
13+
* Block groups with more than this value (percents) of unusable space will be
14+
* scheduled for background reclaim.
15+
*/
16+
#define BTRFS_DEFAULT_RECLAIM_THRESH 75
17+
1218
struct btrfs_zoned_device_info {
1319
/*
1420
* Number of zones, zone size and types of zones if bdev is a

include/trace/events/btrfs.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1903,6 +1903,18 @@ DEFINE_EVENT(btrfs__block_group, btrfs_add_unused_block_group,
19031903
TP_ARGS(bg_cache)
19041904
);
19051905

1906+
DEFINE_EVENT(btrfs__block_group, btrfs_add_reclaim_block_group,
1907+
TP_PROTO(const struct btrfs_block_group *bg_cache),
1908+
1909+
TP_ARGS(bg_cache)
1910+
);
1911+
1912+
DEFINE_EVENT(btrfs__block_group, btrfs_reclaim_block_group,
1913+
TP_PROTO(const struct btrfs_block_group *bg_cache),
1914+
1915+
TP_ARGS(bg_cache)
1916+
);
1917+
19061918
DEFINE_EVENT(btrfs__block_group, btrfs_skip_unused_block_group,
19071919
TP_PROTO(const struct btrfs_block_group *bg_cache),
19081920

0 commit comments

Comments
 (0)