Skip to content

Commit 681583a

Browse files
YangYang866gregkh
authored andcommitted
sbitmap: fix io hung due to race on sbitmap_word::cleared
[ Upstream commit 72d04bd ] Configuration for sbq: depth=64, wake_batch=6, shift=6, map_nr=1 1. There are 64 requests in progress: map->word = 0xFFFFFFFFFFFFFFFF 2. After all the 64 requests complete, and no more requests come: map->word = 0xFFFFFFFFFFFFFFFF, map->cleared = 0xFFFFFFFFFFFFFFFF 3. Now two tasks try to allocate requests: T1: T2: __blk_mq_get_tag . __sbitmap_queue_get . sbitmap_get . sbitmap_find_bit . sbitmap_find_bit_in_word . __sbitmap_get_word -> nr=-1 __blk_mq_get_tag sbitmap_deferred_clear __sbitmap_queue_get /* map->cleared=0xFFFFFFFFFFFFFFFF */ sbitmap_find_bit if (!READ_ONCE(map->cleared)) sbitmap_find_bit_in_word return false; __sbitmap_get_word -> nr=-1 mask = xchg(&map->cleared, 0) sbitmap_deferred_clear atomic_long_andnot() /* map->cleared=0 */ if (!(map->cleared)) return false; /* * map->cleared is cleared by T1 * T2 fail to acquire the tag */ 4. T2 is the sole tag waiter. When T1 puts the tag, T2 cannot be woken up due to the wake_batch being set at 6. If no more requests come, T1 will wait here indefinitely. This patch achieves two purposes: 1. Check on ->cleared and update on both ->cleared and ->word need to be done atomically, and using spinlock could be the simplest solution. 2. Add extra check in sbitmap_deferred_clear(), to identify whether ->word has free bits. Fixes: ea86ea2 ("sbitmap: ammortize cost of clearing bits") Signed-off-by: Yang Yang <[email protected]> Reviewed-by: Ming Lei <[email protected]> Reviewed-by: Bart Van Assche <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Jens Axboe <[email protected]> Signed-off-by: Sasha Levin <[email protected]>
1 parent aba6f11 commit 681583a

File tree

2 files changed

+34
-7
lines changed

2 files changed

+34
-7
lines changed

include/linux/sbitmap.h

+5
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,11 @@ struct sbitmap_word {
3636
* @cleared: word holding cleared bits
3737
*/
3838
unsigned long cleared ____cacheline_aligned_in_smp;
39+
40+
/**
41+
* @swap_lock: serializes simultaneous updates of ->word and ->cleared
42+
*/
43+
spinlock_t swap_lock;
3944
} ____cacheline_aligned_in_smp;
4045

4146
/**

lib/sbitmap.c

+29-7
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,30 @@ static inline void update_alloc_hint_after_get(struct sbitmap *sb,
6060
/*
6161
* See if we have deferred clears that we can batch move
6262
*/
63-
static inline bool sbitmap_deferred_clear(struct sbitmap_word *map)
63+
static inline bool sbitmap_deferred_clear(struct sbitmap_word *map,
64+
unsigned int depth, unsigned int alloc_hint, bool wrap)
6465
{
65-
unsigned long mask;
66+
unsigned long mask, word_mask;
6667

67-
if (!READ_ONCE(map->cleared))
68-
return false;
68+
guard(spinlock_irqsave)(&map->swap_lock);
69+
70+
if (!map->cleared) {
71+
if (depth == 0)
72+
return false;
73+
74+
word_mask = (~0UL) >> (BITS_PER_LONG - depth);
75+
/*
76+
* The current behavior is to always retry after moving
77+
* ->cleared to word, and we change it to retry in case
78+
* of any free bits. To avoid an infinite loop, we need
79+
* to take wrap & alloc_hint into account, otherwise a
80+
* soft lockup may occur.
81+
*/
82+
if (!wrap && alloc_hint)
83+
word_mask &= ~((1UL << alloc_hint) - 1);
84+
85+
return (READ_ONCE(map->word) & word_mask) != word_mask;
86+
}
6987

7088
/*
7189
* First get a stable cleared mask, setting the old mask to 0.
@@ -85,6 +103,7 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
85103
bool alloc_hint)
86104
{
87105
unsigned int bits_per_word;
106+
int i;
88107

89108
if (shift < 0)
90109
shift = sbitmap_calculate_shift(depth);
@@ -116,6 +135,9 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
116135
return -ENOMEM;
117136
}
118137

138+
for (i = 0; i < sb->map_nr; i++)
139+
spin_lock_init(&sb->map[i].swap_lock);
140+
119141
return 0;
120142
}
121143
EXPORT_SYMBOL_GPL(sbitmap_init_node);
@@ -126,7 +148,7 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth)
126148
unsigned int i;
127149

128150
for (i = 0; i < sb->map_nr; i++)
129-
sbitmap_deferred_clear(&sb->map[i]);
151+
sbitmap_deferred_clear(&sb->map[i], 0, 0, 0);
130152

131153
sb->depth = depth;
132154
sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word);
@@ -179,7 +201,7 @@ static int sbitmap_find_bit_in_word(struct sbitmap_word *map,
179201
alloc_hint, wrap);
180202
if (nr != -1)
181203
break;
182-
if (!sbitmap_deferred_clear(map))
204+
if (!sbitmap_deferred_clear(map, depth, alloc_hint, wrap))
183205
break;
184206
} while (1);
185207

@@ -505,7 +527,7 @@ unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
505527
unsigned int map_depth = __map_depth(sb, index);
506528
unsigned long val;
507529

508-
sbitmap_deferred_clear(map);
530+
sbitmap_deferred_clear(map, 0, 0, 0);
509531
val = READ_ONCE(map->word);
510532
if (val == (1UL << (map_depth - 1)) - 1)
511533
goto next;

0 commit comments

Comments
 (0)