Skip to content

Commit fb3592c

Browse files
yhuang-intelakpm00
authored andcommitted
migrate_pages: fix deadlock in batched migration
Patch series "migrate_pages: fix deadlock in batched synchronous migration", v2. Two deadlock bugs were reported for the migrate_pages() batching series. Thanks Hugh and Pengfei. Analysis shows that if we have locked some other folios except the one we are migrating, it's not safe in general to wait synchronously, for example, to wait the writeback to complete or wait to lock the buffer head. So 1/3 fixes the deadlock in a simple way, where the batching support for the synchronous migration is disabled. The change is straightforward and easy to be understood. While 3/3 re-introduce the batching for synchronous migration via trying to migrate asynchronously in batch optimistically, then fall back to migrate synchronously one by one for fail-to-migrate folios. Test shows that this can restore the TLB flushing batching performance for synchronous migration effectively. This patch (of 3): Two deadlock bugs were reported for the migrate_pages() batching series. Thanks Hugh and Pengfei! For example, in the following deadlock trace snippet, INFO: task kworker/u4:0:9 blocked for more than 147 seconds. Not tainted 6.2.0-rc4-kvm+ #1314 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:kworker/u4:0 state:D stack:0 pid:9 ppid:2 flags:0x00004000 Workqueue: loop4 loop_rootcg_workfn Call Trace: <TASK> __schedule+0x43b/0xd00 schedule+0x6a/0xf0 io_schedule+0x4a/0x80 folio_wait_bit_common+0x1b5/0x4e0 ? __pfx_wake_page_function+0x10/0x10 __filemap_get_folio+0x73d/0x770 shmem_get_folio_gfp+0x1fd/0xc80 shmem_write_begin+0x91/0x220 generic_perform_write+0x10e/0x2e0 __generic_file_write_iter+0x17e/0x290 ? generic_write_checks+0x12b/0x1a0 generic_file_write_iter+0x97/0x180 ? __sanitizer_cov_trace_const_cmp4+0x1a/0x20 do_iter_readv_writev+0x13c/0x210 ? __sanitizer_cov_trace_const_cmp4+0x1a/0x20 do_iter_write+0xf6/0x330 vfs_iter_write+0x46/0x70 loop_process_work+0x723/0xfe0 loop_rootcg_workfn+0x28/0x40 process_one_work+0x3cc/0x8d0 worker_thread+0x66/0x630 ? __pfx_worker_thread+0x10/0x10 kthread+0x153/0x190 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x29/0x50 </TASK> INFO: task repro:1023 blocked for more than 147 seconds. Not tainted 6.2.0-rc4-kvm+ #1314 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:repro state:D stack:0 pid:1023 ppid:360 flags:0x00004004 Call Trace: <TASK> __schedule+0x43b/0xd00 schedule+0x6a/0xf0 io_schedule+0x4a/0x80 folio_wait_bit_common+0x1b5/0x4e0 ? compaction_alloc+0x77/0x1150 ? __pfx_wake_page_function+0x10/0x10 folio_wait_bit+0x30/0x40 folio_wait_writeback+0x2e/0x1e0 migrate_pages_batch+0x555/0x1ac0 ? __pfx_compaction_alloc+0x10/0x10 ? __pfx_compaction_free+0x10/0x10 ? __this_cpu_preempt_check+0x17/0x20 ? lock_is_held_type+0xe6/0x140 migrate_pages+0x100e/0x1180 ? __pfx_compaction_free+0x10/0x10 ? __pfx_compaction_alloc+0x10/0x10 compact_zone+0xe10/0x1b50 ? lock_is_held_type+0xe6/0x140 ? check_preemption_disabled+0x80/0xf0 compact_node+0xa3/0x100 ? __sanitizer_cov_trace_const_cmp8+0x1c/0x30 ? _find_first_bit+0x7b/0x90 sysctl_compaction_handler+0x5d/0xb0 proc_sys_call_handler+0x29d/0x420 proc_sys_write+0x2b/0x40 vfs_write+0x3a3/0x780 ksys_write+0xb7/0x180 __x64_sys_write+0x26/0x30 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x72/0xdc RIP: 0033:0x7f3a2471f59d RSP: 002b:00007ffe567f7288 EFLAGS: 00000217 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f3a2471f59d RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000005 RBP: 00007ffe567f72a0 R08: 0000000000000010 R09: 0000000000000010 R10: 0000000000000010 R11: 0000000000000217 R12: 00000000004012e0 R13: 00007ffe567f73e0 R14: 0000000000000000 R15: 0000000000000000 </TASK> The page migration task has held the lock of the shmem folio A, and is waiting the writeback of the folio B of the file system on the loop block device to complete. While the loop worker task which writes back the folio B is waiting to lock the shmem folio A, because the folio A backs the folio B in the loop device. Thus deadlock is triggered. In general, if we have locked some other folios except the one we are migrating, it's not safe to wait synchronously, for example, to wait the writeback to complete or wait to lock the buffer head. To fix the deadlock, in this patch, we avoid to batch the page migration except for MIGRATE_ASYNC mode. In MIGRATE_ASYNC mode, synchronous waiting is avoided. The fix can be improved further. We will do that as soon as possible. Link: https://lkml.kernel.org/r/[email protected] Link: https://lore.kernel.org/linux-mm/[email protected]/ Link: https://lore.kernel.org/linux-mm/[email protected]/ Link: https://lore.kernel.org/linux-mm/20230227110614.dngdub2j3exr6dfp@quack3/ Link: https://lkml.kernel.org/r/[email protected] Fixes: 5dfab10 ("migrate_pages: batch _unmap and _move") Signed-off-by: "Huang, Ying" <[email protected]> Reported-by: Hugh Dickins <[email protected]> Reported-by: "Xu, Pengfei" <[email protected]> Cc: Jan Kara <[email protected]> Cc: Baolin Wang <[email protected]> Cc: Christoph Hellwig <[email protected]> Cc: Stefan Roesch <[email protected]> Cc: Tejun Heo <[email protected]> Cc: Xin Hao <[email protected]> Cc: Zi Yan <[email protected]> Cc: Yang Shi <[email protected]> Cc: Matthew Wilcox <[email protected]> Cc: Mike Kravetz <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent 89a0045 commit fb3592c

File tree

1 file changed

+26
-43
lines changed

1 file changed

+26
-43
lines changed

mm/migrate.c

Lines changed: 26 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1112,7 +1112,7 @@ static void migrate_folio_done(struct folio *src,
11121112
/* Obtain the lock on page, remove all ptes. */
11131113
static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page,
11141114
unsigned long private, struct folio *src,
1115-
struct folio **dstp, int force, bool avoid_force_lock,
1115+
struct folio **dstp, int force,
11161116
enum migrate_mode mode, enum migrate_reason reason,
11171117
struct list_head *ret)
11181118
{
@@ -1163,17 +1163,6 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page
11631163
if (current->flags & PF_MEMALLOC)
11641164
goto out;
11651165

1166-
/*
1167-
* We have locked some folios and are going to wait to lock
1168-
* this folio. To avoid a potential deadlock, let's bail
1169-
* out and not do that. The locked folios will be moved and
1170-
* unlocked, then we can wait to lock this folio.
1171-
*/
1172-
if (avoid_force_lock) {
1173-
rc = -EDEADLOCK;
1174-
goto out;
1175-
}
1176-
11771166
folio_lock(src);
11781167
}
11791168
locked = true;
@@ -1253,7 +1242,7 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page
12531242
/* Establish migration ptes */
12541243
VM_BUG_ON_FOLIO(folio_test_anon(src) &&
12551244
!folio_test_ksm(src) && !anon_vma, src);
1256-
try_to_migrate(src, TTU_BATCH_FLUSH);
1245+
try_to_migrate(src, mode == MIGRATE_ASYNC ? TTU_BATCH_FLUSH : 0);
12571246
page_was_mapped = 1;
12581247
}
12591248

@@ -1267,7 +1256,7 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page
12671256
* A folio that has not been unmapped will be restored to
12681257
* right list unless we want to retry.
12691258
*/
1270-
if (rc == -EAGAIN || rc == -EDEADLOCK)
1259+
if (rc == -EAGAIN)
12711260
ret = NULL;
12721261

12731262
migrate_folio_undo_src(src, page_was_mapped, anon_vma, locked, ret);
@@ -1618,6 +1607,11 @@ static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page,
16181607
/*
16191608
* migrate_pages_batch() first unmaps folios in the from list as many as
16201609
* possible, then move the unmapped folios.
1610+
*
1611+
* We only batch migration if mode == MIGRATE_ASYNC to avoid to wait a
1612+
* lock or bit when we have locked more than one folio. Which may cause
1613+
* deadlock (e.g., for loop device). So, if mode != MIGRATE_ASYNC, the
1614+
* length of the from list must be <= 1.
16211615
*/
16221616
static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
16231617
free_page_t put_new_page, unsigned long private,
@@ -1640,11 +1634,11 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
16401634
LIST_HEAD(dst_folios);
16411635
bool nosplit = (reason == MR_NUMA_MISPLACED);
16421636
bool no_split_folio_counting = false;
1643-
bool avoid_force_lock;
16441637

1638+
VM_WARN_ON_ONCE(mode != MIGRATE_ASYNC &&
1639+
!list_empty(from) && !list_is_singular(from));
16451640
retry:
16461641
rc_saved = 0;
1647-
avoid_force_lock = false;
16481642
retry = 1;
16491643
for (pass = 0;
16501644
pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry);
@@ -1689,15 +1683,14 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
16891683
}
16901684

16911685
rc = migrate_folio_unmap(get_new_page, put_new_page, private,
1692-
folio, &dst, pass > 2, avoid_force_lock,
1693-
mode, reason, ret_folios);
1686+
folio, &dst, pass > 2, mode,
1687+
reason, ret_folios);
16941688
/*
16951689
* The rules are:
16961690
* Success: folio will be freed
16971691
* Unmap: folio will be put on unmap_folios list,
16981692
* dst folio put on dst_folios list
16991693
* -EAGAIN: stay on the from list
1700-
* -EDEADLOCK: stay on the from list
17011694
* -ENOMEM: stay on the from list
17021695
* Other errno: put on ret_folios list
17031696
*/
@@ -1749,14 +1742,6 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
17491742
goto out;
17501743
else
17511744
goto move;
1752-
case -EDEADLOCK:
1753-
/*
1754-
* The folio cannot be locked for potential deadlock.
1755-
* Go move (and unlock) all locked folios. Then we can
1756-
* try again.
1757-
*/
1758-
rc_saved = rc;
1759-
goto move;
17601745
case -EAGAIN:
17611746
if (is_large) {
17621747
large_retry++;
@@ -1771,11 +1756,6 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
17711756
stats->nr_thp_succeeded += is_thp;
17721757
break;
17731758
case MIGRATEPAGE_UNMAP:
1774-
/*
1775-
* We have locked some folios, don't force lock
1776-
* to avoid deadlock.
1777-
*/
1778-
avoid_force_lock = true;
17791759
list_move_tail(&folio->lru, &unmap_folios);
17801760
list_add_tail(&dst->lru, &dst_folios);
17811761
break;
@@ -1900,17 +1880,15 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
19001880
*/
19011881
list_splice_init(from, ret_folios);
19021882
list_splice_init(&split_folios, from);
1883+
/*
1884+
* Force async mode to avoid to wait lock or bit when we have
1885+
* locked more than one folios.
1886+
*/
1887+
mode = MIGRATE_ASYNC;
19031888
no_split_folio_counting = true;
19041889
goto retry;
19051890
}
19061891

1907-
/*
1908-
* We have unlocked all locked folios, so we can force lock now, let's
1909-
* try again.
1910-
*/
1911-
if (rc == -EDEADLOCK)
1912-
goto retry;
1913-
19141892
return rc;
19151893
}
19161894

@@ -1945,7 +1923,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
19451923
enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
19461924
{
19471925
int rc, rc_gather;
1948-
int nr_pages;
1926+
int nr_pages, batch;
19491927
struct folio *folio, *folio2;
19501928
LIST_HEAD(folios);
19511929
LIST_HEAD(ret_folios);
@@ -1959,6 +1937,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
19591937
mode, reason, &stats, &ret_folios);
19601938
if (rc_gather < 0)
19611939
goto out;
1940+
1941+
if (mode == MIGRATE_ASYNC)
1942+
batch = NR_MAX_BATCHED_MIGRATION;
1943+
else
1944+
batch = 1;
19621945
again:
19631946
nr_pages = 0;
19641947
list_for_each_entry_safe(folio, folio2, from, lru) {
@@ -1969,11 +1952,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
19691952
}
19701953

19711954
nr_pages += folio_nr_pages(folio);
1972-
if (nr_pages > NR_MAX_BATCHED_MIGRATION)
1955+
if (nr_pages >= batch)
19731956
break;
19741957
}
1975-
if (nr_pages > NR_MAX_BATCHED_MIGRATION)
1976-
list_cut_before(&folios, from, &folio->lru);
1958+
if (nr_pages >= batch)
1959+
list_cut_before(&folios, from, &folio2->lru);
19771960
else
19781961
list_splice_init(from, &folios);
19791962
rc = migrate_pages_batch(&folios, get_new_page, put_new_page, private,

0 commit comments

Comments
 (0)