Skip to content

Commit 96d365e

Browse files
committed
cgroup: make css_set_lock a rwsem and rename it to css_set_rwsem
Currently there are two ways to walk tasks of a cgroup - css_task_iter_start/next/end() and css_scan_tasks(). The latter builds on the former but allows blocking while iterating. Unfortunately, the way css_scan_tasks() is implemented is rather nasty, it uses a priority heap of pointers to extract some number of tasks in task creation order and loops over them invoking the callback and repeats that until it reaches the end. It requires either preallocated heap or may fail under memory pressure, while unlikely to be problematic, the complexity is O(N^2), and in general just nasty. We're gonna convert all css_scan_users() to css_task_iter_start/next/end() and remove css_scan_users(). As css_scan_tasks() users may block, let's convert css_set_lock to a rwsem so that tasks can block during css_task_iter_*() is in progress. While this does increase the chance of possible deadlock scenarios, given the current usage, the probability is relatively low, and even if that happens, the right thing to do is updating the iteration in the similar way to css iterators so that it can handle blocking. Most conversions are trivial; however, task_cgroup_path() now expects to be called with css_set_rwsem locked instead of locking itself. This is because the function is called with RCU read lock held and rwsem locking should nest outside RCU read lock. Signed-off-by: Tejun Heo <[email protected]> Acked-by: Li Zefan <[email protected]>
1 parent e406d1c commit 96d365e

File tree

1 file changed

+57
-47
lines changed

1 file changed

+57
-47
lines changed

kernel/cgroup.c

Lines changed: 57 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
#include <linux/sched.h>
4343
#include <linux/slab.h>
4444
#include <linux/spinlock.h>
45+
#include <linux/rwsem.h>
4546
#include <linux/string.h>
4647
#include <linux/sort.h>
4748
#include <linux/kmod.h>
@@ -341,11 +342,10 @@ static struct css_set init_css_set;
341342
static struct cgrp_cset_link init_cgrp_cset_link;
342343

343344
/*
344-
* css_set_lock protects the list of css_set objects, and the chain of
345-
* tasks off each css_set. Nests outside task->alloc_lock due to
346-
* css_task_iter_start().
345+
* css_set_rwsem protects the list of css_set objects, and the chain of
346+
* tasks off each css_set.
347347
*/
348-
static DEFINE_RWLOCK(css_set_lock);
348+
static DECLARE_RWSEM(css_set_rwsem);
349349
static int css_set_count;
350350

351351
/*
@@ -380,9 +380,9 @@ static void __put_css_set(struct css_set *cset, int taskexit)
380380
*/
381381
if (atomic_add_unless(&cset->refcount, -1, 1))
382382
return;
383-
write_lock(&css_set_lock);
383+
down_write(&css_set_rwsem);
384384
if (!atomic_dec_and_test(&cset->refcount)) {
385-
write_unlock(&css_set_lock);
385+
up_write(&css_set_rwsem);
386386
return;
387387
}
388388

@@ -396,7 +396,7 @@ static void __put_css_set(struct css_set *cset, int taskexit)
396396
list_del(&link->cset_link);
397397
list_del(&link->cgrp_link);
398398

399-
/* @cgrp can't go away while we're holding css_set_lock */
399+
/* @cgrp can't go away while we're holding css_set_rwsem */
400400
if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
401401
if (taskexit)
402402
set_bit(CGRP_RELEASABLE, &cgrp->flags);
@@ -406,7 +406,7 @@ static void __put_css_set(struct css_set *cset, int taskexit)
406406
kfree(link);
407407
}
408408

409-
write_unlock(&css_set_lock);
409+
up_write(&css_set_rwsem);
410410
kfree_rcu(cset, rcu_head);
411411
}
412412

@@ -627,11 +627,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
627627

628628
/* First see if we already have a cgroup group that matches
629629
* the desired set */
630-
read_lock(&css_set_lock);
630+
down_read(&css_set_rwsem);
631631
cset = find_existing_css_set(old_cset, cgrp, template);
632632
if (cset)
633633
get_css_set(cset);
634-
read_unlock(&css_set_lock);
634+
up_read(&css_set_rwsem);
635635

636636
if (cset)
637637
return cset;
@@ -655,7 +655,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
655655
* find_existing_css_set() */
656656
memcpy(cset->subsys, template, sizeof(cset->subsys));
657657

658-
write_lock(&css_set_lock);
658+
down_write(&css_set_rwsem);
659659
/* Add reference counts and links from the new css_set. */
660660
list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
661661
struct cgroup *c = link->cgrp;
@@ -673,7 +673,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
673673
key = css_set_hash(cset->subsys);
674674
hash_add(css_set_table, &cset->hlist, key);
675675

676-
write_unlock(&css_set_lock);
676+
up_write(&css_set_rwsem);
677677

678678
return cset;
679679
}
@@ -739,14 +739,14 @@ static void cgroup_destroy_root(struct cgroupfs_root *root)
739739
* Release all the links from cset_links to this hierarchy's
740740
* root cgroup
741741
*/
742-
write_lock(&css_set_lock);
742+
down_write(&css_set_rwsem);
743743

744744
list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
745745
list_del(&link->cset_link);
746746
list_del(&link->cgrp_link);
747747
kfree(link);
748748
}
749-
write_unlock(&css_set_lock);
749+
up_write(&css_set_rwsem);
750750

751751
if (!list_empty(&root->root_list)) {
752752
list_del(&root->root_list);
@@ -764,16 +764,17 @@ static void cgroup_destroy_root(struct cgroupfs_root *root)
764764

765765
/*
766766
* Return the cgroup for "task" from the given hierarchy. Must be
767-
* called with cgroup_mutex held.
767+
* called with cgroup_mutex and css_set_rwsem held.
768768
*/
769769
static struct cgroup *task_cgroup_from_root(struct task_struct *task,
770770
struct cgroupfs_root *root)
771771
{
772772
struct css_set *cset;
773773
struct cgroup *res = NULL;
774774

775-
BUG_ON(!mutex_is_locked(&cgroup_mutex));
776-
read_lock(&css_set_lock);
775+
lockdep_assert_held(&cgroup_mutex);
776+
lockdep_assert_held(&css_set_rwsem);
777+
777778
/*
778779
* No need to lock the task - since we hold cgroup_mutex the
779780
* task can't change groups, so the only thing that can happen
@@ -794,7 +795,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
794795
}
795796
}
796797
}
797-
read_unlock(&css_set_lock);
798+
798799
BUG_ON(!res);
799800
return res;
800801
}
@@ -1310,7 +1311,7 @@ static void cgroup_enable_task_cg_lists(void)
13101311
{
13111312
struct task_struct *p, *g;
13121313

1313-
write_lock(&css_set_lock);
1314+
down_write(&css_set_rwsem);
13141315

13151316
if (use_task_css_set_links)
13161317
goto out_unlock;
@@ -1343,7 +1344,7 @@ static void cgroup_enable_task_cg_lists(void)
13431344
} while_each_thread(g, p);
13441345
read_unlock(&tasklist_lock);
13451346
out_unlock:
1346-
write_unlock(&css_set_lock);
1347+
up_write(&css_set_rwsem);
13471348
}
13481349

13491350
static void init_cgroup_housekeeping(struct cgroup *cgrp)
@@ -1408,7 +1409,7 @@ static int cgroup_setup_root(struct cgroupfs_root *root, unsigned long ss_mask)
14081409
root_cgrp->id = ret;
14091410

14101411
/*
1411-
* We're accessing css_set_count without locking css_set_lock here,
1412+
* We're accessing css_set_count without locking css_set_rwsem here,
14121413
* but that's OK - it can only be increased by someone holding
14131414
* cgroup_lock, and that's us. The worst that can happen is that we
14141415
* have some link structures left over
@@ -1451,10 +1452,10 @@ static int cgroup_setup_root(struct cgroupfs_root *root, unsigned long ss_mask)
14511452
* Link the top cgroup in this hierarchy into all the css_set
14521453
* objects.
14531454
*/
1454-
write_lock(&css_set_lock);
1455+
down_write(&css_set_rwsem);
14551456
hash_for_each(css_set_table, i, cset, hlist)
14561457
link_css_set(&tmp_links, cset, root_cgrp);
1457-
write_unlock(&css_set_lock);
1458+
up_write(&css_set_rwsem);
14581459

14591460
BUG_ON(!list_empty(&root_cgrp->children));
14601461
BUG_ON(atomic_read(&root->nr_cgrps) != 1);
@@ -1617,6 +1618,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
16171618
char *path = NULL;
16181619

16191620
mutex_lock(&cgroup_mutex);
1621+
down_read(&css_set_rwsem);
16201622

16211623
root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
16221624

@@ -1629,6 +1631,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
16291631
path = buf;
16301632
}
16311633

1634+
up_read(&css_set_rwsem);
16321635
mutex_unlock(&cgroup_mutex);
16331636
return path;
16341637
}
@@ -1739,9 +1742,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
17391742
rcu_assign_pointer(tsk->cgroups, new_cset);
17401743
task_unlock(tsk);
17411744

1742-
write_lock(&css_set_lock);
1745+
down_write(&css_set_rwsem);
17431746
list_move(&tsk->cg_list, &new_cset->tasks);
1744-
write_unlock(&css_set_lock);
1747+
up_write(&css_set_rwsem);
17451748

17461749
/*
17471750
* We just gained a reference on old_cset by taking it from the
@@ -1799,6 +1802,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
17991802
* already PF_EXITING could be freed from underneath us unless we
18001803
* take an rcu_read_lock.
18011804
*/
1805+
down_read(&css_set_rwsem);
18021806
rcu_read_lock();
18031807
do {
18041808
struct task_and_cgroup ent;
@@ -1826,6 +1830,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
18261830
break;
18271831
} while_each_thread(leader, tsk);
18281832
rcu_read_unlock();
1833+
up_read(&css_set_rwsem);
18291834
/* remember the number of threads in the array for later. */
18301835
group_size = i;
18311836
tset.tc_array = group;
@@ -2003,7 +2008,11 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
20032008

20042009
mutex_lock(&cgroup_mutex);
20052010
for_each_active_root(root) {
2006-
struct cgroup *from_cgrp = task_cgroup_from_root(from, root);
2011+
struct cgroup *from_cgrp;
2012+
2013+
down_read(&css_set_rwsem);
2014+
from_cgrp = task_cgroup_from_root(from, root);
2015+
up_read(&css_set_rwsem);
20072016

20082017
retval = cgroup_attach_task(from_cgrp, tsk, false);
20092018
if (retval)
@@ -2396,10 +2405,10 @@ static int cgroup_task_count(const struct cgroup *cgrp)
23962405
int count = 0;
23972406
struct cgrp_cset_link *link;
23982407

2399-
read_lock(&css_set_lock);
2408+
down_read(&css_set_rwsem);
24002409
list_for_each_entry(link, &cgrp->cset_links, cset_link)
24012410
count += atomic_read(&link->cset->refcount);
2402-
read_unlock(&css_set_lock);
2411+
up_read(&css_set_rwsem);
24032412
return count;
24042413
}
24052414

@@ -2630,12 +2639,12 @@ static void css_advance_task_iter(struct css_task_iter *it)
26302639
*/
26312640
void css_task_iter_start(struct cgroup_subsys_state *css,
26322641
struct css_task_iter *it)
2633-
__acquires(css_set_lock)
2642+
__acquires(css_set_rwsem)
26342643
{
26352644
/* no one should try to iterate before mounting cgroups */
26362645
WARN_ON_ONCE(!use_task_css_set_links);
26372646

2638-
read_lock(&css_set_lock);
2647+
down_read(&css_set_rwsem);
26392648

26402649
it->origin_css = css;
26412650
it->cset_link = &css->cgroup->cset_links;
@@ -2683,9 +2692,9 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
26832692
* Finish task iteration started by css_task_iter_start().
26842693
*/
26852694
void css_task_iter_end(struct css_task_iter *it)
2686-
__releases(css_set_lock)
2695+
__releases(css_set_rwsem)
26872696
{
2688-
read_unlock(&css_set_lock);
2697+
up_read(&css_set_rwsem);
26892698
}
26902699

26912700
static inline int started_after_time(struct task_struct *t1,
@@ -2735,7 +2744,7 @@ static inline int started_after(void *p1, void *p2)
27352744
*
27362745
* @test may be NULL, meaning always true (select all tasks), which
27372746
* effectively duplicates css_task_iter_{start,next,end}() but does not
2738-
* lock css_set_lock for the call to @process.
2747+
* lock css_set_rwsem for the call to @process.
27392748
*
27402749
* It is guaranteed that @process will act on every task that is a member
27412750
* of @css for the duration of this call. This function may or may not
@@ -3867,12 +3876,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
38673876
lockdep_assert_held(&cgroup_mutex);
38683877

38693878
/*
3870-
* css_set_lock synchronizes access to ->cset_links and prevents
3879+
* css_set_rwsem synchronizes access to ->cset_links and prevents
38713880
* @cgrp from being removed while __put_css_set() is in progress.
38723881
*/
3873-
read_lock(&css_set_lock);
3882+
down_read(&css_set_rwsem);
38743883
empty = list_empty(&cgrp->cset_links);
3875-
read_unlock(&css_set_lock);
3884+
up_read(&css_set_rwsem);
38763885
if (!empty)
38773886
return -EBUSY;
38783887

@@ -4208,6 +4217,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)
42084217
retval = 0;
42094218

42104219
mutex_lock(&cgroup_mutex);
4220+
down_read(&css_set_rwsem);
42114221

42124222
for_each_active_root(root) {
42134223
struct cgroup_subsys *ss;
@@ -4233,6 +4243,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)
42334243
}
42344244

42354245
out_unlock:
4246+
up_read(&css_set_rwsem);
42364247
mutex_unlock(&cgroup_mutex);
42374248
put_task_struct(tsk);
42384249
out_free:
@@ -4328,12 +4339,12 @@ void cgroup_post_fork(struct task_struct *child)
43284339
* lock on fork.
43294340
*/
43304341
if (use_task_css_set_links) {
4331-
write_lock(&css_set_lock);
4342+
down_write(&css_set_rwsem);
43324343
task_lock(child);
43334344
if (list_empty(&child->cg_list))
43344345
list_add(&child->cg_list, &task_css_set(child)->tasks);
43354346
task_unlock(child);
4336-
write_unlock(&css_set_lock);
4347+
up_write(&css_set_rwsem);
43374348
}
43384349

43394350
/*
@@ -4390,15 +4401,14 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
43904401
int i;
43914402

43924403
/*
4393-
* Unlink from the css_set task list if necessary.
4394-
* Optimistically check cg_list before taking
4395-
* css_set_lock
4404+
* Unlink from the css_set task list if necessary. Optimistically
4405+
* check cg_list before taking css_set_rwsem.
43964406
*/
43974407
if (!list_empty(&tsk->cg_list)) {
4398-
write_lock(&css_set_lock);
4408+
down_write(&css_set_rwsem);
43994409
if (!list_empty(&tsk->cg_list))
44004410
list_del_init(&tsk->cg_list);
4401-
write_unlock(&css_set_lock);
4411+
up_write(&css_set_rwsem);
44024412
}
44034413

44044414
/* Reassign the task to the init_css_set. */
@@ -4650,7 +4660,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
46504660
if (!name_buf)
46514661
return -ENOMEM;
46524662

4653-
read_lock(&css_set_lock);
4663+
down_read(&css_set_rwsem);
46544664
rcu_read_lock();
46554665
cset = rcu_dereference(current->cgroups);
46564666
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
@@ -4666,7 +4676,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
46664676
c->root->hierarchy_id, name);
46674677
}
46684678
rcu_read_unlock();
4669-
read_unlock(&css_set_lock);
4679+
up_read(&css_set_rwsem);
46704680
kfree(name_buf);
46714681
return 0;
46724682
}
@@ -4677,7 +4687,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
46774687
struct cgroup_subsys_state *css = seq_css(seq);
46784688
struct cgrp_cset_link *link;
46794689

4680-
read_lock(&css_set_lock);
4690+
down_read(&css_set_rwsem);
46814691
list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
46824692
struct css_set *cset = link->cset;
46834693
struct task_struct *task;
@@ -4693,7 +4703,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
46934703
}
46944704
}
46954705
}
4696-
read_unlock(&css_set_lock);
4706+
up_read(&css_set_rwsem);
46974707
return 0;
46984708
}
46994709

0 commit comments

Comments
 (0)