@@ -84,6 +84,9 @@ struct scan_control {
84
84
/* Scan (total_size >> priority) pages at once */
85
85
int priority ;
86
86
87
+ /* The highest zone to isolate pages for reclaim from */
88
+ enum zone_type reclaim_idx ;
89
+
87
90
unsigned int may_writepage :1 ;
88
91
89
92
/* Can mapped pages be reclaimed? */
@@ -1392,6 +1395,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1392
1395
unsigned long nr_taken = 0 ;
1393
1396
unsigned long nr_zone_taken [MAX_NR_ZONES ] = { 0 };
1394
1397
unsigned long scan , nr_pages ;
1398
+ LIST_HEAD (pages_skipped );
1395
1399
1396
1400
for (scan = 0 ; scan < nr_to_scan && nr_taken < nr_to_scan &&
1397
1401
!list_empty (src ); scan ++ ) {
@@ -1402,6 +1406,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1402
1406
1403
1407
VM_BUG_ON_PAGE (!PageLRU (page ), page );
1404
1408
1409
+ if (page_zonenum (page ) > sc -> reclaim_idx ) {
1410
+ list_move (& page -> lru , & pages_skipped );
1411
+ continue ;
1412
+ }
1413
+
1405
1414
switch (__isolate_lru_page (page , mode )) {
1406
1415
case 0 :
1407
1416
nr_pages = hpage_nr_pages (page );
@@ -1420,6 +1429,15 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1420
1429
}
1421
1430
}
1422
1431
1432
+ /*
1433
+ * Splice any skipped pages to the start of the LRU list. Note that
1434
+ * this disrupts the LRU order when reclaiming for lower zones but
1435
+ * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
1436
+ * scanning would soon rescan the same pages to skip and put the
1437
+ * system at risk of premature OOM.
1438
+ */
1439
+ if (!list_empty (& pages_skipped ))
1440
+ list_splice (& pages_skipped , src );
1423
1441
* nr_scanned = scan ;
1424
1442
trace_mm_vmscan_lru_isolate (sc -> order , nr_to_scan , scan ,
1425
1443
nr_taken , mode , is_file_lru (lru ));
@@ -1589,7 +1607,7 @@ static int current_may_throttle(void)
1589
1607
}
1590
1608
1591
1609
/*
1592
- * shrink_inactive_list() is a helper for shrink_zone (). It returns the number
1610
+ * shrink_inactive_list() is a helper for shrink_node (). It returns the number
1593
1611
* of reclaimed pages
1594
1612
*/
1595
1613
static noinline_for_stack unsigned long
@@ -2401,12 +2419,13 @@ static inline bool should_continue_reclaim(struct zone *zone,
2401
2419
}
2402
2420
}
2403
2421
2404
- static bool shrink_zone ( struct zone * zone , struct scan_control * sc ,
2405
- bool is_classzone )
2422
+ static bool shrink_node ( pg_data_t * pgdat , struct scan_control * sc ,
2423
+ enum zone_type classzone_idx )
2406
2424
{
2407
2425
struct reclaim_state * reclaim_state = current -> reclaim_state ;
2408
2426
unsigned long nr_reclaimed , nr_scanned ;
2409
2427
bool reclaimable = false;
2428
+ struct zone * zone = & pgdat -> node_zones [classzone_idx ];
2410
2429
2411
2430
do {
2412
2431
struct mem_cgroup * root = sc -> target_mem_cgroup ;
@@ -2438,7 +2457,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2438
2457
shrink_zone_memcg (zone , memcg , sc , & lru_pages );
2439
2458
zone_lru_pages += lru_pages ;
2440
2459
2441
- if (memcg && is_classzone )
2460
+ if (! global_reclaim ( sc ) )
2442
2461
shrink_slab (sc -> gfp_mask , zone_to_nid (zone ),
2443
2462
memcg , sc -> nr_scanned - scanned ,
2444
2463
lru_pages );
@@ -2469,7 +2488,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2469
2488
* Shrink the slab caches in the same proportion that
2470
2489
* the eligible LRU pages were scanned.
2471
2490
*/
2472
- if (global_reclaim (sc ) && is_classzone )
2491
+ if (global_reclaim (sc ))
2473
2492
shrink_slab (sc -> gfp_mask , zone_to_nid (zone ), NULL ,
2474
2493
sc -> nr_scanned - nr_scanned ,
2475
2494
zone_lru_pages );
@@ -2553,25 +2572,31 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2553
2572
unsigned long nr_soft_reclaimed ;
2554
2573
unsigned long nr_soft_scanned ;
2555
2574
gfp_t orig_mask ;
2556
- enum zone_type requested_highidx = gfp_zone ( sc -> gfp_mask ) ;
2575
+ enum zone_type classzone_idx ;
2557
2576
2558
2577
/*
2559
2578
* If the number of buffer_heads in the machine exceeds the maximum
2560
2579
* allowed level, force direct reclaim to scan the highmem zone as
2561
2580
* highmem pages could be pinning lowmem pages storing buffer_heads
2562
2581
*/
2563
2582
orig_mask = sc -> gfp_mask ;
2564
- if (buffer_heads_over_limit )
2583
+ if (buffer_heads_over_limit ) {
2565
2584
sc -> gfp_mask |= __GFP_HIGHMEM ;
2585
+ sc -> reclaim_idx = classzone_idx = gfp_zone (sc -> gfp_mask );
2586
+ }
2566
2587
2567
2588
for_each_zone_zonelist_nodemask (zone , z , zonelist ,
2568
- gfp_zone (sc -> gfp_mask ), sc -> nodemask ) {
2569
- enum zone_type classzone_idx ;
2570
-
2589
+ sc -> reclaim_idx , sc -> nodemask ) {
2571
2590
if (!populated_zone (zone ))
2572
2591
continue ;
2573
2592
2574
- classzone_idx = requested_highidx ;
2593
+ /*
2594
+ * Note that reclaim_idx does not change as it is the highest
2595
+ * zone reclaimed from which for empty zones is a no-op but
2596
+ * classzone_idx is used by shrink_node to test if the slabs
2597
+ * should be shrunk on a given node.
2598
+ */
2599
+ classzone_idx = sc -> reclaim_idx ;
2575
2600
while (!populated_zone (zone -> zone_pgdat -> node_zones +
2576
2601
classzone_idx ))
2577
2602
classzone_idx -- ;
@@ -2600,8 +2625,8 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2600
2625
*/
2601
2626
if (IS_ENABLED (CONFIG_COMPACTION ) &&
2602
2627
sc -> order > PAGE_ALLOC_COSTLY_ORDER &&
2603
- zonelist_zone_idx (z ) <= requested_highidx &&
2604
- compaction_ready (zone , sc -> order , requested_highidx )) {
2628
+ zonelist_zone_idx (z ) <= classzone_idx &&
2629
+ compaction_ready (zone , sc -> order , classzone_idx )) {
2605
2630
sc -> compaction_ready = true;
2606
2631
continue ;
2607
2632
}
@@ -2621,7 +2646,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2621
2646
/* need some check for avoid more shrink_zone() */
2622
2647
}
2623
2648
2624
- shrink_zone (zone , sc , zone_idx ( zone ) == classzone_idx );
2649
+ shrink_node (zone -> zone_pgdat , sc , classzone_idx );
2625
2650
}
2626
2651
2627
2652
/*
@@ -2847,6 +2872,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2847
2872
struct scan_control sc = {
2848
2873
.nr_to_reclaim = SWAP_CLUSTER_MAX ,
2849
2874
.gfp_mask = (gfp_mask = memalloc_noio_flags (gfp_mask )),
2875
+ .reclaim_idx = gfp_zone (gfp_mask ),
2850
2876
.order = order ,
2851
2877
.nodemask = nodemask ,
2852
2878
.priority = DEF_PRIORITY ,
@@ -2886,6 +2912,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2886
2912
.target_mem_cgroup = memcg ,
2887
2913
.may_writepage = !laptop_mode ,
2888
2914
.may_unmap = 1 ,
2915
+ .reclaim_idx = MAX_NR_ZONES - 1 ,
2889
2916
.may_swap = !noswap ,
2890
2917
};
2891
2918
unsigned long lru_pages ;
@@ -2924,6 +2951,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
2924
2951
.nr_to_reclaim = max (nr_pages , SWAP_CLUSTER_MAX ),
2925
2952
.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK ) |
2926
2953
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK ),
2954
+ .reclaim_idx = MAX_NR_ZONES - 1 ,
2927
2955
.target_mem_cgroup = memcg ,
2928
2956
.priority = DEF_PRIORITY ,
2929
2957
.may_writepage = !laptop_mode ,
@@ -3118,7 +3146,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
3118
3146
balance_gap , classzone_idx ))
3119
3147
return true;
3120
3148
3121
- shrink_zone (zone , sc , zone_idx ( zone ) == classzone_idx );
3149
+ shrink_node (zone -> zone_pgdat , sc , classzone_idx );
3122
3150
3123
3151
/* TODO: ANOMALY */
3124
3152
clear_bit (PGDAT_WRITEBACK , & pgdat -> flags );
@@ -3167,6 +3195,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3167
3195
unsigned long nr_soft_scanned ;
3168
3196
struct scan_control sc = {
3169
3197
.gfp_mask = GFP_KERNEL ,
3198
+ .reclaim_idx = MAX_NR_ZONES - 1 ,
3170
3199
.order = order ,
3171
3200
.priority = DEF_PRIORITY ,
3172
3201
.may_writepage = !laptop_mode ,
@@ -3237,15 +3266,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3237
3266
sc .may_writepage = 1 ;
3238
3267
3239
3268
/*
3240
- * Now scan the zone in the dma->highmem direction, stopping
3241
- * at the last zone which needs scanning.
3242
- *
3243
- * We do this because the page allocator works in the opposite
3244
- * direction. This prevents the page allocator from allocating
3245
- * pages behind kswapd's direction of progress, which would
3246
- * cause too much scanning of the lower zones.
3269
+ * Continue scanning in the highmem->dma direction stopping at
3270
+ * the last zone which needs scanning. This may reclaim lowmem
3271
+ * pages that are not necessary for zone balancing but it
3272
+ * preserves LRU ordering. It is assumed that the bulk of
3273
+ * allocation requests can use arbitrary zones with the
3274
+ * possible exception of big highmem:lowmem configurations.
3247
3275
*/
3248
- for (i = 0 ; i <= end_zone ; i ++ ) {
3276
+ for (i = end_zone ; i >= 0 ; i -- ) {
3249
3277
struct zone * zone = pgdat -> node_zones + i ;
3250
3278
3251
3279
if (!populated_zone (zone ))
@@ -3256,6 +3284,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3256
3284
continue ;
3257
3285
3258
3286
sc .nr_scanned = 0 ;
3287
+ sc .reclaim_idx = i ;
3259
3288
3260
3289
nr_soft_scanned = 0 ;
3261
3290
/*
@@ -3513,6 +3542,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
3513
3542
struct scan_control sc = {
3514
3543
.nr_to_reclaim = nr_to_reclaim ,
3515
3544
.gfp_mask = GFP_HIGHUSER_MOVABLE ,
3545
+ .reclaim_idx = MAX_NR_ZONES - 1 ,
3516
3546
.priority = DEF_PRIORITY ,
3517
3547
.may_writepage = 1 ,
3518
3548
.may_unmap = 1 ,
@@ -3704,6 +3734,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3704
3734
.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE ),
3705
3735
.may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP ),
3706
3736
.may_swap = 1 ,
3737
+ .reclaim_idx = zone_idx (zone ),
3707
3738
};
3708
3739
3709
3740
cond_resched ();
@@ -3723,7 +3754,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3723
3754
* priorities until we have enough memory freed.
3724
3755
*/
3725
3756
do {
3726
- shrink_zone (zone , & sc , true );
3757
+ shrink_node (zone -> zone_pgdat , & sc , zone_idx ( zone ) );
3727
3758
} while (sc .nr_reclaimed < nr_pages && -- sc .priority >= 0 );
3728
3759
}
3729
3760
0 commit comments