Skip to content

Commit fec58a4

Browse files
committed
Add podman system check for checking storage consistency
Add a `podman system check` that performs consistency checks on local storage, optionally removing damaged items so that they can be recreated. Signed-off-by: Nalin Dahyabhai <[email protected]>
1 parent c510959 commit fec58a4

File tree

15 files changed

+565
-0
lines changed

15 files changed

+565
-0
lines changed

cmd/podman/system/check.go

+138
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
package system
2+
3+
import (
4+
"context"
5+
"errors"
6+
"fmt"
7+
"time"
8+
9+
"github.com/containers/common/pkg/completion"
10+
"github.com/containers/podman/v5/cmd/podman/registry"
11+
"github.com/containers/podman/v5/cmd/podman/validate"
12+
"github.com/containers/podman/v5/pkg/domain/entities/types"
13+
multierror "github.com/hashicorp/go-multierror"
14+
"github.com/spf13/cobra"
15+
)
16+
17+
var (
18+
checkOptions = types.SystemCheckOptions{}
19+
checkDescription = `
20+
podman system check
21+
22+
Check storage for consistency and remove anything that looks damaged
23+
`
24+
25+
checkCommand = &cobra.Command{
26+
Use: "check [options]",
27+
Short: "Check storage consistency",
28+
Args: validate.NoArgs,
29+
Long: checkDescription,
30+
RunE: check,
31+
ValidArgsFunction: completion.AutocompleteNone,
32+
Example: `podman system check`,
33+
}
34+
)
35+
36+
func init() {
37+
registry.Commands = append(registry.Commands, registry.CliCommand{
38+
Command: checkCommand,
39+
Parent: systemCmd,
40+
})
41+
flags := checkCommand.Flags()
42+
flags.BoolVarP(&checkOptions.Quick, "quick", "q", false, "Skip time-consuming checks. The default is to include time-consuming checks")
43+
flags.BoolVarP(&checkOptions.Repair, "repair", "r", false, "Remove inconsistent images")
44+
flags.BoolVarP(&checkOptions.RepairLossy, "force", "f", false, "Remove inconsistent images and containers")
45+
flags.DurationP("max", "m", 24*time.Hour, "Maximum allowed age of unreferenced layers")
46+
_ = checkCommand.RegisterFlagCompletionFunc("max", completion.AutocompleteNone)
47+
}
48+
49+
func check(cmd *cobra.Command, args []string) error {
50+
flags := cmd.Flags()
51+
if flags.Changed("max") {
52+
maxAge, err := flags.GetDuration("max")
53+
if err != nil {
54+
return err
55+
}
56+
checkOptions.UnreferencedLayerMaximumAge = &maxAge
57+
}
58+
response, err := registry.ContainerEngine().SystemCheck(context.Background(), checkOptions)
59+
if err != nil {
60+
return err
61+
}
62+
63+
if err = printSystemCheckResults(response); err != nil {
64+
return err
65+
}
66+
67+
if !checkOptions.Repair && !checkOptions.RepairLossy && response.Errors {
68+
return errors.New("damage detected in local storage")
69+
}
70+
71+
recheckOptions := checkOptions
72+
recheckOptions.Repair = false
73+
recheckOptions.RepairLossy = false
74+
if response, err = registry.ContainerEngine().SystemCheck(context.Background(), recheckOptions); err != nil {
75+
return err
76+
}
77+
if response.Errors {
78+
return errors.New("damage in local storage still present after repair attempt")
79+
}
80+
81+
return nil
82+
}
83+
84+
func printSystemCheckResults(report *types.SystemCheckReport) error {
85+
if !report.Errors {
86+
return nil
87+
}
88+
errorSlice := func(strs []string) []error {
89+
if strs == nil {
90+
return nil
91+
}
92+
errs := make([]error, len(strs))
93+
for i, s := range strs {
94+
errs[i] = errors.New(s)
95+
}
96+
return errs
97+
}
98+
for damagedLayer, errorsSlice := range report.Layers {
99+
merr := multierror.Append(nil, errorSlice(errorsSlice)...)
100+
if err := merr.ErrorOrNil(); err != nil {
101+
fmt.Printf("Damaged layer %s:\n%s", damagedLayer, err)
102+
}
103+
}
104+
for _, removedLayer := range report.RemovedLayers {
105+
fmt.Printf("Deleted damaged layer: %s\n", removedLayer)
106+
}
107+
for damagedROLayer, errorsSlice := range report.ROLayers {
108+
merr := multierror.Append(nil, errorSlice(errorsSlice)...)
109+
if err := merr.ErrorOrNil(); err != nil {
110+
fmt.Printf("Damaged read-only layer %s:\n%s", damagedROLayer, err)
111+
}
112+
}
113+
for damagedImage, errorsSlice := range report.Images {
114+
merr := multierror.Append(nil, errorSlice(errorsSlice)...)
115+
if err := merr.ErrorOrNil(); err != nil {
116+
fmt.Printf("Damaged image %s:\n%s", damagedImage, err)
117+
}
118+
}
119+
for removedImage := range report.RemovedImages {
120+
fmt.Printf("Deleted damaged image: %s\n", removedImage)
121+
}
122+
for damagedROImage, errorsSlice := range report.ROImages {
123+
merr := multierror.Append(nil, errorSlice(errorsSlice)...)
124+
if err := merr.ErrorOrNil(); err != nil {
125+
fmt.Printf("Damaged read-only image %s\n%s", damagedROImage, err)
126+
}
127+
}
128+
for damagedContainer, errorsSlice := range report.Containers {
129+
merr := multierror.Append(nil, errorSlice(errorsSlice)...)
130+
if err := merr.ErrorOrNil(); err != nil {
131+
fmt.Printf("Damaged container %s:\n%s", damagedContainer, err)
132+
}
133+
}
134+
for removedContainer := range report.RemovedContainers {
135+
fmt.Printf("Deleted damaged container: %s\n", removedContainer)
136+
}
137+
return nil
138+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
% podman-system-check 1
2+
3+
## NAME
4+
podman\-system\-check - Perform consistency checks on image and container storage
5+
6+
## SYNOPSIS
7+
**podman system check** [*options*]
8+
9+
## DESCRIPTION
10+
Perform consistency checks on image and container storage, reporting images and
11+
containers which have identified issues.
12+
13+
## OPTIONS
14+
15+
#### **--force**, **-f**
16+
17+
When attempting to remove damaged images, also remove containers which depend
18+
on those images. By default, damaged images which are being used by containers
19+
are left alone.
20+
21+
Containers which depend on damaged images do so regardless of which engine
22+
created them, but because podman only "knows" how to shut down containers that
23+
it started, the effect on still-running containers which were started by other
24+
engines is difficult to predict.
25+
26+
#### **--max**, **-m**=*duration*
27+
28+
When considering layers which are not used by any images or containers, assume
29+
that any layers which are more than *duration* old are the results of canceled
30+
attempts to pull images, and should be treated as though they are damaged.
31+
32+
#### **--quick**, **-q**
33+
34+
Skip checks which are known to be time-consuming. This will prevent some types
35+
of errors from being detected.
36+
37+
#### **--repair**, **-r**
38+
39+
Remove any images which are determined to have been damaged in some way, unless
40+
they are in use by containers. Use **--force** to remove containers which
41+
depend on damaged images, and those damaged images, as well.
42+
43+
## EXAMPLE
44+
45+
A reasonably quick check:
46+
```
47+
podman system check --quick --repair --force
48+
```
49+
50+
A more thorough check:
51+
```
52+
podman system check --repair --max=1h --force
53+
```
54+
55+
## SEE ALSO
56+
**[podman(1)](podman.1.md)**, **[podman-system(1)](podman-system.1.md)**
57+
58+
## HISTORY
59+
April 2024

docs/source/markdown/podman-system.1.md

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ The system command allows management of the podman systems
1313

1414
| Command | Man Page | Description |
1515
| ------- | ------------------------------------------------------------ | ------------------------------------------------------------------------ |
16+
| check | [podman-system-check(1)](podman-system-check.1.md) | Perform consistency checks on image and container storage.
1617
| connection | [podman-system-connection(1)](podman-system-connection.1.md) | Manage the destination(s) for Podman service(s) |
1718
| df | [podman-system-df(1)](podman-system-df.1.md) | Show podman disk usage. |
1819
| events | [podman-events(1)](podman-events.1.md) | Monitor Podman events |

libpod/runtime.go

+133
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ import (
3131
"github.com/containers/podman/v5/libpod/lock"
3232
"github.com/containers/podman/v5/libpod/plugin"
3333
"github.com/containers/podman/v5/libpod/shutdown"
34+
"github.com/containers/podman/v5/pkg/domain/entities"
3435
"github.com/containers/podman/v5/pkg/rootless"
3536
"github.com/containers/podman/v5/pkg/systemd"
3637
"github.com/containers/podman/v5/pkg/util"
@@ -39,9 +40,11 @@ import (
3940
"github.com/containers/storage/pkg/lockfile"
4041
"github.com/containers/storage/pkg/unshare"
4142
"github.com/docker/docker/pkg/namesgenerator"
43+
"github.com/hashicorp/go-multierror"
4244
jsoniter "github.com/json-iterator/go"
4345
spec "github.com/opencontainers/runtime-spec/specs-go"
4446
"github.com/sirupsen/logrus"
47+
"golang.org/x/exp/slices"
4548
)
4649

4750
// Set up the JSON library for all of Libpod
@@ -1249,3 +1252,133 @@ func (r *Runtime) LockConflicts() (map[uint32][]string, []uint32, error) {
12491252

12501253
return toReturn, locksHeld, nil
12511254
}
1255+
1256+
// SystemCheck checks our storage for consistency, and depending on the options
1257+
// specified, will attempt to remove anything which fails consistency checks.
1258+
func (r *Runtime) SystemCheck(ctx context.Context, options entities.SystemCheckOptions) (entities.SystemCheckReport, error) {
1259+
what := storage.CheckEverything()
1260+
if options.Quick {
1261+
what = storage.CheckMost()
1262+
}
1263+
if options.UnreferencedLayerMaximumAge != nil {
1264+
tmp := *options.UnreferencedLayerMaximumAge
1265+
what.LayerUnreferencedMaximumAge = &tmp
1266+
}
1267+
storageReport, err := r.store.Check(what)
1268+
if err != nil {
1269+
return entities.SystemCheckReport{}, err
1270+
}
1271+
if len(storageReport.Containers) == 0 &&
1272+
len(storageReport.Layers) == 0 &&
1273+
len(storageReport.ROLayers) == 0 &&
1274+
len(storageReport.Images) == 0 &&
1275+
len(storageReport.ROImages) == 0 {
1276+
// no errors detected
1277+
return entities.SystemCheckReport{}, nil
1278+
}
1279+
mapErrorSlicesToStringSlices := func(m map[string][]error) map[string][]string {
1280+
if len(m) == 0 {
1281+
return nil
1282+
}
1283+
mapped := make(map[string][]string, len(m))
1284+
for k, errs := range m {
1285+
strs := make([]string, len(errs))
1286+
for i, e := range errs {
1287+
strs[i] = e.Error()
1288+
}
1289+
mapped[k] = strs
1290+
}
1291+
return mapped
1292+
}
1293+
1294+
report := entities.SystemCheckReport{
1295+
Errors: true,
1296+
Layers: mapErrorSlicesToStringSlices(storageReport.Layers),
1297+
ROLayers: mapErrorSlicesToStringSlices(storageReport.ROLayers),
1298+
Images: mapErrorSlicesToStringSlices(storageReport.Images),
1299+
ROImages: mapErrorSlicesToStringSlices(storageReport.ROImages),
1300+
Containers: mapErrorSlicesToStringSlices(storageReport.Containers),
1301+
}
1302+
if !options.Repair && report.Errors {
1303+
// errors detected, no corrective measures to be taken
1304+
return report, err
1305+
}
1306+
1307+
// get a list of images that we knew of before we tried to clean up any
1308+
// that were damaged
1309+
imagesBefore, err := r.store.Images()
1310+
if err != nil {
1311+
return report, fmt.Errorf("getting a list of images before attempting repairs: %w", err)
1312+
}
1313+
1314+
repairOptions := storage.RepairOptions{
1315+
RemoveContainers: options.RepairLossy,
1316+
}
1317+
var containers []*Container
1318+
if repairOptions.RemoveContainers {
1319+
// build a list of the containers that we claim as ours that we
1320+
// expect to be removing in a bit
1321+
for containerID := range storageReport.Containers {
1322+
ctr, lookupErr := r.state.LookupContainer(containerID)
1323+
if lookupErr != nil {
1324+
// we're about to remove it, so it's okay that
1325+
// it isn't even one of ours
1326+
continue
1327+
}
1328+
containers = append(containers, ctr)
1329+
}
1330+
}
1331+
1332+
// run the cleanup
1333+
merr := multierror.Append(nil, r.store.Repair(storageReport, &repairOptions)...)
1334+
1335+
if repairOptions.RemoveContainers {
1336+
// get the list of containers that storage will still admit to knowing about
1337+
containersAfter, err := r.store.Containers()
1338+
if err != nil {
1339+
merr = multierror.Append(merr, fmt.Errorf("getting a list of containers after attempting repairs: %w", err))
1340+
}
1341+
for _, ctr := range containers {
1342+
// if one of our containers that we tried to remove is
1343+
// still on disk, report an error
1344+
if slices.IndexFunc(containersAfter, func(containerAfter storage.Container) bool {
1345+
return containerAfter.ID == ctr.ID()
1346+
}) != -1 {
1347+
merr = multierror.Append(merr, fmt.Errorf("clearing storage for container %s: %w", ctr.ID(), err))
1348+
continue
1349+
}
1350+
// remove the container from our database
1351+
if removeErr := r.state.RemoveContainer(ctr); removeErr != nil {
1352+
merr = multierror.Append(merr, fmt.Errorf("updating state database to reflect removal of container %s: %w", ctr.ID(), removeErr))
1353+
continue
1354+
}
1355+
if report.RemovedContainers == nil {
1356+
report.RemovedContainers = make(map[string]string)
1357+
}
1358+
report.RemovedContainers[ctr.ID()] = ctr.config.Name
1359+
}
1360+
}
1361+
1362+
// get a list of images that are still around after we clean up any
1363+
// that were damaged
1364+
imagesAfter, err := r.store.Images()
1365+
if err != nil {
1366+
merr = multierror.Append(merr, fmt.Errorf("getting a list of images after attempting repairs: %w", err))
1367+
}
1368+
for _, imageBefore := range imagesBefore {
1369+
if slices.IndexFunc(imagesAfter, func(imageAfter storage.Image) bool {
1370+
return imageAfter.ID == imageBefore.ID
1371+
}) == -1 {
1372+
if report.RemovedImages == nil {
1373+
report.RemovedImages = make(map[string][]string)
1374+
}
1375+
report.RemovedImages[imageBefore.ID] = slices.Clone(imageBefore.Names)
1376+
}
1377+
}
1378+
1379+
if merr != nil {
1380+
err = merr.ErrorOrNil()
1381+
}
1382+
1383+
return report, err
1384+
}

0 commit comments

Comments
 (0)