Skip to content

Commit 88bc201

Browse files
Add ecosystem specific inclusions or exclusions
Also ignore specific files paths containing metadata in ruby gems. Reference: #1438 Reference: #1476 Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
1 parent fa9ac3f commit 88bc201

File tree

7 files changed

+225
-35
lines changed

7 files changed

+225
-35
lines changed

scanpipe/config.py

+115
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
#
3+
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
4+
# The ScanCode.io software is licensed under the Apache License version 2.0.
5+
# Data generated with ScanCode.io is provided as-is without warranties.
6+
# ScanCode is a trademark of nexB Inc.
7+
#
8+
# You may not use this software except in compliance with the License.
9+
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
10+
# Unless required by applicable law or agreed to in writing, software distributed
11+
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
# specific language governing permissions and limitations under the License.
14+
#
15+
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
16+
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
17+
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
18+
# for any legal advice.
19+
#
20+
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
21+
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
22+
23+
24+
class EcosystemConfig:
25+
"""
26+
Base class for ecosystem specific configurations to be defined
27+
for each ecosystems.
28+
"""
29+
30+
# This should be defined for each ecosystem which
31+
# are options in the pipelines
32+
ecosystem_option = None
33+
34+
# These are extensions for packages of this ecosystem which
35+
# needs to be matched from purldb
36+
purldb_package_extensions = []
37+
38+
# These are extensions for resources of this ecosystem which
39+
# needs to be macthed from purldb
40+
purldb_resource_extensions = []
41+
42+
# Extensions for document files which do not require review
43+
doc_extensions = []
44+
45+
# Paths in the deployed binaries/archives (on the to/ side) which
46+
# do not need review even if they are not matched to the source side
47+
deployed_resource_path_exclusions = []
48+
49+
# Paths in the developement/source archive (on the from/ side) which
50+
# should not be considered even if unmapped to the deployed side when
51+
# assesing what to review on the deployed side
52+
devel_resource_path_exclusions = []
53+
54+
# Symbols which are found in ecosystem specific standard libraries
55+
# which are not so useful in mapping
56+
standard_symbols_to_exclude = []
57+
58+
59+
class DefaultEcosystemConfig(EcosystemConfig):
60+
"""Configurations which are common across multiple ecosystems."""
61+
62+
ecosystem_option = "Default"
63+
purldb_package_extensions = [".zip", ".tar.gz", ".tar.xz"]
64+
devel_resource_path_exclusions = ["*/tests/*"]
65+
doc_extensions = [
66+
".pdf",
67+
".doc",
68+
".docx",
69+
".ppt",
70+
".pptx",
71+
".tex",
72+
".odt",
73+
".odp",
74+
]
75+
76+
77+
class JavaEcosystemConfig(EcosystemConfig):
78+
ecosystem_option = "Java"
79+
purldb_package_extensions = [".jar", ".war"]
80+
purldb_resource_extensions = [".class"]
81+
82+
83+
class JavaScriptEcosystemConfig(EcosystemConfig):
84+
ecosystem_option = "JavaScript"
85+
purldb_resource_extensions = [
86+
".map",
87+
".js",
88+
".mjs",
89+
".ts",
90+
".d.ts",
91+
".jsx",
92+
".tsx",
93+
".css",
94+
".scss",
95+
".less",
96+
".sass",
97+
".soy",
98+
]
99+
100+
101+
class GoEcosystemConfig(EcosystemConfig):
102+
ecosystem_option = "Go"
103+
purldb_resource_extensions = [".go"]
104+
105+
106+
class RustEcosystemConfig(EcosystemConfig):
107+
ecosystem_option = "Rust"
108+
purldb_resource_extensions = [".rs"]
109+
110+
111+
class RubyEcosystemConfig(EcosystemConfig):
112+
ecosystem_option = "Ruby"
113+
purldb_package_extensions = [".gem"]
114+
purldb_resource_extensions = [".rb"]
115+
deployed_resource_path_exclusions = ["*checksums.yaml.gz*", "*metadata.gz*"]

scanpipe/pipelines/__init__.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,10 @@ def flag_ignored_resources(self):
7878
ignored_patterns = ignored_patterns.splitlines()
7979
ignored_patterns.extend(flag.DEFAULT_IGNORED_PATTERNS)
8080

81-
flag.flag_ignored_patterns(self.project, patterns=ignored_patterns)
81+
flag.flag_ignored_patterns(
82+
codebaseresources=self.project.codebaseresources.no_status(),
83+
patterns=ignored_patterns,
84+
)
8285

8386
def extract_archive(self, location, target):
8487
"""Extract archive at `location` to `target`. Save errors as messages."""

scanpipe/pipelines/deploy_to_develop.py

+19-28
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
from aboutcode.pipeline import optional_step
2424
from scanpipe import pipes
25+
from scanpipe.config import DefaultEcosystemConfig
2526
from scanpipe.pipelines import Pipeline
2627
from scanpipe.pipes import d2d
2728
from scanpipe.pipes import flag
@@ -31,7 +32,7 @@
3132
from scanpipe.pipes import scancode
3233

3334

34-
class DeployToDevelop(Pipeline):
35+
class DeployToDevelop(Pipeline, DefaultEcosystemConfig):
3536
"""
3637
Establish relationships between two code trees: deployment and development.
3738
@@ -64,6 +65,8 @@ def steps(cls):
6465
cls.flag_empty_files,
6566
cls.flag_whitespace_files,
6667
cls.flag_ignored_resources,
68+
cls.load_ecosystem_config,
69+
cls.load_ecosystem_config_ruby,
6770
cls.map_about_files,
6871
cls.map_checksum,
6972
cls.match_archives_to_purldb,
@@ -91,33 +94,6 @@ def steps(cls):
9194
cls.create_local_files_packages,
9295
)
9396

94-
purldb_package_extensions = [".jar", ".war", ".zip"]
95-
purldb_resource_extensions = [
96-
".map",
97-
".js",
98-
".mjs",
99-
".ts",
100-
".d.ts",
101-
".jsx",
102-
".tsx",
103-
".css",
104-
".scss",
105-
".less",
106-
".sass",
107-
".soy",
108-
".class",
109-
]
110-
doc_extensions = [
111-
".pdf",
112-
".doc",
113-
".docx",
114-
".ppt",
115-
".pptx",
116-
".tex",
117-
".odt",
118-
".odp",
119-
]
120-
12197
def get_inputs(self):
12298
"""Locate the ``from`` and ``to`` input files."""
12399
self.from_files, self.to_files = d2d.get_inputs(self.project)
@@ -152,6 +128,15 @@ def flag_whitespace_files(self):
152128
"""Flag whitespace files with size less than or equal to 100 byte as ignored."""
153129
d2d.flag_whitespace_files(project=self.project)
154130

131+
def load_ecosystem_config(self):
132+
"""Load ecosystem specific configurations for d2d steps for selected options."""
133+
d2d.load_ecosystem_config(pipeline=self, options=self.selected_groups)
134+
135+
@optional_step("Ruby")
136+
def load_ecosystem_config_ruby(self):
137+
"""Load Ruby specific configurations for d2d steps."""
138+
pass
139+
155140
def map_about_files(self):
156141
"""Map ``from/`` .ABOUT files to their related ``to/`` resources."""
157142
d2d.map_about_files(project=self.project, logger=self.log)
@@ -268,6 +253,7 @@ def flag_mapped_resources_archives_and_ignored_directories(self):
268253
def perform_house_keeping_tasks(self):
269254
"""
270255
On deployed side
256+
- Ignore specific files based on ecosystem based configurations.
271257
- PurlDB match files with ``no-java-source`` and empty status,
272258
if no match is found update status to ``requires-review``.
273259
- Update status for uninteresting files.
@@ -278,6 +264,11 @@ def perform_house_keeping_tasks(self):
278264
"""
279265
d2d.match_resources_with_no_java_source(project=self.project, logger=self.log)
280266
d2d.handle_dangling_deployed_legal_files(project=self.project, logger=self.log)
267+
d2d.ignore_unmapped_resources_from_config(
268+
project=self.project,
269+
patterns_to_ignore=self.deployed_resource_path_exclusions,
270+
logger=self.log,
271+
)
281272
d2d.match_unmapped_resources(
282273
project=self.project,
283274
matched_extensions=self.purldb_resource_extensions,

scanpipe/pipes/d2d.py

+74
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
from summarycode.classify import LEGAL_STARTS_ENDS
4848

4949
from aboutcode.pipeline import LoopProgress
50+
from scanpipe import config
5051
from scanpipe import pipes
5152
from scanpipe.models import CodebaseRelation
5253
from scanpipe.models import CodebaseResource
@@ -66,6 +67,16 @@
6667
TO = "to/"
6768

6869

70+
ECOSYSTEM_CONFIGS = [
71+
config.DefaultEcosystemConfig,
72+
config.JavaEcosystemConfig,
73+
config.JavaScriptEcosystemConfig,
74+
config.RubyEcosystemConfig,
75+
config.RustEcosystemConfig,
76+
config.GoEcosystemConfig,
77+
]
78+
79+
6980
def get_inputs(project):
7081
"""
7182
Locate the ``from`` and ``to`` input files in project inputs/ directory.
@@ -114,6 +125,55 @@ def get_best_path_matches(to_resource, matches):
114125
return matches
115126

116127

128+
def load_ecosystem_config(pipeline, options):
129+
"""
130+
Add ecosystem specific configurations for each ecosystem selected
131+
as `options` to the `pipeline`.
132+
"""
133+
configs_by_ecosystem = {
134+
ecosystem.ecosystem_option: ecosystem for ecosystem in ECOSYSTEM_CONFIGS
135+
}
136+
137+
# Add default configurations which are common accross ecosystems
138+
add_ecosystem_config(
139+
pipeline=pipeline,
140+
configs_by_ecosystem=configs_by_ecosystem,
141+
selected_option="Default",
142+
)
143+
144+
# Add configurations for each selected ecosystem
145+
for selected_option in options:
146+
if selected_option not in configs_by_ecosystem:
147+
continue
148+
149+
add_ecosystem_config(
150+
pipeline=pipeline,
151+
configs_by_ecosystem=configs_by_ecosystem,
152+
selected_option=selected_option,
153+
)
154+
155+
156+
def add_ecosystem_config(pipeline, configs_by_ecosystem, selected_option):
157+
d2d_pipeline_configs = [
158+
"purldb_package_extensions",
159+
"purldb_resource_extensions",
160+
"deployed_resource_path_exclusions",
161+
]
162+
163+
ecosystem_config = configs_by_ecosystem.get(selected_option)
164+
165+
for pipeline_config in d2d_pipeline_configs:
166+
config_value = getattr(ecosystem_config, pipeline_config)
167+
pipeline_config_value = getattr(pipeline, pipeline_config)
168+
if config_value:
169+
if not pipeline_config_value:
170+
new_config_value = config_value
171+
else:
172+
new_config_value = pipeline_config_value.extend(config_value)
173+
174+
setattr(pipeline, pipeline_config, new_config_value)
175+
176+
117177
def get_from_files_for_scanning(resources):
118178
"""
119179
Return resources in the "from/" side which has been mapped to the "to/"
@@ -1453,6 +1513,20 @@ def match_resources_with_no_java_source(project, logger=None):
14531513
)
14541514

14551515

1516+
def ignore_unmapped_resources_from_config(project, patterns_to_ignore, logger=None):
1517+
"""Ignore unmapped resources for a project using `patterns_to_ignore`."""
1518+
ignored_resources_count = flag.flag_ignored_patterns(
1519+
codebaseresources=project.codebaseresources.to_codebase().no_status(),
1520+
patterns=patterns_to_ignore,
1521+
status=flag.IGNORED_FROM_CONFIG,
1522+
)
1523+
if logger:
1524+
logger(
1525+
f"Ignoring {ignored_resources_count:,d} to/ resources with "
1526+
"from ecosystem specific configurations."
1527+
)
1528+
1529+
14561530
def match_unmapped_resources(project, matched_extensions=None, logger=None):
14571531
"""
14581532
Match resources with empty status to PurlDB, if unmatched

scanpipe/pipes/flag.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
IGNORED_DEFAULT_IGNORES = "ignored-default-ignores"
4444
IGNORED_DATA_FILE_NO_CLUES = "ignored-data-file-no-clues"
4545
IGNORED_DOC_FILE = "ignored-doc-file"
46+
IGNORED_FROM_CONFIG = "ignored-from-config"
4647

4748
COMPLIANCE_LICENSES = "compliance-licenses"
4849
COMPLIANCE_SOURCEMIRROR = "compliance-sourcemirror"
@@ -89,15 +90,15 @@ def flag_ignored_directories(project):
8990
return qs.update(status=IGNORED_DIRECTORY)
9091

9192

92-
def flag_ignored_patterns(project, patterns):
93+
def flag_ignored_patterns(codebaseresources, patterns, status=IGNORED_PATTERN):
9394
"""Flag codebase resource as ``ignored`` status from list of ``patterns``."""
9495
if isinstance(patterns, str):
9596
patterns = patterns.splitlines()
9697

9798
update_count = 0
9899
for pattern in patterns:
99-
qs = project.codebaseresources.no_status().path_pattern(pattern)
100-
update_count += qs.update(status=IGNORED_PATTERN)
100+
qs = codebaseresources.path_pattern(pattern)
101+
update_count += qs.update(status=status)
101102

102103
return update_count
103104

scanpipe/tests/pipes/test_flag.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,9 @@ def test_scanpipe_pipes_flag_flag_ignored_directories(self):
7070

7171
def test_scanpipe_pipes_flag_flag_ignored_patterns(self):
7272
patterns = ["*.ext", "dir/*"]
73-
updated = flag.flag_ignored_patterns(self.project1, patterns)
73+
updated = flag.flag_ignored_patterns(
74+
self.project1.codebaseresources.no_status(), patterns
75+
)
7476

7577
self.assertEqual(3, updated)
7678
self.resource1.refresh_from_db()
@@ -85,7 +87,8 @@ def test_scanpipe_pipes_flag_flag_ignored_patterns(self):
8587
make_resource_file(self.project1, "path/deeper/policies.yml")
8688
make_resource_file(self.project1, "path/other-policies.yml")
8789
updated = flag.flag_ignored_patterns(
88-
self.project1, flag.DEFAULT_IGNORED_PATTERNS
90+
self.project1.codebaseresources.no_status(),
91+
flag.DEFAULT_IGNORED_PATTERNS,
8992
)
9093
self.assertEqual(3, updated)
9194

scanpipe/tests/test_pipelines.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -423,7 +423,10 @@ def test_scanpipe_pipeline_class_flag_ignored_resources(self):
423423
mock_flag.return_value = None
424424
pipeline.flag_ignored_resources()
425425
patterns_args = ["*.ext", *flag.DEFAULT_IGNORED_PATTERNS]
426-
mock_flag.assert_called_with(project1, patterns=patterns_args)
426+
mock_flag.assert_called_with(
427+
codebaseresources=project1.codebaseresources.no_status(),
428+
patterns=patterns_args,
429+
)
427430

428431
def test_scanpipe_pipeline_class_extract_archive(self):
429432
project1 = Project.objects.create(name="Analysis")

0 commit comments

Comments
 (0)