Skip to content

Commit 70c8f5a

Browse files
committed
fix: Remediation to deal with duplicates and removed repositories in source data
1 parent 4cbc22a commit 70c8f5a

File tree

16 files changed

+276
-20
lines changed

16 files changed

+276
-20
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@ coverage/
33
node_modules/
44
.nyc_output/
55
.vscode/
6+
/.env

docs/CNAME

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
tools.openapis.org

gulpfile.js/index.js

+14-2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@ const { src, dest } = require('gulp');
33
const transform = require('gulp-transform');
44
const rename = require('gulp-rename');
55

6+
const { argv } = require('yargs')
7+
.option('metadata', {
8+
describe: 'Input file that describes build metadata',
9+
default: 'gulpfile.js/metadata.json',
10+
});
11+
612
const {
713
classifyTools,
814
validateMetadata,
@@ -11,11 +17,14 @@ const {
1117
readSourceData,
1218
mergeSources,
1319
normaliseSources,
20+
purgeSources,
1421
} = require('../lib/data');
1522

23+
console.log(`info: Using metadata file <${argv.metadata}>`);
24+
1625
// This is complete scan of the source data. All sources will be retrieved and processed
1726
// though the transformation code, rebuilding tools.yaml
18-
const full = () => src('gulpfile.js/metadata.json')
27+
const full = () => src(argv.metadata)
1928
.pipe(transform('utf8', validateMetadata))
2029
.pipe(transform('utf8', readSourceData))
2130
.pipe(rename('raw-sources.yaml')) // Write raw data for debug purposes
@@ -24,15 +33,18 @@ const full = () => src('gulpfile.js/metadata.json')
2433
.pipe(transform('utf8', normaliseSources))
2534
.pipe(transform('utf8', getRepositoryMetadata))
2635
.pipe(transform('utf8', classifyTools))
36+
.pipe(transform('utf8', purgeSources))
2737
.pipe(rename('tools.yaml'))
2838
.pipe(dest('src/_data'));
2939

3040
// This is a scan of the metadata associated with the repositories already
3141
// held in the repository. No new source data is retrieved from sources
32-
const metadata = () => src('gulpfile.js/metadata.json')
42+
// but existing data may be removed if not found
43+
const metadata = () => src(argv.metadata)
3344
.pipe(transform('utf8', validateMetadata))
3445
.pipe(transform('utf8', readLocalSourceData))
3546
.pipe(transform('utf8', getRepositoryMetadata))
47+
.pipe(transform('utf8', purgeSources))
3648
.pipe(rename('tools.yaml'))
3749
.pipe(dest('src/_data'));
3850

lib/data/index.js

+2
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ const {
66
readSourceData,
77
mergeSources,
88
normaliseSources,
9+
purgeSources,
910
} = require('./transform');
1011

1112
module.exports = {
@@ -16,4 +17,5 @@ module.exports = {
1617
readSourceData,
1718
mergeSources,
1819
normaliseSources,
20+
purgeSources,
1921
};
+7-1
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,19 @@
11
const fs = require('fs');
22
const YAML = require('js-yaml');
33

4+
const { logger } = require('../util');
5+
46
module.exports = async (args) => {
57
const { url } = args;
68

79
if (!url) {
810
throw new Error(`Mandatory parameters missing when invoking ${__filename}`);
911
}
1012

11-
return YAML.load(fs.readFileSync(url, 'utf-8'))
13+
const tools = YAML.load(fs.readFileSync(url, 'utf-8'))
1214
.map((tool) => ({ ...tool, foundInMaster: true }));
15+
16+
logger('Number of sources found in master', tools.length);
17+
18+
return tools;
1319
};

lib/data/repo/github.js

+25-2
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ module.exports = async (
5959
repositoryMetadata = {},
6060
) => {
6161
const paths = [
62+
{ source: '$.data.id', target: 'repositoryId' },
6263
{ source: '$.data.description', target: 'description' },
6364
{ source: '$.data.created_at', target: 'created' },
6465
{ source: '$.data.updated_at', target: 'updated' },
@@ -97,6 +98,8 @@ module.exports = async (
9798
}
9899

99100
const repoUrl = `https://api.github.com/repos/${organization}/${repo}`;
101+
let newUrl = null;
102+
let moved = false;
100103

101104
// Get repository metadata. If a 404 is returned mark as not found, throw any other errors
102105
try {
@@ -105,11 +108,26 @@ module.exports = async (
105108
repoResponse = await http.get(
106109
repoUrl,
107110
{
108-
auth: { username, password }, headers, timeout: 60000, validateStatus,
111+
auth: { username, password }, headers, timeout: 60000, validateStatus, maxRedirects: 0,
109112
},
110113
);
114+
115+
if (repoResponse.status === 301) {
116+
// The repository has moved so prepare to drop it out of the dataset or use the new URL
117+
// to identify the repository
118+
const targetUrl = repoResponse.data.url;
119+
120+
repoResponse = await http.get(targetUrl, {
121+
auth: { username, password }, timeout: 60000, validateStatus,
122+
});
123+
124+
newUrl = repoResponse.data.html_url
125+
.toLowerCase();
126+
moved = true;
127+
}
111128
} catch (err) {
112129
if (!err.response) {
130+
/* istanbul ignore if */
113131
if (process.env.NODE_ENV !== 'test') {
114132
console.error(err);
115133
}
@@ -142,6 +160,7 @@ module.exports = async (
142160
);
143161
} catch (err) {
144162
if (!err.response) {
163+
/* istanbul ignore if */
145164
if (process.env.NODE_ENV !== 'test') {
146165
console.error(err);
147166
}
@@ -176,6 +195,10 @@ module.exports = async (
176195

177196
return output;
178197
}, {
179-
repositoryMetadata: getReadme(readmeResponse, url, repositoryMetadata),
198+
repositoryMetadata: {
199+
...getReadme(readmeResponse, url, repositoryMetadata),
200+
...(newUrl ? { newUrl } : {}),
201+
...(moved ? { moved } : {}),
202+
},
180203
});
181204
};

lib/data/transform/index.js

+2
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,14 @@ const mergeSources = require('./merge-sources');
44
const normaliseSources = require('./normalise-sources');
55
const { readLocalSourceData, readSourceData } = require('./read-source-data');
66
const validateMetadata = require('./validate-metadata');
7+
const purgeSources = require('./purge-sources');
78

89
module.exports = {
910
classifyTools,
1011
getRepositoryMetadata,
1112
mergeSources,
1213
normaliseSources,
14+
purgeSources,
1315
readLocalSourceData,
1416
readSourceData,
1517
validateMetadata,

lib/data/transform/merge-sources.js

+24-3
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@ module.exports = async (rawSources) => {
1212
logger(__filename, 'mergeSources');
1313
const sources = YAML.load(rawSources);
1414

15-
// Get properties across all sources
15+
// Get properties across all sources. This is used to help define the precedence of a given key
16+
// when the properties are normalised in subsequent scripts. Executed now as sources are merged
1617
const sourceProperties = sources
1718
.reduce((output, source) => Object.assign(
1819
output,
@@ -22,12 +23,32 @@ module.exports = async (rawSources) => {
2223
{},
2324
), {});
2425

26+
// Merge all sources that have been previously moved into a single array. This will be used to
27+
// prevent work being done again on data pulled back in from the source repositories
28+
// Any repositories matching a value found in this array will be ignored
29+
const movedSources = sources
30+
.filter((source) => source.foundInMaster && source.oldLocations)
31+
.reduce((output, source) => output.concat(source.oldLocations), []);
32+
2533
// This of course removes some of the flexibility we get from the processor approach
2634
// Need to devise a way to discover the uri rather than using hard-coded values
2735
const mergedSources = sources
28-
.reduce((output, source) => {
36+
.map((source) => ({
37+
uri: getUri(source)
38+
.toLowerCase(),
39+
source,
40+
}))
41+
.filter((source) => {
42+
if (movedSources.indexOf(source.uri) !== -1) {
43+
logger(`Ignoring repository as found to have moved at previous run: ${source.uri}`);
44+
return false;
45+
}
46+
47+
return true;
48+
})
49+
.reduce((output, filteredSource) => {
2950
const updatedOutput = output;
30-
const uri = getUri(source).toLowerCase();
51+
const { uri, source } = filteredSource;
3152

3253
if (!updatedOutput[uri]) {
3354
updatedOutput[uri] = { master: {}, sources: [] };

lib/data/transform/normalise-sources.js

+11-9
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
const YAML = require('js-yaml');
2-
const crypto = require('crypto');
32

43
const {
5-
getUri, logger, normalisePropertyNames, normaliseSplitters,
4+
getUri, getHash, logger, normalisePropertyNames, normaliseSplitters,
65
} = require('../util');
76

87
/**
@@ -16,7 +15,8 @@ module.exports = async (rawSources) => {
1615

1716
const dedupRepositoryValues = (data) => {
1817
if (Array.isArray(data)) {
19-
return [...new Set(data.map((value) => value.toLowerCase()))];
18+
return [...new Set(data.map((value) => value
19+
.toLowerCase()))];
2020
}
2121
return data ? data.toLowerCase() : data;
2222
};
@@ -35,8 +35,8 @@ module.exports = async (rawSources) => {
3535

3636
const normalisedSources = mergedSources
3737
.map((tool) => {
38-
// Loop across source data and normalise based on occurances of a given property
39-
// and naming conventions implied by preponderance in source data
38+
// Loop across source data and normalise based on occurances of a given property
39+
// and naming conventions implied by preponderance in source data
4040
const mergedSourceProperties = tool.sources
4141
.reduce((output, source) => Object.assign(
4242
output,
@@ -74,13 +74,15 @@ module.exports = async (rawSources) => {
7474
);
7575
}, {});
7676

77-
// Check whether there is any chance of setting a repository value based on any recognise
77+
// Check whether there is any chance of setting a repository value based on any recognised
7878
// properties from sources
7979
if (!source.repository) {
8080
const candidate = getUri(source);
8181

82+
// This will be expanded as more sources are implemented
8283
if (candidate && candidate.match(/^https.*github\.com(?:\/[a-zA-Z0-9-_.~]+){2}$/)) {
83-
source.repository = candidate;
84+
source.repository = candidate
85+
.toLowerCase();
8486
}
8587
}
8688

@@ -91,13 +93,13 @@ module.exports = async (rawSources) => {
9193

9294
if (!seed) {
9395
if (process.env.NODE_ENV !== 'test') {
94-
console.error(tool);
96+
console.error(`Cannot map URL to uniquely identity tools: ${JSON.stringify(tool)}`);
9597
}
9698

9799
throw new Error('Could not discover URL and therefore generate ID for tooling source. Check error log for tool properties');
98100
}
99101

100-
return Object.assign(tool, { id: crypto.createHash('md5').update(seed).digest('hex') });
102+
return Object.assign(tool, { id: getHash(seed.toLowerCase()) });
101103
});
102104

103105
return YAML.dump(normalisedSources);

lib/data/transform/purge-sources.js

+82
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
const YAML = require('js-yaml');
2+
3+
const { getHash, logger } = require('../util');
4+
5+
module.exports = (rawSources) => {
6+
logger(__filename, 'purgeSources');
7+
8+
const sources = YAML.load(rawSources);
9+
10+
// List all repository URLs
11+
const allRepositories = sources
12+
.filter((source) => source.repository)
13+
.map((source) => source.repository);
14+
15+
// Collect up any repositories that have been moved as a pointer ready for merging them with the
16+
// the new GitHub URL
17+
const newRepositoryLocations = sources
18+
.filter((source) => source.repositoryMetadata && source.repositoryMetadata.moved)
19+
.reduce((output, source) => {
20+
if (output[source.repositoryMetadata.newUrl]) {
21+
// eslint-disable-next-line no-param-reassign
22+
output[source.repositoryMetadata.newUrl] = output[source.repositoryMetadata.newUrl]
23+
.concat(source.repository);
24+
25+
return output;
26+
}
27+
28+
return Object.assign(
29+
output,
30+
{ [source.repositoryMetadata.newUrl]: [source.repository] },
31+
);
32+
}, {});
33+
34+
// Remove anything that is not found or has been moved. If a repository has been moved but the
35+
// new location is not in source then change the value of the source.repository property
36+
return YAML.dump(
37+
sources
38+
.map((source) => {
39+
const updatedSource = source;
40+
const newRepositoryLocation = newRepositoryLocations[source.repository];
41+
42+
if (newRepositoryLocation) {
43+
// Source repository is updated to include old locations for tracking purposes
44+
45+
// eslint-disable-next-line no-param-reassign
46+
updatedSource.oldLocations = [...new Set((source.oldLocations || [])
47+
.concat(newRepositoryLocations[source.repository]))];
48+
} else if ((source.repositoryMetadata || {}).moved
49+
&& allRepositories.indexOf(source.repositoryMetadata.newUrl) === -1) {
50+
// Source repository has been moved but there is no data in the repository to move it to
51+
// Update the repository property with the new URL. This will then be ignored on the
52+
// next run and the updated repository value retained in the tools.yaml
53+
54+
logger(`Moving repository metadata: ${source.repository} to new URL reference: ${updatedSource.repositoryMetadata.newUrl}`);
55+
56+
updatedSource.oldLocations = [source.repository];
57+
updatedSource.repository = updatedSource.repositoryMetadata.newUrl;
58+
updatedSource.id = getHash(source.repository);
59+
60+
delete updatedSource.repositoryMetadata.newUrl;
61+
delete updatedSource.repositoryMetadata.moved;
62+
}
63+
64+
return updatedSource;
65+
})
66+
.filter((source) => {
67+
const { repositoryMetadata } = source;
68+
69+
if (!repositoryMetadata || (!repositoryMetadata.notFound && !repositoryMetadata.moved)) {
70+
return true;
71+
}
72+
73+
if (repositoryMetadata.notFound) {
74+
logger(`Removing repository as not found at target location: ${source.repository}`);
75+
return false;
76+
}
77+
78+
logger(`Removing repository as it has been moved: ${source.repository} and is already catalogued at new URL reference: ${source.repositoryMetadata.newUrl}`);
79+
return false;
80+
}),
81+
);
82+
};

lib/data/transform/read-source-data.js

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ const { logger } = require('../util');
66

77
const readSource = async (output, source) => {
88
const update = await output;
9-
logger(source.title, 'Reading source data...');
9+
logger(`${__dirname}/${source.processor}`, source.title, 'Reading source data...');
1010

1111
// Yes, this is an anti-pattern and opinionated approach...
1212
// but it provides a nice level of flexibility in the build mechanism

lib/data/util.js

+2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
const dice = require('talisman/metrics/dice');
22
const dl = require('talisman/metrics/damerau-levenshtein');
33
const log = require('fancy-log');
4+
const crypto = require('crypto');
45

56
module.exports = {
7+
getHash: (seed) => crypto.createHash('md5').update(seed).digest('hex'),
68
getUri: (source) => (source.repository || source.github || source.link || source.homepage),
79
logger: (...args) => {
810
if (process.env.NODE_ENV !== 'test') {

package.json

+2-1
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,14 @@
1010
"build:site:webpack": "webpack --mode ${NODE_ENV:-development}",
1111
"build:site:eleventy": "eleventy",
1212
"build:site": "NODE_ENV=production npm-run-all -l clean build:site:*",
13+
"build:all": "npm run build:data:full && npm run build:site",
1314
"coverage": "export NODE_ENV=test && nyc --reporter=html --reporter=text ./node_modules/.bin/_mocha test/lib/* --exit --recursive --timeout 10000",
1415
"coverage:desktop": "yarn run coverage && open coverage/index.html",
1516
"clean": "rimraf docs",
1617
"serve:webpack": "webpack --mode development --watch",
1718
"serve:eleventy": "ELEVENTY_ENV=development eleventy --serve",
1819
"serve": "npm-run-all clean --parallel serve:*",
19-
"test": "export NODE_ENV=test && mocha test --exit --recursive --timeout 10000"
20+
"test": "export NODE_ENV=test && mocha --exit --recursive --timeout 10000"
2021
},
2122
"dependencies": {
2223
"ajv": "^8.10.0",

0 commit comments

Comments
 (0)