Skip to content

Commit ad54681

Browse files
committed
analyze a whole database and find its relationships
1 parent c7b07ea commit ad54681

File tree

5 files changed

+215
-2
lines changed

5 files changed

+215
-2
lines changed

Diff for: package-lock.json

+31
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: package.json

+1
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
"@types/mocha": "^10.0.1",
5757
"@types/node": "^18.11.18",
5858
"@types/reservoir": "^0.1.0",
59+
"@types/yargs": "^17.0.29",
5960
"@typescript-eslint/eslint-plugin": "^5.47.1",
6061
"@typescript-eslint/parser": "^5.47.1",
6162
"bson": "^5.0.1",

Diff for: scripts/analyze-database.ts

+78
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
#!/usr/bin/env npx ts-node
2+
3+
import { MongoClient, Document } from 'mongodb';
4+
import yargs from 'yargs';
5+
import { hideBin } from 'yargs/helpers';
6+
7+
import type { Schema, Relationship } from '../src';
8+
import { SchemaAnalyzer, findRelationshipsForSchema } from '../src';
9+
10+
async function analyzeCollection(documents: AsyncIterable<Document>) {
11+
const analyzer = new SchemaAnalyzer({
12+
storeValues: true
13+
});
14+
for await (const doc of documents) {
15+
analyzer.analyzeDoc(doc);
16+
}
17+
return analyzer;
18+
}
19+
20+
let client: MongoClient;
21+
async function run() {
22+
const argv = await yargs(hideBin(process.argv))
23+
.option('sampleSize', { type: 'number', default: 1000 })
24+
.argv;
25+
26+
const [uri, databaseName] = argv._ as [string, string];
27+
if (!(uri && databaseName)) {
28+
throw new Error('USAGE: analyze-database.ts connectionURI databaseName');
29+
}
30+
31+
client = new MongoClient(uri);
32+
await client.connect();
33+
34+
const db = client.db(databaseName);
35+
36+
const collectionInfos = await db.listCollections().toArray();
37+
console.dir(collectionInfos);
38+
39+
const collections: Record<string, Schema> = {};
40+
41+
const relationships: Relationship[] = [];
42+
43+
const collectionNames = collectionInfos.map((c) => c.name);
44+
45+
for (const coll of collectionInfos) {
46+
console.log(coll.name);
47+
const collection = db.collection(coll.name);
48+
const cursor = collection.aggregate([{
49+
$sample: {
50+
size: argv.sampleSize
51+
}
52+
}], {
53+
allowDiskUse: true
54+
});
55+
56+
const analyzer = await analyzeCollection(cursor);
57+
58+
const schema = analyzer.getResult();
59+
collections[coll.name] = schema;
60+
61+
relationships.push(...await findRelationshipsForSchema(db, coll.name, collectionNames, schema));
62+
63+
console.log(); // newline
64+
}
65+
66+
console.dir(relationships, { depth: null });
67+
}
68+
69+
if (require.main === module) {
70+
run()
71+
.finally(() => {
72+
client?.close();
73+
})
74+
.catch((err) => {
75+
console.error(err.stack);
76+
process.exit(1);
77+
});
78+
}

Diff for: src/database-analyzer.ts

+95
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
2+
import type { Schema } from './schema-analyzer';
3+
import type { Db } from 'mongodb';
4+
5+
type CollectionFieldReference = {
6+
collection: string;
7+
fieldPath: string[];
8+
}
9+
10+
type FieldReferenceWithValues = CollectionFieldReference & {
11+
values: any[]
12+
}
13+
14+
export type Relationship = {
15+
from: CollectionFieldReference;
16+
to: CollectionFieldReference;
17+
}
18+
19+
function shuffleArray(array: any[]) {
20+
for (let i = array.length - 1; i > 0; i--) {
21+
const j = Math.floor(Math.random() * (i + 1));
22+
[array[i], array[j]] = [array[j], array[i]];
23+
}
24+
}
25+
26+
function findCandidateReferencesForSchema(collectionName: string, schema: Schema) {
27+
const candidatePaths: FieldReferenceWithValues[] = [];
28+
29+
for (const field of schema.fields) {
30+
if (field.name === '_id') {
31+
continue;
32+
}
33+
34+
// TODO: also consider anything matching a known naming convention like /_id$/
35+
// TODO: we might also want to consider any large integers if there are lots of different values?
36+
37+
const values: any[] = [];
38+
for (const typeInfo of field.types) {
39+
if (['ObjectId', 'UUID'].includes(typeInfo.bsonType)) {
40+
values.push(...(typeInfo as { values: any[]}).values ?? []);
41+
}
42+
}
43+
if (values.length) {
44+
// in case the sample came from limit()* and wasn't already sorted randomly
45+
shuffleArray(values);
46+
47+
candidatePaths.push({
48+
collection: collectionName,
49+
fieldPath: field.path,
50+
values
51+
});
52+
console.log(field.path);
53+
}
54+
}
55+
56+
return candidatePaths;
57+
}
58+
59+
async function findRelationshipsCandidate(db: Db, collectionNames: string[], candidatePaths: FieldReferenceWithValues[]) {
60+
const relationships: Relationship[] = [];
61+
62+
// not the most efficient..
63+
for (const { collection, fieldPath, values } of candidatePaths) {
64+
for (const target of collectionNames) {
65+
const ids = values.slice(0, 10);
66+
const result = (await db.collection(target).aggregate([
67+
{ $match: { _id: { $in: ids } } },
68+
{ $count: 'matches' }
69+
]).toArray());
70+
71+
if (result.length) {
72+
console.log(collection, fieldPath, result);
73+
relationships.push({
74+
from: {
75+
collection,
76+
fieldPath
77+
},
78+
to: {
79+
collection: target,
80+
fieldPath: ['_id']
81+
}
82+
});
83+
// no point checking the collections - we assume this is a many to one
84+
break;
85+
}
86+
}
87+
}
88+
89+
return relationships;
90+
}
91+
92+
export async function findRelationshipsForSchema(db: Db, collectionName: string, collectionNames: string[], schema: Schema) {
93+
const candidatePaths = findCandidateReferencesForSchema(collectionName, schema);
94+
return await findRelationshipsCandidate(db, collectionNames, candidatePaths);
95+
}

Diff for: src/index.ts

+10-2
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,12 @@ import type {
2323
SimplifiedSchema
2424
} from './schema-analyzer';
2525
import * as schemaStats from './stats';
26+
import type {
27+
Relationship
28+
} from './database-analyzer';
29+
import {
30+
findRelationshipsForSchema
31+
} from './database-analyzer';
2632

2733
type MongoDBCursor = AggregationCursor | FindCursor;
2834

@@ -109,7 +115,8 @@ export type {
109115
SimplifiedSchemaDocumentType,
110116
SimplifiedSchemaType,
111117
SimplifiedSchemaField,
112-
SimplifiedSchema
118+
SimplifiedSchema,
119+
Relationship
113120
};
114121

115122
export {
@@ -119,5 +126,6 @@ export {
119126
getSchemaPaths,
120127
getSimplifiedSchema,
121128
SchemaAnalyzer,
122-
schemaStats
129+
schemaStats,
130+
findRelationshipsForSchema
123131
};

0 commit comments

Comments
 (0)