Skip to content

Commit bf07d2b

Browse files
committed
Add hash arrangements in models and importers
Signed-off-by: Shivam Sandbhor <[email protected]>
1 parent 4c952ad commit bf07d2b

File tree

5 files changed

+63
-13
lines changed

5 files changed

+63
-13
lines changed

requirements.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -30,4 +30,5 @@ tqdm==4.41.1
3030
wcwidth==0.1.7
3131
whitenoise==5.0.1
3232
zipp==0.6.0
33-
pytoml==0.1.21
33+
pytoml==0.1.21
34+
xxhash==1.4.3

vulnerabilities/models.py

+10-3
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,10 @@ class Vulnerability(models.Model):
3131
A software vulnerability with minimal information.
3232
Identifiers other than CVE ID are stored as VulnerabilityReference.
3333
"""
34-
cve_id = models.CharField(max_length=50, help_text='CVE ID', unique=True, null=True)
35-
summary = models.TextField(help_text='Summary of the vulnerability', blank=True)
34+
cve_id = models.CharField(
35+
max_length=50, help_text='CVE ID', unique=True, null=True)
36+
summary = models.TextField(
37+
help_text='Summary of the vulnerability', blank=True)
3638
cvss = models.FloatField(max_length=100, help_text='CVSS Score', null=True)
3739

3840
def __str__(self):
@@ -68,7 +70,8 @@ class Package(PackageURLMixin):
6870
A software package with minimal identifying information.
6971
Other identifiers are stored as PackageReference.
7072
"""
71-
vulnerabilities = models.ManyToManyField(to='Vulnerability', through='ImpactedPackage')
73+
vulnerabilities = models.ManyToManyField(
74+
to='Vulnerability', through='ImpactedPackage')
7275

7376
def __str__(self):
7477
return self.name
@@ -123,3 +126,7 @@ class PackageReference(models.Model):
123126

124127
def __str__(self):
125128
return self.platform
129+
130+
131+
class AdvisoryHashes(models.Model):
132+
hash = models.BigIntegerField(primary_key=True, unique=True)

vulnerabilities/scraper/npm.py

+25-3
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,13 @@
2222
# Visit https://github.com/nexB/vulnerablecode/ for support and download.
2323

2424
import json
25+
import xxhash
2526
from dephell_specifier import RangeSpecifier
2627
from urllib.request import urlopen
2728
from urllib.error import HTTPError
2829

30+
from vulnerabilities.models import AdvisoryHashes
31+
2932

3033
NPM_URL = 'https://registry.npmjs.org{}'
3134
PAGE = '/-/npm/v1/security/advisories?page=0'
@@ -93,15 +96,26 @@ def extract_data(JSON):
9396
continue
9497
# NPM registry has no data regarding this package finally we skip these
9598

96-
package_vulnerabilities.append({
99+
package_vulnerability = {
97100
'package_name': package_name,
98101
'summary': obj.get('overview', ''),
99102
'cve_ids': obj.get('cves', []),
100103
'fixed_versions': fixed_versions,
101104
'affected_versions': affected_versions,
102105
'severity': obj.get('severity', ''),
103106
'advisory': obj.get('url', ''),
104-
})
107+
}
108+
109+
pkg_vuln_hash = xxhash.xxh32(json.dumps(
110+
package_vulnerability, sort_keys=True)).intdigest()
111+
hash_query = AdvisoryHashes.objects.filter(hash=pkg_vuln_hash)
112+
if hash_query:
113+
# In the past we already had this same data, so much work
114+
# for nothing
115+
continue
116+
package_vulnerabilities.append(package_vulnerability)
117+
AdvisoryHashes.objects.create(hash=pkg_vuln_hash)
118+
105119
return package_vulnerabilities
106120

107121

@@ -115,7 +129,15 @@ def scrape_vulnerabilities():
115129
try:
116130
cururl = NPM_URL.format(nextpage)
117131
response = json.load(urlopen(cururl))
118-
package_vulnerabilities.extend(extract_data(response))
132+
resp_hash = xxhash.xxh32(json.dumps(
133+
response, sort_keys=True)).intdigest()
134+
hash_query = AdvisoryHashes.objects.filter(hash=resp_hash)
135+
136+
if not hash_query:
137+
138+
package_vulnerabilities.extend(extract_data(response))
139+
AdvisoryHashes.objects.create(hash=resp_hash)
140+
119141
nextpage = response.get('urls', {}).get('next')
120142

121143
except HTTPError as error:

vulnerabilities/scraper/ruby.py

+25-5
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,29 @@
1-
import os
1+
import saneyaml
22
import urllib.request
33
from urllib.error import HTTPError
44
from zipfile import ZipFile
55
from io import BytesIO
6-
import saneyaml
76
from dephell_specifier import RangeSpecifier
87
from urllib.request import urlopen
8+
from xxhash import xxh32
9+
10+
from vulnerabilities.models import AdvisoryHashes
911

1012
RUBYSEC_DB_URL = 'https://github.com/rubysec/ruby-advisory-db/archive/master.zip'
1113

1214

1315
def rubygem_advisories(url, prefix='ruby-advisory-db-master/gems/'):
16+
hash_of_zip = xxh32()
1417
with urlopen(url) as response:
15-
with ZipFile(BytesIO(response.read())) as zf:
18+
with BytesIO(response.read()) as zfbytes:
19+
chunk = zfbytes.read()
20+
hash_of_zip.update(chunk)
21+
hash_of_zip = hash_of_zip.intdigest()
22+
hash_query = AdvisoryHashes.objects.filter(hash=hash_of_zip)
23+
if hash_query:
24+
return []
25+
AdvisoryHashes.objects.create(hash=hash_of_zip)
26+
zf = ZipFile(zfbytes)
1627
for path in zf.namelist():
1728
if path.startswith(prefix) and path.endswith('.yml'):
1829
yield saneyaml.load(zf.open(path))
@@ -67,11 +78,20 @@ def import_vulnerabilities():
6778
break
6879

6980
affected_versions = all_versions - unaffected_versions
70-
vulnerability_package_dicts.append({
81+
vuln_pkg_dict = {
7182
'package_name': package_name,
7283
'cve_id': vulnerability_id,
7384
'fixed_versions': unaffected_versions,
7485
'affected_versions': affected_versions,
7586
'advisory': advisory_url
76-
})
87+
}
88+
pkg_vuln_hash = xxhash.xxh32(json.dumps(
89+
vuln_pkg_dict, sort_keys=True)).intdigest()
90+
hash_query = AdvisoryHashes.objects.filter(hash=pkg_vuln_hash)
91+
92+
if hash_query:
93+
continue
94+
95+
AdvisoryHashes.objects.create(hash=pkg_vuln_hash)
96+
vulnerability_package_dicts.append(vuln_pkg_dict)
7797
return vulnerability_package_dicts

vulnerabilities/tests/test_npm.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def test_get_all_versions():
4141

4242

4343
@pytest.mark.webtest
44-
def test_extract_data():
44+
def test_extract_data(db):
4545
with open(os.path.join(TEST_DATA, 'npm_test.json')) as f:
4646
test_data = json.load(f)
4747

0 commit comments

Comments
 (0)