Add hash arrangements in models and importers

sbs2001 · sbs2001 · commit bf07d2b6f6ec · 2020-04-02T23:46:52.000+05:30
Signed-off-by: Shivam Sandbhor &lt;shivam.sandbhor@gmail.com&gt;
diff --git a/requirements.txt b/requirements.txt
@@ -30,4 +30,5 @@ tqdm==4.41.1
 wcwidth==0.1.7
 whitenoise==5.0.1
 zipp==0.6.0
-pytoml==0.1.21
+pytoml==0.1.21
+xxhash==1.4.3
diff --git a/vulnerabilities/models.py b/vulnerabilities/models.py
@@ -31,8 +31,10 @@ class Vulnerability(models.Model):
     A software vulnerability with minimal information.
     Identifiers other than CVE ID are stored as VulnerabilityReference.
     """
-    cve_id = models.CharField(max_length=50, help_text='CVE ID', unique=True, null=True)
-    summary = models.TextField(help_text='Summary of the vulnerability', blank=True)
+    cve_id = models.CharField(
+        max_length=50, help_text='CVE ID', unique=True, null=True)
+    summary = models.TextField(
+        help_text='Summary of the vulnerability', blank=True)
     cvss = models.FloatField(max_length=100, help_text='CVSS Score', null=True)
 
     def __str__(self):
@@ -68,7 +70,8 @@ class Package(PackageURLMixin):
     A software package with minimal identifying information.
     Other identifiers are stored as PackageReference.
     """
-    vulnerabilities = models.ManyToManyField(to='Vulnerability', through='ImpactedPackage')
+    vulnerabilities = models.ManyToManyField(
+        to='Vulnerability', through='ImpactedPackage')
 
     def __str__(self):
         return self.name
@@ -123,3 +126,7 @@ class PackageReference(models.Model):
 
     def __str__(self):
         return self.platform
+
+
+class AdvisoryHashes(models.Model):
+    hash = models.BigIntegerField(primary_key=True, unique=True)
diff --git a/vulnerabilities/scraper/npm.py b/vulnerabilities/scraper/npm.py
@@ -22,10 +22,13 @@
 #  Visit https://github.com/nexB/vulnerablecode/ for support and download.
 
 import json
+import xxhash
 from dephell_specifier import RangeSpecifier
 from urllib.request import urlopen
 from urllib.error import HTTPError
 
+from vulnerabilities.models import AdvisoryHashes
+
 
 NPM_URL = 'https://registry.npmjs.org{}'
 PAGE = '/-/npm/v1/security/advisories?page=0'
@@ -93,15 +96,26 @@ def extract_data(JSON):
             continue
             # NPM registry has no data regarding this package finally we skip these
 
-        package_vulnerabilities.append({
+        package_vulnerability = {
             'package_name': package_name,
             'summary': obj.get('overview', ''),
             'cve_ids': obj.get('cves', []),
             'fixed_versions': fixed_versions,
             'affected_versions': affected_versions,
             'severity': obj.get('severity', ''),
             'advisory': obj.get('url', ''),
-        })
+        }
+
+        pkg_vuln_hash = xxhash.xxh32(json.dumps(
+            package_vulnerability, sort_keys=True)).intdigest()
+        hash_query = AdvisoryHashes.objects.filter(hash=pkg_vuln_hash)
+        if hash_query:
+            # In the past we already had this same data, so much work
+            # for nothing
+            continue
+        package_vulnerabilities.append(package_vulnerability)
+        AdvisoryHashes.objects.create(hash=pkg_vuln_hash)
+
     return package_vulnerabilities
 
 
@@ -115,7 +129,15 @@ def scrape_vulnerabilities():
         try:
             cururl = NPM_URL.format(nextpage)
             response = json.load(urlopen(cururl))
-            package_vulnerabilities.extend(extract_data(response))
+            resp_hash = xxhash.xxh32(json.dumps(
+                response, sort_keys=True)).intdigest()
+            hash_query = AdvisoryHashes.objects.filter(hash=resp_hash)
+
+            if not hash_query:
+
+                package_vulnerabilities.extend(extract_data(response))
+                AdvisoryHashes.objects.create(hash=resp_hash)
+
             nextpage = response.get('urls', {}).get('next')
 
         except HTTPError as error:
diff --git a/vulnerabilities/scraper/ruby.py b/vulnerabilities/scraper/ruby.py
@@ -1,18 +1,29 @@
-import os
+import saneyaml
 import urllib.request
 from urllib.error import HTTPError
 from zipfile import ZipFile
 from io import BytesIO
-import saneyaml
 from dephell_specifier import RangeSpecifier
 from urllib.request import urlopen
+from xxhash import xxh32
+
+from vulnerabilities.models import AdvisoryHashes
 
 RUBYSEC_DB_URL = 'https://github.com/rubysec/ruby-advisory-db/archive/master.zip'
 
 
 def rubygem_advisories(url, prefix='ruby-advisory-db-master/gems/'):
+    hash_of_zip = xxh32()
     with urlopen(url) as response:
-        with ZipFile(BytesIO(response.read())) as zf:
+        with BytesIO(response.read()) as zfbytes:
+            chunk = zfbytes.read()
+            hash_of_zip.update(chunk)
+            hash_of_zip = hash_of_zip.intdigest()
+            hash_query = AdvisoryHashes.objects.filter(hash=hash_of_zip)
+            if hash_query:
+                return []
+            AdvisoryHashes.objects.create(hash=hash_of_zip)
+            zf = ZipFile(zfbytes)
             for path in zf.namelist():
                 if path.startswith(prefix) and path.endswith('.yml'):
                     yield saneyaml.load(zf.open(path))
@@ -67,11 +78,20 @@ def import_vulnerabilities():
                         break
 
         affected_versions = all_versions - unaffected_versions
-        vulnerability_package_dicts.append({
+        vuln_pkg_dict = {
             'package_name': package_name,
             'cve_id': vulnerability_id,
             'fixed_versions': unaffected_versions,
             'affected_versions': affected_versions,
             'advisory': advisory_url
-        })
+        }
+        pkg_vuln_hash = xxhash.xxh32(json.dumps(
+            vuln_pkg_dict, sort_keys=True)).intdigest()
+        hash_query = AdvisoryHashes.objects.filter(hash=pkg_vuln_hash)
+
+        if hash_query:
+            continue
+
+        AdvisoryHashes.objects.create(hash=pkg_vuln_hash)
+        vulnerability_package_dicts.append(vuln_pkg_dict)
     return vulnerability_package_dicts
diff --git a/vulnerabilities/tests/test_npm.py b/vulnerabilities/tests/test_npm.py
@@ -41,7 +41,7 @@ def test_get_all_versions():
 
 
 @pytest.mark.webtest
-def test_extract_data():
+def test_extract_data(db):
     with open(os.path.join(TEST_DATA, 'npm_test.json')) as f:
         test_data = json.load(f)