[1418] Cached report refactor - Now use DataPipe to trigger QA off archiver. Minor fixes to QA task - now runs ok.

David Read · David Read · commit 48fadf2585d1 · 2014-05-01T09:40:37.000Z
diff --git a/ckanext/qa/model.py b/ckanext/qa/model.py
@@ -37,15 +37,11 @@ class QA(Base):
     updated = Column(types.DateTime)
 
     def __repr__(self):
-        if not self.error:
-            summary = 'score=%s format=%s' % (self.openness_score, self.format)
-            details = self.openness_score_reason
-        else:
-            summary = 'ERROR'
-            details = self.error
+        summary = 'score=%s format=%s' % (self.openness_score, self.format)
+        details = self.openness_score_reason
         package = model.Package.get(self.package_id)
         package_name = package.name if package else '?%s?' % self.package_id
-        return '<QA %s /dataset/%s/resource/%s%s>' % \
+        return '<QA %s /dataset/%s/resource/%s %s>' % \
             (summary, package_name, self.resource_id, details)
 
     def as_dict(self):
diff --git a/ckanext/qa/plugin.py b/ckanext/qa/plugin.py
@@ -20,16 +20,20 @@
 import reports
 import logic
 
+from ckanext.archiver.interfaces import IPipe
+
 resource_dictize = model_dictize.resource_dictize
 send_task = celery_app.celery.send_task
 
 log = logging.getLogger(__name__)
 
+
 class QAPlugin(p.SingletonPlugin):
     p.implements(p.IConfigurer, inherit=True)
     p.implements(p.IRoutes, inherit=True)
-    p.implements(p.IDomainObjectModification, inherit=True)
-    p.implements(p.IResourceUrlChange)
+    p.implements(IPipe, inherit=True)
+    #p.implements(p.IDomainObjectModification, inherit=True)
+    #p.implements(p.IResourceUrlChange)
     p.implements(p.IActions)
     p.implements(p.IReportCache)
 
@@ -92,22 +96,19 @@ def before_map(self, map):
 
         return map
 
-    # IDomainObjectModification / IResourceUrlChange
+    # IPipe
 
-    def notify(self, entity, operation=None):
-        if not isinstance(entity, model.Resource):
+    def receive_data(self, operation, **params):
+        '''Receive notification from ckan-archiver that a resource has been archived.'''
+        if not operation == 'archived':
             return
-        resource = entity
-
-        if operation:
-            if operation == model.DomainObjectOperation.new:
-                # Resource created
-                create_qa_update_task(resource, queue='priority')
-        else:
-            # Resource URL has changed.
-            # If operation is None, resource URL has been changed because the
-            # notify function in IResourceUrlChange only takes 1 parameter
-            create_qa_update_task(resource, queue='priority')
+        resource_id = params['resource_id']
+        #cache_filepath = params['cached_filepath']
+
+        resource = model.Resource.get(resource_id)
+        assert resource
+
+        create_qa_update_task(resource, queue='priority')
 
     # IActions
 
diff --git a/ckanext/qa/sniff_format.py b/ckanext/qa/sniff_format.py
@@ -360,7 +360,6 @@ def get_zipped_format(filepath, log):
         log.info('Zip has no known extensions: %s', filepath)
         return Formats.by_display_name()['Zip']
 
-    print top_scoring_extension_counts.items()
     top_scoring_extension_counts = sorted(top_scoring_extension_counts.items(),
                                           key=lambda x: x[1])
     top_extension = top_scoring_extension_counts[-1][0]
@@ -372,17 +371,19 @@ def get_zipped_format(filepath, log):
     format_['container'] = Formats.by_display_name()['Zip']['display_name']
     log.info('Zipped file format detected: %s', format_['display_name'])
     return format_
-    
+
+
 def is_excel(filepath, log):
     try:
-        book = xlrd.open_workbook(filepath)
+        xlrd.open_workbook(filepath)
     except Exception, e:
         log.info('Not Excel - failed to load: %s %s', e, e.args)
         return False
     else:
         log.info('Excel file opened successfully')
         return True
 
+
 # same as the python 2.7 subprocess.check_output
 def check_output(*popenargs, **kwargs):
     if 'stdout' in kwargs:
@@ -425,7 +426,8 @@ def run_bsd_file(filepath, log):
         return format_
     log.info('"file" could not determine file format of "%s": %s',
              filepath, result)
-                      
+
+
 def is_ttl(buf, log):
     '''If the buffer is a Turtle RDF file then return True.'''
     # Turtle spec: "Turtle documents may have the strings '@prefix' or '@base' (case dependent) near the beginning of the document."
@@ -465,12 +467,12 @@ def turtle_regex():
      does not support nested blank nodes, collection, sameas ('a' token)
     '''
     if not turtle_regex_:
-         global turtle_regex_
-         rdf_term = '(<[^ >]+>|_:\S+|".+?"(@\w+)?(\^\^\S+)?|\'.+?\'(@\w+)?(\^\^\S+)?|""".+?"""(@\w+)?(\^\^\S+)?|\'\'\'.+?\'\'\'(@\w+)?(\^\^\S+)?|[+-]?([0-9]+|[0-9]*\.[0-9]+)(E[+-]?[0-9]+)?|false|true)'
-
-         # simple case is: triple_re = '^T T T \.$'.replace('T', rdf_term)
-         # but extend to deal with multiple predicate-objects:
-         #triple = '^T T T\s*(;\s*T T\s*)*\.\s*$'.replace('T', rdf_term).replace(' ', '\s+')
-         triple = '(^T|;)\s*T T\s*(;|\.\s*$)'.replace('T', rdf_term).replace(' ', '\s+')
-         turtle_regex_ = re.compile(triple, re.MULTILINE)
+        global turtle_regex_
+        rdf_term = '(<[^ >]+>|_:\S+|".+?"(@\w+)?(\^\^\S+)?|\'.+?\'(@\w+)?(\^\^\S+)?|""".+?"""(@\w+)?(\^\^\S+)?|\'\'\'.+?\'\'\'(@\w+)?(\^\^\S+)?|[+-]?([0-9]+|[0-9]*\.[0-9]+)(E[+-]?[0-9]+)?|false|true)'
+
+        # simple case is: triple_re = '^T T T \.$'.replace('T', rdf_term)
+        # but extend to deal with multiple predicate-objects:
+        #triple = '^T T T\s*(;\s*T T\s*)*\.\s*$'.replace('T', rdf_term).replace(' ', '\s+')
+        triple = '(^T|;)\s*T T\s*(;|\.\s*$)'.replace('T', rdf_term).replace(' ', '\s+')
+        turtle_regex_ = re.compile(triple, re.MULTILINE)
     return turtle_regex_
diff --git a/ckanext/qa/tasks.py b/ckanext/qa/tasks.py
@@ -8,6 +8,7 @@
 import traceback
 
 import ckan.lib.celery_app as celery_app
+from ckan.lib.json import DateTimeJsonEncoder
 from ckanext.dgu.lib.formats import Formats
 from ckanext.qa.sniff_format import sniff_file_format
 from ckanext.archiver.model import Archival, Status
@@ -108,15 +109,15 @@ def update(ckan_ini_filepath, resource_id):
             raise QAError('Resource ID not found: %s' % resource_id)
         qa_result = resource_score(resource, log)
         log.info('Openness scoring: \n%r\n%r\n%r\n\n', qa_result, resource,
-                 resource['url'])
+                 resource.url)
         save_qa_result(resource.id, qa_result, log)
         log.info('CKAN updated with openness score')
         package = resource.resource_group.package if resource.resource_group else None
         if package:
             update_search_index(package.id, log)
         else:
             log.warning('Resource not connected to a package. Res: %r', resource)
-        return json.dumps(qa_result)
+        return json.dumps(qa_result, cls=DateTimeJsonEncoder)
     except Exception, e:
         log.error('Exception occurred during QA update: %s: %s',
                   e.__class__.__name__,  unicode(e))
@@ -132,7 +133,7 @@ def get_qa_format(resource_id):
     return q.format
 
 
-def resource_score(resource_id, log):
+def resource_score(resource, log):
     """
     Score resource on Sir Tim Berners-Lee\'s five stars of openness.
 
@@ -145,18 +146,15 @@ def resource_score(resource_id, log):
 
     Raises QAError for reasonable errors
     """
-    from ckan import model
-
     score = 0
     score_reason = ''
     format_ = None
 
     try:
         score_reasons = []  # a list of strings detailing how we scored it
-        archival = Archival.get_for_resource(resource_id=resource_id)
-        resource = model.Resource.get(resource_id)
+        archival = Archival.get_for_resource(resource_id=resource.id)
         if not resource:
-            raise QAError('Could not find resource "%s"' % resource_id)
+            raise QAError('Could not find resource "%s"' % resource.id)
 
         score, format_ = score_if_link_broken(archival, resource, score_reasons, log)
         if score == None:
@@ -182,8 +180,7 @@ def resource_score(resource_id, log):
     except Exception, e:
         log.error('Unexpected error while calculating openness score %s: %s\nException: %s', e.__class__.__name__,  unicode(e), traceback.format_exc())
         score_reason = "Unknown error: %s" % str(e)
-        if os.environ.get('DEBUG'):
-            raise
+        raise
 
     # Even if we can get the link, we should still treat the resource
     # as having a score of 0 if the license isn't open.
@@ -398,7 +395,7 @@ def save_qa_result(resource_id, qa_result, log):
     else:
         log.info('QA from before: %r', qa)
 
-    for key in ('openness_score', 'openness_reason', 'format'):
+    for key in ('openness_score', 'openness_score_reason', 'format'):
         setattr(qa, key, qa_result[key])
     qa.archival_timestamp == qa_result['archival_timestamp']
     qa.updated = now
diff --git a/tests/test_tasks.py b/tests/test_tasks.py
@@ -1,18 +1,17 @@
 import requests
-import json
 import logging
-import os
 import urllib
 import datetime
 
-from nose.tools import raises, assert_equal
+from nose.tools import assert_equal
 from ckan import model
 from ckan.tests import BaseCase
 from ckan.logic import get_action
 
 import ckanext.qa.tasks
 from ckanext.qa.tasks import resource_score, extension_variants
 import ckanext.archiver
+import ckanext.archiver.tasks
 from ckanext.dgu.lib.formats import Formats
 from ckanext.qa import model as qa_model
 from ckanext.archiver import model as archiver_model
@@ -43,6 +42,43 @@ def set_sniffed_format(format_display_name):
 
 TODAY = datetime.datetime(year=2008, month=10, day=10)
 
+class TestTask(BaseCase):
+
+    @classmethod
+    def setup_class(cls):
+        archiver_model.init_tables(model.meta.engine)
+        qa_model.init_tables(model.meta.engine)
+
+    def teardown(self):
+        model.repo.rebuild_db()
+
+    def test_trigger_on_archival(cls):
+        # create package
+        context = {'model': model, 'ignore_auth': True, 'session': model.Session, 'user': 'test'}
+        pkg = {'name': 'testpkg', 'license_id': 'uk-ogl', 'resources': [
+            {'url': 'http://test.com/', 'format': 'CSV', 'description': 'Test'}
+            ]}
+        pkg = get_action('package_create')(context, pkg)
+        resource_dict = pkg['resources'][0]
+        res_id = resource_dict['id']
+        # create record of archival
+        archival = Archival.create(res_id)
+        cache_filepath = __file__  # just needs to exist
+        archival.cache_filepath = cache_filepath
+        archival.updated = TODAY
+        model.Session.add(archival)
+        model.Session.commit()
+        # TODO show that QA hasn't run yet
+
+        # create a send_data from ckanext-archiver, that gets picked up by
+        # ckanext-qa to put a task on the queue
+        ckanext.archiver.tasks.notify(resource_dict, cache_filepath)
+        # this is useful on its own (without any asserts) because it checks
+        # there are no exceptions when running it
+
+        # TODO run celery and check it actually ran...
+
+
 class TestResourceScore(BaseCase):
 
     @classmethod
@@ -60,11 +96,7 @@ def setup_class(cls):
         }
 
     def teardown(self):
-        pkg = model.Package.get(u'testpkg')
-        if pkg:
-            model.repo.new_revision()
-            pkg.purge()
-            model.repo.commit_and_remove()
+        model.repo.rebuild_db()
 
     def _test_resource(self, url='anything', format='TXT', archived=True, cached=True, license_id='uk-ogl'):
         context = {'model': model, 'ignore_auth': True, 'session': model.Session, 'user': 'test'}
@@ -79,7 +111,7 @@ def _test_resource(self, url='anything', format='TXT', archived=True, cached=Tru
             archival.updated = TODAY
             model.Session.add(archival)
             model.Session.commit()
-        return res_id
+        return model.Resource.get(res_id)
 
     @classmethod
     def _set_task_status(cls, task_type, task_status_str):
@@ -173,16 +205,16 @@ def test_available_but_not_open(self):
         assert 'License not open' in result['openness_score_reason'], result
 
     def test_not_available_and_not_open(self):
-        res_id = self._test_resource(license_id=None, format=None, cached=False)
-        archival = Archival.get_for_resource(res_id)
+        res = self._test_resource(license_id=None, format=None, cached=False)
+        archival = Archival.get_for_resource(res.id)
         archival.status_id = Status.by_text('Download error')
         archival.reason = 'Server returned 500 error'
         archival.last_success = None
         archival.first_failure = datetime.datetime(year=2008, month=10, day=1, hour=6, minute=30)
         archival.failure_count = 16
         archival.is_broken = True
         model.Session.commit()
-        result = resource_score(res_id, log)
+        result = resource_score(res, log)
         assert result['openness_score'] == 0, result
         assert_equal(result['format'], None)
         # in preference it should report that it is not available
@@ -192,22 +224,22 @@ def test_not_available_any_more(self):
         # A cache of the data still exists from the previous run, but this
         # time, the archiver found the file gave a 404.
         # The record of the previous (successful) run of QA.
-        res_id = self._test_resource(license_id=None, format=None)
-        qa = qa_model.QA.create(res_id)
+        res = self._test_resource(license_id=None, format=None)
+        qa = qa_model.QA.create(res.id)
         qa.format = 'CSV'
         model.Session.add(qa)
         model.Session.commit()
         # cache still exists from the previous run, but this time, the archiver
         # found the file gave a 404.
-        archival = Archival.get_for_resource(res_id)
+        archival = Archival.get_for_resource(res.id)
         archival.cache_filepath = __file__
         archival.status_id = Status.by_text('Download error')
         archival.reason = 'Server returned 404 error'
         archival.last_success = datetime.datetime(year=2008, month=10, day=1)
         archival.first_failure = datetime.datetime(year=2008, month=10, day=2)
         archival.failure_count = 1
         archival.is_broken = True
-        result = resource_score(res_id, log)
+        result = resource_score(res, log)
         assert result['openness_score'] == 0, result
         assert_equal(result['format'], 'CSV')
         # in preference it should report that it is not available