diff --git a/mfr/core/exceptions.py b/mfr/core/exceptions.py index 8f3573b54..317399568 100644 --- a/mfr/core/exceptions.py +++ b/mfr/core/exceptions.py @@ -145,6 +145,24 @@ def __init__(self, message, *args, metadata_url: str='', response: str='', **kwa 'response': self.response }]) + +class QueryParameterError(ProviderError): + """The MFR related errors raised from a :class:`mfr.core.provider`and relating to query + parameters. This error is thrown when the query has an invalid value. + """ + + __TYPE = 'query_parameter' + + def __init__(self, message, *args, url: str='', code: int=400, **kwargs): + super().__init__(message, code=code, *args, **kwargs) + self.url = url + self.return_code = code + self.attr_stack.append([self.__TYPE, { + 'url': self.url, + 'returncode': self.return_code, + }]) + + class TooBigToRenderError(ProviderError): """If the user tries to render a file larger than a server specified maximum, throw a TooBigToRenderError. diff --git a/mfr/core/provider.py b/mfr/core/provider.py index 7931f1d36..402655f6b 100644 --- a/mfr/core/provider.py +++ b/mfr/core/provider.py @@ -1,11 +1,12 @@ import abc -import markupsafe +from aiohttp import HttpBadRequest import furl +import markupsafe -from mfr.core import exceptions -from mfr.server import settings +from mfr.core import exceptions as core_exceptions from mfr.core.metrics import MetricsRecord +from mfr.server import settings as server_settings class BaseProvider(metaclass=abc.ABCMeta): @@ -17,12 +18,13 @@ class BaseProvider(metaclass=abc.ABCMeta): def __init__(self, request, url): self.request = request url_netloc = furl.furl(url).netloc - if url_netloc not in settings.ALLOWED_PROVIDER_NETLOCS: - raise exceptions.ProviderError( + if url_netloc not in server_settings.ALLOWED_PROVIDER_NETLOCS: + raise core_exceptions.ProviderError( message="{} is not a permitted provider domain.".format( markupsafe.escape(url_netloc) ), - code=400 + # TODO: using HTTPStatus.BAD_REQUEST fails tests, not sure why and I will take a another look later + code=HttpBadRequest.code ) self.url = url self.provider_metrics = MetricsRecord('provider') @@ -48,12 +50,13 @@ def download(self): class ProviderMetadata: - def __init__(self, name, ext, content_type, unique_key, download_url): + def __init__(self, name, ext, content_type, unique_key, download_url, is_public=False): self.name = name self.ext = ext self.content_type = content_type self.unique_key = unique_key self.download_url = download_url + self.is_public = is_public def serialize(self): return { @@ -62,4 +65,5 @@ def serialize(self): 'content_type': self.content_type, 'unique_key': str(self.unique_key), 'download_url': str(self.download_url), + 'is_public': self.is_public, } diff --git a/mfr/core/utils.py b/mfr/core/utils.py index 079f88457..1abea18a9 100644 --- a/mfr/core/utils.py +++ b/mfr/core/utils.py @@ -76,14 +76,29 @@ def make_renderer(name, metadata, file_path, url, assets_url, export_url): :rtype: :class:`mfr.core.extension.BaseRenderer` """ normalized_name = (name and name.lower()) or 'none' + if metadata.is_public: + try: + # Use the public renderer if exist + return driver.DriverManager( + namespace='mfr.public_renderers', + name=normalized_name, + invoke_on_load=True, + invoke_args=(metadata, file_path, url, assets_url, export_url), + ).driver + except: + # If public render does not exist, use default renderer by MFR + # If public render exists but exceptions occurs, delay the exception handling + pass + try: + # Use the default MFR handler return driver.DriverManager( namespace='mfr.renderers', name=normalized_name, invoke_on_load=True, invoke_args=(metadata, file_path, url, assets_url, export_url), ).driver - except RuntimeError: + except: raise exceptions.MakeRendererError( namespace='mfr.renderers', name=normalized_name, @@ -97,6 +112,7 @@ def make_renderer(name, metadata, file_path, url, assets_url, export_url): } ) + def sizeof_fmt(num, suffix='B'): if abs(num) < 1000: return '%3.0f%s' % (num, suffix) diff --git a/mfr/extensions/office365/README.md b/mfr/extensions/office365/README.md new file mode 100644 index 000000000..a43ca9cc4 --- /dev/null +++ b/mfr/extensions/office365/README.md @@ -0,0 +1,20 @@ + +# Office 365 Renderer + + +This renderer uses Office Online to render .docx files for us. If the Office Online URL ever changes, it will also need to be changed here in settings. + +Currently there is no OSF side component for these changes. Once there is, this specific note can be removed. In the meantime in order to test this renderer, you need to go to your local OSF copy of this file: https://github.com/CenterForOpenScience/osf.io/blob/develop/addons/base/views.py#L728-L736 +and add 'public_file' : 1, to the dict. This will send all files as public files. + +Testing this renderer locally is hard. Since Office Online needs access to the files it will not work with private files or ones hosted locally. To see what the docx files will render like, replace the render function with something that looks like this: + +``` + def render(self): + static_url = 'https://files.osf.io/v1/resources//providers/osfstorage/' + url = settings.OFFICE_BASE_URL + download_url.url + return self.TEMPLATE.render(base=self.assets_url, url=url) + +``` + +The file at `static_url` must be publicly available. diff --git a/mfr/extensions/office365/__init__.py b/mfr/extensions/office365/__init__.py new file mode 100644 index 000000000..08833dba1 --- /dev/null +++ b/mfr/extensions/office365/__init__.py @@ -0,0 +1 @@ +from .render import Office365Renderer # noqa diff --git a/mfr/extensions/office365/render.py b/mfr/extensions/office365/render.py new file mode 100644 index 000000000..2c03cb3e1 --- /dev/null +++ b/mfr/extensions/office365/render.py @@ -0,0 +1,40 @@ +import os +from urllib import parse + +import furl +from mako.lookup import TemplateLookup + +from mfr.core import extension +from mfr.extensions.office365 import settings as office365_settings + + +class Office365Renderer(extension.BaseRenderer): + """A renderer for .docx files that are publicly available. + + Office online can render `.docx` files to `.pdf` for us. This renderer will only be made + if a query param with `public_file=true` is present. It then generates and embeds an + office online URL into an `iframe` and returns the template. The file it is trying to + render MUST be public. + + Note: this renderer DOES NOT work locally. + + """ + + TEMPLATE = TemplateLookup( + directories=[ + os.path.join(os.path.dirname(__file__), 'templates') + ]).get_template('viewer.mako') + + def render(self): + download_url = furl.furl(self.metadata.download_url).set(query='').url + encoded_download_url = parse.quote(download_url) + office_render_url = office365_settings.OFFICE_BASE_URL + encoded_download_url + return self.TEMPLATE.render(base=self.assets_url, url=office_render_url) + + @property + def file_required(self): + return False + + @property + def cache_result(self): + return False diff --git a/mfr/extensions/office365/settings.py b/mfr/extensions/office365/settings.py new file mode 100644 index 000000000..c92ba78e4 --- /dev/null +++ b/mfr/extensions/office365/settings.py @@ -0,0 +1,6 @@ +from mfr import settings + + +config = settings.child('OFFICE365_EXTENSION_CONFIG') + +OFFICE_BASE_URL = 'https://view.officeapps.live.com/op/embed.aspx?src=' diff --git a/mfr/extensions/office365/templates/viewer.mako b/mfr/extensions/office365/templates/viewer.mako new file mode 100644 index 000000000..cfc2840dc --- /dev/null +++ b/mfr/extensions/office365/templates/viewer.mako @@ -0,0 +1,11 @@ + + + + + + diff --git a/mfr/providers/osf/provider.py b/mfr/providers/osf/provider.py index 042f9e9b1..3f8da1cfb 100644 --- a/mfr/providers/osf/provider.py +++ b/mfr/providers/osf/provider.py @@ -1,9 +1,10 @@ -import os import json import hashlib +from http import HTTPStatus import logging -from urllib.parse import urlparse import mimetypes +import os +from urllib.parse import urlparse import furl import aiohttp @@ -11,12 +12,10 @@ from waterbutler.core import streams -from mfr.core import exceptions -from mfr.core import provider -from mfr.core.utils import sizeof_fmt -from mfr.providers.osf import settings -from mfr.settings import MAX_FILE_SIZE_TO_RENDER -from mfr.core.exceptions import TooBigToRenderError +from mfr import settings as mfr_settings +from mfr.core import exceptions as core_exceptions +from mfr.core import provider, utils +from mfr.providers.osf import settings as provider_settings logger = logging.getLogger(__name__) @@ -74,14 +73,14 @@ async def metadata(self): response_reason = metadata_response.reason response_headers = metadata_response.headers await metadata_response.release() - if response_code != 200: - raise exceptions.MetadataError( + if response_code != HTTPStatus.OK: + raise core_exceptions.MetadataError( 'Failed to fetch file metadata from WaterButler. Received response: ', 'code {} {}'.format(str(response_code), str(response_reason)), metadata_url=download_url, response=response_reason, provider=self.NAME, - code=400 + code=HTTPStatus.BAD_REQUEST ) try: @@ -104,12 +103,12 @@ async def metadata(self): name, ext = os.path.splitext(metadata['data']['name']) size = metadata['data']['size'] - max_file_size = MAX_FILE_SIZE_TO_RENDER.get(ext) + max_file_size = mfr_settings.MAX_FILE_SIZE_TO_RENDER.get(ext) if max_file_size and size and int(size) > max_file_size: - raise TooBigToRenderError( + raise core_exceptions.TooBigToRenderError( "This file with extension '{ext}' exceeds the size limit of {max_size} and will not " "be rendered. To view this file download it and view it " - "offline.".format(ext=ext, max_size=sizeof_fmt(max_file_size)), + "offline.".format(ext=ext, max_size=utils.sizeof_fmt(max_file_size)), requested_size=int(size), maximum_size=max_file_size, ) @@ -119,18 +118,32 @@ async def metadata(self): cleaned_url.args.pop(unneeded, None) self.metrics.add('metadata.clean_url_args', str(cleaned_url)) unique_key = hashlib.sha256((metadata['data']['etag'] + cleaned_url.url).encode('utf-8')).hexdigest() - return provider.ProviderMetadata(name, ext, content_type, unique_key, download_url) + + is_public = False + public_file = cleaned_url.args.get('public_file', None) + if public_file: + if public_file not in ['True', 'False']: + raise core_exceptions.QueryParameterError( + 'Invalid value for query parameter `public_file`: {}'.format(cleaned_url.args['public_file']), + url=download_url, + provider=self.NAME, + code=HTTPStatus.BAD_REQUEST, + ) + is_public = public_file == 'True' + + return provider.ProviderMetadata(name, ext, content_type, + unique_key, download_url, is_public=is_public) async def download(self): """Download file from WaterButler, returning stream.""" download_url = await self._fetch_download_url() - headers = {settings.MFR_IDENTIFYING_HEADER: '1'} + headers = {provider_settings.MFR_IDENTIFYING_HEADER: '1'} response = await self._make_request('GET', download_url, allow_redirects=False, headers=headers) - if response.status >= 400: + if response.status >= HTTPStatus.BAD_REQUEST: resp_text = await response.text() logger.error('Unable to download file: ({}) {}'.format(response.status, resp_text)) - raise exceptions.DownloadError( + raise core_exceptions.DownloadError( 'Unable to download the requested file, please try again later.', download_url=download_url, response=resp_text, @@ -138,7 +151,7 @@ async def download(self): ) self.metrics.add('download.saw_redirect', False) - if response.status in (302, 301): + if response.status in (HTTPStatus.MOVED_PERMANENTLY, HTTPStatus.FOUND): await response.release() response = await aiohttp.request('GET', response.headers['location']) self.metrics.add('download.saw_redirect', True) @@ -170,8 +183,8 @@ async def _fetch_download_url(self): ) await request.release() - if request.status != 302: - raise exceptions.MetadataError( + if request.status != HTTPStatus.FOUND: + raise core_exceptions.MetadataError( request.reason, metadata_url=self.url, provider=self.NAME, diff --git a/setup.py b/setup.py index 4fa7a9663..4a9618642 100755 --- a/setup.py +++ b/setup.py @@ -40,6 +40,9 @@ def parse_requirements(requirements): 'http = mfr.providers.http:HttpProvider', 'osf = mfr.providers.osf:OsfProvider', ], + 'mfr.public_renderers': [ + '.docx = mfr.extensions.office365:Office365Renderer', + ], 'mfr.exporters': [ # google docs '.gdraw = mfr.extensions.image:ImageExporter', diff --git a/tests/extensions/office365/__init__.py b/tests/extensions/office365/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/extensions/office365/test_renderer.py b/tests/extensions/office365/test_renderer.py new file mode 100644 index 000000000..5d90d3990 --- /dev/null +++ b/tests/extensions/office365/test_renderer.py @@ -0,0 +1,54 @@ +from urllib import parse + +import furl +import pytest + +from mfr.core.provider import ProviderMetadata +from mfr.extensions.office365 import Office365Renderer +from mfr.extensions.office365 import settings as office365_settings + + +@pytest.fixture +def metadata(): + return ProviderMetadata( + 'test', + '.pdf', + 'text/plain', + '1234', + 'http://wb.osf.io/file/test.pdf?token=1234&public_file=1', + is_public=True + ) + + +@pytest.fixture +def file_path(): + return '/tmp/test.docx' + + +@pytest.fixture +def url(): + return parse.quote('http://osf.io/file/test.pdf') + + +@pytest.fixture +def assets_url(): + return 'http://mfr.osf.io/assets' + + +@pytest.fixture +def export_url(): + return 'http://mfr.osf.io/export?url=' + url() + + +@pytest.fixture +def renderer(metadata, file_path, url, assets_url, export_url): + return Office365Renderer(metadata, file_path, url, assets_url, export_url) + + +class TestOffice365Renderer: + + def test_render_pdf(self, renderer, metadata): + download_url = furl.furl(metadata.download_url).set(query='') + office_render_url = office365_settings.OFFICE_BASE_URL + parse.quote(download_url.url) + body = renderer.render() + assert ''.format(office_render_url) in body diff --git a/tests/server/handlers/test_query_params.py b/tests/server/handlers/test_query_params.py index b707035dc..001c42b27 100644 --- a/tests/server/handlers/test_query_params.py +++ b/tests/server/handlers/test_query_params.py @@ -7,7 +7,7 @@ from tests import utils -class TestRenderHandler(utils.HandlerTestCase): +class TestQueryParamsHandler(utils.HandlerTestCase): @testing.gen_test def test_format_url(self):