From 04740f1f7e4e39fa6439d5a82eeafe4656567e33 Mon Sep 17 00:00:00 2001 From: Addison Schiller Date: Mon, 2 Oct 2017 12:15:07 -0400 Subject: [PATCH 1/2] `Public_file` query param and office365 renderer Adding support for a `public_file` query param so the OSF can request a public renderer. Added office365 which is a public renderer. This uses office online to do .docx file conversions. --- mfr/core/exceptions.py | 19 ++++++++ mfr/core/provider.py | 4 +- mfr/core/utils.py | 13 +++++ mfr/extensions/office365/README.md | 20 ++++++++ mfr/extensions/office365/__init__.py | 1 + mfr/extensions/office365/render.py | 36 ++++++++++++++ mfr/extensions/office365/settings.py | 6 +++ .../office365/templates/viewer.mako | 11 +++++ mfr/providers/osf/provider.py | 18 ++++++- setup.py | 3 ++ tests/extensions/office365/__init__.py | 0 tests/extensions/office365/test_renderer.py | 47 +++++++++++++++++++ 12 files changed, 176 insertions(+), 2 deletions(-) create mode 100644 mfr/extensions/office365/README.md create mode 100644 mfr/extensions/office365/__init__.py create mode 100644 mfr/extensions/office365/render.py create mode 100644 mfr/extensions/office365/settings.py create mode 100644 mfr/extensions/office365/templates/viewer.mako create mode 100644 tests/extensions/office365/__init__.py create mode 100644 tests/extensions/office365/test_renderer.py diff --git a/mfr/core/exceptions.py b/mfr/core/exceptions.py index 8f3573b54..9f7d5c8dc 100644 --- a/mfr/core/exceptions.py +++ b/mfr/core/exceptions.py @@ -145,6 +145,25 @@ def __init__(self, message, *args, metadata_url: str='', response: str='', **kwa 'response': self.response }]) + +class QueryParameterError(ProviderError): + """The MFR related errors raised from a :class:`mfr.core.provider` and relating to query parameters + should inherit from MetadataError + This error is thrown when a query parameter is used missused + """ + + __TYPE = 'query_parameter' + + def __init__(self, message, *args, url: str='', code: int=400, **kwargs): + super().__init__(message, code=code, *args, **kwargs) + self.url = url + self.return_code = code + self.attr_stack.append([self.__TYPE, { + 'url': self.url, + 'returncode': self.return_code, + }]) + + class TooBigToRenderError(ProviderError): """If the user tries to render a file larger than a server specified maximum, throw a TooBigToRenderError. diff --git a/mfr/core/provider.py b/mfr/core/provider.py index 7931f1d36..00054c313 100644 --- a/mfr/core/provider.py +++ b/mfr/core/provider.py @@ -48,17 +48,19 @@ def download(self): class ProviderMetadata: - def __init__(self, name, ext, content_type, unique_key, download_url): + def __init__(self, name, ext, content_type, unique_key, download_url, is_public=False): self.name = name self.ext = ext self.content_type = content_type self.unique_key = unique_key self.download_url = download_url + self.is_public = is_public def serialize(self): return { 'name': self.name, 'ext': self.ext, + 'is_public': self.is_public, 'content_type': self.content_type, 'unique_key': str(self.unique_key), 'download_url': str(self.download_url), diff --git a/mfr/core/utils.py b/mfr/core/utils.py index 079f88457..c5f7713cc 100644 --- a/mfr/core/utils.py +++ b/mfr/core/utils.py @@ -76,6 +76,19 @@ def make_renderer(name, metadata, file_path, url, assets_url, export_url): :rtype: :class:`mfr.core.extension.BaseRenderer` """ normalized_name = (name and name.lower()) or 'none' + if metadata.is_public: + try: + return driver.DriverManager( + namespace='mfr.public_renderers', + name=normalized_name, + invoke_on_load=True, + invoke_args=(metadata, file_path, url, assets_url, export_url), + ).driver + except: + # Check for a public renderer, if one doesn't exist, use a regular one + # Real exceptions handled by main driver.DriverManager + pass + try: return driver.DriverManager( namespace='mfr.renderers', diff --git a/mfr/extensions/office365/README.md b/mfr/extensions/office365/README.md new file mode 100644 index 000000000..a43ca9cc4 --- /dev/null +++ b/mfr/extensions/office365/README.md @@ -0,0 +1,20 @@ + +# Office 365 Renderer + + +This renderer uses Office Online to render .docx files for us. If the Office Online URL ever changes, it will also need to be changed here in settings. + +Currently there is no OSF side component for these changes. Once there is, this specific note can be removed. In the meantime in order to test this renderer, you need to go to your local OSF copy of this file: https://github.com/CenterForOpenScience/osf.io/blob/develop/addons/base/views.py#L728-L736 +and add 'public_file' : 1, to the dict. This will send all files as public files. + +Testing this renderer locally is hard. Since Office Online needs access to the files it will not work with private files or ones hosted locally. To see what the docx files will render like, replace the render function with something that looks like this: + +``` + def render(self): + static_url = 'https://files.osf.io/v1/resources//providers/osfstorage/' + url = settings.OFFICE_BASE_URL + download_url.url + return self.TEMPLATE.render(base=self.assets_url, url=url) + +``` + +The file at `static_url` must be publicly available. diff --git a/mfr/extensions/office365/__init__.py b/mfr/extensions/office365/__init__.py new file mode 100644 index 000000000..08833dba1 --- /dev/null +++ b/mfr/extensions/office365/__init__.py @@ -0,0 +1 @@ +from .render import Office365Renderer # noqa diff --git a/mfr/extensions/office365/render.py b/mfr/extensions/office365/render.py new file mode 100644 index 000000000..2760ce761 --- /dev/null +++ b/mfr/extensions/office365/render.py @@ -0,0 +1,36 @@ +import os +import furl + +from mfr.core import extension +from mako.lookup import TemplateLookup +from mfr.extensions.office365 import settings + + +class Office365Renderer(extension.BaseRenderer): + """A renderer for use with public .docx files. + + Office online can render .docx files to pdf for us. + This renderer will only ever be made if a query param with `public_file=1` is sent. + It then generates and embeds an office online url into an + iframe and returns the template. The file it is trying to render MUST + be available publically online. This renderer will not work if testing locally. + + """ + + TEMPLATE = TemplateLookup( + directories=[ + os.path.join(os.path.dirname(__file__), 'templates') + ]).get_template('viewer.mako') + + def render(self): + download_url = furl.furl(self.metadata.download_url).set(query='') + url = settings.OFFICE_BASE_URL + download_url.url + return self.TEMPLATE.render(base=self.assets_url, url=url) + + @property + def file_required(self): + return False + + @property + def cache_result(self): + return False diff --git a/mfr/extensions/office365/settings.py b/mfr/extensions/office365/settings.py new file mode 100644 index 000000000..c92ba78e4 --- /dev/null +++ b/mfr/extensions/office365/settings.py @@ -0,0 +1,6 @@ +from mfr import settings + + +config = settings.child('OFFICE365_EXTENSION_CONFIG') + +OFFICE_BASE_URL = 'https://view.officeapps.live.com/op/embed.aspx?src=' diff --git a/mfr/extensions/office365/templates/viewer.mako b/mfr/extensions/office365/templates/viewer.mako new file mode 100644 index 000000000..cfc2840dc --- /dev/null +++ b/mfr/extensions/office365/templates/viewer.mako @@ -0,0 +1,11 @@ + + + + + + diff --git a/mfr/providers/osf/provider.py b/mfr/providers/osf/provider.py index 042f9e9b1..52bf13d36 100644 --- a/mfr/providers/osf/provider.py +++ b/mfr/providers/osf/provider.py @@ -119,7 +119,23 @@ async def metadata(self): cleaned_url.args.pop(unneeded, None) self.metrics.add('metadata.clean_url_args', str(cleaned_url)) unique_key = hashlib.sha256((metadata['data']['etag'] + cleaned_url.url).encode('utf-8')).hexdigest() - return provider.ProviderMetadata(name, ext, content_type, unique_key, download_url) + + is_public = False + + if 'public_file' in cleaned_url.args: + if cleaned_url.args['public_file'] not in ['0', '1']: + raise exceptions.QueryParameterError( + 'The `public_file` query paramter should either `0`, `1`, or unused. Instead ' + 'got {}'.format(cleaned_url.args['public_file']), + url=download_url, + provider=self.NAME, + code=400, + ) + + is_public = cleaned_url.args['public_file'] == '1' + + return provider.ProviderMetadata(name, ext, content_type, + unique_key, download_url, is_public=is_public) async def download(self): """Download file from WaterButler, returning stream.""" diff --git a/setup.py b/setup.py index 4fa7a9663..4a9618642 100755 --- a/setup.py +++ b/setup.py @@ -40,6 +40,9 @@ def parse_requirements(requirements): 'http = mfr.providers.http:HttpProvider', 'osf = mfr.providers.osf:OsfProvider', ], + 'mfr.public_renderers': [ + '.docx = mfr.extensions.office365:Office365Renderer', + ], 'mfr.exporters': [ # google docs '.gdraw = mfr.extensions.image:ImageExporter', diff --git a/tests/extensions/office365/__init__.py b/tests/extensions/office365/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/extensions/office365/test_renderer.py b/tests/extensions/office365/test_renderer.py new file mode 100644 index 000000000..ae485a125 --- /dev/null +++ b/tests/extensions/office365/test_renderer.py @@ -0,0 +1,47 @@ +import furl +import pytest + +from mfr.extensions.office365 import settings +from mfr.core.provider import ProviderMetadata +from mfr.extensions.office365 import Office365Renderer + + +@pytest.fixture +def metadata(): + return ProviderMetadata('test', '.pdf', 'text/plain', '1234', + 'http://wb.osf.io/file/test.pdf?token=1234&public_file=1', + is_public=True) + + +@pytest.fixture +def file_path(): + return '/tmp/test.docx' + + +@pytest.fixture +def url(): + return 'http://osf.io/file/test.pdf' + + +@pytest.fixture +def assets_url(): + return 'http://mfr.osf.io/assets' + + +@pytest.fixture +def export_url(): + return 'http://mfr.osf.io/export?url=' + url() + + +@pytest.fixture +def renderer(metadata, file_path, url, assets_url, export_url): + return Office365Renderer(metadata, file_path, url, assets_url, export_url) + + +class TestOffice365Renderer: + + def test_render_pdf(self, renderer, metadata, assets_url): + download_url = furl.furl(metadata.download_url).set(query='') + body_url = settings.OFFICE_BASE_URL + download_url.url + body = renderer.render() + assert ''.format(body_url) in body From 2eec820406bd15d438efbb718636edd663fd414f Mon Sep 17 00:00:00 2001 From: longze chen Date: Wed, 6 Dec 2017 13:50:09 -0500 Subject: [PATCH 2/2] Encode download url, fix tests and style updates. --- mfr/core/exceptions.py | 5 +- mfr/core/provider.py | 16 +++--- mfr/core/utils.py | 9 ++-- mfr/extensions/office365/render.py | 26 +++++----- mfr/providers/osf/provider.py | 55 ++++++++++----------- tests/extensions/office365/test_renderer.py | 21 +++++--- tests/server/handlers/test_query_params.py | 2 +- 7 files changed, 73 insertions(+), 61 deletions(-) diff --git a/mfr/core/exceptions.py b/mfr/core/exceptions.py index 9f7d5c8dc..317399568 100644 --- a/mfr/core/exceptions.py +++ b/mfr/core/exceptions.py @@ -147,9 +147,8 @@ def __init__(self, message, *args, metadata_url: str='', response: str='', **kwa class QueryParameterError(ProviderError): - """The MFR related errors raised from a :class:`mfr.core.provider` and relating to query parameters - should inherit from MetadataError - This error is thrown when a query parameter is used missused + """The MFR related errors raised from a :class:`mfr.core.provider`and relating to query + parameters. This error is thrown when the query has an invalid value. """ __TYPE = 'query_parameter' diff --git a/mfr/core/provider.py b/mfr/core/provider.py index 00054c313..402655f6b 100644 --- a/mfr/core/provider.py +++ b/mfr/core/provider.py @@ -1,11 +1,12 @@ import abc -import markupsafe +from aiohttp import HttpBadRequest import furl +import markupsafe -from mfr.core import exceptions -from mfr.server import settings +from mfr.core import exceptions as core_exceptions from mfr.core.metrics import MetricsRecord +from mfr.server import settings as server_settings class BaseProvider(metaclass=abc.ABCMeta): @@ -17,12 +18,13 @@ class BaseProvider(metaclass=abc.ABCMeta): def __init__(self, request, url): self.request = request url_netloc = furl.furl(url).netloc - if url_netloc not in settings.ALLOWED_PROVIDER_NETLOCS: - raise exceptions.ProviderError( + if url_netloc not in server_settings.ALLOWED_PROVIDER_NETLOCS: + raise core_exceptions.ProviderError( message="{} is not a permitted provider domain.".format( markupsafe.escape(url_netloc) ), - code=400 + # TODO: using HTTPStatus.BAD_REQUEST fails tests, not sure why and I will take a another look later + code=HttpBadRequest.code ) self.url = url self.provider_metrics = MetricsRecord('provider') @@ -60,8 +62,8 @@ def serialize(self): return { 'name': self.name, 'ext': self.ext, - 'is_public': self.is_public, 'content_type': self.content_type, 'unique_key': str(self.unique_key), 'download_url': str(self.download_url), + 'is_public': self.is_public, } diff --git a/mfr/core/utils.py b/mfr/core/utils.py index c5f7713cc..1abea18a9 100644 --- a/mfr/core/utils.py +++ b/mfr/core/utils.py @@ -78,6 +78,7 @@ def make_renderer(name, metadata, file_path, url, assets_url, export_url): normalized_name = (name and name.lower()) or 'none' if metadata.is_public: try: + # Use the public renderer if exist return driver.DriverManager( namespace='mfr.public_renderers', name=normalized_name, @@ -85,18 +86,19 @@ def make_renderer(name, metadata, file_path, url, assets_url, export_url): invoke_args=(metadata, file_path, url, assets_url, export_url), ).driver except: - # Check for a public renderer, if one doesn't exist, use a regular one - # Real exceptions handled by main driver.DriverManager + # If public render does not exist, use default renderer by MFR + # If public render exists but exceptions occurs, delay the exception handling pass try: + # Use the default MFR handler return driver.DriverManager( namespace='mfr.renderers', name=normalized_name, invoke_on_load=True, invoke_args=(metadata, file_path, url, assets_url, export_url), ).driver - except RuntimeError: + except: raise exceptions.MakeRendererError( namespace='mfr.renderers', name=normalized_name, @@ -110,6 +112,7 @@ def make_renderer(name, metadata, file_path, url, assets_url, export_url): } ) + def sizeof_fmt(num, suffix='B'): if abs(num) < 1000: return '%3.0f%s' % (num, suffix) diff --git a/mfr/extensions/office365/render.py b/mfr/extensions/office365/render.py index 2760ce761..2c03cb3e1 100644 --- a/mfr/extensions/office365/render.py +++ b/mfr/extensions/office365/render.py @@ -1,19 +1,22 @@ import os +from urllib import parse + import furl +from mako.lookup import TemplateLookup from mfr.core import extension -from mako.lookup import TemplateLookup -from mfr.extensions.office365 import settings +from mfr.extensions.office365 import settings as office365_settings class Office365Renderer(extension.BaseRenderer): - """A renderer for use with public .docx files. + """A renderer for .docx files that are publicly available. + + Office online can render `.docx` files to `.pdf` for us. This renderer will only be made + if a query param with `public_file=true` is present. It then generates and embeds an + office online URL into an `iframe` and returns the template. The file it is trying to + render MUST be public. - Office online can render .docx files to pdf for us. - This renderer will only ever be made if a query param with `public_file=1` is sent. - It then generates and embeds an office online url into an - iframe and returns the template. The file it is trying to render MUST - be available publically online. This renderer will not work if testing locally. + Note: this renderer DOES NOT work locally. """ @@ -23,9 +26,10 @@ class Office365Renderer(extension.BaseRenderer): ]).get_template('viewer.mako') def render(self): - download_url = furl.furl(self.metadata.download_url).set(query='') - url = settings.OFFICE_BASE_URL + download_url.url - return self.TEMPLATE.render(base=self.assets_url, url=url) + download_url = furl.furl(self.metadata.download_url).set(query='').url + encoded_download_url = parse.quote(download_url) + office_render_url = office365_settings.OFFICE_BASE_URL + encoded_download_url + return self.TEMPLATE.render(base=self.assets_url, url=office_render_url) @property def file_required(self): diff --git a/mfr/providers/osf/provider.py b/mfr/providers/osf/provider.py index 52bf13d36..3f8da1cfb 100644 --- a/mfr/providers/osf/provider.py +++ b/mfr/providers/osf/provider.py @@ -1,9 +1,10 @@ -import os import json import hashlib +from http import HTTPStatus import logging -from urllib.parse import urlparse import mimetypes +import os +from urllib.parse import urlparse import furl import aiohttp @@ -11,12 +12,10 @@ from waterbutler.core import streams -from mfr.core import exceptions -from mfr.core import provider -from mfr.core.utils import sizeof_fmt -from mfr.providers.osf import settings -from mfr.settings import MAX_FILE_SIZE_TO_RENDER -from mfr.core.exceptions import TooBigToRenderError +from mfr import settings as mfr_settings +from mfr.core import exceptions as core_exceptions +from mfr.core import provider, utils +from mfr.providers.osf import settings as provider_settings logger = logging.getLogger(__name__) @@ -74,14 +73,14 @@ async def metadata(self): response_reason = metadata_response.reason response_headers = metadata_response.headers await metadata_response.release() - if response_code != 200: - raise exceptions.MetadataError( + if response_code != HTTPStatus.OK: + raise core_exceptions.MetadataError( 'Failed to fetch file metadata from WaterButler. Received response: ', 'code {} {}'.format(str(response_code), str(response_reason)), metadata_url=download_url, response=response_reason, provider=self.NAME, - code=400 + code=HTTPStatus.BAD_REQUEST ) try: @@ -104,12 +103,12 @@ async def metadata(self): name, ext = os.path.splitext(metadata['data']['name']) size = metadata['data']['size'] - max_file_size = MAX_FILE_SIZE_TO_RENDER.get(ext) + max_file_size = mfr_settings.MAX_FILE_SIZE_TO_RENDER.get(ext) if max_file_size and size and int(size) > max_file_size: - raise TooBigToRenderError( + raise core_exceptions.TooBigToRenderError( "This file with extension '{ext}' exceeds the size limit of {max_size} and will not " "be rendered. To view this file download it and view it " - "offline.".format(ext=ext, max_size=sizeof_fmt(max_file_size)), + "offline.".format(ext=ext, max_size=utils.sizeof_fmt(max_file_size)), requested_size=int(size), maximum_size=max_file_size, ) @@ -121,18 +120,16 @@ async def metadata(self): unique_key = hashlib.sha256((metadata['data']['etag'] + cleaned_url.url).encode('utf-8')).hexdigest() is_public = False - - if 'public_file' in cleaned_url.args: - if cleaned_url.args['public_file'] not in ['0', '1']: - raise exceptions.QueryParameterError( - 'The `public_file` query paramter should either `0`, `1`, or unused. Instead ' - 'got {}'.format(cleaned_url.args['public_file']), + public_file = cleaned_url.args.get('public_file', None) + if public_file: + if public_file not in ['True', 'False']: + raise core_exceptions.QueryParameterError( + 'Invalid value for query parameter `public_file`: {}'.format(cleaned_url.args['public_file']), url=download_url, provider=self.NAME, - code=400, + code=HTTPStatus.BAD_REQUEST, ) - - is_public = cleaned_url.args['public_file'] == '1' + is_public = public_file == 'True' return provider.ProviderMetadata(name, ext, content_type, unique_key, download_url, is_public=is_public) @@ -140,13 +137,13 @@ async def metadata(self): async def download(self): """Download file from WaterButler, returning stream.""" download_url = await self._fetch_download_url() - headers = {settings.MFR_IDENTIFYING_HEADER: '1'} + headers = {provider_settings.MFR_IDENTIFYING_HEADER: '1'} response = await self._make_request('GET', download_url, allow_redirects=False, headers=headers) - if response.status >= 400: + if response.status >= HTTPStatus.BAD_REQUEST: resp_text = await response.text() logger.error('Unable to download file: ({}) {}'.format(response.status, resp_text)) - raise exceptions.DownloadError( + raise core_exceptions.DownloadError( 'Unable to download the requested file, please try again later.', download_url=download_url, response=resp_text, @@ -154,7 +151,7 @@ async def download(self): ) self.metrics.add('download.saw_redirect', False) - if response.status in (302, 301): + if response.status in (HTTPStatus.MOVED_PERMANENTLY, HTTPStatus.FOUND): await response.release() response = await aiohttp.request('GET', response.headers['location']) self.metrics.add('download.saw_redirect', True) @@ -186,8 +183,8 @@ async def _fetch_download_url(self): ) await request.release() - if request.status != 302: - raise exceptions.MetadataError( + if request.status != HTTPStatus.FOUND: + raise core_exceptions.MetadataError( request.reason, metadata_url=self.url, provider=self.NAME, diff --git a/tests/extensions/office365/test_renderer.py b/tests/extensions/office365/test_renderer.py index ae485a125..5d90d3990 100644 --- a/tests/extensions/office365/test_renderer.py +++ b/tests/extensions/office365/test_renderer.py @@ -1,16 +1,23 @@ +from urllib import parse + import furl import pytest -from mfr.extensions.office365 import settings from mfr.core.provider import ProviderMetadata from mfr.extensions.office365 import Office365Renderer +from mfr.extensions.office365 import settings as office365_settings @pytest.fixture def metadata(): - return ProviderMetadata('test', '.pdf', 'text/plain', '1234', + return ProviderMetadata( + 'test', + '.pdf', + 'text/plain', + '1234', 'http://wb.osf.io/file/test.pdf?token=1234&public_file=1', - is_public=True) + is_public=True + ) @pytest.fixture @@ -20,7 +27,7 @@ def file_path(): @pytest.fixture def url(): - return 'http://osf.io/file/test.pdf' + return parse.quote('http://osf.io/file/test.pdf') @pytest.fixture @@ -40,8 +47,8 @@ def renderer(metadata, file_path, url, assets_url, export_url): class TestOffice365Renderer: - def test_render_pdf(self, renderer, metadata, assets_url): + def test_render_pdf(self, renderer, metadata): download_url = furl.furl(metadata.download_url).set(query='') - body_url = settings.OFFICE_BASE_URL + download_url.url + office_render_url = office365_settings.OFFICE_BASE_URL + parse.quote(download_url.url) body = renderer.render() - assert ''.format(body_url) in body + assert ''.format(office_render_url) in body diff --git a/tests/server/handlers/test_query_params.py b/tests/server/handlers/test_query_params.py index b707035dc..001c42b27 100644 --- a/tests/server/handlers/test_query_params.py +++ b/tests/server/handlers/test_query_params.py @@ -7,7 +7,7 @@ from tests import utils -class TestRenderHandler(utils.HandlerTestCase): +class TestQueryParamsHandler(utils.HandlerTestCase): @testing.gen_test def test_format_url(self):