Skip to content

[SVCS-488] Improve docx rendering - MFR Part #304

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions mfr/core/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,24 @@ def __init__(self, message, *args, metadata_url: str='', response: str='', **kwa
'response': self.response
}])


class QueryParameterError(ProviderError):
"""The MFR related errors raised from a :class:`mfr.core.provider`and relating to query
parameters. This error is thrown when the query has an invalid value.
"""

__TYPE = 'query_parameter'

def __init__(self, message, *args, url: str='', code: int=400, **kwargs):
super().__init__(message, code=code, *args, **kwargs)
self.url = url
self.return_code = code
self.attr_stack.append([self.__TYPE, {
'url': self.url,
'returncode': self.return_code,
}])


class TooBigToRenderError(ProviderError):
"""If the user tries to render a file larger than a server specified maximum, throw a
TooBigToRenderError.
Expand Down
18 changes: 11 additions & 7 deletions mfr/core/provider.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import abc
import markupsafe

from aiohttp import HttpBadRequest
import furl
import markupsafe

from mfr.core import exceptions
from mfr.server import settings
from mfr.core import exceptions as core_exceptions
from mfr.core.metrics import MetricsRecord
from mfr.server import settings as server_settings
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is settings shadowed somewhere?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch. This is 6-month old code. Needs update. It no longer make sense to me.



class BaseProvider(metaclass=abc.ABCMeta):
Expand All @@ -17,12 +18,13 @@ class BaseProvider(metaclass=abc.ABCMeta):
def __init__(self, request, url):
self.request = request
url_netloc = furl.furl(url).netloc
if url_netloc not in settings.ALLOWED_PROVIDER_NETLOCS:
raise exceptions.ProviderError(
if url_netloc not in server_settings.ALLOWED_PROVIDER_NETLOCS:
raise core_exceptions.ProviderError(
message="{} is not a permitted provider domain.".format(
markupsafe.escape(url_netloc)
),
code=400
# TODO: using HTTPStatus.BAD_REQUEST fails tests, not sure why and I will take a another look later
code=HttpBadRequest.code
)
self.url = url
self.provider_metrics = MetricsRecord('provider')
Expand All @@ -48,12 +50,13 @@ def download(self):

class ProviderMetadata:

def __init__(self, name, ext, content_type, unique_key, download_url):
def __init__(self, name, ext, content_type, unique_key, download_url, is_public=False):
self.name = name
self.ext = ext
self.content_type = content_type
self.unique_key = unique_key
self.download_url = download_url
self.is_public = is_public

def serialize(self):
return {
Expand All @@ -62,4 +65,5 @@ def serialize(self):
'content_type': self.content_type,
'unique_key': str(self.unique_key),
'download_url': str(self.download_url),
'is_public': self.is_public,
}
18 changes: 17 additions & 1 deletion mfr/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,14 +76,29 @@ def make_renderer(name, metadata, file_path, url, assets_url, export_url):
:rtype: :class:`mfr.core.extension.BaseRenderer`
"""
normalized_name = (name and name.lower()) or 'none'
if metadata.is_public:
try:
# Use the public renderer if exist
return driver.DriverManager(
namespace='mfr.public_renderers',
name=normalized_name,
invoke_on_load=True,
invoke_args=(metadata, file_path, url, assets_url, export_url),
).driver
except:
# If public render does not exist, use default renderer by MFR
# If public render exists but exceptions occurs, delay the exception handling
pass

try:
# Use the default MFR handler
return driver.DriverManager(
namespace='mfr.renderers',
name=normalized_name,
invoke_on_load=True,
invoke_args=(metadata, file_path, url, assets_url, export_url),
).driver
except RuntimeError:
except:
raise exceptions.MakeRendererError(
namespace='mfr.renderers',
name=normalized_name,
Expand All @@ -97,6 +112,7 @@ def make_renderer(name, metadata, file_path, url, assets_url, export_url):
}
)


def sizeof_fmt(num, suffix='B'):
if abs(num) < 1000:
return '%3.0f%s' % (num, suffix)
Expand Down
20 changes: 20 additions & 0 deletions mfr/extensions/office365/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@

# Office 365 Renderer


This renderer uses Office Online to render .docx files for us. If the Office Online URL ever changes, it will also need to be changed here in settings.

Currently there is no OSF side component for these changes. Once there is, this specific note can be removed. In the meantime in order to test this renderer, you need to go to your local OSF copy of this file: https://github.com/CenterForOpenScience/osf.io/blob/develop/addons/base/views.py#L728-L736
and add 'public_file' : 1, to the dict. This will send all files as public files.

Testing this renderer locally is hard. Since Office Online needs access to the files it will not work with private files or ones hosted locally. To see what the docx files will render like, replace the render function with something that looks like this:

```
def render(self):
static_url = 'https://files.osf.io/v1/resources/<fake_project_id>/providers/osfstorage/<fake_file_id>'
url = settings.OFFICE_BASE_URL + download_url.url
return self.TEMPLATE.render(base=self.assets_url, url=url)

```

The file at `static_url` must be publicly available.
1 change: 1 addition & 0 deletions mfr/extensions/office365/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .render import Office365Renderer # noqa
40 changes: 40 additions & 0 deletions mfr/extensions/office365/render.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import os
from urllib import parse

import furl
from mako.lookup import TemplateLookup

from mfr.core import extension
from mfr.extensions.office365 import settings as office365_settings
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why does this need to be qualified?

Copy link
Contributor Author

@cslzchen cslzchen May 3, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as the previous comment. We no longer do alias if there is no shadowing issues. Needs update.



class Office365Renderer(extension.BaseRenderer):
"""A renderer for .docx files that are publicly available.

Office online can render `.docx` files to `.pdf` for us. This renderer will only be made
if a query param with `public_file=true` is present. It then generates and embeds an
office online URL into an `iframe` and returns the template. The file it is trying to
render MUST be public.

Note: this renderer DOES NOT work locally.

"""

TEMPLATE = TemplateLookup(
directories=[
os.path.join(os.path.dirname(__file__), 'templates')
]).get_template('viewer.mako')

def render(self):
download_url = furl.furl(self.metadata.download_url).set(query='').url
encoded_download_url = parse.quote(download_url)
office_render_url = office365_settings.OFFICE_BASE_URL + encoded_download_url
return self.TEMPLATE.render(base=self.assets_url, url=office_render_url)

@property
def file_required(self):
return False

@property
def cache_result(self):
return False
6 changes: 6 additions & 0 deletions mfr/extensions/office365/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from mfr import settings


config = settings.child('OFFICE365_EXTENSION_CONFIG')

OFFICE_BASE_URL = 'https://view.officeapps.live.com/op/embed.aspx?src='
11 changes: 11 additions & 0 deletions mfr/extensions/office365/templates/viewer.mako
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<style>
iframe {
width: 100%;
height: 800;
}
</style>

<iframe src=${url} frameborder='0'></iframe>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The only thing it looks like is in here is another iframe - is it possible to return a 302 here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I get your point but not sure how 302 works. OSF creates the parent iframe and we shouldn't modify it from MFR. Is embedded iframe a bad practice? For MFR itself, it should use iframe here since it loads content from an untrusted external web service.


<script src="/static/js/mfr.js"></script>
<script src="/static/js/mfr.child.js"></script>
55 changes: 34 additions & 21 deletions mfr/providers/osf/provider.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,21 @@
import os
import json
import hashlib
from http import HTTPStatus
import logging
from urllib.parse import urlparse
import mimetypes
import os
from urllib.parse import urlparse

import furl
import aiohttp
from aiohttp.errors import ContentEncodingError

from waterbutler.core import streams

from mfr.core import exceptions
from mfr.core import provider
from mfr.core.utils import sizeof_fmt
from mfr.providers.osf import settings
from mfr.settings import MAX_FILE_SIZE_TO_RENDER
from mfr.core.exceptions import TooBigToRenderError
from mfr import settings as mfr_settings
from mfr.core import exceptions as core_exceptions
from mfr.core import provider, utils
from mfr.providers.osf import settings as provider_settings

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -74,14 +73,14 @@ async def metadata(self):
response_reason = metadata_response.reason
response_headers = metadata_response.headers
await metadata_response.release()
if response_code != 200:
raise exceptions.MetadataError(
if response_code != HTTPStatus.OK:
raise core_exceptions.MetadataError(
'Failed to fetch file metadata from WaterButler. Received response: ',
'code {} {}'.format(str(response_code), str(response_reason)),
metadata_url=download_url,
response=response_reason,
provider=self.NAME,
code=400
code=HTTPStatus.BAD_REQUEST
)

try:
Expand All @@ -104,12 +103,12 @@ async def metadata(self):
name, ext = os.path.splitext(metadata['data']['name'])
size = metadata['data']['size']

max_file_size = MAX_FILE_SIZE_TO_RENDER.get(ext)
max_file_size = mfr_settings.MAX_FILE_SIZE_TO_RENDER.get(ext)
if max_file_size and size and int(size) > max_file_size:
raise TooBigToRenderError(
raise core_exceptions.TooBigToRenderError(
"This file with extension '{ext}' exceeds the size limit of {max_size} and will not "
"be rendered. To view this file download it and view it "
"offline.".format(ext=ext, max_size=sizeof_fmt(max_file_size)),
"offline.".format(ext=ext, max_size=utils.sizeof_fmt(max_file_size)),
requested_size=int(size), maximum_size=max_file_size,
)

Expand All @@ -119,26 +118,40 @@ async def metadata(self):
cleaned_url.args.pop(unneeded, None)
self.metrics.add('metadata.clean_url_args', str(cleaned_url))
unique_key = hashlib.sha256((metadata['data']['etag'] + cleaned_url.url).encode('utf-8')).hexdigest()
return provider.ProviderMetadata(name, ext, content_type, unique_key, download_url)

is_public = False
public_file = cleaned_url.args.get('public_file', None)
if public_file:
if public_file not in ['True', 'False']:
raise core_exceptions.QueryParameterError(
'Invalid value for query parameter `public_file`: {}'.format(cleaned_url.args['public_file']),
url=download_url,
provider=self.NAME,
code=HTTPStatus.BAD_REQUEST,
)
is_public = public_file == 'True'

return provider.ProviderMetadata(name, ext, content_type,
unique_key, download_url, is_public=is_public)

async def download(self):
"""Download file from WaterButler, returning stream."""
download_url = await self._fetch_download_url()
headers = {settings.MFR_IDENTIFYING_HEADER: '1'}
headers = {provider_settings.MFR_IDENTIFYING_HEADER: '1'}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again here, wondering why this needs to be qualified

Copy link
Contributor Author

@cslzchen cslzchen May 3, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similarly, this should be pd_settings as we have been using it for both WB and MFR.

response = await self._make_request('GET', download_url, allow_redirects=False, headers=headers)

if response.status >= 400:
if response.status >= HTTPStatus.BAD_REQUEST:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dig it.

resp_text = await response.text()
logger.error('Unable to download file: ({}) {}'.format(response.status, resp_text))
raise exceptions.DownloadError(
raise core_exceptions.DownloadError(
'Unable to download the requested file, please try again later.',
download_url=download_url,
response=resp_text,
provider=self.NAME,
)

self.metrics.add('download.saw_redirect', False)
if response.status in (302, 301):
if response.status in (HTTPStatus.MOVED_PERMANENTLY, HTTPStatus.FOUND):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this be a list rather than a tuple?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No sure. I had the same question. However, it was a tuple and the code base uses tuple.

await response.release()
response = await aiohttp.request('GET', response.headers['location'])
self.metrics.add('download.saw_redirect', True)
Expand Down Expand Up @@ -170,8 +183,8 @@ async def _fetch_download_url(self):
)
await request.release()

if request.status != 302:
raise exceptions.MetadataError(
if request.status != HTTPStatus.FOUND:
raise core_exceptions.MetadataError(
request.reason,
metadata_url=self.url,
provider=self.NAME,
Expand Down
3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ def parse_requirements(requirements):
'http = mfr.providers.http:HttpProvider',
'osf = mfr.providers.osf:OsfProvider',
],
'mfr.public_renderers': [
'.docx = mfr.extensions.office365:Office365Renderer',
],
'mfr.exporters': [
# google docs
'.gdraw = mfr.extensions.image:ImageExporter',
Expand Down
Empty file.
54 changes: 54 additions & 0 deletions tests/extensions/office365/test_renderer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from urllib import parse

import furl
import pytest

from mfr.core.provider import ProviderMetadata
from mfr.extensions.office365 import Office365Renderer
from mfr.extensions.office365 import settings as office365_settings


@pytest.fixture
def metadata():
return ProviderMetadata(
'test',
'.pdf',
'text/plain',
'1234',
'http://wb.osf.io/file/test.pdf?token=1234&public_file=1',
is_public=True
)


@pytest.fixture
def file_path():
return '/tmp/test.docx'


@pytest.fixture
def url():
return parse.quote('http://osf.io/file/test.pdf')


@pytest.fixture
def assets_url():
return 'http://mfr.osf.io/assets'


@pytest.fixture
def export_url():
return 'http://mfr.osf.io/export?url=' + url()


@pytest.fixture
def renderer(metadata, file_path, url, assets_url, export_url):
return Office365Renderer(metadata, file_path, url, assets_url, export_url)


class TestOffice365Renderer:

def test_render_pdf(self, renderer, metadata):
download_url = furl.furl(metadata.download_url).set(query='')
office_render_url = office365_settings.OFFICE_BASE_URL + parse.quote(download_url.url)
body = renderer.render()
assert '<iframe src={} frameborder=\'0\'></iframe>'.format(office_render_url) in body
2 changes: 1 addition & 1 deletion tests/server/handlers/test_query_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from tests import utils


class TestRenderHandler(utils.HandlerTestCase):
class TestQueryParamsHandler(utils.HandlerTestCase):

@testing.gen_test
def test_format_url(self):
Expand Down