in exporting, handle binary files from server routes (#1398)

Conduitry · ruben · aloker · web-flow · commit af33c09b2b40 · 2020-08-12T11:59:47.000-04:00
- follow &lt;img src&gt;, &lt;source src&gt;, &lt;source srcset&gt; when crawling
- don't corrupt binary responses from server routes

Co-authored-by: ruben &lt;r.vanderzwaan@datasprong.nl&gt;
Co-authored-by: Andre Loker &lt;140714+aloker@users.noreply.github.com&gt;
diff --git a/runtime/src/server/middleware/get_server_route_handler.ts b/runtime/src/server/middleware/get_server_route_handler.ts
@@ -37,7 +37,7 @@ export function get_server_route_handler(routes: ServerRoute[]) {
 						method: req.method,
 						status: res.statusCode,
 						type: headers['content-type'],
-						body: Buffer.concat(chunks).toString()
+						body: Buffer.concat(chunks)
 					});
 				};
 			}
diff --git a/src/api/export.ts b/src/api/export.ts
@@ -50,6 +50,29 @@ function get_href(attrs: string) {
 	return match && (match[1] || match[2] || match[3]);
 }
 
+function get_src(attrs: string) {
+	const match = /src\s*=\s*(?:"(.*?)"|'(.*?)'|([^\s>]*))/.exec(attrs);
+	return match && (match[1] || match[2] || match[3]);
+}
+
+export function get_srcset_urls(attrs: string) {
+	const results: string[] = [];
+	// Note that the srcset allows any ASCII whitespace, including newlines.
+	const match = /srcset\s*=\s*(?:"(.*?)"|'(.*?)'|([^\s>]*))/s.exec(attrs);
+	if (match) {
+		const attr_content = match[1] || match[2] || match[3];
+		// Parse the content of the srcset attribute.
+		// The regexp is modelled after the srcset specs (https://html.spec.whatwg.org/multipage/images.html#srcset-attribute)
+		// and should cover most reasonable cases.
+		const regex = /\s*([^\s,]\S+[^\s,])\s*((?:\d+w)|(?:-?\d+(?:\.\d+)?(?:[eE]-?\d+)?x))?/gm;
+		let sub_matches;
+		while (sub_matches = regex.exec(attr_content)) {
+			results.push(sub_matches[1]);
+		}
+	}
+	return results;
+}
+
 export { _export as export };
 
 async function _export({
@@ -109,7 +132,7 @@ async function _export({
 	const seen = new Set();
 	const saved = new Set();
 
-	function save(url: string, status: number, type: string, body: string) {
+	function save(url: string, status: number, type: string, body: string | ArrayBuffer) {
 		const { pathname } = resolve(origin, url);
 		let file = decodeURIComponent(pathname.slice(1));
 
@@ -122,20 +145,27 @@ async function _export({
 			if (!file.endsWith('.html')) {
 				file = file === '' ? 'index.html' : `${file}/index.html`;
 			}
-			body = minify_html(body);
+
+			if (typeof body === 'string') {
+				body = minify_html(body);
+			} else {
+				oninfo({ message: `Content of {url} has content-type text/html but the content was received as a binary buffer. The HTML will not be minified.` });
+			}
 		}
 
+		const buffer = Buffer.from(body);
+
 		onfile({
 			file,
-			size: body.length,
+			size: buffer.byteLength,
 			status
 		});
 
 		const export_file = path.join(export_dir, file);
 		if (fs.existsSync(export_file)) return;
 		mkdirp(path.dirname(export_file));
 
-		return writeFile(export_file, body);
+		return writeFile(export_file, buffer);
 	}
 
 	function handle(url: URL, fetchOpts: FetchOpts, addCallback: Function) {
@@ -184,7 +214,9 @@ async function _export({
 
 		let type = response.headers.get('Content-Type');
 
-		let body = await response.text();
+		let body = type.startsWith('text/')
+			? await response.text()
+			: await response.arrayBuffer();
 
 		const range = ~~(response.status / 100);
 
@@ -193,26 +225,38 @@ async function _export({
 			const link = parseLinkHeader(response.headers.get('Link') || '');
 			link.refs.forEach((ref: Ref) => {
 				if (ref.rel === 'preload') {
-					body = body.replace('</head>',
+					body = (body as string).replace('</head>',
 						`<link rel="preload" as=${JSON.stringify(ref.as)} href=${JSON.stringify(ref.uri)}></head>`);
 				}
 			});
 
 			if (pathname !== '/service-worker-index.html') {
-				const cleaned = clean_html(body);
+				const cleaned = clean_html(body as string);
 
 				const base_match = /<base ([\s\S]+?)>/m.exec(cleaned);
 				const base_href = base_match && get_href(base_match[1]);
 				const base = resolve(url.href, base_href);
 
 				let match;
-				const pattern = /<a ([\s\S]+?)>/gm;
+				const pattern = /<(a|img|source)\s+([\s\S]+?)>/gm;
 
 				while (match = pattern.exec(cleaned)) {
-					const attrs = match[1];
-					const href = get_href(attrs);
+					let hrefs: string[] = [];
+					const element = match[1];
+					const attrs = match[2];
+
+					if (element === 'a') {
+						hrefs.push(get_href(attrs));
+					} else {
+						if (element === 'img') {
+							hrefs.push(get_src(attrs));
+						}
+						hrefs.push.apply(hrefs, get_srcset_urls(attrs));
+					}
+
+					hrefs = hrefs.filter(Boolean);
 
-					if (href) {
+					for (const href of hrefs) {
 						const url = resolve(base.href, href);
 
 						if (url.protocol === protocol && url.host === host) {
diff --git a/test/apps/export/content/example-192.png b/test/apps/export/content/example-192.png
diff --git a/test/apps/export/content/example-512.png b/test/apps/export/content/example-512.png
diff --git a/test/apps/export/content/test.pdf b/test/apps/export/content/test.pdf
diff --git a/test/apps/export/src/routes/blog/[slug].html b/test/apps/export/src/routes/blog/[slug].html
@@ -10,4 +10,15 @@
 	export let post;
 </script>
 
-<h1>{post.title}</h1>
+<h1>{post.title}</h1>
+
+{#if post.src}
+	<picture>
+		<source srcset={post.srcset}/>
+		<img src={post.src}/>
+	</picture>
+{/if}
+
+{#if post.pdf}
+	<a href={post.pdf}>{post.pdf}</a>
+{/if}
diff --git a/test/apps/export/src/routes/blog/_posts.js b/test/apps/export/src/routes/blog/_posts.js
@@ -1,5 +1,5 @@
 export default [
-	{ slug: 'foo', title: 'once upon a foo' },
+	{ slug: 'foo', title: 'once upon a foo', src: 'img/example-512.png', srcset: 'img/example-512.png 512w, img/example-192.png 192w', pdf: 'pdfs/test.pdf' },
 	{ slug: 'bar', title: 'a bar is born' },
 	{ slug: 'baz', title: 'bazzily ever after' }
-];
+];
diff --git a/test/apps/export/src/routes/img/[slug].png.js b/test/apps/export/src/routes/img/[slug].png.js
@@ -0,0 +1,15 @@
+const fs = require('fs');
+const path = require('path');
+const cwd = process.cwd();
+
+export function get(req, res) {
+
+	const { slug } = req.params;
+	const image = path.join(cwd, `/content/${slug}.png`);
+
+	let s = fs.createReadStream(image);
+	s.on('open', () => {
+		res.writeHead(200, { 'Content-Type': 'image/png' });
+		s.pipe(res);
+	});
+}
diff --git a/test/apps/export/src/routes/pdfs/[slug].pdf.js b/test/apps/export/src/routes/pdfs/[slug].pdf.js
@@ -0,0 +1,15 @@
+const fs = require('fs');
+const path = require('path');
+const cwd = process.cwd();
+
+export function get(req, res, next) {
+
+	const { slug } = req.params;
+	const image = path.join(cwd, `/content/${slug}.pdf`);
+
+	let s = fs.createReadStream(image);
+	s.on('open', () => {
+		res.writeHead(200, { 'Content-Type': 'application/pdf' });
+		s.pipe(res);
+	});
+}
diff --git a/test/apps/export/test.ts b/test/apps/export/test.ts
@@ -42,15 +42,32 @@ describe('export', function() {
 			'service-worker-index.html',
 			'service-worker.js',
 			'test.pdf',
+			'img/example-192.png',
+			'img/example-512.png',
+			'pdfs/test.pdf',
 			...boom
 		].sort());
 	});
 
 	it('does not corrupt binary file links (like pdf)', () => {
-		const input = readFileSync(`${__dirname}/static/test.pdf`)
-		const output = readFileSync(`${__dirname}/__sapper__/export/test.pdf`)
-		assert.ok(input.equals(output))
-	})
+		const input = readFileSync(`${__dirname}/static/test.pdf`);
+		const output = readFileSync(`${__dirname}/__sapper__/export/test.pdf`);
+		assert.ok(input.equals(output));
+	});
+
+	it('does not corrupt image files from server routes', () => {
+		for(const file of ['example-192.png', 'example-512.png']) {
+			const input = readFileSync(`${__dirname}/content/${file}`);
+			const output = readFileSync(`${__dirname}/__sapper__/export/img/${file}`);
+			assert.ok(input.equals(output));
+		}
+	});
+
+	it('does not corrupt pdf files from server routes', () => {
+		const input = readFileSync(`${__dirname}/content/test.pdf`);
+		const output = readFileSync(`${__dirname}/__sapper__/export/pdfs/test.pdf`);
+		assert.ok(input.equals(output));
+	});
 
 	// TODO test timeout, basepath
 });
diff --git a/test/unit/srcset/test.ts b/test/unit/srcset/test.ts
@@ -0,0 +1,30 @@
+import * as assert from "assert";
+import { get_srcset_urls } from "../../../src/api/export";
+
+describe("get_srcset_urls", () => {
+	it("should parse single entry without descriptor", () => {
+		const result = get_srcset_urls("<source srcset=\"assets/image/1.jpg\"/>");
+		assert.deepEqual(result, ["assets/image/1.jpg"]);
+	});
+
+	it("should parse single entry with width descriptor", () => {
+		const result = get_srcset_urls("<source srcset=\"assets/image/1.jpg 1234w\"/>");
+		assert.deepEqual(result, ["assets/image/1.jpg"]);
+	});
+
+	it("should parse single entry with density descriptor", () => {
+		const result = get_srcset_urls("<source srcset=\"assets/image/1.jpg -123.456x\"/>");
+		assert.deepEqual(result, ["assets/image/1.jpg"]);
+	});
+
+	it("should parse multiple entries with different descriptors", () => {
+		const result = get_srcset_urls("<source srcset=\"   assets/image/1.jpg -1.3E-3x,assets/image/2.jpg 2560w,assets/image/3.jpg, \nassets/image/4.jpg 2.5x  , assets/image/5.jpg 640w   \"/>");
+		assert.deepEqual(result, [
+			"assets/image/1.jpg",
+			"assets/image/2.jpg",
+			"assets/image/3.jpg",
+			"assets/image/4.jpg",
+			"assets/image/5.jpg",
+		]);
+	});
+});

Original file line number	Diff line number	Diff line change
`@@ -37,7 +37,7 @@ export function get_server_route_handler(routes: ServerRoute[]) {`
`37`	`37`	`method: req.method,`
`38`	`38`	`status: res.statusCode,`
`39`	`39`	`type: headers['content-type'],`
`40`		`- body: Buffer.concat(chunks).toString()`
	`40`	`+ body: Buffer.concat(chunks)`
`41`	`41`	`});`
`42`	`42`	`};`
`43`	`43`	`}`