Skip to content

Commit 6df8e19

Browse files
Merge pull request #119 from scrapinghub/fix-incorrectly-formatted-description-property
Fix incorrectly formatted description property
2 parents 50a0915 + 670702e commit 6df8e19

14 files changed

+3133
-13
lines changed

extruct/w3cmicrodata.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,32 @@
1818
from urllib.parse import urljoin
1919

2020
import lxml.etree
21+
from lxml.html.clean import Cleaner
2122
from w3lib.html import strip_html5_whitespace
23+
import html_text
2224

2325
from extruct.utils import parse_html
2426

2527

28+
# Cleaner which is similar to html_text cleaner, but is less aggressive
29+
cleaner = Cleaner(
30+
scripts=True,
31+
javascript=False, # onclick attributes are fine
32+
comments=True,
33+
style=True,
34+
links=True,
35+
meta=True,
36+
page_structure=False, # <title> may be nice to have
37+
processing_instructions=True,
38+
embedded=False, # keep embedded content
39+
frames=False, # keep frames
40+
forms=False, # keep forms
41+
annoying_tags=False,
42+
remove_unknown_tags=False,
43+
safe_attrs_only=False,
44+
)
45+
46+
2647
class LxmlMicrodataExtractor(object):
2748
_xp_item = lxml.etree.XPath('descendant-or-self::*[@itemscope]')
2849
_xp_prop = lxml.etree.XPath("""set:difference(.//*[@itemprop],
@@ -182,7 +203,8 @@ def _extract_property_value(self, node, items_seen, base_url, force=False):
182203
return self._extract_textContent(node)
183204

184205
def _extract_textContent(self, node):
185-
return u"".join(self._xp_clean_text(node)).strip()
206+
clean_node = cleaner.clean_html(node)
207+
return html_text.etree_to_text(clean_node)
186208

187209

188210
MicrodataExtractor = LxmlMicrodataExtractor

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,6 @@ requests
77
rdflib
88
rdflib-jsonld
99
mf2py>=1.1.0
10-
six
10+
six>=1.11
1111
w3lib
12+
html-text

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ def get_version():
3333
'rdflib-jsonld',
3434
'mf2py',
3535
'w3lib',
36+
'html-text>=0.5.1',
3637
'six'],
3738
extras_require={
3839
'service': [

tests/samples/schema.org/Event.002.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
"offers": "foo-fighters-everlong-buy.html",
3838
"url": "foo-fighters-everlong.html"},
3939
"type": "http://schema.org/MusicRecording"}],
40-
"video": {"properties": {"description": "Catch this exclusive interview with\n Dave Grohl and the Foo Fighters about their new album, Rope.",
40+
"video": {"properties": {"description": "Catch this exclusive interview with Dave Grohl and the Foo Fighters about their new album, Rope.",
4141
"duration": "T1M33S",
4242
"name": "Interview with the Foo Fighters",
4343
"thumbnail": "foo-fighters-interview-thumb.jpg"},

tests/samples/schema.org/MusicRecording.001.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
"offers": "foo-fighters-everlong-buy.html",
3838
"url": "foo-fighters-everlong.html"},
3939
"type": "http://schema.org/MusicRecording"}],
40-
"video": {"properties": {"description": "Catch this exclusive interview with\n Dave Grohl and the Foo Fighters about their new album, Rope.",
40+
"video": {"properties": {"description": "Catch this exclusive interview with Dave Grohl and the Foo Fighters about their new album, Rope.",
4141
"duration": "T1M33S",
4242
"name": "Interview with the Foo Fighters",
4343
"thumbnail": "foo-fighters-interview-thumb.jpg"},

tests/samples/schema.org/product-ref.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
],
3333
"brand": "ACME",
3434
"name": "Executive Anvil",
35-
"description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.",
35+
"description": "Sleeker than ACME's Classic Anvil, the Executive Anvil is perfect for the business traveler looking for something to drop from a height.",
3636
"mpn": "925872",
3737
"aggregateRating": {
3838
"type": "http://schema.org/AggregateRating",

tests/samples/schema.org/product.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"properties": {"brand": "ACME",
33
"name": "Executive Anvil",
44
"image": "anvil_executive.jpg",
5-
"description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.",
5+
"description": "Sleeker than ACME's Classic Anvil, the Executive Anvil is perfect for the business traveler looking for something to drop from a height.",
66
"mpn": "925872",
77
"aggregateRating": {"type": "http://schema.org/AggregateRating",
88
"properties": {"ratingValue": "4.4",

tests/samples/schema.org/product_custom_url.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"properties": {"brand": "ACME",
33
"name": "Executive Anvil",
44
"image": "http://some-example.com/anvil_executive.jpg",
5-
"description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.",
5+
"description": "Sleeker than ACME's Classic Anvil, the Executive Anvil is perfect for the business traveler looking for something to drop from a height.",
66
"mpn": "925872",
77
"aggregateRating": {"type": "http://schema.org/AggregateRating",
88
"properties": {"ratingValue": "4.4",

tests/samples/schema.org/product_custom_url_and_node_id.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"properties": {"brand": "ACME",
44
"name": "Executive Anvil",
55
"image": "http://some-example.com/anvil_executive.jpg",
6-
"description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.",
6+
"description": "Sleeker than ACME's Classic Anvil, the Executive Anvil is perfect for the business traveler looking for something to drop from a height.",
77
"mpn": "925872",
88
"aggregateRating": {"type": "http://schema.org/AggregateRating",
99
"_nodeId_": "aggregateRating",

tests/samples/w3c/microdata.5.2.withtext.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,18 @@
22
"name": "Tank Locomotive (DB 80)",
33
"product-code": "33041",
44
"scale": "HO"},
5-
"textContent": "Name:\n Tank Locomotive (DB 80)\n Product code:\n 33041\n Scale:\n HO\n Digital:\n Delta",
5+
"textContent": "Name:\nTank Locomotive (DB 80)\nProduct code:\n33041\nScale:\nHO\nDigital:\nDelta",
66
"type": ["http://md.example.com/loco",
77
"http://md.example.com/lighting"]},
88
{"properties": {"name": "Turnout Lantern Kit",
99
"product-code": "74470",
1010
"scale": "HO",
1111
"track-type": "C"},
12-
"textContent": "Name:\n Turnout Lantern Kit\n Product code:\n 74470\n Purpose:\n For retrofitting 2 C Track\n turnouts.",
12+
"textContent": "Name:\nTurnout Lantern Kit\nProduct code:\n74470\nPurpose:\nFor retrofitting 2 C Track turnouts.",
1313
"type": ["http://md.example.com/track",
1414
"http://md.example.com/lighting"]},
1515
{"properties": {"name": "Express Train Passenger Car (DB Am 203)",
1616
"product-code": "8710",
1717
"scale": "Z"},
18-
"textContent": "Name:\n Express Train Passenger Car (DB Am 203)\n Product code:\n 8710\n Scale:\n Z",
18+
"textContent": "Name:\nExpress Train Passenger Car (DB Am 203)\nProduct code:\n8710\nScale:\nZ",
1919
"type": "http://md.example.com/passengers"}]

0 commit comments

Comments
 (0)