Skip to content

patch updates to crawler fetcher #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: cfa-main-1
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apps/crawler-fetcher/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Crawler fetcher
#

FROM gcr.io/mcback/common:latest
FROM gcr.io/mcback/common:release

# Copy sources
COPY src/ /opt/mediacloud/src/crawler-fetcher/
Expand Down
5 changes: 4 additions & 1 deletion apps/crawler-fetcher/src/python/crawler_fetcher/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,10 @@ def run_fetcher(no_daemon: bool = False) -> None:
try:
downloads_id = db.query("SELECT pop_queued_download()").flat()[0]
if downloads_id:
download = db.find_by_id(table='downloads', object_id=downloads_id)
try:
download = db.find_by_id(table='downloads', object_id=downloads_id)
except McCrawlerFetcherSoftError as ex:
_log_download_error(db=db, download=download, error_message=str(ex))

idle_timer.stop()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from mediawords.db import DatabaseHandler
from mediawords.dbi.stories.stories import add_story
from mediawords.util.log import create_logger
from mediawords.util.parse_html import html_title
from mediawords.util.perl import decode_object_from_bytes_if_needed
from mediawords.util.sql import sql_now
Expand All @@ -12,6 +13,7 @@
from crawler_fetcher.handlers.default.fetch_mixin import DefaultFetchMixin
from crawler_fetcher.handlers.feed import AbstractDownloadFeedHandler

log = create_logger(__name__)

class DownloadFeedWebPageHandler(DefaultFetchMixin, AbstractDownloadFeedHandler, AbstractDownloadHandler):
"""Handler for 'web_page' feed downloads."""
Expand Down Expand Up @@ -42,8 +44,10 @@ def add_stories_from_feed(self, db: DatabaseHandler, download: dict, content: st
'title': title,
}

story = add_story(db=db, story=new_story, feeds_id=feeds_id)
if not story:
try:
story = add_story(db=db, story=new_story, feeds_id=feeds_id)
except McCrawlerFetcherSoftError as ex:
log.error(f"Failed to add story {new_story} : {ex}")
raise McCrawlerFetcherSoftError(f"Failed to add story {new_story}")

db.query("""
Expand All @@ -60,19 +64,24 @@ def add_stories_from_feed(self, db: DatabaseHandler, download: dict, content: st
story_ids = [
story['stories_id'],
]

log.info(f"Feed webpage: added stories, count ..{len(story_ids)}")
return story_ids

def return_stories_to_be_extracted_from_feed(self, db: DatabaseHandler, download: dict, content: str) -> List[int]:
download = decode_object_from_bytes_if_needed(download)
# content = decode_object_from_bytes_if_needed(content)

# Download row might have been changed by add_stories_from_feed()
download = db.find_by_id(table='downloads', object_id=download['downloads_id'])
try:
# Download row might have been changed by add_stories_from_feed()
download = db.find_by_id(table='downloads', object_id=download['downloads_id'])
except McCrawlerFetcherSoftError as ex:
log.error(f"Feed webpage: download {download['downloads_id']} does not exist, {ex}")

# Extract web page download that was just fetched
stories_to_extract = [
download['stories_id'],
]

log.info(f"Feed, webpage stories to be extracted, count ..{len(stories_to_extract)}")

return stories_to_extract
9 changes: 7 additions & 2 deletions apps/crawler-fetcher/src/python/crawler_fetcher/new_story.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import datetime
from typing import Optional

from crawler_fetcher.exceptions import McCrawlerFetcherSoftError

from mediawords.db import DatabaseHandler
from mediawords.dbi.stories.stories import add_story
from mediawords.util.perl import decode_object_from_bytes_if_needed
Expand Down Expand Up @@ -45,8 +47,11 @@ def add_story_and_content_download(db: DatabaseHandler, story: dict, parent_down
"""If the story is new, add it to the database and also add a pending download for the story content."""
story = decode_object_from_bytes_if_needed(story)
parent_download = decode_object_from_bytes_if_needed(parent_download)

story = add_story(db=db, story=story, feeds_id=parent_download['feeds_id'])

try:
story = add_story(db=db, story=story, feeds_id=parent_download['feeds_id'])
except McCrawlerFetcherSoftError as ex:
raise McCrawlerFetcherSoftError(f"Error adding story, feed_id:{parent_download['feeds_id']} story_id:{story['url']}: {ex}")

if story:
if story.get('is_new', False):
Expand Down