Skip to content

Movie sample changes (in progress) - DO NOT MERGE before Nov 15. #597

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Nov 15, 2016
77 changes: 23 additions & 54 deletions language/movie_nl/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,20 @@

from googleapiclient import discovery
from googleapiclient.errors import HttpError
import httplib2
from oauth2client.client import GoogleCredentials
import requests

# TODO REMOVE - when discovery is public
DISCOVERY_URL = ('https://language.googleapis.com/$discovery/rest?'
'version=v1&labels=GOOGLE_INTERNAL')

def analyze_document(service, document):
"""Analyze the document and get the distribution of sentiments and
the movie name."""
logging.info('Analyzing {}'.format(document.doc_id))

sentences, entities = document.extract_all_sentences(service)

sentiments = [get_sentiment(service, sentence) for sentence in sentences]

sentiments, entities = document.extract_sentiment_entities(service)
return sentiments, entities


Expand All @@ -56,62 +57,35 @@ def get_request_body(text, syntax=True, entities=True, sentiment=True):
return body


def get_sentiment(service, sentence):
"""Get the sentence-level sentiment."""
body = get_request_body(
sentence, syntax=False, entities=True, sentiment=True)

docs = service.documents()
request = docs.annotateText(body=body)

response = request.execute(num_retries=3)

sentiment = response.get('documentSentiment')

if sentiment is None:
return (None, None)
else:
pol = sentiment.get('polarity')
mag = sentiment.get('magnitude')

if pol is None and mag is not None:
pol = 0
return (pol, mag)


class Document(object):
"""Document class captures a single document of movie reviews."""

def __init__(self, text, doc_id, doc_path):
self.text = text
self.doc_id = doc_id
self.doc_path = doc_path
self.sentence_entity_pair = None
self.sentiment_entity_pair = None
self.label = None

def extract_all_sentences(self, service):
def extract_sentiment_entities(self, service):
"""Extract the sentences in a document."""

if self.sentence_entity_pair is not None:
if self.sentiment_entity_pair is not None:
return self.sentence_entity_pair

docs = service.documents()
request_body = get_request_body(
self.text,
syntax=True,
syntax=False,
entities=True,
sentiment=False)
sentiment=True)
request = docs.annotateText(body=request_body)

ent_list = []

response = request.execute()
entities = response.get('entities', [])
sentences = response.get('sentences', [])

sent_list = [
sentence.get('text', {}).get('content') for sentence in sentences
]
documentSentiment = response.get('documentSentiment', {})

for entity in entities:
ent_type = entity.get('type')
Expand All @@ -120,9 +94,9 @@ def extract_all_sentences(self, service):
if ent_type == 'PERSON' and wiki_url is not None:
ent_list.append(wiki_url)

self.sentence_entity_pair = (sent_list, ent_list)
self.sentiment_entity_pair = (documentSentiment, ent_list)

return self.sentence_entity_pair
return self.sentiment_entity_pair


def to_sentiment_json(doc_id, sent, label):
Expand Down Expand Up @@ -200,18 +174,9 @@ def get_sentiment_entities(service, document):
"""

sentiments, entities = analyze_document(service, document)
score = sentiments.get('score')

sentiments = [sent for sent in sentiments if sent[0] is not None]
negative_sentiments = [
polarity for polarity, magnitude in sentiments if polarity < 0.0]
positive_sentiments = [
polarity for polarity, magnitude in sentiments if polarity > 0.0]

negative = sum(negative_sentiments)
positive = sum(positive_sentiments)
total = positive + negative

return (total, entities)
return (score, entities)


def get_sentiment_label(sentiment):
Expand Down Expand Up @@ -315,12 +280,16 @@ def rank_entities(reader, sentiment=None, topn=None, reverse_bool=False):


def get_service():
"""Build a client to the Google Cloud Natural Language API."""
""""Build a client to the Google Cloud Natural Language API."""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

extra "

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done


credentials = GoogleCredentials.get_application_default()

return discovery.build('language', 'v1beta1',
credentials=credentials)
scoped_credentials = credentials.create_scoped(
['https://www.googleapis.com/auth/cloud-platform'])
http = httplib2.Http()
scoped_credentials.authorize(http)
return discovery.build('language', 'v1',
http=http,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should only need to specify credentials and not http, yeah?

discoveryServiceUrl=DISCOVERY_URL)


def analyze(input_dir, sentiment_writer, entity_writer, sample, log_file):
Expand Down
6 changes: 3 additions & 3 deletions language/movie_nl/main_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,18 +69,18 @@ def test_process_movie_reviews():
entities = [json.loads(entity) for entity in entities]

# assert sentiments
assert sentiments[0].get('sentiment') == 1.0
assert sentiments[0].get('sentiment') == 0.9
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't these values change as the model changes? Should you just assert that sentiments are some non-None value?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes but they don't happen that frequently. And so above check allows us to see if something really failed. Non-None will not allow us to detect some issue model or API might have.

assert sentiments[0].get('label') == 1

assert sentiments[1].get('sentiment') == 1.0
assert sentiments[1].get('sentiment') == 0.8
assert sentiments[1].get('label') == 1

# assert entities
assert len(entities) == 1
assert entities[0].get('name') == 'Tom Cruise'
assert (entities[0].get('wiki_url') ==
'http://en.wikipedia.org/wiki/Tom_Cruise')
assert entities[0].get('sentiment') == 2.0
assert entities[0].get('sentiment') == 1.7


def test_rank_positive_entities(capsys):
Expand Down