-
Notifications
You must be signed in to change notification settings - Fork 6.5k
Face detection beta features #1414
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 10 commits
c610dad
4bed5f6
c24243a
69071f9
3c1800b
95a0b50
8d633ef
974d237
f9e2dcc
9bedd7c
8c0ac58
ba30eec
5a5da60
c15c729
8d4adf1
7266d1e
0fc6819
d91bbe0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,203 @@ | ||
#!/usr/bin/env python | ||
|
||
# Copyright 2017 Google Inc. All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
"""This application demonstrates face detection, face emotions | ||
and speech transcription using the Google Cloud API. | ||
|
||
Usage Examples: | ||
python beta_snippets.py boxes \ | ||
gs://python-docs-samples-tests/video/googlework_short.mp4 | ||
|
||
python beta_snippets.py \ | ||
emotions gs://python-docs-samples-tests/video/googlework_short.mp4 | ||
|
||
python beta_snippets.py \ | ||
transcription gs://python-docs-samples-tests/video/googlework_short.mp4 | ||
""" | ||
|
||
import argparse | ||
|
||
from google.cloud import videointelligence_v1p1beta1 as videointelligence | ||
|
||
|
||
# [START video_face_bounding_boxes] | ||
def face_bounding_boxes(path): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can they use a local file too or just a GCS file? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. They can. The request needs to be only slightly different, such as https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/video/cloud-client/analyze/analyze.py#L134 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we only need to provide local file snippets, no GCS snippets? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here we are showing only GCS snippets. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OH, do Python samples all use It should be def detect_labels(path):
"""Detects labels in the file."""
client = vision.ImageAnnotatorClient() def detect_labels_uri(uri):
"""Detects labels in the file located in Google Cloud Storage or on the
Web."""
client = vision.ImageAnnotatorClient() There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's make this change across the board to all 3 samples There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you are right - it should be |
||
""" Detects faces' bounding boxes. """ | ||
video_client = videointelligence.VideoIntelligenceServiceClient() | ||
features = [videointelligence.enums.Feature.FACE_DETECTION] | ||
|
||
config = videointelligence.types.FaceConfig( | ||
include_bounding_boxes=True) | ||
context = videointelligence.types.VideoContext( | ||
face_detection_config=config) | ||
|
||
operation = video_client.annotate_video( | ||
path, features=features, video_context=context) | ||
print('\nProcessing video for face annotations:') | ||
|
||
result = operation.result(timeout=900) | ||
print('\nFinished processing.') | ||
|
||
# There is only one result because a single video was processed. | ||
faces = result.annotation_results[0].face_detection_annotations | ||
for i, face in enumerate(faces): | ||
print('Face {}'.format(i)) | ||
|
||
# Each face_detection_annotation has only one segment. | ||
segment = face.segments[0] | ||
start_time = (segment.segment.start_time_offset.seconds + | ||
segment.segment.start_time_offset.nanos / 1e9) | ||
end_time = (segment.segment.end_time_offset.seconds + | ||
segment.segment.end_time_offset.nanos / 1e9) | ||
positions = '{}s to {}s'.format(start_time, end_time) | ||
print('\tSegment: {}\n'.format(positions)) | ||
|
||
# Each detected face may appear in many frames of the video. | ||
# Here we process only the first frame. | ||
frame = face.frames[0] | ||
|
||
time_offset = (frame.time_offset.seconds + | ||
frame.time_offset.nanos / 1e9) | ||
box = frame.attributes[0].normalized_bounding_box | ||
|
||
print('First frame time offset: {}s\n'.format(time_offset)) | ||
|
||
print('First frame normalized bounding box:') | ||
print('\tleft : {}'.format(box.left)) | ||
print('\ttop : {}'.format(box.top)) | ||
print('\tright : {}'.format(box.right)) | ||
print('\tbottom: {}'.format(box.bottom)) | ||
print('\n') | ||
# [END video_face_bounding_boxes] | ||
|
||
|
||
# [START video_face_emotions] | ||
def face_emotions(path): | ||
""" Analyze faces' emotions over frames. """ | ||
video_client = videointelligence.VideoIntelligenceServiceClient() | ||
features = [videointelligence.enums.Feature.FACE_DETECTION] | ||
|
||
config = videointelligence.types.FaceConfig( | ||
include_emotions=True) | ||
context = videointelligence.types.VideoContext( | ||
face_detection_config=config) | ||
|
||
operation = video_client.annotate_video( | ||
path, features=features, video_context=context) | ||
print('\nProcessing video for face annotations:') | ||
|
||
result = operation.result(timeout=600) | ||
print('\nFinished processing.') | ||
|
||
# There is only one result because a single video was processed. | ||
faces = result.annotation_results[0].face_detection_annotations | ||
for i, face in enumerate(faces): | ||
print('Face {}'.format(i)) | ||
|
||
frame_emotions = [] | ||
for frame in face.frames: | ||
time_offset = (frame.time_offset.seconds + | ||
frame.time_offset.nanos / 1e9) | ||
emotions = frame.attributes[0].emotions | ||
|
||
# from videointelligence.enums | ||
emotion_labels = ( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there any way to use the videointelligence.enums object directly or pull this programmatically? How badly will this break if the enum changes in the future? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The simplest way is for the enums in videointelligence.enums to extend python's Enums class, which would allow us to map the enums back to their names. However the client libraries do not do that (yet). I don't know of a clean way to pull the names programmatically, and as it stands now the sample code will break in these possible ways:
The best long term solution is to simply extend Enums. When that happens we will need to come back and update these samples. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 Programmatically, this keeps showing up in Python samples and, as a user, it seems really frustrating that Python makes you do this... you get the proto's enum value index for enums via the library but not the enum value name? I don't think any of the other languages have this problem? (Make this better! 😭) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Without fixing the client library generation tool, this is the closest I could come up with:
I would rather not have this as is in a code snippet - but perhaps abstracted away as a helper function? The reader of the sample would not be able to see the full list of enums this way. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To be clear, I wasn't thinking about fixing this in the code sample, but rather in google-cloud-core IMO The code sample should use an inline, literal List with the values. The display name of the enum value should be printed by getting it by index. As we do today. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll move this to a thread on google-cloud-core 😄 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Code in this PR regarding Enum LGTM |
||
'EMOTION_UNSPECIFIED', 'AMUSEMENT', 'ANGER', | ||
'CONCENTRATION', 'CONTENTMENT', 'DESIRE', | ||
'DISAPPOINTMENT', 'DISGUST', 'ELATION', | ||
'EMBARRASSMENT', 'INTEREST', 'PRIDE', 'SADNESS', | ||
'SURPRISE') | ||
|
||
emotion, score = sorted( | ||
[(em.emotion, em.score) for em in emotions], | ||
key=lambda p: p[1])[-1] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Non-trivial logic alert! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @nnegrey I think I misunderstood your comment about this part - indeed I was sorting by emotion scores. The reason being that the API returns one score for each emotion, and here I am trying to show only the one that scores the highest. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added a comment here to clarify what I was doing. please take a look. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This line is really impressive:
|
||
emotion_label = emotion_labels[emotion] | ||
|
||
frame_emotions.append((time_offset, emotion_label, score)) | ||
|
||
for time_offset, emotion_label, score in frame_emotions: | ||
print('\t{:04.2f}s: {:14}({:4.3f})'.format( | ||
time_offset, emotion_label, score)) | ||
print('\n') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we merely print out the results without sorting based on score? Will help simplify the code, if we were going across the whole segment to determine the most likely emotion across all frames I think that would be cool, but not as simple. Also by printing out the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here actually we are sorting by time_offset - an earlier version of of the API did not sort the frames by it. Emotion is a frame level output since detecting the change of emotions is useful. The output might look like this:
|
||
# [END video_face_emotions] | ||
|
||
|
||
# [START video_speech_transcription] | ||
def speech_transcription(input_uri): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The other 2 functions use We should make them consistent (and I'd TAL at whatever variable we use in all of the other Vision samples and use that for consistency) |
||
"""Transcribe speech from a video stored on GCS.""" | ||
video_client = videointelligence.VideoIntelligenceServiceClient() | ||
|
||
features = [videointelligence.enums.Feature.SPEECH_TRANSCRIPTION] | ||
|
||
config = videointelligence.types.SpeechTranscriptionConfig( | ||
language_code='en-US') | ||
video_context = videointelligence.types.VideoContext( | ||
speech_transcription_config=config) | ||
|
||
operation = video_client.annotate_video( | ||
input_uri, features=features, | ||
video_context=video_context) | ||
|
||
print('\nProcessing video for speech transcription.') | ||
|
||
result = operation.result(timeout=180) | ||
|
||
# There is only one annotation_result since only | ||
# one video is processed. | ||
annotation_results = result.annotation_results[0] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add a comment that notes you are only pulling out the first result or is there only one result? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done. |
||
speech_transcription = annotation_results.speech_transcriptions[0] | ||
alternative = speech_transcription.alternatives[0] | ||
|
||
print('Transcript: {}'.format(alternative.transcript)) | ||
print('Confidence: {}\n'.format(alternative.confidence)) | ||
|
||
print('Word level information:') | ||
for word_info in alternative.words: | ||
word = word_info.word | ||
start_time = word_info.start_time | ||
end_time = word_info.end_time | ||
print('\t{}s - {}s: {}'.format( | ||
start_time.seconds + start_time.nanos * 1e-9, | ||
end_time.seconds + end_time.nanos * 1e-9, | ||
word)) | ||
# [END video_speech_transcription] | ||
|
||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser( | ||
description=__doc__, | ||
formatter_class=argparse.RawDescriptionHelpFormatter) | ||
subparsers = parser.add_subparsers(dest='command') | ||
analyze_faces_parser = subparsers.add_parser( | ||
'boxes', help=face_bounding_boxes.__doc__) | ||
analyze_faces_parser.add_argument('path') | ||
|
||
analyze_emotions_parser = subparsers.add_parser( | ||
'emotions', help=face_emotions.__doc__) | ||
analyze_emotions_parser.add_argument('path') | ||
|
||
speech_transcription_parser = subparsers.add_parser( | ||
'transcription', help=speech_transcription.__doc__) | ||
speech_transcription_parser.add_argument('path') | ||
|
||
args = parser.parse_args() | ||
|
||
if args.command == 'boxes': | ||
face_bounding_boxes(args.path) | ||
elif args.command == 'emotions': | ||
face_emotions(args.path) | ||
elif args.command == 'transcription': | ||
speech_transcription(args.path) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
#!/usr/bin/env python | ||
|
||
# Copyright 2017 Google, Inc | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import os | ||
|
||
import pytest | ||
|
||
import beta_snippets | ||
|
||
|
||
BUCKET = os.environ['CLOUD_STORAGE_BUCKET'] | ||
FACES_SHORT_FILE_PATH = 'video/googlework_short.mp4' | ||
|
||
|
||
@pytest.mark.slow | ||
def test_face_bounding_boxes(capsys): | ||
beta_snippets.face_bounding_boxes( | ||
'gs://{}/{}'.format(BUCKET, FACES_SHORT_FILE_PATH)) | ||
out, _ = capsys.readouterr() | ||
assert 'top :' in out | ||
|
||
|
||
@pytest.mark.slow | ||
def test_face_emotions(capsys): | ||
beta_snippets.face_emotions( | ||
'gs://{}/{}'.format(BUCKET, FACES_SHORT_FILE_PATH)) | ||
out, _ = capsys.readouterr() | ||
assert 'CONCENTRATION' in out | ||
|
||
|
||
@pytest.mark.slow | ||
def test_speech_transcription(capsys): | ||
beta_snippets.speech_transcription( | ||
'gs://{}/{}'.format(BUCKET, FACES_SHORT_FILE_PATH)) | ||
out, _ = capsys.readouterr() | ||
assert 'cultural' in out |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
google-cloud-videointelligence==1.0.1 | ||
google-cloud-videointelligence==1.1.0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we make another demo project to hold public files that can be used across languages?