|
| 1 | +#!/usr/bin/env python |
| 2 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 3 | +# you may not use this file except in compliance with the License. |
| 4 | +# You may obtain a copy of the License at |
| 5 | +# |
| 6 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 7 | +# |
| 8 | +# Unless required by applicable law or agreed to in writing, software |
| 9 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 10 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 11 | +# See the License for the specific language governing permissions and |
| 12 | +# limitations under the License. |
| 13 | + |
| 14 | +""" Sample command-line program for listing Google Dataproc Clusters""" |
| 15 | + |
| 16 | +import argparse |
| 17 | +import os |
| 18 | + |
| 19 | +from apiclient import discovery |
| 20 | +from gcloud import storage |
| 21 | +from oauth2client.client import GoogleCredentials |
| 22 | + |
| 23 | +# Currently only the "global" region is supported |
| 24 | +REGION = 'global' |
| 25 | +DEFAULT_FILENAME = 'pyspark_sort.py' |
| 26 | + |
| 27 | + |
| 28 | +def get_default_pyspark_file(): |
| 29 | + """Gets the PySpark file from this directory""" |
| 30 | + current_dir = os.path.dirname(os.path.abspath(__file__)) |
| 31 | + f = open(os.path.join(current_dir, DEFAULT_FILENAME), 'r') |
| 32 | + return f, DEFAULT_FILENAME |
| 33 | + |
| 34 | + |
| 35 | +def get_pyspark_file(filename): |
| 36 | + f = open(filename, 'r') |
| 37 | + return f, os.path.basename(filename) |
| 38 | + |
| 39 | + |
| 40 | +def upload_pyspark_file(project_id, bucket_name, filename, file): |
| 41 | + """Uploads the PySpark file in this directory to the configured |
| 42 | + input bucket.""" |
| 43 | + print('Uploading pyspark file to GCS') |
| 44 | + client = storage.Client(project=project_id) |
| 45 | + bucket = client.get_bucket(bucket_name) |
| 46 | + blob = bucket.blob(filename) |
| 47 | + blob.upload_from_file(file) |
| 48 | + |
| 49 | + |
| 50 | +def download_output(project_id, cluster_id, output_bucket, job_id): |
| 51 | + """Downloads the output file from Cloud Storage and returns it as a |
| 52 | + string.""" |
| 53 | + print('Downloading output file') |
| 54 | + client = storage.Client(project=project_id) |
| 55 | + bucket = client.get_bucket(output_bucket) |
| 56 | + output_blob = ( |
| 57 | + 'google-cloud-dataproc-metainfo/{}/jobs/{}/driveroutput.000000000' |
| 58 | + .format(cluster_id, job_id)) |
| 59 | + return bucket.blob(output_blob).download_as_string() |
| 60 | + |
| 61 | + |
| 62 | +# [START create_cluster] |
| 63 | +def create_cluster(dataproc, project, cluster_name, zone): |
| 64 | + print('Creating cluster.') |
| 65 | + zone_uri = \ |
| 66 | + 'https://www.googleapis.com/compute/v1/projects/{}/zones/{}'.format( |
| 67 | + project, zone) |
| 68 | + cluster_data = { |
| 69 | + 'projectId': project, |
| 70 | + 'clusterName': cluster_name, |
| 71 | + 'config': { |
| 72 | + 'gceClusterConfig': { |
| 73 | + 'zoneUri': zone_uri |
| 74 | + } |
| 75 | + } |
| 76 | + } |
| 77 | + result = dataproc.projects().regions().clusters().create( |
| 78 | + projectId=project, |
| 79 | + region=REGION, |
| 80 | + body=cluster_data).execute() |
| 81 | + return result |
| 82 | +# [END create_cluster] |
| 83 | + |
| 84 | + |
| 85 | +def wait_for_cluster_creation(dataproc, project_id, cluster_name, zone): |
| 86 | + print('Waiting for cluster creation') |
| 87 | + |
| 88 | + while True: |
| 89 | + result = dataproc.projects().regions().clusters().list( |
| 90 | + projectId=project_id, |
| 91 | + region=REGION).execute() |
| 92 | + cluster_list = result['clusters'] |
| 93 | + cluster = [c |
| 94 | + for c in cluster_list |
| 95 | + if c['clusterName'] == cluster_name][0] |
| 96 | + if cluster['status']['state'] == 'ERROR': |
| 97 | + raise Exception(result['status']['details']) |
| 98 | + if cluster['status']['state'] == 'RUNNING': |
| 99 | + print("Cluster created.") |
| 100 | + break |
| 101 | + |
| 102 | + |
| 103 | +# [START list_clusters_with_detail] |
| 104 | +def list_clusters_with_details(dataproc, project): |
| 105 | + result = dataproc.projects().regions().clusters().list( |
| 106 | + projectId=project, |
| 107 | + region=REGION).execute() |
| 108 | + cluster_list = result['clusters'] |
| 109 | + for cluster in cluster_list: |
| 110 | + print("{} - {}" |
| 111 | + .format(cluster['clusterName'], cluster['status']['state'])) |
| 112 | + return result |
| 113 | +# [END list_clusters_with_detail] |
| 114 | + |
| 115 | + |
| 116 | +def get_cluster_id_by_name(cluster_list, cluster_name): |
| 117 | + """Helper function to retrieve the ID and output bucket of a cluster by |
| 118 | + name.""" |
| 119 | + cluster = [c for c in cluster_list if c['clusterName'] == cluster_name][0] |
| 120 | + return cluster['clusterUuid'], cluster['config']['configBucket'] |
| 121 | + |
| 122 | + |
| 123 | +# [START submit_pyspark_job] |
| 124 | +def submit_pyspark_job(dataproc, project, cluster_name, bucket_name, filename): |
| 125 | + """Submits the Pyspark job to the cluster, assuming `filename` has |
| 126 | + already been uploaded to `bucket_name`""" |
| 127 | + job_details = { |
| 128 | + 'projectId': project, |
| 129 | + 'job': { |
| 130 | + 'placement': { |
| 131 | + 'clusterName': cluster_name |
| 132 | + }, |
| 133 | + 'pysparkJob': { |
| 134 | + 'mainPythonFileUri': 'gs://{}/{}'.format(bucket_name, filename) |
| 135 | + } |
| 136 | + } |
| 137 | + } |
| 138 | + result = dataproc.projects().regions().jobs().submit( |
| 139 | + projectId=project, |
| 140 | + region=REGION, |
| 141 | + body=job_details).execute() |
| 142 | + job_id = result['reference']['jobId'] |
| 143 | + print('Submitted job ID {}'.format(job_id)) |
| 144 | + return job_id |
| 145 | +# [END submit_pyspark_job] |
| 146 | + |
| 147 | + |
| 148 | +# [START delete] |
| 149 | +def delete_cluster(dataproc, project, cluster): |
| 150 | + print('Tearing down cluster') |
| 151 | + result = dataproc.projects().regions().clusters().delete( |
| 152 | + projectId=project, |
| 153 | + region=REGION, |
| 154 | + clusterName=cluster).execute() |
| 155 | + return result |
| 156 | +# [END delete] |
| 157 | + |
| 158 | + |
| 159 | +# [START wait] |
| 160 | +def wait_for_job(dataproc, project, job_id): |
| 161 | + print('Waiting for job to finish...') |
| 162 | + while True: |
| 163 | + result = dataproc.projects().regions().jobs().get( |
| 164 | + projectId=project, |
| 165 | + region=REGION, |
| 166 | + jobId=job_id).execute() |
| 167 | + # Handle exceptions |
| 168 | + if result['status']['state'] == 'ERROR': |
| 169 | + print(result) |
| 170 | + raise Exception(result['status']['details']) |
| 171 | + elif result['status']['state'] == 'DONE': |
| 172 | + print('Job finished') |
| 173 | + return result |
| 174 | +# [END wait] |
| 175 | + |
| 176 | + |
| 177 | +# [START get_client] |
| 178 | +def get_client(): |
| 179 | + """Builds an http client authenticated with the service account |
| 180 | + credentials.""" |
| 181 | + credentials = GoogleCredentials.get_application_default() |
| 182 | + dataproc = discovery.build('dataproc', 'v1', credentials=credentials) |
| 183 | + return dataproc |
| 184 | +# [END get_client] |
| 185 | + |
| 186 | + |
| 187 | +def main(project_id, zone, cluster_name, bucket_name, pyspark_file=None): |
| 188 | + dataproc = get_client() |
| 189 | + try: |
| 190 | + if pyspark_file: |
| 191 | + spark_file, spark_filename = get_pyspark_file(pyspark_file) |
| 192 | + else: |
| 193 | + spark_file, spark_filename = get_default_pyspark_file() |
| 194 | + |
| 195 | + create_cluster(dataproc, project_id, cluster_name, zone) |
| 196 | + wait_for_cluster_creation(dataproc, project_id, cluster_name, zone) |
| 197 | + upload_pyspark_file(project_id, bucket_name, |
| 198 | + spark_filename, spark_file) |
| 199 | + cluster_list = list_clusters_with_details( |
| 200 | + dataproc, project_id)['clusters'] |
| 201 | + |
| 202 | + (cluster_id, output_bucket) = ( |
| 203 | + get_cluster_id_by_name(cluster_list, cluster_name)) |
| 204 | + job_id = submit_pyspark_job( |
| 205 | + dataproc, project_id, cluster_name, bucket_name, spark_filename) |
| 206 | + wait_for_job(dataproc, project_id, job_id) |
| 207 | + |
| 208 | + output = download_output(project_id, cluster_id, output_bucket, job_id) |
| 209 | + print('Received job output {}'.format(output)) |
| 210 | + return output |
| 211 | + finally: |
| 212 | + delete_cluster(dataproc, project_id, cluster_name) |
| 213 | + spark_file.close() |
| 214 | + |
| 215 | + |
| 216 | +if __name__ == '__main__': |
| 217 | + parser = argparse.ArgumentParser( |
| 218 | + description=__doc__, |
| 219 | + formatter_class=argparse.RawDescriptionHelpFormatter |
| 220 | + ) |
| 221 | + parser.add_argument( |
| 222 | + '--project_id', help='Project ID you want to access.', required=True), |
| 223 | + parser.add_argument( |
| 224 | + '--zone', help='Region to create clusters in', required=True) |
| 225 | + parser.add_argument( |
| 226 | + '--cluster_name', help='Region to create clusters in', required=True) |
| 227 | + parser.add_argument( |
| 228 | + '--gcs_bucket', help='Bucket to upload Pyspark file to', required=True) |
| 229 | + parser.add_argument( |
| 230 | + '--pyspark_file', help='Pyspark filename. Defaults to pyspark_sort.py') |
| 231 | + |
| 232 | + args = parser.parse_args() |
| 233 | + main( |
| 234 | + args.project_id, args.zone, |
| 235 | + args.cluster_name, args.gcs_bucket, args.pyspark_file) |
0 commit comments