Adding HadoopFS sample

bradmiro · bradmiro · commit 933900b8fe37 · 2020-05-20T16:52:26.000-04:00
diff --git a/dataproc/src/main/java/SubmitHadoopFSJob.java b/dataproc/src/main/java/SubmitHadoopFSJob.java
@@ -0,0 +1,151 @@
+/*
+ * Copyright 2019 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// [START dataproc_quickstart]
+/* This quickstart sample walks a user through creating a Cloud Dataproc
+ * cluster, submitting a PySpark job from Google Cloud Storage to the
+ * cluster, reading the output of the job and deleting the cluster, all
+ * using the Java client library.
+ *
+ * Usage:
+ *     mvn clean package -DskipTests
+ *
+ *     mvn exec:java -Dexec.args="<PROJECT_ID> <REGION> <CLUSTER_NAME> <GCS_JOB_FILE_PATH>"
+ *
+ *     You can also set these arguments in the main function instead of providing them via the CLI.
+ */
+
+import com.google.api.gax.longrunning.OperationFuture;
+import com.google.cloud.dataproc.v1.*;
+import com.google.cloud.storage.Blob;
+import com.google.cloud.storage.Storage;
+import com.google.cloud.storage.StorageOptions;
+import com.google.protobuf.Empty;
+import java.io.IOException;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+public class SubmitHadoopFSJob {
+
+    public static Job waitForJobCompletion(
+            JobControllerClient jobControllerClient, String projectId, String region, String jobId) {
+        while (true) {
+            // Poll the service periodically until the Job is in a finished state.
+            Job jobInfo = jobControllerClient.getJob(projectId, region, jobId);
+            switch (jobInfo.getStatus().getState()) {
+                case DONE:
+                case CANCELLED:
+                case ERROR:
+                    return jobInfo;
+                default:
+                    try {
+                        // Wait a second in between polling attempts.
+                        TimeUnit.SECONDS.sleep(1);
+                    } catch (InterruptedException e) {
+                        throw new RuntimeException(e);
+                    }
+            }
+        }
+    }
+
+    public static void submitHadoopFSJob(
+            String projectId, String region, String clusterName, String hadoopFSQuery)
+            throws IOException, InterruptedException {
+        String myEndpoint = String.format("%s-dataproc.googleapis.com:443", region);
+
+        // Configure the settings for the job controller client.
+        JobControllerSettings jobControllerSettings =
+                JobControllerSettings.newBuilder().setEndpoint(myEndpoint).build();
+
+        // Create both a cluster controller client and job controller client with the
+        // configured settings. The client only needs to be created once and can be reused for
+        // multiple requests. Using a try-with-resources closes the client, but this can also be done
+        // manually with the .close() method.
+        try (JobControllerClient jobControllerClient =
+                     JobControllerClient.create(jobControllerSettings)) {
+
+            // Configure cluster placement for the job.
+            JobPlacement jobPlacement = JobPlacement.newBuilder().setClusterName(clusterName).build();
+
+            // Configure Hadoop job settings. The HadoopFS query is set here.
+            HadoopJob hadoopJob = HadoopJob.newBuilder()
+                    .setMainClass("org.apache.hadoop.fs.FsShell")
+                    .addArgs(hadoopFSQuery).build();
+
+            Job job = Job.newBuilder().setPlacement(jobPlacement).setHadoopJob(hadoopJob).build();
+
+            // Submit an asynchronous request to execute the job.
+            Job request = jobControllerClient.submitJob(projectId, region, job);
+            String jobId = request.getReference().getJobId();
+            System.out.println(String.format("Submitted job \"%s\"", jobId));
+
+            // Wait for the job to finish.
+            CompletableFuture<Job> finishedJobFuture =
+                    CompletableFuture.supplyAsync(
+                            () -> waitForJobCompletion(jobControllerClient, projectId, region, jobId));
+            int timeout = 10;
+            try {
+                Job jobInfo = finishedJobFuture.get(timeout, TimeUnit.MINUTES);
+                System.out.println(String.format("Job %s finished successfully.", jobId));
+
+                // Cloud Dataproc job output gets saved to a GCS bucket allocated to it.
+                Cluster clusterInfo = clusterControllerClient.getCluster(projectId, region, clusterName);
+                Storage storage = StorageOptions.getDefaultInstance().getService();
+                Blob blob =
+                        storage.get(
+                                clusterInfo.getConfig().getConfigBucket(),
+                                String.format(
+                                        "google-cloud-dataproc-metainfo/%s/jobs/%s/driveroutput.000000000",
+                                        clusterInfo.getClusterUuid(), jobId));
+                System.out.println(
+                        String.format(
+                                "Job \"%s\" finished with state %s:%n%s",
+                                jobId, jobInfo.getStatus().getState(), new String(blob.getContent())));
+            } catch (TimeoutException e) {
+                System.err.println(
+                        String.format("Job timed out after %d minutes: %s", timeout, e.getMessage()));
+            }
+
+            // Delete the cluster.
+            OperationFuture<Empty, ClusterOperationMetadata> deleteClusterAsyncRequest =
+                    clusterControllerClient.deleteClusterAsync(projectId, region, clusterName);
+            deleteClusterAsyncRequest.get();
+            System.out.println(String.format("Cluster \"%s\" successfully deleted.", clusterName));
+
+        } catch (ExecutionException e) {
+            System.err.println(String.format("Error executing quickstart: %s ", e.getMessage()));
+        }
+    }
+
+    public static void main(String... args) throws IOException, InterruptedException {
+        if (args.length != 4) {
+            System.err.println(
+                    "Insufficient number of parameters provided. Please make sure a "
+                            + "PROJECT_ID, REGION, CLUSTER_NAME and JOB_FILE_PATH are provided, in this order.");
+            return;
+        }
+
+        String projectId = args[0]; // project-id of project to create the cluster in
+        String region = args[1]; // region to create the cluster
+        String clusterName = args[2]; // name of the cluster
+        String jobFilePath = args[3]; // location in GCS of the PySpark job
+
+        quickstart(projectId, region, clusterName, jobFilePath);
+    }
+}
+// [END dataproc_quickstart]
diff --git a/dataproc/src/test/java/SubmitHadoopFSJobTest.java b/dataproc/src/test/java/SubmitHadoopFSJobTest.java
@@ -0,0 +1,91 @@
+/*
+ * Copyright 2019 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import com.google.api.gax.longrunning.OperationFuture;
+import com.google.cloud.dataproc.v1.ClusterControllerClient;
+import com.google.cloud.dataproc.v1.ClusterControllerSettings;
+import com.google.cloud.dataproc.v1.ClusterOperationMetadata;
+import com.google.protobuf.Empty;
+import org.hamcrest.CoreMatchers;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.util.UUID;
+import java.util.concurrent.ExecutionException;
+
+import static junit.framework.TestCase.assertNotNull;
+import static org.hamcrest.MatcherAssert.assertThat;
+
+@RunWith(JUnit4.class)
+public class SubmitHadoopFSJobTest {
+
+  private static final String CLUSTER_NAME =
+      String.format("java-cc-test--%s", UUID.randomUUID().toString());
+  private static final String REGION = "us-east1";
+  private static final String PROJECT_ID = System.getenv("GOOGLE_CLOUD_PROJECT");
+
+  private ByteArrayOutputStream bout;
+
+  private static void requireEnv(String varName) {
+    assertNotNull(
+        String.format("Environment variable '%s' is required to perform these tests.", varName),
+        System.getenv(varName));
+  }
+
+  @BeforeClass
+  public static void checkRequirements() {
+    requireEnv("GOOGLE_APPLICATION_CREDENTIALS");
+    requireEnv("GOOGLE_CLOUD_PROJECT");
+  }
+
+  @Before
+  public void setUp() {
+    bout = new ByteArrayOutputStream();
+    System.setOut(new PrintStream(bout));
+    //System.setErr(new PrintStream(bout));
+  }
+
+  @Test
+  public void createClusterTest() throws IOException, InterruptedException {
+    CreateCluster.createCluster(PROJECT_ID, REGION, CLUSTER_NAME);
+    String output = bout.toString();
+
+    System.out.println(output);
+    assertThat(output, CoreMatchers.containsString(CLUSTER_NAME));
+  }
+
+  @After
+  public void tearDown() throws IOException, InterruptedException, ExecutionException {
+    String myEndpoint = String.format("%s-dataproc.googleapis.com:443", REGION);
+
+    ClusterControllerSettings clusterControllerSettings =
+        ClusterControllerSettings.newBuilder().setEndpoint(myEndpoint).build();
+
+    try (ClusterControllerClient clusterControllerClient =
+        ClusterControllerClient.create(clusterControllerSettings)) {
+      OperationFuture<Empty, ClusterOperationMetadata> deleteClusterAsyncRequest =
+          clusterControllerClient.deleteClusterAsync(PROJECT_ID, REGION, CLUSTER_NAME);
+      deleteClusterAsyncRequest.get();
+    }
+  }
+}